this is the code from CL 501725670, which removed TensorCore code from llvm

commit: 459de9db50c760a8f7c61a8227f62262c29e6a29 [log] [tgz]
author: TimAtGoogle <tjharvey@google.com> Wed Dec 11 14:55:07 2024 -0600
committer: TimAtGoogle <tjharvey@google.com> Wed Dec 11 14:55:07 2024 -0600
tree: 59158d37fc99992ed91070583c4494168d1a8cda
parent: 5b1b633f4d79c24e0e1cad19855e684b483d7a6b [diff]
diff --git a/tpu_recision/platforms/xla/service/jellyfish/llvm_code_generator.cc b/tpu_recision/platforms/xla/service/jellyfish/llvm_code_generator.cc
new file mode 100644
index 0000000..240632c
--- /dev/null
+++ b/tpu_recision/platforms/xla/service/jellyfish/llvm_code_generator.cc

@@ -0,0 +1,4942 @@
+#include "platforms/xla/service/jellyfish/llvm_code_generator.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <iomanip>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <queue>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "base/timer.h"
+#include "learning/brain/tpu/runtime/tpu_chip_enums.h"
+#include "platforms/asic_sw/lib/deepsea/pxc/pfc/isa/barnacore_channel_codec.h"
+#include "platforms/deepsea/jellyfish/mnemonics/parser_iss.h"
+#include "platforms/deepsea/jellyfish/xdb/debugger.proto.h"
+#include "platforms/deepsea/software/jfc/mnemonics/parser_factory.h"
+#include "platforms/deepsea/software/pxc/mnemonics/parser_pf.h"
+#include "platforms/xla/port/util.h"
+#include "platforms/xla/service/jellyfish/bundle_requirement.h"
+#include "platforms/xla/service/jellyfish/bundle_requirement_tracker.h"
+#include "platforms/xla/service/jellyfish/dma_strides.h"
+#include "platforms/xla/service/jellyfish/hlo_deduplication.h"
+#include "platforms/xla/service/jellyfish/llo_allocation_assignment.h"
+#include "platforms/xla/service/jellyfish/llo_constant.h"
+#include "platforms/xla/service/jellyfish/llo_dumper.h"
+#include "platforms/xla/service/jellyfish/llo_instruction.h"
+#include "platforms/xla/service/jellyfish/llo_mnemonic_printer.h"
+#include "platforms/xla/service/jellyfish/llo_module.h"
+#include "platforms/xla/service/jellyfish/llo_phi_classifier.h"
+#include "platforms/xla/service/jellyfish/llo_region.h"
+#include "platforms/xla/service/jellyfish/llo_region_visitor.h"
+#include "platforms/xla/service/jellyfish/llo_verifier.h"
+#include "platforms/xla/service/jellyfish/llvm_mc_program_processor.h"
+#include "platforms/xla/service/jellyfish/memory_space_enum.h"
+#include "platforms/xla/service/jellyfish/metadata/llo_opcode.h"
+#include "platforms/xla/service/jellyfish/sched/arch_register_tracker.h"
+#include "platforms/xla/service/jellyfish/sched/fifo_tracker.h"
+#include "platforms/xla/service/jellyfish/vpack_format.h"
+#include "security/util/sha256.h"
+#include "strings/numbers.h"
+#include "third_party/absl/container/flat_hash_map.h"
+#include "third_party/absl/container/flat_hash_set.h"
+#include "third_party/absl/strings/str_join.h"
+#include "third_party/absl/strings/str_replace.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/Analysis/TargetTransformInfo.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/AsmParser/Parser.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/Bitcode/BitcodeWriter.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/CFG.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/Constants.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/DIBuilder.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/DebugInfoMetadata.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/DebugLoc.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/DiagnosticInfo.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/DiagnosticPrinter.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/IRBuilder.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/IntrinsicsTPU.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/LLVMContext.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/LegacyPassManager.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/MDBuilder.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/Module.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/NoFolder.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/PassTimingInfo.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/Verifier.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IRReader/IRReader.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/MC/TargetRegistry.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/Passes/PassBuilder.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/Support/CommandLine.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/Transforms/IPO.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "third_party/llvm/llvm/lib/Target/GoogleTPU/TPU.h"
+#include "third_party/tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "third_party/tensorflow/compiler/xla/map_util.h"
+#include "third_party/tensorflow/compiler/xla/status_macros.h"
+#include "third_party/tensorflow/compiler/xla/stream_executor/lib/statusor.h"
+#include "third_party/tensorflow/compiler/xla/util.h"
+#include "third_party/tensorflow/core/platform/errors.h"
+#include "thread/fiber/bundle.h"
+#include "util/gtl/comparator.h"
+#include "util/regexp/re2/re2.h"
+
+// Explicitly extern these initializers here instead of relying on
+// TargetSelect.h. TargetSelect.h only contains these declarations if we set the
+// blaze option --define=tpu=1. This allows us to link against the TPU backend
+// even if we didn't set that define.
+extern "C" void LLVMInitializeTPUAsmParser();
+extern "C" void LLVMInitializeTPUAsmPrinter();
+extern "C" void LLVMInitializeTPUTarget();
+extern "C" void LLVMInitializeTPUTargetMC();
+extern "C" void LLVMInitializeTPUTargetInfo();
+
+// We need to extend the default fibers stack size because LLVM does not manage
+// stack depth very well so that the default of 64K is nearly not enough.
+DECLARE_int32(fibers_default_thread_stack_size);
+REGISTER_MODULE_INITIALIZER(XLA_LLVM_code_generator, {
+  constexpr int32_t kFibersStackMin = 1 * 1024 * 1024;
+  if (absl::GetFlag(FLAGS_fibers_default_thread_stack_size) < kFibersStackMin) {
+    FLAGS_fibers_default_thread_stack_size = kFibersStackMin;
+  }
+});
+
+namespace xla {
+namespace jellyfish {
+
+using ::strings::ParseLeadingHex64Value;
+using ::tpu::TpuSequencerType;
+using ::tpu::TpuVersion;
+
+using StatusOrIsaProgram = StatusOr<std::unique_ptr<IsaProgramProto>>;
+
+namespace {
+
+constexpr const char* kCpuPfcBarnaCoreChannelController = "barnacore-cc-pf";
+constexpr const char* kCpuPfcTensorCore = "tensorcore-pf";
+constexpr const char* kCpuDfcTensorCore = "tensorcore-df";
+constexpr const char* kCpuJfcTensorCore = "tensorcore-jf";
+
+// One-time module initializer.
+// Must be called only once -- DO NOT CALL DIRECTLY.
+void GoogleTPUBackendInit(const std::string& cpu, const LloModule& module) {
+  std::vector<const char*> fake_argv = {""};
+  for (absl::string_view f : module.comp_env().xla_jf_llvm_flags().values()) {
+    fake_argv.push_back(f.data());
+  }
+  fake_argv.push_back("-tpu-enable-overlayer-passes");
+  // Limit GVN of instructions with side-effect to reduce compile time and avoid
+  // increasing register pressure. More aggressive load to store forwarding can
+  // be done later in the flow when we have more information about scheduling
+  // and register pressure.
+  fake_argv.push_back("-memdep-block-scan-limit=10");
+  // Report used spill slots instead of failing compilation.
+  fake_argv.push_back("-tpu-report-used-spillslots=true");
+  // Do not unroll loops in LLVM.
+  fake_argv.push_back("-unroll-threshold=0");
+  // Except for BarnaCore we need to propagate DebugLoc encoding source LLO
+  // instructions to MCInst.
+  if (cpu != kCpuPfcBarnaCoreChannelController) {
+    fake_argv.push_back("-tpu-explicit-debug-loc-on-mcinst=true");
+  }
+  // We need to exlicitly specify enable or disable option since TPU target
+  // would decide on value itself otherwise.
+  fake_argv.push_back(cpu == kCpuPfcBarnaCoreChannelController
+                          ? "-tpu-encode-mcinst-bundles"
+                          : "-tpu-encode-mcinst-bundles=false");
+  if (!module.comp_env().xla_jf_llvm_use_fast_opt()) {
+    fake_argv.push_back("-tpu-skip-fast-opt");
+  }
+  llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]);
+
+  // Initialize GoogleTPU LLVM backend.
+  LLVMInitializeTPUAsmParser();
+  LLVMInitializeTPUAsmPrinter();
+  LLVMInitializeTPUTarget();
+  LLVMInitializeTPUTargetMC();
+  LLVMInitializeTPUTargetInfo();
+}
+
+// Create an instance of the LLVM target machine for the target specified.
+StatusOr<std::unique_ptr<llvm::TargetMachine>> GetLlvmTargetMachine(
+    const LloModule& module) {
+  // Get CPU for LLVM target.
+  const auto get_llvm_cpu = [&] {
+    switch (module.SequencerType()) {
+      case TpuSequencerType::kTensorCoreSequencer: {
+        return module.target().TensorCoreSequencerLlvmCpu();
+      }
+      case TpuSequencerType::kBarnaCoreAddressHandler: {
+        return module.target().BarnaCoreAddressHandlerLlvmCpu();
+      }
+      default: {
+        LOG(FATAL) << "Unsupported sequencer type: " << module.SequencerType();
+      }
+    }
+  };
+  const std::string cpu = get_llvm_cpu();
+  static std::once_flag flag;
+  std::call_once(flag, [&] { GoogleTPUBackendInit(cpu, module); });
+
+  constexpr auto kGoogleTpuTriple = "googletpu--";
+
+  std::string error_message;
+  const llvm::Target* llvm_target =
+      llvm::TargetRegistry::lookupTarget(kGoogleTpuTriple, error_message);
+  TF_RET_CHECK(llvm_target != nullptr)
+      << "Cannot create target: " << error_message;
+  TF_RET_CHECK(llvm_target->hasTargetMachine()) << "Target machine is missing";
+
+  // We use the default ctor for TargetOptions. If we wanted to initialize via
+  // InitTargetOptionsFromCodeGenFlags(), we would need to #include
+  // ".../CodeGen/CommandFlags.inc" but that would conflict with gpu_backend_lib
+  // which is already doing that, leading to a startup error like:
+  //   CommandLine Error: Option 'mc-relax-all' registered more than once!
+  //
+  // Alternatively, we could wrap CommandFlags.inc in a cc_library(), use it
+  // here and in gpu_backend_lib, and let blaze deduplicate. See b/144117667.
+  llvm::TargetOptions options;
+  options.MCOptions.AsmVerbose = true;
+  if (cpu == kCpuPfcBarnaCoreChannelController) {
+    options.MCOptions.ShowMCEncoding = true;
+  }
+  std::unique_ptr<llvm::TargetMachine> target_machine(
+      llvm_target->createTargetMachine(kGoogleTpuTriple, cpu, "", options,
+                                       std::nullopt));
+  TF_RET_CHECK(static_cast<bool>(target_machine))
+      << "Failed to create target machine.";
+
+  return target_machine;
+}
+
+// Return allocation offset in bytes in context of the compilation result.
+int64_t GetAdjustedAllocationOffset(
+    const LloCompilationResult* compilation_result,
+    const LloAllocation* allocation) {
+  // Implementation is based on CodeGenerator::AllocationAdjustmentWords(...).
+  if (!allocation->is_scoped() || allocation->is_remote() ||
+      allocation->is_fixed()) {
+    return allocation->offset();
+  }
+  const auto& allocation_offset_adjustments =
+      compilation_result->allocation_offset_adjustment_bytes;
+  return allocation->offset() +
+         allocation_offset_adjustments[allocation->space()]
+                                      [allocation->sub_space()];
+}
+
+// Maps allocation into offset/size to be used in LLVM. Used as a layer of
+// abstraction between XLA allocations and allocations used in LLVM.
+class AllocationMap {
+ public:
+  AllocationMap(const Target& target,
+                const LloCompilationResult* compilation_result,
+                MemorySpace space)
+      : target_(target),
+        compilation_result_(compilation_result),
+        alignment_in_words_(ComputeAlignment(target, space)) {}
+
+  Status RegisterAllocation(const LloAllocation* allocation);
+  StatusOr<MemRegion> ComputeBestSpillRange(MemRegion bounds_in_words) const;
+  StatusOr<MemRegion> GetAllocationRegionInWords(
+      const LloAllocation* allocation) const;
+  Status Pack(MemRegion bounds_in_words);
+  void Reset() { allocations_.clear(); }
+
+ private:
+  const Target& target_;
+  const LloCompilationResult* const compilation_result_;
+  const int64_t alignment_in_words_;
+
+  // Maps allocation into memory region to be used.
+  absl::flat_hash_map<const LloAllocation*, MemRegion> allocations_;
+
+  static int64_t ComputeAlignment(const Target& target, MemorySpace space);
+};
+
+Status AllocationMap::Pack(MemRegion bounds_in_words) {
+  // Here we try to remap allocations so that empty space is moved to the top of
+  // the memory, to do so we find gaps in the allocations and shift the
+  // allocations above the gap down while maintaining a valid alignment.
+  if (allocations_.empty()) {
+    return OkStatus();
+  }
+
+  // First sort all allocations by their offset(first) and limit (second).
+  std::vector<const LloAllocation*> ordered_allocations;
+  for (const auto& [allocation, region] : allocations_) {
+    TF_RET_CHECK(region.first >= bounds_in_words.first);
+    TF_RET_CHECK(region.second <= bounds_in_words.second);
+    ordered_allocations.push_back(allocation);
+  }
+  absl::c_sort(ordered_allocations,
+               gtl::OrderBy([&](const LloAllocation* allocation) {
+                 return allocations_[allocation];
+               }));
+
+  // Go in allocation order, if there is a gap between allocations, try shifting
+  // all allocations to the bottom of memory.
+  int64_t empty_space_start = bounds_in_words.first;
+  for (auto it = ordered_allocations.begin(), end = ordered_allocations.end();
+       it != end; ++it) {
+    MemRegion& region = allocations_[*it];
+    // Assume all allocations are aligned.
+    TF_RET_CHECK(region.first % alignment_in_words_ == 0);
+    TF_RET_CHECK(empty_space_start % alignment_in_words_ == 0);
+
+    if (region.first > empty_space_start &&
+        std::all_of(it, end, [](const LloAllocation* allocation) {
+          return !allocation->is_remote();
+        })) {
+      // We can shift all allocations starting with current one.
+      const int64_t shift_by = region.first - empty_space_start;
+      for (auto nested_it = it; nested_it != end; ++nested_it) {
+        MemRegion& nested_region = allocations_[*nested_it];
+        nested_region.first -= shift_by;
+        nested_region.second -= shift_by;
+      }
+    }
+
+    // Note: if/after the allocations were shifted, the region is updated since
+    // it's a reference.
+    empty_space_start =
+        std::max(empty_space_start,
+                 MathUtil::RoundUpTo(region.second, alignment_in_words_));
+  }
+
+  return OkStatus();
+}
+
+Status AllocationMap::RegisterAllocation(const LloAllocation* allocation) {
+  TF_RET_CHECK(allocation->has_offset());
+  TF_RET_CHECK(allocation->has_size());
+  const int64_t offset = address_util::ConvertOffsetByteToWord(
+      allocation->space(),
+      GetAdjustedAllocationOffset(compilation_result_, allocation), target_);
+  const int64_t size = address_util::ConvertOffsetByteToWord(
+      allocation->space(), allocation->size(), target_);
+  const auto [iter, inserted] =
+      allocations_.emplace(allocation, std::make_pair(offset, offset + size));
+  TF_RET_CHECK(inserted);
+  return OkStatus();
+}
+
+StatusOr<MemRegion> AllocationMap::GetAllocationRegionInWords(
+    const LloAllocation* allocation) const {
+  const auto it = allocations_.find(allocation);
+  TF_RET_CHECK(it != allocations_.end());
+  return it->second;
+}
+
+StatusOr<MemRegion> AllocationMap::ComputeBestSpillRange(
+    MemRegion bounds_in_words) const {
+  // The pair representing MemRegion start and limit points.
+  using StartOrLimitPoint = std::pair<int64_t, int64_t>;
+  std::vector<StartOrLimitPoint> points;
+
+  // We are using such values for kRegionStart and kRegionLimit so that after
+  // sorting all start points at some offset are located before limit points at
+  // the same offset.
+  constexpr int64_t kRegionStart = -1;
+  constexpr int64_t kRegionLimit = 1;
+
+  // Mark bounds with empty regions.
+  points.emplace_back(bounds_in_words.first, kRegionStart);
+  points.emplace_back(bounds_in_words.first, kRegionLimit);
+  points.emplace_back(bounds_in_words.second, kRegionStart);
+  points.emplace_back(bounds_in_words.second, kRegionLimit);
+
+  // Go through all regions one-by-one and add their start/limit points
+  // potentially cut to bounds, ignore out-of-bound regions.
+  for (const auto& [allocation, range_in_words] : allocations_) {
+    // Ensure we only track allocation inside bounds.
+    const int64_t offset_in_words_cut =
+        std::max(bounds_in_words.first, range_in_words.first);
+    const int64_t limit_in_words_cut =
+        std::min(bounds_in_words.second, range_in_words.second);
+    if (offset_in_words_cut < limit_in_words_cut) {
+      points.emplace_back(offset_in_words_cut, kRegionStart);
+      points.emplace_back(limit_in_words_cut, kRegionLimit);
+    }
+  }
+  absl::c_sort(points);
+
+  TF_RET_CHECK(points.front().first == bounds_in_words.first);
+  TF_RET_CHECK(points.front().second == kRegionStart);
+  TF_RET_CHECK(points.back().first == bounds_in_words.second);
+  TF_RET_CHECK(points.back().second == kRegionLimit);
+
+  MemRegion best{bounds_in_words.first, bounds_in_words.first};
+
+  // Start with the second point, the first point must be the region start.
+  int64_t last_offset = -1;
+  int64_t open_regions = 0;
+
+  for (const StartOrLimitPoint& point : points) {
+    if (open_regions == 0) {
+      TF_RET_CHECK(point.second == kRegionStart);
+      if (last_offset >= 0 &&
+          (best.second - best.first < point.first - last_offset)) {
+        best = {last_offset, point.first};
+      }
+    }
+    last_offset = point.first;
+    open_regions += (point.second == kRegionStart) ? 1 : -1;
+  }
+
+  return std::make_pair(
+      MathUtil::RoundUpTo(best.first, alignment_in_words_),
+      MathUtil::RoundDownTo(best.second, alignment_in_words_));
+}
+
+int64_t AllocationMap::ComputeAlignment(const Target& target,
+                                        MemorySpace space) {
+  switch (space) {
+    case MemorySpace::kSmem:
+      // We don't really plan to compact SMEM allocations, but we still want to
+      // make sure the spill slots are properly aligned, we we use 2 for SMEM
+      // alignments based on XLA practice.
+      return 1;
+    case MemorySpace::kVmem:
+      // See LloAllocationAssignment::AllocationAlignment().
+      return address_util::ConvertOffsetByteToWord(
+          space, target.ChunkSizeBytes(), target);
+    default:
+      LOG(FATAL) << "Unsupported memory space: " << space;
+  }
+}
+
+class ModuleAllocationMaps {
+ public:
+  ModuleAllocationMaps(const Target& target,
+                       const LloCompilationResult* compilation_result)
+      : smem_allocations_(target, compilation_result, MemorySpace::kSmem),
+        vmem_allocations_(target, compilation_result, MemorySpace::kVmem) {}
+
+  Status RegisterAllocation(const LloAllocation* allocation) {
+    const auto space = allocation->space();
+    TF_RET_CHECK(space == MemorySpace::kSmem || space == MemorySpace::kVmem);
+    TF_RETURN_IF_ERROR(map(space).RegisterAllocation(allocation));
+    return OkStatus();
+  }
+
+  StatusOr<MemRegion> GetAllocationRegionInWords(
+      const LloAllocation* allocation) const {
+    return map(allocation->space()).GetAllocationRegionInWords(allocation);
+  }
+
+  StatusOr<MemRegion> ComputeBestSpillRange(MemorySpace space,
+                                            MemRegion bounds_in_words) const {
+    return map(space).ComputeBestSpillRange(bounds_in_words);
+  }
+
+  Status PackVmemAllocations(MemRegion bounds_in_words) {
+    return map(MemorySpace::kVmem).Pack(bounds_in_words);
+  }
+
+  void Reset() {
+    smem_allocations_.Reset();
+    vmem_allocations_.Reset();
+  }
+
+ private:
+  AllocationMap smem_allocations_;
+  AllocationMap vmem_allocations_;
+
+  AllocationMap& map(MemorySpace space) {
+    return space == MemorySpace::kSmem ? smem_allocations_ : vmem_allocations_;
+  }
+
+  const AllocationMap& map(MemorySpace space) const {
+    return space == MemorySpace::kSmem ? smem_allocations_ : vmem_allocations_;
+  }
+};
+
+class TimingInfo;
+// Timer that can be used to collect timing information. It has a component
+// string that determines the component the timing is being taken of. If the
+// timer is created from another timer then the two components strings are
+// concatenated to create a hierarchy of components.
+// For example if a timer with the component string "program/subcomponent1" is
+// used to construct a timer and to the constructor is passed the string
+// "subcomponent2" the final component string will be
+// "program/subcomponent1/subcomponent2". This allows to create an hierarchy
+// of components that then can be used to determine the relationship of
+// different timers between themselves.
+// The timer is scoped. It starts when created and is stopped when it goes
+// out of scope and it reports its data to the parent "TimingInfo" structure
+// that collects all the timing information and generates a report at the end.
+class SubTimer {
+  TimingInfo* const parent_info_;
+  const std::string component_;
+  int64_t start_;
+  int64_t stop_;
+  bool reported_ = false;
+
+  void Start();
+  void Stop();
+  void Report();
+
+ public:
+  SubTimer(TimingInfo* parent, absl::string_view component_str)
+      : parent_info_(parent), component_(component_str) {
+    Start();
+  }
+  SubTimer(const SubTimer& parent_timer, absl::string_view component_str)
+      : parent_info_(parent_timer.parent_info_),
+        component_(absl::StrCat(parent_timer.component_, "/", component_str)),
+        reported_(false) {
+    Start();
+  }
+  // This method can be used to terminate early a scope without waiting for it
+  // to go out of scope. It saves some useless brackets to make it go out of
+  // scope.
+  void StopAndReport() {
+    this->Stop();
+    this->Report();
+  }
+  const std::string& getComponent() const { return component_; }
+  int64_t getStart() const { return start_; }
+  int64_t getStop() const { return stop_; }
+  ~SubTimer() { StopAndReport(); }
+};
+
+// Collect and return timing information
+class TimingInfo {
+  struct TimerInfo {
+    std::string component_name;
+    int64_t start_usec;
+    int64_t end_usec;
+  };
+  // Timer to keep track of passed time.
+  SimpleCycleTimer timer_;
+  // Mutex to synchronize access to the timing information vector across
+  // threads.
+  absl::Mutex mutex_;
+  // Base name of this timing info.
+  const std::string base_component_name_;
+  // Vector containing all the recorded timing information from SubTimers that
+  // reported to us.
+  std::vector<TimerInfo> structured_timing_info_;
+
+ public:
+  explicit TimingInfo(absl::string_view base_component_name)
+      : base_component_name_(base_component_name) {
+    timer_.Start();
+  }
+  ~TimingInfo() { timer_.Stop(); }
+
+  // Return current time in microseconds.
+  int64_t GetUsec() const { return timer_.GetInUsec(); }
+  // Add the readings of a SubTimer to the report. Should be called when the
+  // SubTimer is terminated, but we are not checking it here.
+  void Report(const SubTimer& subtimer) {
+    absl::MutexLock l(&mutex_);
+    structured_timing_info_.push_back(TimerInfo{
+        subtimer.getComponent(), subtimer.getStart(), subtimer.getStop()});
+  }
+  // Return a string with a report of the timing information callected.
+  std::string GetTimingReport() {
+    absl::MutexLock l(&mutex_);
+    if (structured_timing_info_.empty()) {
+      return std::string();
+    }
+    // Sort by running time.
+    std::stable_sort(
+        structured_timing_info_.begin(), structured_timing_info_.end(),
+        [&](const TimingInfo::TimerInfo& a, const TimingInfo::TimerInfo& b) {
+          return (a.end_usec - a.start_usec) > (b.end_usec - b.start_usec);
+        });
+    int64_t max_time = 0;
+    for (auto& ti : structured_timing_info_) {
+      max_time = std::max(max_time, ti.end_usec);
+    }
+    auto UsecToSec = [](int64_t v) {
+      return static_cast<double>(v) / 1000000.0;
+    };
+    std::stringstream ss;
+    ss << "// Timer percentages might not add up to 100%, because of "
+          "parallelism and because some timers might not be disjoint\n\n";
+    for (auto& ti : structured_timing_info_) {
+      double percent = (static_cast<double>(ti.end_usec - ti.start_usec) /
+                        static_cast<double>(max_time)) *
+                       100.0;
+      ss << base_component_name_ << "/" << ti.component_name;
+      ss << "\n  -- Start = " << std::setw(10) << UsecToSec(ti.start_usec)
+         << " s -- Stop = " << std::setw(10) << UsecToSec(ti.end_usec)
+         << " s -- Time = " << std::setw(10)
+         << UsecToSec(ti.end_usec - ti.start_usec)
+         << " s -- Percent of Total = " << std::setw(10) << percent << "%\n";
+    }
+    return ss.str();
+  }
+};
+
+void SubTimer::Start() {
+  if (parent_info_) {
+    start_ = parent_info_->GetUsec();
+  }
+}
+
+void SubTimer::Stop() {
+  if (parent_info_) {
+    stop_ = parent_info_->GetUsec();
+  }
+}
+
+void SubTimer::Report() {
+  if (parent_info_ && !reported_) {
+    parent_info_->Report(*this);
+    reported_ = true;
+  }
+}
+
+using ProgramAllocationMaps = std::map<const LloCompilationResult*,
+                                       std::unique_ptr<ModuleAllocationMaps>>;
+
+// Defines build configuration for HLO.
+class HloBuildConfig {
+ public:
+  // Explicitly specify if they should be retried.
+  bool supports_retry() const { return supports_retry_; }
+
+  template <typename T>
+  const T* as() const {
+    return dynamic_cast<const T*>(this);
+  }
+
+  virtual ~HloBuildConfig() = default;
+
+ protected:
+  explicit HloBuildConfig(bool supports_retry)
+      : supports_retry_(supports_retry) {}
+
+  const bool supports_retry_;
+};
+
+class ReshapeBuildConfig : public HloBuildConfig {
+ public:
+  // Reshape HLOs may be split into several LLVM modules each containing one
+  // subregion of the originl top-region. Field specifies a list of subregions
+  // to be ignored while building LLVM module.
+  const std::vector<const LloRegion*>& hidden_regions() const {
+    return hidden_regions_;
+  }
+
+  explicit ReshapeBuildConfig(std::vector<const LloRegion*>&& hidden_regions)
+      : HloBuildConfig(false), hidden_regions_(std::move(hidden_regions)) {}
+
+ protected:
+  // For all HLOs may explicitly specify if they should be retried.
+  std::vector<const LloRegion*> hidden_regions_;
+};
+
+// Represents HLO compilation result along with the function signature created
+// to represent this HLO in LLVM. Note that the function is created in context
+// of the TLP LLVM module and may be deleted by LLVM optimizations, we track
+// this with WeakVH.
+class HloReference {
+ public:
+  HloReference(LloCompilationResult* compilation_result,
+               llvm::Function* function)
+      : compilation_result_(compilation_result),
+        weak_function_(function),
+        config_() {}
+
+  HloReference(LloCompilationResult* compilation_result,
+               llvm::Function* function, std::shared_ptr<HloBuildConfig> config)
+      : compilation_result_(compilation_result),
+        weak_function_(function),
+        config_(std::move(config)) {}
+
+  LloCompilationResult* compilation_result() const {
+    return compilation_result_;
+  }
+
+  llvm::Function* function_or_null() const {
+    return llvm::cast_or_null<llvm::Function>(weak_function_);
+  }
+
+  const HloBuildConfig* config() const { return config_.get(); }
+
+ private:
+  LloCompilationResult* const compilation_result_;
+  llvm::WeakVH weak_function_;
+  // Use shared pointer since we copy HLO references.
+  std::shared_ptr<HloBuildConfig> config_;
+};
+
+// Inherit llvm IRBuilder to be able to potentially do more efficient code
+// generation.
+template <typename FolderTy = llvm::ConstantFolder,
+          typename InserterTy = llvm::IRBuilderDefaultInserter>
+class llo_converter_builder : public llvm::IRBuilder<FolderTy, InserterTy> {
+ public:
+  explicit llo_converter_builder(llvm::LLVMContext& C)
+      : llvm::IRBuilder<FolderTy, InserterTy>(C) {}
+  llvm::Value* CreateVectorSplat(unsigned NumElts, llvm::Value* V,
+                                 const llvm::Twine& Name = "") {
+    if (auto* SC = llvm::dyn_cast<llvm::Constant>(V)) {
+      return llvm::ConstantVector::getSplat(
+          llvm::ElementCount::getFixed(NumElts), SC);
+    }
+    return llvm::IRBuilder<FolderTy, InserterTy>::CreateVectorSplat(NumElts, V,
+                                                                    Name);
+  }
+};
+
+// Mapping used for tracking LLO instructions used in LLVM debug locations.
+using DebugLocationMapping =
+    absl::flat_hash_map<SourceLocation, const LloInstruction*>;
+
+class Converter {
+  friend class LlvmModuleBuilder;
+
+ public:
+  Converter(const std::string& name,
+            const LloCompilationResult* compilation_result,
+            LloModule* main_module, LloDumper* dumper,
+            ProgramAllocationMaps* program_allocation_maps,
+            const SubTimer& timer);
+
+  Converter(const std::string& name,
+            const LloCompilationResult* compilation_result,
+            const HloBuildConfig* hlo_build_config, LloModule* main_module,
+            LloDumper* dumper, ProgramAllocationMaps* program_allocation_maps,
+            SpillRegionCollection spill_regions,
+            const LloCompilationResult* tlp_compilation_result,
+            const SpillRegionCollection* tlp_used_spill_regions,
+            llvm::Function* hlo_function_prototype, int64_t retry,
+            const SubTimer& timer);
+
+  // Build an LLVM module.
+  Status BuildMainModule();
+
+  // Run limited set of optimizations and code gen to ISA, return assembly for
+  // MCInst parser to consume.
+  StatusOr<std::string> OptimizeAndCodeGen();
+
+  // Run limited set of optimizations and code gen to ISA, return McCode object
+  // with MCInst and debug locations.
+  StatusOrMcCode OptimizeAndCodeGen(const IsaProgramTarget& target);
+
+  // Print current LLVM module.
+  std::string PrintAsLlvm();
+
+  // Verify the module.
+  Status VerifyModule();
+
+  // Mapping for LLO module/value ordinal into LLO value.
+  const DebugLocationMapping& GetUsedDebugLocations() const {
+    return debug_locations_;
+  }
+
+  const LloCompilationResult* compilation_result() const {
+    return compilation_result_;
+  }
+
+  const SpillRegionCollection& GetSpillRegions() const {
+    return spill_regions_;
+  }
+
+  const std::vector<HloReference>& GetHloReferences() const {
+    return hlo_references_;
+  }
+
+  ProgramAllocationMaps* GetProgramAllocationMaps() {
+    return program_allocation_maps_;
+  }
+
+ private:
+  using PassPopulationHandler = std::function<Status(
+      llvm::legacy::PassManager*, llvm::raw_svector_ostream*)>;
+  StatusOr<std::string> OptimizeAndCodeGen(PassPopulationHandler handler);
+
+  // LLVM context and module.
+  llvm::LLVMContext context_;
+  llvm::Module module_;
+
+  // Mapping for LLO module/value ordinal into LLO value.
+  DebugLocationMapping debug_locations_;
+
+  // Main LLO module and compilation result.
+  const LloCompilationResult* const compilation_result_;
+  LloModule* const main_module_;
+
+  // Misc conversion context parameters.
+  const TpuSequencerType sequencer_;
+  const Target& target_;
+  LloDumper* const dumper_;
+  std::unique_ptr<llvm::TargetMachine> target_machine_;
+
+  // Note that the original max spill range is defined in the constructor, but
+  // we later adjust it to match real allocations.
+  SpillRegionCollection spill_regions_;
+
+  // When compiling TLP a collection of HLO references which use alternative
+  // inlining, i.e. external HLOs needing stitching.
+  std::vector<HloReference> hlo_references_;
+
+  // If specified, indicates that the converter is created for an HLO function
+  // as opposed to TLP module.
+  llvm::Function* const hlo_function_prototype_;
+
+  // Stores information about VMEM/SMEM allocations used in TLP and each HLOs.
+  ProgramAllocationMaps* const program_allocation_maps_;
+
+  // For converters created for HLO modules holds the pointer to TLP compilation
+  // result.
+  const LloCompilationResult* const tlp_compilation_result_;
+
+  // For converters created for HLO modules holds a pointer to spill regions
+  // used by TLP.
+  const SpillRegionCollection* const tlp_used_spill_regions_;
+
+  // The current retry attempt.
+  const int64_t retry_;
+
+  // Parent timer for operations performed by Converter.
+  const SubTimer& assigned_timer_;
+
+  // For HLO converters stores optional build config.
+  const HloBuildConfig* hlo_build_config_;
+};
+
+Converter::Converter(const std::string& name,
+                     const LloCompilationResult* compilation_result,
+                     LloModule* main_module, LloDumper* dumper,
+                     ProgramAllocationMaps* program_allocation_maps,
+                     const SubTimer& timer)
+    : module_(name, context_),
+      compilation_result_(compilation_result),
+      main_module_(main_module),
+      sequencer_(main_module->SequencerType()),
+      target_(main_module->target()),
+      dumper_(dumper),
+      target_machine_(GetLlvmTargetMachine(*main_module_).value()),
+      hlo_function_prototype_(nullptr),
+      program_allocation_maps_(program_allocation_maps),
+      tlp_compilation_result_(nullptr),
+      tlp_used_spill_regions_(nullptr),
+      retry_(0),
+      assigned_timer_(timer),
+      hlo_build_config_(nullptr) {
+  // Compute default value for spill regions.
+  spill_regions_[MemorySpace::kVmem] = {
+      0, address_util::ConvertOffsetByteToWord(
+             MemorySpace::kVmem, main_module_->target().VmemSizeBytes(),
+             main_module_->target())};
+  spill_regions_[MemorySpace::kSmem] = {
+      main_module_->target().SmemUserSpaceWordOffset(),
+      main_module_->target().StartReservedSmemWordOffset(
+          main_module_->param_count())};
+}
+
+Converter::Converter(const std::string& name,
+                     const LloCompilationResult* compilation_result,
+                     const HloBuildConfig* hlo_build_config,
+                     LloModule* main_module, LloDumper* dumper,
+                     ProgramAllocationMaps* program_allocation_maps,
+                     SpillRegionCollection spill_regions,
+                     const LloCompilationResult* tlp_compilation_result,
+                     const SpillRegionCollection* tlp_used_spill_regions,
+                     llvm::Function* hlo_function_prototype, int64_t retry,
+                     const SubTimer& timer)
+    : module_(name, context_),
+      compilation_result_(compilation_result),
+      main_module_(main_module),
+      sequencer_(main_module->SequencerType()),
+      target_(main_module->target()),
+      dumper_(dumper),
+      target_machine_(GetLlvmTargetMachine(*main_module_).value()),
+      spill_regions_(std::move(spill_regions)),
+      hlo_function_prototype_(hlo_function_prototype),
+      program_allocation_maps_(program_allocation_maps),
+      tlp_compilation_result_(tlp_compilation_result),
+      tlp_used_spill_regions_(tlp_used_spill_regions),
+      retry_(retry),
+      assigned_timer_(timer),
+      hlo_build_config_(hlo_build_config) {}
+
+std::string NormalizeAsFileName(const std::string& file) {
+  return absl::StrReplaceAll(file, {{".", "-"}});
+}
+
+// Return LLVM timing info if "-time-passes" has been passed as an option.
+// After that the timing info is reset ready for new timings to be collected.
+std::string GetAndResetLlvmTimingInfo() {
+  std::string tmp;
+  llvm::raw_string_ostream OS(tmp);
+  llvm::reportAndResetTimings(&OS);
+  OS.flush();
+  return tmp;
+}
+
+std::string TypeAsString(const llvm::Type* type) {
+  if (type == nullptr) {
+    return "<null>";
+  }
+  std::string tmp;
+  llvm::raw_string_ostream OS(tmp);
+  type->print(OS);
+  OS.flush();
+  return tmp;
+}
+
+std::string ValueAsString(const llvm::Value* value) {
+  if (value == nullptr) {
+    return "<null>";
+  }
+  std::string tmp;
+  llvm::raw_string_ostream OS(tmp);
+  value->print(OS);
+  OS.flush();
+  return tmp;
+}
+
+class LlvmModuleBuilder {
+  using LlvmValueTransformer =
+      StatusOr<llvm::Value*> (LlvmModuleBuilder::*)(llvm::Value*);
+
+  struct MxuState {
+    // Number of pushes in the current epoch.
+    int push_count = 0;
+    // MatMuls currently in fifo.
+    std::queue<llvm::Value*> muls;
+
+    // Last vmatpush, nullptr in the beginning of the first epoch.
+    llvm::Value* push_chain = nullptr;
+
+    // Last vdwg, nullptr in the beginning of the first epoch.
+    llvm::Value* mul_chain = nullptr;
+
+    // Tracks the "GMR empty" status of the fifo.
+    bool gmrmt = true;
+
+    DoneWithGainsMode next_dwg_mode = DoneWithGainsMode::kNormal;
+  };
+
+  static constexpr int64_t kMxuCount = 4;
+
+ public:
+  LlvmModuleBuilder(Converter* converter, absl::string_view cpu)
+      : converter_(converter),
+        cpu_(cpu),
+        vector_size_(cpu_ == kCpuPfcBarnaCoreChannelController ? 8 : 1024),
+        hbm_pointer_size_(converter->target_.HbmWordSizeBytes()),
+        fifo_tracker_(&converter->target_),
+        fifo_tracker_post_(&converter->target_),
+        arch_register_tracker_(&converter->target_),
+        builder_(converter->context_),
+        di_builder_(converter->module_) {
+    InitializeFillers();
+
+    // Create compilation unit debug info.
+    constexpr unsigned DW_LANG_Llo_Module = 0x8888;  // Fake lang definition.
+    di_compilation_unit_ = di_builder_.createCompileUnit(
+        DW_LANG_Llo_Module,
+        di_builder_.createFile(converter->module_.getName(), "/"), "llo2llvm",
+        /*isOptimized=*/true, /*Flags=*/"", /*RV=*/0);
+  }
+
+  StatusOr<std::vector<HloReference>> BuildMainModule();
+
+ private:
+  const Target& target() const { return converter_->target_; }
+  LloModule* llo_module() const { return converter_->main_module_; }
+  const LloCompilationResult* compilation_result() const {
+    return converter_->compilation_result_;
+  }
+  llvm::LLVMContext& context() const { return converter_->context_; }
+  llvm::Module* module() { return &converter_->module_; }
+  LloDumper* dumper() { return converter_->dumper_; }
+
+  // True if we are building a TLP module, false if we build a module for an
+  // external HLO.
+  bool is_tlp_module() const {
+    return converter_->tlp_compilation_result_ == nullptr;
+  }
+
+  // Create LLVM debug location for an instrcution specified, add appropriate
+  // mapping to be used later. Note that we use LLO module ordinal as a column
+  // and LLO instruction ordinal as a cloumn, this should ensure unique debug
+  // locations across the compilation.
+  StatusOr<llvm::DebugLoc> CreateDebugLocation(
+      const LloInstruction* instruction) {
+    llvm::DISubprogram* function_scope = llvm_function()->getSubprogram();
+    TF_RET_CHECK(function_scope != nullptr);
+    // Both line and column are expected to start with 1.
+    TF_RET_CHECK(llo_module()->ordinal() >= 0);
+    const int64_t line = llo_module()->ordinal() + 1;
+    TF_RET_CHECK(instruction->ordinal() >= 0);
+    const int64_t column = instruction->ordinal() + 1;
+    // Store the mapping.
+    converter_->debug_locations_.insert({{line, column}, instruction});
+    return llvm::DebugLoc(llvm::DILocation::get(function_scope->getContext(),
+                                                line, column, function_scope));
+  }
+
+  StatusOr<llvm::Function*> CreateMainFunction();
+  StatusOr<llvm::Function*> CreateTlpFunction();
+  StatusOr<llvm::Function*> CreateHloFunction();
+  void InitializeFillers();
+  void InitializeIntrinsicsFillers();
+  void InitializeBinopFillers();
+  void InitializeUnopFillers();
+
+  Status ApplyBuildConfig();
+  Status ComputeSpillRegions(
+      const LloCompilationResult* main_compilation_result);
+  Status BuildBasicBlocks(LloModule* llo_module);
+  Status FillBasicBlocks(LloModule* llo_module);
+
+  llvm::Function* llvm_function() const {
+    CHECK(llvm_function_ != nullptr);
+    return llvm_function_;
+  }
+  // Verify if the provided string is a valid LLVM name identifier.
+  // Valid LLVM identifiers need to conform the regular expression
+  // [-a-zA-Z$._][-a-zA-Z$._0-9]*
+  static bool VerifyLlvmIdentifierValidity(const absl::string_view ident_str);
+
+  // Arg types are used for creating or checking function signature, we could
+  // have tried and reconstructed it based on information in compilation_result,
+  // but chose not to do that to avoid duplication of logic.
+  StatusOr<std::vector<llvm::Function*>> GetOrCreateHloFunction(
+      LloCompilationResult* compilation_result,
+      const std::vector<llvm::Type*>& args_types);
+
+  // Creates an intrinsic call with specified arguments, adjust arguments to
+  // match the types of intrinsic parameters.
+  StatusOr<llvm::Value*> CreateIntrinsicCallWithArgs(
+      llvm::Intrinsic::TPUIntrinsics intrinsic_id,
+      absl::Span<llvm::Value*> args) {
+    return CreateIntrinsicCallWithArgs(
+        llvm::Intrinsic::getDeclaration(module(), intrinsic_id), args);
+  }
+
+  // Creates an intrinsic call with specified arguments, adjust arguments to
+  // match the types of intrinsic parameters.
+  StatusOr<llvm::Value*> CreateIntrinsicCallWithArgs(
+      llvm::FunctionCallee func_callee, absl::Span<llvm::Value*> args);
+
+  // Well-known constants.
+  llvm::Value* LlvmTrue() { return builder_.getInt1(true); }
+  llvm::Value* LlvmFalse() { return builder_.getInt1(false); }
+
+  // Well-known types.
+  StatusOr<llvm::PointerType*> PointerTy(MemorySpace space);
+  llvm::Type* ScalarIntTy() { return builder_.getInt32Ty(); }
+  llvm::Type* VectorIntTy() {
+    return llvm::VectorType::get(ScalarIntTy(), vector_size_, false);
+  }
+  llvm::Type* ScalarFloatTy() { return builder_.getFloatTy(); }
+  llvm::Type* VectorFloatTy() {
+    return llvm::VectorType::get(ScalarFloatTy(), vector_size_, false);
+  }
+  llvm::Type* PredicateTy() { return builder_.getInt1Ty(); }
+  llvm::Type* VectorMaskTy() {
+    return llvm::VectorType::get(builder_.getInt1Ty(), vector_size_, false);
+  }
+
+  // Converting the LLO instruction into a value or inferring its type.
+  StatusOr<llvm::Value*> LlvmAddress(const LloValue* address);
+  StatusOr<llvm::Value*> LlvmConstant(const LloValue* llo_value);
+  StatusOr<llvm::Value*> LlvmValue(const LloValue* llo_value);
+  StatusOr<llvm::Type*> LlvmType(const LloValue* llo_value);
+  StatusOr<llvm::Value*> LlvmArgument(int argno);
+
+  // We treat all registers as if they store int32_t (or a vector of i32) and
+  // bitcast the value to float when needed.
+  StatusOr<llvm::Value*> EnsureScalarFloat(llvm::Value* value);
+  StatusOr<llvm::Value*> EnsureScalarOrVectorFloat(llvm::Value* value);
+  StatusOr<llvm::Value*> EnsureVectorFloat(llvm::Value* value);
+
+  StatusOr<llvm::Value*> EnsureScalarInt(llvm::Value* value);
+  StatusOr<llvm::Value*> EnsureScalarOrVectorInt(llvm::Value* value);
+  StatusOr<llvm::Value*> EnsureVectorInt(llvm::Value* value);
+
+  StatusOr<llvm::Value*> EnsurePointer(llvm::Value* value, llvm::Type* type);
+  StatusOr<llvm::Value*> EnsureSmemPointer(llvm::Value* value) {
+    TF_ASSIGN_OR_RETURN(const auto expected_type,
+                        PointerTy(MemorySpace::kSmem));
+    return EnsurePointer(value, expected_type);
+  }
+
+  StatusOr<llvm::Value*> EnsureNoOp(llvm::Value* value) { return value; }
+
+  // Convert LLO instruction into LLVM representation.
+  Status FillInstruction(const LloInstruction* instruction);
+
+  // Custom fillers for misc ops.
+  StatusOr<llvm::Value*> FillCompare(const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillInlinedCall(const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillInlinedCallOperand(
+      const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillPhi(const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillSelect(const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillParameterAddress(
+      const LloInstruction* instruction);
+
+  // Custom fillers for scalar ops.
+  StatusOr<llvm::Value*> FillScalarHaltOnError(
+      const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillScalarStore(const LloInstruction* instruction);
+
+  // Custom fillers for vector ops.
+  StatusOr<llvm::Value*> FillVectorDoneWithGains(
+      const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillVectorLatch(const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillVectorLoad(const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillVectorLoadIndexed(
+      const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillVectorLoadSublaneShuffle(
+      const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillVectorMatmul(const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillVectorMatres(const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillVectorPermute(const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillVectorSetPermutePattern(
+      const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillVectorStore(const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillVectorStoreIndexedMasked(
+      const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillVectorStoreMasked(
+      const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillVectorTranspose(const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillVectorTransposeResult(
+      const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillScalarLoad(const LloInstruction* instruction);
+
+  StatusOr<llvm::Value*> CreateScalarAddressCalculation(
+      llvm::Value* address, llvm::Value* displacement,
+      MemorySpace memory_space);
+  StatusOr<int64_t> LlvmAddressSpace(MemorySpace memory_space) const;
+  StatusOr<llvm::Value*> CreatePointerToInt(llvm::Value* address);
+  StatusOr<llvm::Value*> CreateIntToPointer(llvm::Value* address,
+                                            llvm::Type* type);
+
+  // PFC Barna Core specific fillers.
+  StatusOr<llvm::Value*> FillPfcBcVectorStore(
+      const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillPfcBcVectorLoad(const LloInstruction* instruction);
+  StatusOr<llvm::Value*> FillPfcBcVectorLoadImmediateOffset(
+      const LloInstruction* instruction);
+
+  // Helper used for filling a predicated instruction, creates a control flow
+  // around the instruction and allows to customize the terminator.
+  StatusOr<llvm::Value*> FillAsPredicated(
+      const std::optional<PredicateAndPolarity>& predicate,
+      const std::function<StatusOr<llvm::Value*>()>& builder,
+      Status (LlvmModuleBuilder::*terminator)(llvm::BasicBlock*));
+
+  Status FallThroughBlockTerminator(llvm::BasicBlock* join_block) {
+    builder_.CreateBr(join_block);
+    return OkStatus();
+  }
+
+  StatusOr<llvm::BasicBlock*> CreateBasicBlock(llvm::Twine name);
+
+  StatusOr<llvm::BasicBlock*> GetFirstRegionBlock(const LloRegion* region);
+
+  // Find a blocking instruction, if any, that the given instruction depends on.
+  // This uses the LLO fifo_tracker and arch_register_tracker, and as such may
+  // find multiple blocking instructions. These are filtered using the supplied
+  // filter, and the first matching instruction is returned. An undef value is
+  // returned if there are no matching instructions.
+  StatusOr<llvm::Value*> GetFifoDependency(
+      const LloInstruction* instruction,
+      std::function<StatusOr<llvm::Value*>(const LloValue&)>
+          blocking_instruction_filter) {
+    TF_ASSIGN_OR_RETURN(auto blocking_fifo,
+                        fifo_tracker_.FindBlockingPushesAndPops(instruction));
+    auto blocking_arch_reg = arch_register_tracker_.FindHazards(instruction);
+    std::vector<const LloValue*> dependencies;
+    for (auto [blocking_instruction, latency] : blocking_fifo) {
+      dependencies.push_back(blocking_instruction);
+    }
+    for (auto [blocking_instruction, latency] : blocking_arch_reg) {
+      dependencies.push_back(blocking_instruction);
+    }
+    for (const LloValue* blocking_instruction : dependencies) {
+      // Enforce an allow-listed set of instruction opcodes.
+      const LloOpcode opcode = blocking_instruction->opcode();
+      TF_RET_CHECK(LloOpcodeIsVectorLatch(opcode) ||
+                   LloOpcodeIsVectorMatmul(opcode) ||
+                   opcode == LloOpcode::kVectorAddReduceF32 ||
+                   opcode == LloOpcode::kVectorMaxReduceF32 ||
+                   opcode == LloOpcode::kVectorMinReduceF32 ||
+                   opcode == LloOpcode::kVectorMaxIndexReduceF32 ||
+                   opcode == LloOpcode::kVectorMinIndexReduceF32 ||
+                   opcode == LloOpcode::kVectorDoneWithGains ||
+                   opcode == LloOpcode::kVectorMatres ||
+                   opcode == LloOpcode::kVectorPermute ||
+                   opcode == LloOpcode::kVectorPermuteResult ||
+                   opcode == LloOpcode::kVectorRotate ||
+                   opcode == LloOpcode::kVectorSetPermutePattern ||
+                   opcode == LloOpcode::kVectorTranspose ||
+                   opcode == LloOpcode::kVectorTransposeResult ||
+                   opcode == LloOpcode::kVectorXlaneResult)
+          << opcode;
+    }
+    for (const LloValue* blocking_instruction : dependencies) {
+      TF_ASSIGN_OR_RETURN(auto filtered,
+                          blocking_instruction_filter(*blocking_instruction));
+      if (filtered != nullptr) {
+        return filtered;
+      }
+    }
+    return llvm::UndefValue::get(builder_.getInt32Ty());
+  }
+
+  using LloInstructionFiller =
+      std::function<StatusOr<llvm::Value*>(const LloInstruction*)>;
+  absl::flat_hash_map<LloOpcode, LloInstructionFiller> fillers_;
+
+  // Constant fold on the fly to save compile time.
+  using Builder = llo_converter_builder<llvm::ConstantFolder>;
+
+  Converter* const converter_;  // Mostly provides access to the context.
+  const std::string cpu_;
+  const int64_t vector_size_;
+
+  // HBM word size in bytes, used for HBM pointers.
+  const int64_t hbm_pointer_size_;
+
+  // LLVM function created for the module, will be initialized later.
+  llvm::Function* llvm_function_ = nullptr;
+
+  // Tracks the FIFO dependencies as instructions are processed.
+  LloFifoTracker fifo_tracker_;
+  // Tracks the FIFO dependencies as would be seen *after* the current
+  // instruction is processed. This is to determine whether the current
+  // (vmatmul) instruction will clear GMRmt and therefore require a
+  // LoadGainsToGmr() call.
+  LloFifoTracker fifo_tracker_post_;
+  // Tracks the architectural register dependencies as instructions are
+  // processed.
+  LloArchRegisterTracker arch_register_tracker_;
+
+  // Current MxuState.
+  std::vector<MxuState> mxu_{kMxuCount};
+
+  // LLVM values returned from last emitter load/store iar operation. Will be
+  // used by the following load/store operation. Should be cleared when
+  // control flow merges so that we don't know the value to be passed to
+  // appropriate load or store. Unfortunately it appears that in many cases
+  // iar value is set just once for the whole program and then used in many
+  // places, since precise analysis is not implemented yet we just leave this
+  // value and hope its proper use is guaranteed by the program.
+  llvm::Value* last_issued_load_iar_token_ = nullptr;
+  llvm::Value* last_issued_store_iar_token_ = nullptr;
+
+  Builder builder_;
+
+  llvm::DIBuilder di_builder_;
+  llvm::DICompileUnit* di_compilation_unit_ = nullptr;
+
+  // Maps LLO compilation result into a range of HLO function references.
+  absl::flat_hash_map<const LloCompilationResult*, std::pair<int64_t, int64_t>>
+      compilation_result_to_hlo_functions_;
+  // Stably ordered HLO function modules.
+  std::vector<HloReference> hlo_functions_;
+
+  // Represents LLO phi value information, used for PHIs fixup.
+  std::vector<std::pair<const LloInstruction*, llvm::PHINode*>> phis_to_fix_up_;
+
+  // Maps the first LLO region in a basic block to the first LLVM block for that
+  // basic block (note that LLO predication can result in multiple LLVM blocks
+  // for an LLO basic block). This is used for mapping branch targets.
+  absl::flat_hash_map<const LloRegion*, llvm::BasicBlock*>
+      first_region_to_block_;
+
+  // Maps the final LLO region in a basic block to the last LLVM block for that
+  // basic block. This is used for mapping phi sources.
+  absl::flat_hash_map<const LloRegion*, llvm::BasicBlock*>
+      last_region_to_block_;
+
+  // At the end of a basic block, we capture the branch instruction (if any),
+  // and at the start of the next basic block, we use this to add the
+  // terminating branch instruction to the previous LLVM block.
+  const LloInstruction* next_branch_ = nullptr;
+
+  // Maps LLO value into LLVM value.
+  absl::flat_hash_map<const LloValue*, llvm::Value*> value_mapping_;
+  absl::flat_hash_map<const LloAllocation*, llvm::Value*> allocation_mapping_;
+
+  // Tracks the current LLO region, just for constructing LLVM block names for
+  // predicated instructions.
+  const LloRegion* current_region_ = nullptr;
+
+  // Defines regions to be filtered out and ignored while building the module,
+  // used for creation partial HLO modules, such as for reshape.
+  absl::flat_hash_set<const LloRegion*> region_filter_;
+
+  Status LoadGainsToGmr(int64_t mxu_id);
+
+  // See PufferfishTensorCoreEmitter::EmitVectorStoreEvenOddSublanes() and
+  // PufferfishTensorCoreEmitter::EmitVectorLoadReplicateEvenOddSublanes().
+  // TODO(b/147254380): Move this logic to LLVM lowering. We should add
+  // intrinsics implicitly setting iar registers to match LLO.
+  static constexpr int32_t kVectorStoreEvenOddSublanesIar = 0;
+  static constexpr int32_t kVectorLoadReplicateEvenOddSublanes = 1;
+};
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::CreateIntrinsicCallWithArgs(
+    llvm::FunctionCallee func_callee, absl::Span<llvm::Value*> args) {
+  TF_RET_CHECK(func_callee.getFunctionType()->getNumParams() == args.size())
+      << func_callee.getCallee()->getName().data() << " expects param_count="
+      << func_callee.getFunctionType()->getNumParams() << " but got "
+      << args.size();
+  for (int64_t i = 0; i < args.size(); i++) {
+    llvm::Type* expected_type = func_callee.getFunctionType()->getParamType(i);
+    llvm::Type* arg_type = args[i]->getType();
+    if (expected_type->isPointerTy()) {
+      TF_ASSIGN_OR_RETURN(args[i], EnsurePointer(args[i], expected_type));
+    }
+    if (expected_type->getScalarType()->isFloatTy() &&
+        arg_type->getScalarType()->isIntegerTy()) {
+      TF_ASSIGN_OR_RETURN(args[i], EnsureScalarOrVectorFloat(args[i]));
+    }
+    if (expected_type->getScalarType()->isIntegerTy() &&
+        arg_type->getScalarType()->isFloatTy()) {
+      TF_ASSIGN_OR_RETURN(args[i], EnsureScalarOrVectorInt(args[i]));
+    }
+    TF_RET_CHECK(args[i]->getType() == expected_type)
+        << func_callee.getCallee()->getName().data() << ": arg#" << i
+        << " has type: " << TypeAsString(args[i]->getType())
+        << ", while expected: " << TypeAsString(expected_type);
+  }
+  return builder_.CreateCall(func_callee, {args.begin(), args.end()});
+}
+
+StatusOr<llvm::Function*> LlvmModuleBuilder::CreateMainFunction() {
+  TF_ASSIGN_OR_RETURN(llvm::Function * function, is_tlp_module()
+                                                     ? CreateTlpFunction()
+                                                     : CreateHloFunction());
+
+  // Build debug info metadata.
+  TF_RET_CHECK(di_compilation_unit_ != nullptr);
+  const llvm::StringRef name = function->getName();
+  llvm::DISubprogram* function_scope = di_builder_.createFunction(
+      di_compilation_unit_, name, name,
+      di_builder_.createFile(function->getName(), "/"), /*LineNo=*/0,
+      /*Ty=*/
+      di_builder_.createSubroutineType(
+          di_builder_.getOrCreateTypeArray(std::nullopt)),
+      /*ScopeLine=*/0, llvm::DINode::FlagZero,
+      /*SPFlags=*/llvm::DISubprogram::SPFlagDefinition |
+          llvm::DISubprogram::SPFlagOptimized);
+  function->setSubprogram(function_scope);
+
+  return function;
+}
+
+StatusOr<llvm::Function*> LlvmModuleBuilder::CreateHloFunction() {
+  llvm::Function* prototype = converter_->hlo_function_prototype_;
+  llvm::FunctionType* prototype_type = prototype->getFunctionType();
+
+  // Clone argument types from TLP into HLO context.
+  std::vector<llvm::Type*> args_types;
+  const int64_t count = prototype_type->getNumParams();
+  for (int64_t i = 0; i < count; i++) {
+    llvm::SMDiagnostic Error;
+    std::string type_as_string = TypeAsString(prototype_type->getParamType(i));
+    unsigned read = 0;
+    llvm::Type* type =
+        llvm::parseTypeAtBeginning(type_as_string, read, Error, *module());
+    TF_RET_CHECK(type != nullptr);
+    TF_RET_CHECK(read == type_as_string.length());
+    TF_RET_CHECK(type_as_string == TypeAsString(type));
+    args_types.push_back(type);
+  }
+
+  return llvm::Function::Create(
+      llvm::FunctionType::get(llvm::Type::getVoidTy(context()), args_types,
+                              /*IsVarArg=*/false),
+      llvm::Function::ExternalLinkage, prototype->getName(), module());
+}
+
+StatusOr<llvm::Function*> LlvmModuleBuilder::CreateTlpFunction() {
+  std::vector<llvm::Type*> params;
+
+  if (cpu_ == kCpuPfcBarnaCoreChannelController) {
+    // For BarnaCore create function parameters representing hardware sregs.
+    llvm::Type* bmem_address =
+        VectorFloatTy()->getPointerTo(llvm::TPUAddressSpace::TPUAS_Bmem);
+    params.push_back(bmem_address);
+    params.push_back(bmem_address);
+    params.push_back(ScalarFloatTy());
+    params.push_back(ScalarIntTy());
+  }
+
+  llvm::Function* tlp_function = llvm::Function::Create(
+      llvm::FunctionType::get(llvm::Type::getVoidTy(context()),
+                              /*Params=*/params, /*IsVarArg=*/false),
+      llvm::Function::ExternalLinkage, "main", module());
+  tlp_function->addFnAttr("is-tlp-function");
+  return tlp_function;
+}
+
+bool LlvmModuleBuilder::VerifyLlvmIdentifierValidity(
+    const absl::string_view ident_str) {
+  // An empty string is not a valid identifier
+  if (ident_str.empty()) {
+    return false;
+  }
+  // First character needs to be [-a-zA-Z$._]
+  if (!absl::ascii_isalpha(ident_str[0]) && ident_str[0] != '-' &&
+      ident_str[0] != '$' && ident_str[0] != '.' && ident_str[0] != '_') {
+    return false;
+  }
+  // Any subsequent character needs to follow [-a-zA-Z$._0-9]*
+  for (char c : ident_str.substr(1)) {
+    if (!absl::ascii_isalnum(c) && c != '-' && c != '$' && c != '.' &&
+        c != '_') {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Analyzes if reshape HLO can be split into separate functions for each of the
+// top-level region sub-regions, enforces few invariants we expect to be valid.
+// Returns reshape top-region subregions representing separate regions which
+// could be represented as separate LLVM functions.
+StatusOr<std::vector<const LloRegion*>> AnalyzeReshapeForSplitting(
+    std::string hlo, const LloRegion& top_region) {
+  absl::flat_hash_set<const LloValue*> top_region_instructions;
+  std::vector<const LloRegion*> regions;
+
+  // Ensures each top-region's subregion only references values from itself or
+  // from the top-region.
+  const auto analyze_subregion = [&](const LloRegion* region) -> Status {
+    absl::flat_hash_set<const LloValue*> region_instructions;
+    return ConstLloRegionVisitor()
+        .WithRunOnInstruction([&](const LloRegionMember* member) -> Status {
+          const LloInstruction* instruction = member->instruction();
+          region_instructions.insert(instruction);
+          // Check instruction operands.
+          for (const LloValue* operand : instruction->operands_full()) {
+            if (operand->IsConstant() ||
+                region_instructions.contains(operand) ||
+                top_region_instructions.contains(operand)) {
+              continue;  // Assumption holds.
+            }
+            // We don't expect control flow on top region level, so not yet seen
+            // phi operands are assumed to be in the same subregion.
+            TF_RET_CHECK(instruction->IsPhi());
+            TF_RET_CHECK(instruction->operands(1) == operand);
+          }
+          return OkStatus();
+        })
+        .VisitRegion(region);
+  };
+
+  // Process all the members of the top region.
+  for (const auto& member : top_region.members()) {
+    switch (member->kind()) {
+      case LloRegionMember::kSubRegion:
+        regions.push_back(member->sub_region());
+        TF_RETURN_IF_ERROR(analyze_subregion(member->sub_region()));
+        break;
+      case LloRegionMember::kInstruction: {
+        const LloInstruction* instruction = member->instruction();
+        TF_RET_CHECK(instruction->opcode() == LloOpcode::kInlinedCallOperand ||
+                     instruction->opcode() ==
+                         LloOpcode::kScalarAddressCalculation);
+        top_region_instructions.insert(instruction);
+        // Ensure all operands are from top-region.
+        for (const LloValue* operand : instruction->operands_full()) {
+          TF_RET_CHECK(operand->IsConstant() ||
+                       top_region_instructions.contains(operand));
+        }
+        break;
+      }
+      default:
+        return InternalError("Unexpected reshape top-region member: %s",
+                             LloRegionMember::KindToString(member->kind()));
+    }
+  }
+
+  if (regions.size() > 1) {
+    LOG(INFO) << "HLO " << hlo << " was split into " << regions.size()
+              << " LLVM functions.";
+  }
+  return regions;
+}
+
+StatusOr<std::vector<llvm::Function*>>
+LlvmModuleBuilder::GetOrCreateHloFunction(
+    LloCompilationResult* compilation_result,
+    const std::vector<llvm::Type*>& args_types) {
+  TF_RET_CHECK(cpu_ != kCpuPfcBarnaCoreChannelController);
+
+  std::vector<llvm::Function*> result;
+  if (const auto it =
+          compilation_result_to_hlo_functions_.find(compilation_result);
+      it != compilation_result_to_hlo_functions_.end()) {
+    for (int64_t i = it->second.first; i < it->second.second; ++i) {
+      result.push_back(hlo_functions_[i].function_or_null());
+    }
+  } else {
+    // Get the name of the LLVM function from the LloCompilationResult
+    // object containing the LloRegion we want to call. Use "hlo.X"
+    // where X is the number of the function if the name is not a valid
+    // LLVM identifier or if the region is not derived from an HLO
+    // computation.
+    const LloRegion& module_region = *compilation_result->module->top_region();
+    const absl::string_view function_name_str =
+        module_region.IsHlo() ? module_region.hlo_instruction()->name()
+                              : absl::string_view();
+    const llvm::Twine function_name =
+        VerifyLlvmIdentifierValidity(function_name_str)
+            ? llvm::Twine(function_name_str.data())
+            : llvm::Twine("hlo.", std::to_string(hlo_functions_.size()));
+
+    const int64_t functions_before = hlo_functions_.size();
+
+    const auto create_function = [&](const llvm::Twine name) {
+      llvm::Function* function = function = llvm::Function::Create(
+          llvm::FunctionType::get(llvm::Type::getVoidTy(context()), args_types,
+                                  /*IsVarArg=*/false),
+          llvm::Function::ExternalLinkage, name, module());
+
+      function->addFnAttr(llvm::Attribute::NoInline);
+      return function;
+    };
+
+    if (module_region.IsHlo() &&
+        module_region.hlo_instruction()->opcode() == HloOpcode::kReshape) {
+      // Check if we can we split reshape into separate LLVM modules.
+      TF_ASSIGN_OR_RETURN(
+          std::vector<const LloRegion*> regions,
+          AnalyzeReshapeForSplitting(function_name.str(), module_region));
+      if (regions.size() > 1) {
+        // Create a separate function for each reshape region.
+        for (int64_t index = 0; index < regions.size(); index++) {
+          llvm::Function* function =
+              create_function(function_name + ".slice" + llvm::Twine(index));
+          result.push_back(function);
+          std::vector<const LloRegion*> hidden;
+          hidden.insert(hidden.end(), regions.begin(), regions.begin() + index);
+          hidden.insert(hidden.end(), regions.begin() + index + 1,
+                        regions.end());
+          hlo_functions_.push_back(
+              {compilation_result, function,
+               std::make_shared<ReshapeBuildConfig>(std::move(hidden))});
+        }
+        compilation_result_to_hlo_functions_[compilation_result] = {
+            functions_before, hlo_functions_.size()};
+      }
+    }
+
+    if (result.empty()) {
+      llvm::Function* function = create_function(function_name);
+      result.push_back(function);
+      hlo_functions_.push_back({compilation_result, function});
+      compilation_result_to_hlo_functions_[compilation_result] = {
+          functions_before, hlo_functions_.size()};
+    }
+  }
+
+  for (llvm::Function* function : result) {
+    TF_RET_CHECK(function != nullptr);
+    TF_RET_CHECK(function->getFunctionType()->getNumParams() ==
+                 args_types.size())
+        << function->getName().data() << " expects param count is "
+        << function->getFunctionType()->getNumParams() << " but got "
+        << args_types.size();
+
+    for (int64_t i = 0; i < args_types.size(); i++) {
+      llvm::Type* expected_type = function->getFunctionType()->getParamType(i);
+      TF_RET_CHECK(args_types[i] == expected_type)
+          << function->getName().data() << ": arg#" << i
+          << " has type: " << TypeAsString(args_types[i])
+          << ", while expected: " << TypeAsString(expected_type);
+    }
+  }
+
+  return result;
+}
+
+void LlvmModuleBuilder::InitializeFillers() {
+  InitializeIntrinsicsFillers();
+  InitializeBinopFillers();
+  InitializeUnopFillers();
+
+  {
+    // Register noarg ops.
+    const auto noarg =
+        [&](const std::function<StatusOr<llvm::Value*>()>& func) {
+          return
+              [=](const LloInstruction* instruction) -> StatusOr<llvm::Value*> {
+                TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+                TF_RET_CHECK(instruction->operands_size() == 0);
+                return func();
+              };
+        };
+
+    fillers_[LloOpcode::kScalarFence] = noarg([this] {
+      return builder_.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent);
+    });
+    fillers_[LloOpcode::kScalarReadCycleHigh] =
+        noarg([] { return InternalError("Unexpected kScalarReadCycleHigh"); });
+    fillers_[LloOpcode::kScalarReadCycleLow] =
+        noarg([] { return InternalError("Unexpected kScalarReadCycleLow"); });
+    auto read_local_cycle_count = noarg([this] {
+      if (/* DISABLES CODE */ (false)) {
+        // The LLO decomposition of kScalarReadCycleStart and
+        // kScalarReadCycleEnd prepends an sfence instruction. We could
+        // optionally do that here for parity.
+        builder_.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent);
+      }
+      return builder_.CreateCall(llvm::Intrinsic::getDeclaration(
+          module(), llvm::Intrinsic::tpu_read_local_cycle_count));
+    });
+    // Lower both kScalarReadCycleStart/End to tpu_read_local_cycle_count
+    // intrinsic.
+    fillers_[LloOpcode::kScalarReadCycleStart] = read_local_cycle_count;
+    fillers_[LloOpcode::kScalarReadCycleEnd] = read_local_cycle_count;
+    fillers_[LloOpcode::kScalarHalt] =
+        noarg([this]() -> StatusOr<llvm::Value*> {
+          // Barna core channel program should NOT have explicit program halt.
+          TF_RET_CHECK(cpu_ != kCpuPfcBarnaCoreChannelController);
+          // TODO(b/146169949): Add an operand to identify the reason for the
+          // trap.
+          // Ignore Halt in the middle of the program for now. We will fix it if
+          // we find a real life use for this case.
+          return llvm::UndefValue::get(builder_.getInt32Ty());
+        });
+    fillers_[LloOpcode::kSchedulingBarrier] = noarg([this] {
+      return llvm::UndefValue::get(builder_.getVoidTy());  // TODO(b/139422121)
+    });
+    fillers_[LloOpcode::kVectorInterrupt] =
+        [=](const LloInstruction* instruction) -> StatusOr<llvm::Value*> {
+      std::vector<llvm::Value*> args{
+          builder_.getInt32(instruction->scalar_constant_value())};
+      return noarg([&] {
+        return CreateIntrinsicCallWithArgs(
+            llvm::Intrinsic::getDeclaration(module(),
+                                            llvm::Intrinsic::tpu_tc_vint),
+            {args.data(), args.size()});
+      })(instruction);
+    };
+  }
+
+  {
+    // Register the instructions filled via special builder methods.
+    using MemberLloInstructionFiller = StatusOr<llvm::Value*> (
+        LlvmModuleBuilder::*)(const LloInstruction* instruction);
+    static constexpr std::pair<LloOpcode, MemberLloInstructionFiller> kPairs[] =
+        {
+            {LloOpcode::kBarnaCoreVectorLoad,
+             &LlvmModuleBuilder::FillPfcBcVectorLoad},
+            {LloOpcode::kBarnaCoreVectorLoadImmediateOffset,
+             &LlvmModuleBuilder::FillPfcBcVectorLoadImmediateOffset},
+            {LloOpcode::kBarnaCoreVectorStore,
+             &LlvmModuleBuilder::FillPfcBcVectorStore},
+            {LloOpcode::kInlinedCall, &LlvmModuleBuilder::FillInlinedCall},
+            {LloOpcode::kInlinedCallOperand,
+             &LlvmModuleBuilder::FillInlinedCallOperand},
+            {LloOpcode::kParameterAddress,
+             &LlvmModuleBuilder::FillParameterAddress},
+            {LloOpcode::kPredicatePhi, &LlvmModuleBuilder::FillPhi},
+            {LloOpcode::kScalarCompare, &LlvmModuleBuilder::FillCompare},
+            {LloOpcode::kScalarHaltOnError,
+             &LlvmModuleBuilder::FillScalarHaltOnError},
+            {LloOpcode::kScalarPhi, &LlvmModuleBuilder::FillPhi},
+            {LloOpcode::kScalarSelect, &LlvmModuleBuilder::FillSelect},
+            {LloOpcode::kScalarStore, &LlvmModuleBuilder::FillScalarStore},
+            {LloOpcode::kVectorCompare, &LlvmModuleBuilder::FillCompare},
+            {LloOpcode::kVectorDoneWithGains,
+             &LlvmModuleBuilder::FillVectorDoneWithGains},
+            {LloOpcode::kVectorLatch, &LlvmModuleBuilder::FillVectorLatch},
+            {LloOpcode::kVectorLatchMsk, &LlvmModuleBuilder::FillVectorLatch},
+            {LloOpcode::kScalarLoad, &LlvmModuleBuilder::FillScalarLoad},
+            {LloOpcode::kVectorLoad, &LlvmModuleBuilder::FillVectorLoad},
+            {LloOpcode::kVectorLoadReplicateEvenOddSublanes,
+             &LlvmModuleBuilder::FillVectorLoadIndexed},
+            {LloOpcode::kVectorLoadSublaneShuffle,
+             &LlvmModuleBuilder::FillVectorLoadSublaneShuffle},
+            {LloOpcode::kVectorMaskPhi, &LlvmModuleBuilder::FillPhi},
+            {LloOpcode::kVectorMatmul, &LlvmModuleBuilder::FillVectorMatmul},
+            {LloOpcode::kVectorMatmulHigh,
+             &LlvmModuleBuilder::FillVectorMatmul},
+            {LloOpcode::kVectorMatmulLow, &LlvmModuleBuilder::FillVectorMatmul},
+            {LloOpcode::kVectorMatmulPacked,
+             &LlvmModuleBuilder::FillVectorMatmul},
+            {LloOpcode::kVectorMatmulMsk, &LlvmModuleBuilder::FillVectorMatmul},
+            {LloOpcode::kVectorMatmulHighMsk,
+             &LlvmModuleBuilder::FillVectorMatmul},
+            {LloOpcode::kVectorMatmulLowMsk,
+             &LlvmModuleBuilder::FillVectorMatmul},
+            {LloOpcode::kVectorMatmulPackedMsk,
+             &LlvmModuleBuilder::FillVectorMatmul},
+            {LloOpcode::kVectorMatres, &LlvmModuleBuilder::FillVectorMatres},
+            {LloOpcode::kVectorPermute, &LlvmModuleBuilder::FillVectorPermute},
+            {LloOpcode::kVectorPhi, &LlvmModuleBuilder::FillPhi},
+            {LloOpcode::kVectorSelect, &LlvmModuleBuilder::FillSelect},
+            {LloOpcode::kVectorSetPermutePattern,
+             &LlvmModuleBuilder::FillVectorSetPermutePattern},
+            {LloOpcode::kVectorStore, &LlvmModuleBuilder::FillVectorStore},
+            {LloOpcode::kVectorStoreEvenOddSublanes,
+             &LlvmModuleBuilder::FillVectorStoreIndexedMasked},
+            {LloOpcode::kVectorStoreMasked,
+             &LlvmModuleBuilder::FillVectorStoreMasked},
+            {LloOpcode::kVectorTranspose,
+             &LlvmModuleBuilder::FillVectorTranspose},
+            {LloOpcode::kVectorTransposeResult,
+             &LlvmModuleBuilder::FillVectorTransposeResult},
+        };
+    for (const auto& pair : kPairs) {
+      fillers_[pair.first] = [=](const LloInstruction* instruction) {
+        return (this->*(pair.second))(instruction);
+      };
+    }
+  }
+}
+
+void LlvmModuleBuilder::InitializeBinopFillers() {
+  // Register binary and binary-like ops.
+  const auto binary = [&](LlvmValueTransformer arg_transformer,
+                          const std::function<StatusOr<llvm::Value*>(
+                              llvm::Value*, llvm::Value*)>& func,
+                          LlvmValueTransformer result_transformer =
+                              &LlvmModuleBuilder::EnsureScalarOrVectorInt) {
+    return [=](const LloInstruction* instruction) -> StatusOr<llvm::Value*> {
+      TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+      TF_RET_CHECK(instruction->operands_size() == 2);
+      TF_ASSIGN_OR_RETURN(auto lhs, LlvmValue(instruction->operands(0)));
+      TF_ASSIGN_OR_RETURN(lhs, (this->*arg_transformer)(lhs));
+      TF_ASSIGN_OR_RETURN(auto rhs, LlvmValue(instruction->operands(1)));
+      TF_ASSIGN_OR_RETURN(rhs, (this->*arg_transformer)(rhs));
+      TF_ASSIGN_OR_RETURN(auto result, func(lhs, rhs));
+      return (this->*result_transformer)(result);
+    };
+  };
+
+  {
+    // Scalar INT operations.
+    const auto ensure_scalar_i32 = &LlvmModuleBuilder::EnsureScalarInt;
+
+    fillers_[LloOpcode::kScalarAddS32] =
+        binary(ensure_scalar_i32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateAdd(lhs, rhs);
+        });
+    fillers_[LloOpcode::kScalarBitwiseAnd] =
+        binary(ensure_scalar_i32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateAnd(lhs, rhs);
+        });
+    fillers_[LloOpcode::kScalarBitwiseOr] =
+        binary(ensure_scalar_i32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateOr(lhs, rhs);
+        });
+    fillers_[LloOpcode::kScalarBitwiseXor] =
+        binary(ensure_scalar_i32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateXor(lhs, rhs);
+        });
+    fillers_[LloOpcode::kScalarMultiplyU24] =
+        binary(ensure_scalar_i32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          llvm::Value* mask = builder_.getInt32(0x0ffffff);
+          return builder_.CreateMul(builder_.CreateAnd(lhs, mask),
+                                    builder_.CreateAnd(rhs, mask));
+        });
+    fillers_[LloOpcode::kScalarMultiplyU32] =
+        binary(ensure_scalar_i32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateMul(lhs, rhs);
+        });
+    fillers_[LloOpcode::kScalarShll] =
+        binary(ensure_scalar_i32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          // LLVM codegen will generate efficient code and pattern match
+          // this to a single shift.
+          return builder_.CreateSelect(
+              builder_.CreateICmpEQ(
+                  builder_.CreateAnd(rhs, builder_.getInt32(31)), rhs),
+              builder_.CreateShl(lhs, rhs), builder_.getInt32(0));
+        });
+    fillers_[LloOpcode::kScalarShrl] =
+        binary(ensure_scalar_i32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          // LLVM codegen will generate efficient code and pattern match
+          // this to a single shift.
+          return builder_.CreateSelect(
+              builder_.CreateICmpEQ(
+                  builder_.CreateAnd(rhs, builder_.getInt32(31)), rhs),
+              builder_.CreateLShr(lhs, rhs), builder_.getInt32(0));
+        });
+    fillers_[LloOpcode::kScalarShra] =
+        binary(ensure_scalar_i32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          // LLVM codegen will generate efficient code and pattern match
+          // this to a single shift.
+          return builder_.CreateAShr(
+              lhs, builder_.CreateSelect(
+                       builder_.CreateICmpEQ(
+                           builder_.CreateAnd(rhs, builder_.getInt32(31)), rhs),
+                       rhs, builder_.getInt32(31)));
+        });
+    fillers_[LloOpcode::kScalarSubtractS32] =
+        binary(ensure_scalar_i32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateSub(lhs, rhs);
+        });
+  }
+
+  {
+    // Scalar FLOAT operations.
+    const auto ensure_scalar_f32 = &LlvmModuleBuilder::EnsureScalarFloat;
+
+    fillers_[LloOpcode::kScalarAddF32] =
+        binary(ensure_scalar_f32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateFAdd(lhs, rhs);
+        });
+    fillers_[LloOpcode::kScalarMultiplyF32] =
+        binary(ensure_scalar_f32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateFMul(lhs, rhs);
+        });
+    fillers_[LloOpcode::kScalarSubtractF32] =
+        binary(ensure_scalar_f32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateFSub(lhs, rhs);
+        });
+  }
+
+  {
+    // Vector INT operations.
+    const auto ensure_vector_i32 = &LlvmModuleBuilder::EnsureVectorInt;
+
+    fillers_[LloOpcode::kVectorAddS32] =
+        binary(ensure_vector_i32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateAdd(lhs, rhs);
+        });
+    fillers_[LloOpcode::kVectorAndU32] =
+        binary(ensure_vector_i32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateAnd(lhs, rhs);
+        });
+    fillers_[LloOpcode::kVectorOrU32] =
+        binary(ensure_vector_i32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateOr(lhs, rhs);
+        });
+    fillers_[LloOpcode::kVectorXorU32] =
+        binary(ensure_vector_i32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateXor(lhs, rhs);
+        });
+    // TODO(b/144862005): Use TPU shift intrinsics to remove the range
+    // checking overhead, though keeping in mind that the LLVM optimizer may
+    // be less effective.
+    fillers_[LloOpcode::kVectorShiftLeftLogical] = binary(
+        ensure_vector_i32,
+        [this](llvm::Value* lhs, llvm::Value* rhs) -> StatusOr<llvm::Value*> {
+          // If rhs is outside the range [0:31], return 0 per hardware spec.
+          // (rhs & 31) == rhs ? lhs<<rhs : 0
+          TF_ASSIGN_OR_RETURN(lhs, EnsureVectorInt(lhs));
+          TF_ASSIGN_OR_RETURN(rhs, EnsureVectorInt(rhs));
+          return builder_.CreateSelect(
+              builder_.CreateICmpEQ(
+                  builder_.CreateAnd(
+                      rhs, builder_.CreateVectorSplat(vector_size_,
+                                                      builder_.getInt32(31))),
+                  rhs),
+              builder_.CreateShl(lhs, rhs),
+              builder_.CreateVectorSplat(vector_size_, builder_.getInt32(0)));
+        });
+    fillers_[LloOpcode::kVectorShiftRightLogical] = binary(
+        ensure_vector_i32,
+        [this](llvm::Value* lhs, llvm::Value* rhs) -> StatusOr<llvm::Value*> {
+          // If rhs is outside the range [0:31], return 0 per hardware spec.
+          // (rhs & 31) == rhs ? lhs>>rhs : 0
+          TF_ASSIGN_OR_RETURN(lhs, EnsureVectorInt(lhs));
+          TF_ASSIGN_OR_RETURN(rhs, EnsureVectorInt(rhs));
+          return builder_.CreateSelect(
+              builder_.CreateICmpEQ(
+                  builder_.CreateAnd(
+                      rhs, builder_.CreateVectorSplat(vector_size_,
+                                                      builder_.getInt32(31))),
+                  rhs),
+              builder_.CreateLShr(lhs, rhs),
+              builder_.CreateVectorSplat(vector_size_, builder_.getInt32(0)));
+        });
+    fillers_[LloOpcode::kVectorShiftRightArithmetic] = binary(
+        ensure_vector_i32,
+        [this](llvm::Value* lhs, llvm::Value* rhs) -> StatusOr<llvm::Value*> {
+          // If rhs is outside the range [0:31], return a replication of the
+          // sign bit per hardware spec, by arithmetic-right-shifting 31 bits.
+          // lhs >> ((rhs & 31) == rhs ? rhs : 31)
+          TF_ASSIGN_OR_RETURN(lhs, EnsureVectorInt(lhs));
+          TF_ASSIGN_OR_RETURN(rhs, EnsureVectorInt(rhs));
+          return builder_.CreateAShr(
+              lhs, builder_.CreateSelect(
+                       builder_.CreateICmpEQ(
+                           builder_.CreateAnd(
+                               rhs, builder_.CreateVectorSplat(
+                                        vector_size_, builder_.getInt32(31))),
+                           rhs),
+                       rhs,
+                       builder_.CreateVectorSplat(vector_size_,
+                                                  builder_.getInt32(31))));
+        });
+    fillers_[LloOpcode::kVectorSubtractS32] =
+        binary(ensure_vector_i32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateSub(lhs, rhs);
+        });
+  }
+
+  {
+    // Vector FLOAT operations.
+    const auto ensure_vector_f32 = &LlvmModuleBuilder::EnsureVectorFloat;
+
+    fillers_[LloOpcode::kVectorAddF32] =
+        binary(ensure_vector_f32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateFAdd(lhs, rhs);
+        });
+    fillers_[LloOpcode::kVectorMultiplyF32] =
+        binary(ensure_vector_f32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateFMul(lhs, rhs);
+        });
+    fillers_[LloOpcode::kVectorClampGezF32] = binary(
+        ensure_vector_f32,
+        [this](llvm::Value* lhs, llvm::Value* rhs) -> StatusOr<llvm::Value*> {
+          // Model vclamp-gez(x, y) as:
+          //   (fmaximum (fminimum node:$x, node:$y), (Splat FPZero))
+
+          std::vector<llvm::Value*> min_args{lhs, rhs};
+          TF_ASSIGN_OR_RETURN(
+              auto min,
+              CreateIntrinsicCallWithArgs(
+                  llvm::Intrinsic::getDeclaration(
+                      module(), llvm::Intrinsic::minimum, {VectorFloatTy()}),
+                  {min_args.data(), min_args.size()}));
+
+          std::vector<llvm::Value*> max_args{
+              min,
+              builder_.CreateVectorSplat(
+                  vector_size_, builder_.CreateBitCast(builder_.getInt32(0),
+                                                       ScalarFloatTy()))};
+          return CreateIntrinsicCallWithArgs(
+              llvm::Intrinsic::getDeclaration(
+                  module(), llvm::Intrinsic::maximum, {VectorFloatTy()}),
+              {max_args.data(), max_args.size()});
+        });
+    fillers_[LloOpcode::kVectorSubtractF32] =
+        binary(ensure_vector_f32, [this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateFSub(lhs, rhs);
+        });
+  }
+
+  {
+    // Predication and vector mask ops.
+    const auto binary_simple = [&](const std::function<llvm::Value*(
+                                       llvm::Value*, llvm::Value*)>& func) {
+      return binary(&LlvmModuleBuilder::EnsureNoOp, func,
+                    /*result_transformer=*/&LlvmModuleBuilder::EnsureNoOp);
+    };
+
+    fillers_[LloOpcode::kPredicateNand] =
+        binary_simple([this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateOr(
+              builder_.CreateICmpEQ(lhs, builder_.getFalse()),
+              builder_.CreateICmpEQ(rhs, builder_.getFalse()));
+        });
+    fillers_[LloOpcode::kPredicateOr] =
+        binary_simple([this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateOr(lhs, rhs);
+        });
+
+    fillers_[LloOpcode::kVectorMaskAnd] =
+        binary_simple([this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateAnd(lhs, rhs);
+        });
+    fillers_[LloOpcode::kVectorMaskOr] =
+        binary_simple([this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateOr(lhs, rhs);
+        });
+    fillers_[LloOpcode::kVectorMaskXor] =
+        binary_simple([this](llvm::Value* lhs, llvm::Value* rhs) {
+          return builder_.CreateXor(lhs, rhs);
+        });
+  }
+
+  {
+    // Special ops.
+    const auto ensure_nothing = &LlvmModuleBuilder::EnsureNoOp;
+
+    fillers_[LloOpcode::kScalarAddressCalculation] =
+        [=](const LloInstruction* instruction) -> StatusOr<llvm::Value*> {
+      const auto memory_space = instruction->memory_space();
+      return binary(&LlvmModuleBuilder::EnsureNoOp, [=](llvm::Value* lhs,
+                                                        llvm::Value* rhs) {
+        return CreateScalarAddressCalculation(lhs, rhs, memory_space);
+      })(instruction);
+    };
+
+    fillers_[LloOpcode::kScalarConvertF32ToS32WithProbRounding] = binary(
+        &LlvmModuleBuilder::EnsureScalarFloat,
+        [this](llvm::Value* value, llvm::Value* mode) {
+          mode = builder_.CreateBitCast(mode, builder_.getInt32Ty());
+          CHECK(llvm::isa<llvm::ConstantInt>(mode)) << ValueAsString(mode);
+          CHECK(llvm::cast<llvm::ConstantInt>(mode)->equalsInt(0xFFFFFFFF))
+              << ValueAsString(mode);
+          return builder_.CreateFPToSI(value, ScalarIntTy());
+        },
+        /*result_transformer=*/ensure_nothing);
+
+    fillers_[LloOpcode::kVectorConvertF32ToS32WithProbRounding] = binary(
+        &LlvmModuleBuilder::EnsureVectorFloat,
+        [this](llvm::Value* value, llvm::Value* mode) {
+          // TODO(vnukov): we need check if the second parameter is
+          // kTowardsZero.
+          return builder_.CreateFPToSI(value, VectorIntTy());
+        },
+        /*result_transformer=*/ensure_nothing);
+  }
+}
+
+void LlvmModuleBuilder::InitializeUnopFillers() {
+  // Register unary and unary-like ops.
+  const auto unary =
+      [&](const std::function<StatusOr<llvm::Value*>(llvm::Value*)>& func,
+          LlvmValueTransformer arg_transformer = &LlvmModuleBuilder::EnsureNoOp,
+          LlvmValueTransformer result_transformer =
+              &LlvmModuleBuilder::EnsureNoOp) {
+        return
+            [=](const LloInstruction* instruction) -> StatusOr<llvm::Value*> {
+              TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+              TF_RET_CHECK(instruction->operands_size() == 1);
+              TF_ASSIGN_OR_RETURN(auto arg,
+                                  LlvmValue(instruction->operands(0)));
+              TF_ASSIGN_OR_RETURN(arg, (this->*arg_transformer)(arg));
+              TF_ASSIGN_OR_RETURN(auto result, func(arg));
+              return (this->*result_transformer)(result);
+            };
+      };
+
+  {
+    // Moves.
+    fillers_[LloOpcode::kPredicateMove] =
+        unary([](llvm::Value* arg) { return arg; });
+    fillers_[LloOpcode::kSyncFlagToScalarPush] = unary([](llvm::Value* arg) {
+      // Is modeled in kScalarV2SPop filler as load.
+      return arg;
+    });
+    fillers_[LloOpcode::kScalarMove] =
+        unary([](llvm::Value* arg) { return arg; });
+    fillers_[LloOpcode::kScalarExtractLow32] = unary([this](llvm::Value* arg) {
+      // Transform a scalar extract low in an LLVM extract value of
+      // the first element of a { i32, i32 } structure.
+      CHECK(arg->getType()->isStructTy() &&
+            arg->getType()->getStructElementType(0)->isIntegerTy(32) &&
+            arg->getType()->getStructElementType(1)->isIntegerTy(32));
+      return builder_.CreateExtractValue(arg, {0});
+    });
+    fillers_[LloOpcode::kScalarExtractHigh32] = unary([this](llvm::Value* arg) {
+      // Transform a scalar extract low in an LLVM extract value of
+      // the second element of a { i32, i32 } structure.
+      CHECK(arg->getType()->isStructTy() &&
+            arg->getType()->getStructElementType(0)->isIntegerTy(32) &&
+            arg->getType()->getStructElementType(1)->isIntegerTy(32));
+      return builder_.CreateExtractValue(arg, {1});
+    });
+    {
+      const auto pop_vector = unary([this](llvm::Value* arg) {
+        // Note that 'arg' is expected to represent the original vector value,
+        // here we model pop as an extract 0-index element from this.
+        return builder_.CreateExtractElement(arg, builder_.getInt32(0));
+      });
+      const auto pop_sflag = unary([this](llvm::Value* arg) {
+        // Note that 'arg' is expected to represent the original sflag pointer
+        // value, here we model pop as a load from this address.
+        return builder_.CreateLoad(ScalarIntTy(), arg);
+      });
+
+      fillers_[LloOpcode::kScalarV2SPop] =
+          [=](const LloInstruction* instruction) -> StatusOr<llvm::Value*> {
+        TF_RET_CHECK(instruction->operands_size());
+        switch (instruction->operands(0)->opcode()) {
+          case LloOpcode::kVectorToScalarPush:
+            return pop_vector(instruction);
+          case LloOpcode::kSyncFlagToScalarPush:
+            return pop_sflag(instruction);
+          default:
+            return InternalError("Unsupported argument to v2s pop: %s",
+                                 LloOpcodeString(instruction->opcode()));
+        }
+      };
+      fillers_[LloOpcode::kVectorToScalarPseudo] = pop_vector;
+    }
+    fillers_[LloOpcode::kVectorMaskMove] =
+        unary([](llvm::Value* arg) { return arg; });
+    fillers_[LloOpcode::kVectorMove] =
+        unary([](llvm::Value* arg) { return arg; });
+    fillers_[LloOpcode::kVectorToScalarPush] = unary([](llvm::Value* arg) {
+      // Is modeled in kScalarV2SPop filler as extract-element.
+      return arg;
+    });
+    fillers_[LloOpcode::
+                 kVectorCountLeadingZeros] = unary([=](llvm::Value* arg)
+                                                       -> StatusOr<
+                                                           llvm::Value*> {
+      llvm::Function* callee = llvm::Intrinsic::getDeclaration(
+          module(), llvm::Intrinsic::ctlz, {VectorIntTy()});
+      std::vector<llvm::Value*> args{arg, builder_.getInt1(false)};
+      return CreateIntrinsicCallWithArgs(callee, {args.data(), args.size()});
+    });
+  }
+
+  {
+    // Predication and vector mask ops.
+    fillers_[LloOpcode::kPredicateNegate] = unary([this](llvm::Value* arg) {
+      return builder_.CreateICmpEQ(arg, builder_.getFalse());
+    });
+    fillers_[LloOpcode::kVectorMaskNegate] = unary([this](llvm::Value* arg) {
+      return builder_.CreateXor(
+          arg, builder_.CreateVectorSplat(vector_size_, LlvmTrue()));
+    });
+  }
+
+  {
+    // Misc ops.
+    fillers_[LloOpcode::kIntToPtr] =
+        [=](const LloInstruction* instruction) -> StatusOr<llvm::Value*> {
+      TF_ASSIGN_OR_RETURN(llvm::Type * ptr_type, LlvmType(instruction));
+      return unary([this, ptr_type](llvm::Value* arg) {
+        return CreateIntToPointer(arg, ptr_type);
+      })(instruction);
+    };
+    fillers_[LloOpcode::kScalarConvertS32ToF32] =
+        unary([this](llvm::Value* arg) {
+          return builder_.CreateSIToFP(arg, ScalarFloatTy());
+        });
+    fillers_[LloOpcode::kScalarToVector] = unary(
+        [this](llvm::Value* arg) {
+          return builder_.CreateVectorSplat(vector_size_, arg);
+        },
+        /*arg_transformer=*/&LlvmModuleBuilder::EnsureScalarInt,
+        /*result_transformer=*/&LlvmModuleBuilder::EnsureVectorInt);
+    fillers_[LloOpcode::kVectorConvertS32ToF32] =
+        unary([this](llvm::Value* arg) {
+          return builder_.CreateSIToFP(arg, VectorFloatTy());
+        });
+    fillers_[LloOpcode::kVectorDelay] = unary([this](llvm::Value* arg) {
+      return llvm::UndefValue::get(builder_.getVoidTy());  // TODO(b/140110381)
+    });
+  }
+  fillers_[LloOpcode::kVectorSetIarRaw] =
+      [=](const LloInstruction* instruction) -> StatusOr<llvm::Value*> {
+    return unary(
+        [this, instruction](llvm::Value* arg) -> StatusOr<llvm::Value*> {
+          const std::optional<uint32_t> iar = instruction->iar();
+          CHECK(iar.has_value());
+          std::vector<llvm::Value*> args{arg, builder_.getInt32(*iar)};
+          if (iar == kVectorLoadReplicateEvenOddSublanes) {
+            TF_ASSIGN_OR_RETURN(
+                last_issued_load_iar_token_,
+                CreateIntrinsicCallWithArgs(llvm::Intrinsic::tpu_set_iar_raw,
+                                            {args.data(), args.size()}));
+            return last_issued_load_iar_token_;
+          }
+          CHECK(iar == kVectorStoreEvenOddSublanesIar);
+          TF_ASSIGN_OR_RETURN(
+              last_issued_store_iar_token_,
+              CreateIntrinsicCallWithArgs(llvm::Intrinsic::tpu_set_iar_raw,
+                                          {args.data(), args.size()}));
+          return last_issued_store_iar_token_;
+        })(instruction);
+  };
+  fillers_[LloOpcode::kScalarCountLeadingZeros] =
+      unary([=](llvm::Value* arg) -> StatusOr<llvm::Value*> {
+        llvm::Function* callee = llvm::Intrinsic::getDeclaration(
+            module(), llvm::Intrinsic::ctlz, {ScalarIntTy()});
+        std::vector<llvm::Value*> args{arg, builder_.getInt1(false)};
+        return CreateIntrinsicCallWithArgs(callee, {args.data(), args.size()});
+      });
+}
+
+void LlvmModuleBuilder::InitializeIntrinsicsFillers() {
+  using llvm::Intrinsic::ID;
+
+  // Creates predicated call for predicate, callee, and arguments.
+  const auto predicated_call =
+      [this](const std::optional<PredicateAndPolarity>& predicate,
+             llvm::FunctionCallee callee, std::vector<llvm::Value*>* args) {
+        absl::Span<llvm::Value*> arguments{args->data(), args->size()};
+        return FillAsPredicated(
+            predicate,
+            [&] { return CreateIntrinsicCallWithArgs(callee, arguments); },
+            &LlvmModuleBuilder::FallThroughBlockTerminator);
+      };
+
+  {
+    // Create fillers mapping LLO operation operands to intrinsic args 1:1.
+    const auto filler = [=](ID id, std::vector<llvm::Type*> types = {}) {
+      return [=](const LloInstruction* instruction) -> StatusOr<llvm::Value*> {
+        std::vector<llvm::Value*> args;
+        for (LloValue* operand : instruction->operands()) {
+          TF_ASSIGN_OR_RETURN(const auto arg_value, LlvmValue(operand));
+          args.push_back(arg_value);
+        }
+        return predicated_call(
+            instruction->PredicateOrNull(),
+            llvm::Intrinsic::getDeclaration(module(), id, types), &args);
+      };
+    };
+
+    fillers_[LloOpcode::kDma] = filler(llvm::Intrinsic::tpu_dma_descriptor);
+    // NOTE: kDmaDoneWait is same as kVectorWaitGe.
+    fillers_[LloOpcode::kDmaDoneWait] = filler(llvm::Intrinsic::tpu_waitge);
+    fillers_[LloOpcode::kScalarAddCarryU32] =
+        filler(llvm::Intrinsic::tpu_addcarry, {PredicateTy()});
+    fillers_[LloOpcode::kScalarWeird] = filler(llvm::Intrinsic::tpu_weird_f32);
+    fillers_[LloOpcode::kScalarMaximumF32] =
+        filler(llvm::Intrinsic::maximum, {ScalarFloatTy()});
+    fillers_[LloOpcode::kScalarMinimumF32] =
+        filler(llvm::Intrinsic::minimum, {ScalarFloatTy()});
+    fillers_[LloOpcode::kVectorAddCarryU32] =
+        filler(llvm::Intrinsic::tpu_addcarry, {VectorMaskTy()});
+    fillers_[LloOpcode::kVectorClampSymmetricF32] =
+        filler(llvm::Intrinsic::tpu_clamp_symmetric, {VectorFloatTy()});
+    fillers_[LloOpcode::kVectorEupResult] =
+        filler(llvm::Intrinsic::tpu_eup_pop, {VectorFloatTy()});
+    fillers_[LloOpcode::kVectorExtractSignificand] = filler(
+        llvm::Intrinsic::tpu_significand, {VectorIntTy(), VectorFloatTy()});
+    fillers_[LloOpcode::kVectorGetRngSeed] =
+        filler(llvm::Intrinsic::tpu_tc_getrngseed);
+    fillers_[LloOpcode::kVectorLaneSequence] =
+        filler(llvm::Intrinsic::tpu_vlaneseq, {VectorIntTy()});
+    fillers_[LloOpcode::kVectorLog2F32] =
+        filler(llvm::Intrinsic::tpu_log2, {VectorFloatTy()});
+    fillers_[LloOpcode::kVectorLog2F32AndPop] =
+        filler(llvm::Intrinsic::tpu_log2_macro, {VectorFloatTy()});
+    fillers_[LloOpcode::kVectorMaximumF32] =
+        filler(llvm::Intrinsic::maximum, {VectorFloatTy()});
+    fillers_[LloOpcode::kVectorMinimumF32] =
+        filler(llvm::Intrinsic::minimum, {VectorFloatTy()});
+    fillers_[LloOpcode::kVectorPow2F32] =
+        filler(llvm::Intrinsic::tpu_pow2, {VectorFloatTy()});
+    fillers_[LloOpcode::kVectorPow2F32AndPop] =
+        filler(llvm::Intrinsic::tpu_pow2_macro, {VectorFloatTy()});
+    fillers_[LloOpcode::kVectorPrng] = filler(llvm::Intrinsic::tpu_tc_vrng);
+    fillers_[LloOpcode::kVectorReciprocalF32] =
+        filler(llvm::Intrinsic::tpu_rcp, {VectorFloatTy()});
+    fillers_[LloOpcode::kVectorReciprocalF32AndPop] =
+        filler(llvm::Intrinsic::tpu_rcp_macro, {VectorFloatTy()});
+    fillers_[LloOpcode::kVectorRsqrtF32] =
+        filler(llvm::Intrinsic::tpu_rsqrt, {VectorFloatTy()});
+    fillers_[LloOpcode::kVectorRsqrtF32AndPop] =
+        filler(llvm::Intrinsic::tpu_rsqrt_macro, {VectorFloatTy()});
+    fillers_[LloOpcode::kVectorSetRngSeed] =
+        filler(llvm::Intrinsic::tpu_tc_setrngseed);
+    fillers_[LloOpcode::kVectorSetTracemark] =
+        filler(llvm::Intrinsic::tpu_vsettm);
+    fillers_[LloOpcode::kVectorSublaneRotateTZ] =
+        filler(llvm::Intrinsic::tpu_vrot_sublane_down, {VectorIntTy()});
+    fillers_[LloOpcode::kVectorSyncFlagAdd] =
+        filler(llvm::Intrinsic::tpu_syncadd);
+    fillers_[LloOpcode::kVectorSyncFlagAddRemote] =
+        filler(llvm::Intrinsic::tpu_syncadd_remote);
+    fillers_[LloOpcode::kVectorTanhF32] =
+        filler(llvm::Intrinsic::tpu_tanh, {VectorFloatTy()});
+    fillers_[LloOpcode::kVectorTanhF32AndPop] =
+        filler(llvm::Intrinsic::tpu_tanh_macro, {VectorFloatTy()});
+    fillers_[LloOpcode::kVectorTrace] = filler(llvm::Intrinsic::tpu_vtrace);
+    fillers_[LloOpcode::kVectorWaitEq] = filler(llvm::Intrinsic::tpu_waiteq);
+    fillers_[LloOpcode::kVectorWaitGe] = filler(llvm::Intrinsic::tpu_waitge);
+    fillers_[LloOpcode::kVectorWaitGt] = filler(llvm::Intrinsic::tpu_waitgt);
+    fillers_[LloOpcode::kVectorWaitLe] = filler(llvm::Intrinsic::tpu_waitle);
+    fillers_[LloOpcode::kVectorWaitLt] = filler(llvm::Intrinsic::tpu_waitlt);
+    fillers_[LloOpcode::kVectorWaitNe] = filler(llvm::Intrinsic::tpu_waitne);
+    fillers_[LloOpcode::kVectorWeird] =
+        filler(llvm::Intrinsic::tpu_weird, {VectorMaskTy()});
+  }
+
+  {
+    // Create fillers for few LLO operation with unit_id.
+    const auto filler = [=](ID id, const std::vector<llvm::Type*>& types = {}) {
+      return [=](const LloInstruction* instruction) -> StatusOr<llvm::Value*> {
+        TF_RET_CHECK(instruction->unit_id().has_value());
+        const int32_t unit_id = *instruction->unit_id();
+
+        std::vector<llvm::Value*> args;
+        for (const auto operand : instruction->operands()) {
+          TF_ASSIGN_OR_RETURN(const auto arg, LlvmValue(operand));
+          args.push_back(arg);
+        }
+        args.push_back(builder_.getInt32(unit_id));
+
+        return predicated_call(
+            instruction->PredicateOrNull(),
+            llvm::Intrinsic::getDeclaration(module(), id, types), &args);
+      };
+    };
+
+    fillers_[LloOpcode::kVectorAddReduceF32] =
+        filler(llvm::Intrinsic::tpu_xlane_add);
+    fillers_[LloOpcode::kVectorMaxReduceF32] =
+        filler(llvm::Intrinsic::tpu_xlane_max);
+    fillers_[LloOpcode::kVectorMaxIndexReduceF32] =
+        filler(llvm::Intrinsic::tpu_xlane_maxindex);
+    fillers_[LloOpcode::kVectorMinReduceF32] =
+        filler(llvm::Intrinsic::tpu_xlane_min);
+    fillers_[LloOpcode::kVectorMinIndexReduceF32] =
+        filler(llvm::Intrinsic::tpu_xlane_minindex);
+    fillers_[LloOpcode::kVectorRotate] =
+        filler(llvm::Intrinsic::tpu_vrotate, {VectorIntTy()});
+  }
+
+  {
+    // Create fillers for DMA.LOCAL llo operation.
+    const auto filler = [=](ID id) {
+      return [=](const LloInstruction* instruction) -> StatusOr<llvm::Value*> {
+        TF_RET_CHECK(instruction->operands_size() == 4);
+        TF_ASSIGN_OR_RETURN(const auto src,
+                            LlvmValue(instruction->operands(0)));
+        TF_ASSIGN_OR_RETURN(const auto size,
+                            LlvmValue(instruction->operands(1)));
+        TF_ASSIGN_OR_RETURN(const auto dest,
+                            LlvmValue(instruction->operands(2)));
+        TF_ASSIGN_OR_RETURN(const auto sync_flag,
+                            LlvmValue(instruction->operands(3)));
+        std::vector<llvm::Value*> args{sync_flag, src, dest, size};
+        return predicated_call(
+            instruction->PredicateOrNull(),
+            llvm::Intrinsic::getDeclaration(
+                module(), id,
+                {builder_.getIntNTy(CHAR_BIT * hbm_pointer_size_)
+                     ->getPointerTo(llvm::TPUAddressSpace::TPUAS_Hbm)}),
+            &args);
+      };
+    };
+
+    fillers_[LloOpcode::kDmaHbmToSmem] =
+        filler(llvm::Intrinsic::tpu_dma_hbm_to_smem);
+    fillers_[LloOpcode::kDmaHbmToVmem] =
+        filler(llvm::Intrinsic::tpu_dma_hbm_to_vmem);
+    fillers_[LloOpcode::kDmaHbmToVmemWithHibUpdate] =
+        filler(llvm::Intrinsic::tpu_dma_hbm_to_vmem_hib_update);
+    fillers_[LloOpcode::kDmaSmemToHbm] =
+        filler(llvm::Intrinsic::tpu_dma_smem_to_hbm);
+    fillers_[LloOpcode::kDmaVmemToHbm] =
+        filler(llvm::Intrinsic::tpu_dma_vmem_to_hbm);
+
+    // Create a special version of filter for DMA.LOCAL to HIB, which does not
+    // have destination address.
+    const auto filler_with_hib_dest = [=](ID id) {
+      return [=](const LloInstruction* instruction) -> StatusOr<llvm::Value*> {
+        TF_RET_CHECK(instruction->operands_size() == 4);
+        TF_ASSIGN_OR_RETURN(const auto src,
+                            LlvmValue(instruction->operands(0)));
+        TF_ASSIGN_OR_RETURN(const auto size,
+                            LlvmValue(instruction->operands(1)));
+        TF_ASSIGN_OR_RETURN(const auto sync_flag,
+                            LlvmValue(instruction->operands(3)));
+
+        // Make sure the see expected value of the destination.
+        const LloValue* const llo_dest = instruction->operands(2);
+        TF_RET_CHECK(llo_dest->opcode() == LloOpcode::kIntToPtr);
+        TF_RET_CHECK(llo_dest->memory_space() == MemorySpace::kHib);
+        TF_RET_CHECK(llo_dest->operands(0)->opcode() ==
+                     LloOpcode::kScalarConstantU32);
+        TF_RET_CHECK(llo_dest->operands(0)->GetConstantAsS32() == -1);
+
+        std::vector<llvm::Value*> args{sync_flag, src, size};
+        return predicated_call(
+            instruction->PredicateOrNull(),
+            llvm::Intrinsic::getDeclaration(
+                module(), id,
+                {builder_.getIntNTy(CHAR_BIT * hbm_pointer_size_)
+                     ->getPointerTo(llvm::TPUAddressSpace::TPUAS_Hbm)}),
+            &args);
+      };
+    };
+
+    fillers_[LloOpcode::kDmaHbmToHib] =
+        filler_with_hib_dest(llvm::Intrinsic::tpu_dma_hbm_to_hib);
+  }
+
+  {
+    // Create fillers mapping some for FIFO pop operation.
+    const auto filler = [=](ID id, const std::vector<llvm::Type*>& types = {}) {
+      return [=](const LloInstruction* instruction) -> StatusOr<llvm::Value*> {
+        TF_RET_CHECK(instruction->operands_size() == 1);
+        TF_ASSIGN_OR_RETURN(const auto request,
+                            LlvmValue(instruction->operands(0)));
+
+        TF_RET_CHECK(instruction->unit_id().has_value());
+        const int32_t unit_id = *instruction->unit_id();
+
+        std::vector<llvm::Value*> args{builder_.getInt32(unit_id), request};
+        return predicated_call(
+            instruction->PredicateOrNull(),
+            llvm::Intrinsic::getDeclaration(module(), id, types), &args);
+      };
+    };
+
+    fillers_[LloOpcode::kVectorPermuteResult] =
+        filler(llvm::Intrinsic::tpu_tc_vtrfpop, {VectorIntTy()});
+    fillers_[LloOpcode::kVectorXlaneResult] =
+        filler(llvm::Intrinsic::tpu_tc_vtrfpop, {VectorIntTy()});
+  }
+
+  {
+    const auto filler = [=](std::vector<llvm::Type*> types = {}) {
+      return [=](const LloInstruction* instruction) -> StatusOr<llvm::Value*> {
+        std::vector<llvm::Value*> args;
+        for (LloValue* operand : instruction->operands()) {
+          TF_ASSIGN_OR_RETURN(const auto arg_value, LlvmValue(operand));
+          args.push_back(arg_value);
+        }
+        ID id;
+        switch (instruction->vpack_format()) {
+          case VpackFormat::kInterleavedBf16:
+            id = llvm::Intrinsic::tpu_pack;
+            break;
+          case VpackFormat::kCompressedBf16:
+            id = llvm::Intrinsic::tpu_packc;
+            break;
+          default:
+            return InternalError("not yet implemented");
+        }
+        return predicated_call(
+            instruction->PredicateOrNull(),
+            llvm::Intrinsic::getDeclaration(module(), id, types), &args);
+      };
+    };
+
+    fillers_[LloOpcode::kVectorPack] = filler({VectorFloatTy()});
+  }
+}
+
+StatusOr<llvm::BasicBlock*> LlvmModuleBuilder::CreateBasicBlock(
+    llvm::Twine name) {
+  return llvm::BasicBlock::Create(context(), name, llvm_function());
+}
+
+StatusOr<llvm::BasicBlock*> LlvmModuleBuilder::GetFirstRegionBlock(
+    const LloRegion* region) {
+  const auto block_it = first_region_to_block_.find(region);
+  TF_RET_CHECK(block_it != first_region_to_block_.end());
+  TF_RET_CHECK(block_it->second != nullptr);
+  return block_it->second;
+}
+
+StatusOr<llvm::PointerType*> LlvmModuleBuilder::PointerTy(MemorySpace space) {
+  switch (space) {
+    case MemorySpace::kHbm:
+      return builder_.getIntNTy(hbm_pointer_size_ * CHAR_BIT)
+          ->getPointerTo(llvm::TPUAddressSpace::TPUAS_Hbm);
+    case MemorySpace::kSmem:
+      return ScalarIntTy()->getPointerTo(llvm::TPUAddressSpace::TPUAS_Smem);
+    case MemorySpace::kVmem:
+      return VectorIntTy()->getPointerTo(llvm::TPUAddressSpace::TPUAS_Vmem);
+    case MemorySpace::kSflag:
+      return ScalarIntTy()->getPointerTo(llvm::TPUAddressSpace::TPUAS_Sflag);
+    case MemorySpace::kBarnaCoreBmem:
+      return VectorIntTy()->getPointerTo(llvm::TPUAddressSpace::TPUAS_Bmem);
+    default:
+      return InternalError(
+          "Pointer type for memory space %d is not yet supported", space);
+  }
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillAsPredicated(
+    const std::optional<PredicateAndPolarity>& predicate,
+    const std::function<StatusOr<llvm::Value*>()>& builder,
+    Status (LlvmModuleBuilder::*terminator)(llvm::BasicBlock*)) {
+  if (!predicate.has_value()) {
+    return builder();
+  }
+
+  // Create new blocks.
+  llvm::BasicBlock* current_block = builder_.GetInsertBlock();
+  std::string original_name =
+      absl::StrCat("llo-region-", current_region_->ordinal());
+  TF_ASSIGN_OR_RETURN(llvm::BasicBlock* const predicated_block,
+                      CreateBasicBlock(original_name + ".pred"));
+  TF_ASSIGN_OR_RETURN(llvm::BasicBlock* const join_block,
+                      CreateBasicBlock(original_name + ".join"));
+  join_block->moveAfter(current_block);
+
+  // Finish the current block.
+  TF_ASSIGN_OR_RETURN(const auto predicate_value,
+                      LlvmValue(predicate->predicate));
+  const bool is_negated = predicate->polarity == PredicationPolarity::kNegative;
+  builder_.CreateCondBr(predicate_value,
+                        is_negated ? join_block : predicated_block,
+                        is_negated ? predicated_block : join_block);
+
+  // Build and finalize predicated block.
+  builder_.SetInsertPoint(predicated_block);
+  TF_ASSIGN_OR_RETURN(const auto return_value, builder());
+  TF_RETURN_IF_ERROR((this->*terminator)(join_block));
+
+  // Switch to the join block.
+  builder_.SetInsertPoint(join_block);
+  return return_value;
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillScalarHaltOnError(
+    const LloInstruction* instruction) {
+  const auto predicate = instruction->PredicateOrNull();
+  TF_ASSIGN_OR_RETURN(auto predicate_value, LlvmValue(predicate->predicate));
+  if (predicate->polarity == PredicationPolarity::kNegative) {
+    predicate_value = builder_.CreateNot(predicate_value);
+  }
+  return builder_.CreateCall(
+      llvm::Intrinsic::getDeclaration(module(), llvm::Intrinsic::tpu_halt_trap),
+      predicate_value);
+}
+
+StatusOr<int64_t> LlvmModuleBuilder::LlvmAddressSpace(
+    MemorySpace memory_space) const {
+  switch (memory_space) {
+    case MemorySpace::kHbm:
+      return llvm::TPUAddressSpace::TPUAS_Hbm;
+    case MemorySpace::kSmem:
+      return llvm::TPUAddressSpace::TPUAS_Smem;
+    case MemorySpace::kVmem:
+      return llvm::TPUAddressSpace::TPUAS_Vmem;
+    case MemorySpace::kSflag:
+      return llvm::TPUAddressSpace::TPUAS_Sflag;
+    case MemorySpace::kBarnaCoreBmem:
+      return llvm::TPUAddressSpace::TPUAS_Bmem;
+    default:
+      return InternalError("Unsupported memory space: %d", memory_space);
+  }
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillScalarStore(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->operands_size() == 2);
+  TF_ASSIGN_OR_RETURN(auto address, LlvmValue(instruction->operands(0)));
+  TF_ASSIGN_OR_RETURN(auto value, LlvmValue(instruction->operands(1)));
+  TF_ASSIGN_OR_RETURN(value, EnsureScalarInt(value));
+  TF_ASSIGN_OR_RETURN(address, EnsureSmemPointer(address));
+  return builder_.CreateStore(value, address);
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::CreateScalarAddressCalculation(
+    llvm::Value* address, llvm::Value* displacement, MemorySpace memory_space) {
+  TF_ASSIGN_OR_RETURN(int64_t address_space, LlvmAddressSpace(memory_space));
+  TF_ASSIGN_OR_RETURN(llvm::Type * pointer_type, PointerTy(memory_space));
+  TF_ASSIGN_OR_RETURN(address, EnsurePointer(address, pointer_type));
+  TF_ASSIGN_OR_RETURN(displacement, EnsureScalarInt(displacement));
+
+  // Displacement in LLO is measured in number of words while the index for a
+  // GEP in LLVM is measured in number of pointed objects so we cast the pointer
+  // to a type where the size of the pointed object matches the word size for
+  // the target address space what means that the displacement and the GEP index
+  // will be equal so we don't need any extra arithmetic what would be hard to
+  // optimize out (as GEP lowering happens very late).
+  const int64_t word_size_bits =
+      CHAR_BIT * target().WordSizeBytes(memory_space).value();
+  llvm::Type* word_type = builder_.getIntNTy(word_size_bits);
+  address = builder_.CreatePointerCast(address,
+                                       word_type->getPointerTo(address_space));
+  address = builder_.CreateGEP(word_type, address, displacement);
+  return builder_.CreatePointerCast(address, pointer_type);
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::CreatePointerToInt(
+    llvm::Value* address) {
+  llvm::Function* callee = llvm::Intrinsic::getDeclaration(
+      module(), llvm::Intrinsic::tpu_ptrtoint, {address->getType()});
+  std::vector<llvm::Value*> args{address};
+  return CreateIntrinsicCallWithArgs(callee, {args.data(), args.size()});
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::CreateIntToPointer(
+    llvm::Value* address, llvm::Type* type) {
+  if (converter_->sequencer_ == TpuSequencerType::kBarnaCoreAddressHandler) {
+    // TODO(b/143907296): Remove once isel for constant is fixed.
+    return builder_.CreateIntToPtr(address, type);
+  }
+
+  llvm::Function* callee = llvm::Intrinsic::getDeclaration(
+      module(), llvm::Intrinsic::tpu_inttoptr, {type});
+  std::vector<llvm::Value*> args{address};
+  return CreateIntrinsicCallWithArgs(callee, {args.data(), args.size()});
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillVectorPermute(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->operands_size() == 2);
+  TF_ASSIGN_OR_RETURN(const auto prc, LlvmValue(instruction->operands(0)));
+  TF_ASSIGN_OR_RETURN(auto source, LlvmValue(instruction->operands(1)));
+  TF_ASSIGN_OR_RETURN(source, EnsureVectorInt(source));
+
+  TF_RET_CHECK(instruction->unit_id().has_value());
+  const int32_t unit_id = *instruction->unit_id();
+
+  llvm::Function* callee = llvm::Intrinsic::getDeclaration(
+      module(), llvm::Intrinsic::tpu_permute, {VectorIntTy()});
+  std::vector<llvm::Value*> args{source, prc, builder_.getInt32(unit_id)};
+  return CreateIntrinsicCallWithArgs(callee, {args.data(), args.size()});
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillVectorSetPermutePattern(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->operands_size() == 1);
+  TF_ASSIGN_OR_RETURN(auto pattern, LlvmValue(instruction->operands(0)));
+  TF_ASSIGN_OR_RETURN(pattern, EnsureVectorInt(pattern));
+
+  const uint32_t scalar_constant_value = instruction->scalar_constant_value();
+  const auto mode = static_cast<SetPermuteMode>(scalar_constant_value);
+  TF_RET_CHECK(mode == SetPermuteMode::kAllSublanes ||
+               mode == SetPermuteMode::kOneSublane)
+      << "Actual value: " << scalar_constant_value;
+
+  TF_RET_CHECK(instruction->unit_id().has_value());
+  const int32_t unit_id = *instruction->unit_id();
+
+  llvm::Function* callee = llvm::Intrinsic::getDeclaration(
+      module(), (mode == SetPermuteMode::kAllSublanes)
+                    ? llvm::Intrinsic::tpu_set_permute
+                    : llvm::Intrinsic::tpu_set_permute_sublane);
+  std::vector<llvm::Value*> args{pattern, builder_.getInt32(unit_id)};
+  return CreateIntrinsicCallWithArgs(callee, {args.data(), args.size()});
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillVectorStore(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->operands_size() == 4);
+
+  TF_ASSIGN_OR_RETURN(auto address, LlvmValue(instruction->operands(0)));
+  TF_ASSIGN_OR_RETURN(auto value, LlvmValue(instruction->operands(1)));
+  TF_ASSIGN_OR_RETURN(value, EnsureVectorInt(value));
+  TF_ASSIGN_OR_RETURN(const auto displacement,
+                      LlvmValue(instruction->operands(2)));
+  TF_ASSIGN_OR_RETURN(address, CreateScalarAddressCalculation(
+                                   address, displacement, MemorySpace::kVmem));
+  TF_ASSIGN_OR_RETURN(const auto sublane_mask,
+                      LlvmValue(instruction->operands(3)));
+
+  if (instruction->sublane_stride() == 1 &&
+      llvm::isa<llvm::ConstantInt>(sublane_mask) &&
+      llvm::cast<llvm::ConstantInt>(sublane_mask)
+          ->equalsInt(target().AllSublanesMask())) {
+    return builder_.CreateStore(value, address);
+  }
+
+  // We have sublane mask or stride, need to represent it as an intrinsic.
+  std::vector<llvm::Value*> args{
+      value, address, sublane_mask,
+      builder_.getInt32(*instruction->sublane_stride()),
+      builder_.CreateVectorSplat(vector_size_, LlvmTrue())};
+  TF_ASSIGN_OR_RETURN(llvm::PointerType * pointer_type,
+                      PointerTy(MemorySpace::kVmem));
+  llvm::Function* callee = llvm::Intrinsic::getDeclaration(
+      module(), llvm::Intrinsic::tpu_vst_strided,
+      {VectorIntTy(), pointer_type});
+  return CreateIntrinsicCallWithArgs(callee, {args.data(), args.size()});
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillVectorStoreIndexedMasked(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->operands_size() == 4);
+
+  TF_ASSIGN_OR_RETURN(auto address, LlvmValue(instruction->operands(0)));
+  TF_ASSIGN_OR_RETURN(const auto mask, LlvmValue(instruction->operands(1)));
+  TF_ASSIGN_OR_RETURN(auto value, LlvmValue(instruction->operands(2)));
+  TF_ASSIGN_OR_RETURN(value, EnsureVectorInt(value));
+  TF_ASSIGN_OR_RETURN(const auto displacement,
+                      LlvmValue(instruction->operands(3)));
+  TF_ASSIGN_OR_RETURN(address, CreateScalarAddressCalculation(
+                                   address, displacement, MemorySpace::kVmem));
+
+  llvm::Value* const sublane_mask =
+      builder_.getInt32(target().AllSublanesMask());
+  llvm::Value* const sublane_stride =
+      builder_.getInt32(*instruction->sublane_stride());
+  TF_RET_CHECK(last_issued_store_iar_token_ != nullptr)
+      << "Were not able to track the last issued store iar token";
+  std::vector<llvm::Value*> args{value,        address,
+                                 sublane_mask, sublane_stride,
+                                 mask,         last_issued_store_iar_token_};
+  TF_ASSIGN_OR_RETURN(llvm::PointerType * pointer_type,
+                      PointerTy(MemorySpace::kVmem));
+  llvm::Function* callee = llvm::Intrinsic::getDeclaration(
+      module(), llvm::Intrinsic::tpu_vst_evenodd_sublanes,
+      {VectorIntTy(), pointer_type});
+  return CreateIntrinsicCallWithArgs(callee, {args.data(), args.size()});
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillVectorStoreMasked(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->operands_size() == 5);
+
+  TF_ASSIGN_OR_RETURN(auto address, LlvmValue(instruction->operands(0)));
+  TF_ASSIGN_OR_RETURN(auto mask, LlvmValue(instruction->operands(1)));
+  TF_ASSIGN_OR_RETURN(auto value, LlvmValue(instruction->operands(2)));
+  TF_ASSIGN_OR_RETURN(value, EnsureVectorInt(value));
+  TF_ASSIGN_OR_RETURN(const auto displacement,
+                      LlvmValue(instruction->operands(3)));
+  TF_ASSIGN_OR_RETURN(address, CreateScalarAddressCalculation(
+                                   address, displacement, MemorySpace::kVmem));
+  TF_ASSIGN_OR_RETURN(const auto sublane_mask,
+                      LlvmValue(instruction->operands(4)));
+
+  std::vector<llvm::Value*> args{
+      value, address, sublane_mask,
+      builder_.getInt32(*instruction->sublane_stride()), mask};
+  TF_ASSIGN_OR_RETURN(llvm::PointerType * pointer_type,
+                      PointerTy(MemorySpace::kVmem));
+  llvm::Function* callee = llvm::Intrinsic::getDeclaration(
+      module(), llvm::Intrinsic::tpu_vst_strided,
+      {VectorIntTy(), pointer_type});
+  return CreateIntrinsicCallWithArgs(callee, {args.data(), args.size()});
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillScalarLoad(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->operands_size() == 2);
+  TF_ASSIGN_OR_RETURN(auto address, LlvmValue(instruction->operands(0)));
+  TF_ASSIGN_OR_RETURN(const auto displacement,
+                      LlvmValue(instruction->operands(1)));
+  TF_ASSIGN_OR_RETURN(address, CreateScalarAddressCalculation(
+                                   address, displacement, MemorySpace::kSmem));
+  return builder_.CreateLoad(ScalarIntTy(), address);
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillVectorLoad(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->operands_size() == 3);
+  TF_ASSIGN_OR_RETURN(auto address, LlvmValue(instruction->operands(0)));
+  TF_ASSIGN_OR_RETURN(const auto displacement,
+                      LlvmValue(instruction->operands(1)));
+  TF_ASSIGN_OR_RETURN(address, CreateScalarAddressCalculation(
+                                   address, displacement, MemorySpace::kVmem));
+  TF_ASSIGN_OR_RETURN(const auto sublane_mask,
+                      LlvmValue(instruction->operands(2)));
+
+  if (instruction->sublane_stride() == 1 &&
+      llvm::isa<llvm::ConstantInt>(sublane_mask) &&
+      llvm::cast<llvm::ConstantInt>(sublane_mask)
+          ->equalsInt(target().AllSublanesMask())) {
+    return builder_.CreateLoad(VectorIntTy(), address);
+  }
+
+  // We have sublane mask or stride, need to represent it as an intrinsic.
+  std::vector<llvm::Value*> args{
+      address, sublane_mask, builder_.getInt32(*instruction->sublane_stride())};
+  TF_ASSIGN_OR_RETURN(llvm::PointerType * pointer_type,
+                      PointerTy(MemorySpace::kVmem));
+  llvm::Function* callee = llvm::Intrinsic::getDeclaration(
+      module(), llvm::Intrinsic::tpu_vld_strided,
+      {VectorIntTy(), pointer_type});
+  return CreateIntrinsicCallWithArgs(callee, {args.data(), args.size()});
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillVectorLoadIndexed(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->operands_size() == 3);
+  TF_ASSIGN_OR_RETURN(auto address, LlvmValue(instruction->operands(0)));
+  TF_ASSIGN_OR_RETURN(const auto displacement,
+                      LlvmValue(instruction->operands(1)));
+  TF_ASSIGN_OR_RETURN(address, CreateScalarAddressCalculation(
+                                   address, displacement, MemorySpace::kVmem));
+  TF_ASSIGN_OR_RETURN(const auto sublane_mask,
+                      LlvmValue(instruction->operands(2)));
+
+  TF_RET_CHECK(last_issued_load_iar_token_ != nullptr)
+      << "Were not able to track the last issued load iar token";
+  std::vector<llvm::Value*> args{
+      address, sublane_mask, builder_.getInt32(*instruction->sublane_stride()),
+      last_issued_load_iar_token_};
+  TF_ASSIGN_OR_RETURN(llvm::PointerType * pointer_type,
+                      PointerTy(MemorySpace::kVmem));
+  llvm::Function* callee = llvm::Intrinsic::getDeclaration(
+      module(), llvm::Intrinsic::tpu_vld_replicate_evenodd_sublanes,
+      {VectorIntTy(), pointer_type});
+  return CreateIntrinsicCallWithArgs(callee, {args.data(), args.size()});
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillVectorLoadSublaneShuffle(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->operands_size() == 4);
+  TF_RET_CHECK(instruction->sublane_stride() == 1);
+  TF_ASSIGN_OR_RETURN(auto address, LlvmValue(instruction->operands(0)));
+  TF_ASSIGN_OR_RETURN(auto pattern, LlvmValue(instruction->operands(1)));
+  TF_ASSIGN_OR_RETURN(const auto displacement,
+                      LlvmValue(instruction->operands(2)));
+  TF_ASSIGN_OR_RETURN(address, CreateScalarAddressCalculation(
+                                   address, displacement, MemorySpace::kVmem));
+  TF_ASSIGN_OR_RETURN(const auto sublane_mask,
+                      LlvmValue(instruction->operands(3)));
+
+  std::vector<llvm::Value*> args{address, sublane_mask, pattern};
+  TF_ASSIGN_OR_RETURN(llvm::PointerType * pointer_type,
+                      PointerTy(MemorySpace::kVmem));
+  llvm::Function* callee = llvm::Intrinsic::getDeclaration(
+      module(), llvm::Intrinsic::tpu_vld_shuffle,
+      {VectorIntTy(), pointer_type});
+  return CreateIntrinsicCallWithArgs(callee, {args.data(), args.size()});
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillPhi(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->IsPhi());
+  TF_RET_CHECK(instruction->operands_size() == 2);
+  TF_ASSIGN_OR_RETURN(const auto type, LlvmType(instruction));
+
+  if (instruction->operands(1) == instruction ||
+      instruction->operands(0) == instruction->operands(1)) {
+    // These are special cases of PHIs XLA creates sometimes which are no-ops
+    // and can be removed. Note that we do it mostly because in such cases the
+    // following code generates an invalid LLVM and removing them here helps
+    // avoid more complex workaround.
+    TF_ASSIGN_OR_RETURN(auto init_value, LlvmValue(instruction->operands(0)));
+    return init_value;
+  }
+
+  // Set insertion point to the first non-phi value of the current block.
+  const auto current_block = builder_.GetInsertBlock();
+  TF_RET_CHECK(builder_.GetInsertPoint() == current_block->end());
+  const auto first_non_phi = current_block->getFirstNonPHI();
+  if (first_non_phi != nullptr) {
+    builder_.SetInsertPoint(first_non_phi);
+  }
+  const auto result = builder_.CreatePHI(type, 2);
+  phis_to_fix_up_.emplace_back(instruction, result);
+  // Set insertion point to the end of the current block.
+  builder_.SetInsertPoint(current_block);
+  return result;
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillInlinedCall(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+
+  std::vector<llvm::Value*> args;
+  std::vector<llvm::Type*> arg_types;
+
+  LloCompilationResult* compilation_result =
+      instruction->associated_compilation_result();
+  TF_RET_CHECK(compilation_result != nullptr);
+
+  // Add the rest of the operands.
+  for (int64_t i = 0; i < instruction->operands_size(); i++) {
+    TF_ASSIGN_OR_RETURN(auto arg, LlvmValue(instruction->operands(i)));
+    if (arg->getType()->isPointerTy()) {
+      // If the argument is a pointer, always pass it as i32. This is essential
+      // since we cannot infer from i32 the pointer type in case the value was
+      // just loaded from SMEM and passed as an argument.
+      TF_ASSIGN_OR_RETURN(arg, EnsureScalarInt(arg));
+    }
+    args.push_back(arg);
+    arg_types.push_back(arg->getType());
+  }
+
+  TF_ASSIGN_OR_RETURN(std::vector<llvm::Function*> hlo_functions,
+                      GetOrCreateHloFunction(compilation_result, arg_types));
+  TF_RET_CHECK(!hlo_functions.empty());
+
+  llvm::Value* result = nullptr;
+  for (llvm::Function* function : hlo_functions) {
+    result = builder_.CreateCall(function, {args.data(), args.size()});
+    TF_RET_CHECK(result->getType()->isVoidTy());
+  }
+  return result;
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillInlinedCallOperand(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->operands_size() == 0);
+  constexpr int64_t kReservedArguments = 2;
+  return LlvmArgument(instruction->scalar_constant_value() +
+                      (is_tlp_module() ? kReservedArguments : 0));
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillCompare(
+    const LloInstruction* instruction) {
+  const auto build_float_comparison =
+      [this](ComparisonDirection direction, llvm::Value* lhs,
+             llvm::Value* rhs) -> StatusOr<llvm::Value*> {
+    TF_ASSIGN_OR_RETURN(lhs, EnsureScalarOrVectorFloat(lhs));
+    TF_ASSIGN_OR_RETURN(rhs, EnsureScalarOrVectorFloat(rhs));
+    switch (direction) {
+      case ComparisonDirection::kEq:
+        return builder_.CreateFCmpOEQ(lhs, rhs);
+      case ComparisonDirection::kNe:
+        return builder_.CreateFCmpUNE(lhs, rhs);
+      case ComparisonDirection::kLe:
+        return builder_.CreateFCmpOLE(lhs, rhs);
+      case ComparisonDirection::kLt:
+        return builder_.CreateFCmpOLT(lhs, rhs);
+      case ComparisonDirection::kGe:
+        return builder_.CreateFCmpOGE(lhs, rhs);
+      case ComparisonDirection::kGt:
+        return builder_.CreateFCmpOGT(lhs, rhs);
+    }
+  };
+
+  const auto build_int_comparison =
+      [this](ComparisonDirection direction, llvm::Value* lhs,
+             llvm::Value* rhs) -> StatusOr<llvm::Value*> {
+    TF_ASSIGN_OR_RETURN(lhs, EnsureScalarOrVectorInt(lhs));
+    TF_ASSIGN_OR_RETURN(rhs, EnsureScalarOrVectorInt(rhs));
+    switch (direction) {
+      case ComparisonDirection::kEq:
+        return builder_.CreateICmpEQ(lhs, rhs);
+      case ComparisonDirection::kNe:
+        return builder_.CreateICmpNE(lhs, rhs);
+      case ComparisonDirection::kLe:
+        return builder_.CreateICmpSLE(lhs, rhs);
+      case ComparisonDirection::kLt:
+        return builder_.CreateICmpSLT(lhs, rhs);
+      case ComparisonDirection::kGe:
+        return builder_.CreateICmpSGE(lhs, rhs);
+      case ComparisonDirection::kGt:
+        return builder_.CreateICmpSGT(lhs, rhs);
+    }
+  };
+
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->operands_size() == 2);
+  TF_ASSIGN_OR_RETURN(const auto lhs, LlvmValue(instruction->operands(0)));
+  TF_ASSIGN_OR_RETURN(const auto rhs, LlvmValue(instruction->operands(1)));
+
+  const auto comparison = instruction->comparison();
+  const bool is_float = comparison.GetType() == Comparison::Type::kFloat;
+  return is_float ? build_float_comparison(comparison.GetDirection(), lhs, rhs)
+                  : build_int_comparison(comparison.GetDirection(), lhs, rhs);
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillSelect(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->operands_size() == 3);
+  TF_ASSIGN_OR_RETURN(const auto predicate,
+                      LlvmValue(instruction->operands(0)));
+  TF_ASSIGN_OR_RETURN(auto if_true, LlvmValue(instruction->operands(1)));
+  TF_ASSIGN_OR_RETURN(auto if_false, LlvmValue(instruction->operands(2)));
+  TF_ASSIGN_OR_RETURN(if_true, EnsureScalarOrVectorInt(if_true));
+  TF_ASSIGN_OR_RETURN(if_false, EnsureScalarOrVectorInt(if_false));
+  return builder_.CreateSelect(predicate, if_true, if_false);
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillParameterAddress(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(instruction->operands_size() == 1);
+  TF_RET_CHECK(instruction->operands(0)->IsKnownConstant());
+  const int64_t param_number = instruction->operands(0)->GetConstantAsS32();
+  TF_RET_CHECK(param_number >= 0);
+  const int64_t offset = target().ParamPtrLocationWordOffset(param_number);
+  TF_ASSIGN_OR_RETURN(auto address,
+                      EnsureSmemPointer(builder_.getInt32(offset)));
+  return builder_.CreateLoad(ScalarIntTy(), address);
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillVectorLatch(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  const auto operands_size = instruction->operands_size();
+  TF_RET_CHECK(operands_size == 1 || operands_size == 2);
+
+  TF_ASSIGN_OR_RETURN(auto gain_chunk,
+                      LlvmValue(instruction->operands(operands_size - 1)));
+  TF_ASSIGN_OR_RETURN(gain_chunk, EnsureVectorFloat(gain_chunk));
+  llvm::Value* mask;
+  if (operands_size == 2) {
+    TF_ASSIGN_OR_RETURN(mask, LlvmValue(instruction->operands(0)));
+  } else {
+    mask = builder_.CreateVectorSplat(vector_size_, LlvmTrue());
+  }
+
+  const auto mxu_id = instruction->unit_id();
+  const bool has_id = mxu_id.has_value();
+  TF_RET_CHECK(has_id);
+  const bool valid_id = *mxu_id >= 0 && *mxu_id < kMxuCount;
+  TF_RET_CHECK(valid_id);
+  MxuState& mxu = mxu_[*mxu_id];
+
+  // There must not be more than 16 items in GSF.
+  const bool valid_push_count = mxu.push_count < 16;
+  TF_RET_CHECK(valid_push_count);
+
+  if (mxu.push_chain == nullptr) {
+    const bool zero_push_count = mxu.push_count == 0;
+    TF_RET_CHECK(zero_push_count);
+    mxu.push_chain = llvm::UndefValue::get(builder_.getInt32Ty());
+  }
+
+  llvm::Intrinsic::ID intrinsic_id;
+  switch (instruction->opcode()) {
+    case LloOpcode::kVectorLatch:
+    case LloOpcode::kVectorLatchMsk: {
+      switch (instruction->latch_mode()) {
+        case GainLatchMode::kNoXposeF32:
+          intrinsic_id = llvm::Intrinsic::tpu_vmatpush_f32;
+          break;
+        case GainLatchMode::kXposeF32:
+          intrinsic_id = llvm::Intrinsic::tpu_vmatpush_xpose_f32;
+          break;
+        case GainLatchMode::kNoXposeHiF32:
+          intrinsic_id = llvm::Intrinsic::tpu_vmatpush_hi_f32;
+          break;
+        case GainLatchMode::kXposeHiF32:
+          intrinsic_id = llvm::Intrinsic::tpu_vmatpush_hi_xpose_f32;
+          break;
+        case GainLatchMode::kNoXposeLowF32:
+          intrinsic_id = llvm::Intrinsic::tpu_vmatpush_low_f32;
+          break;
+        case GainLatchMode::kXposeLowF32:
+          intrinsic_id = llvm::Intrinsic::tpu_vmatpush_low_xpose_f32;
+          break;
+        case GainLatchMode::kNoXposePackedBf16:
+          intrinsic_id = llvm::Intrinsic::tpu_vmatpush_packed_f32;
+          break;
+        case GainLatchMode::kXposePackedBf16:
+          intrinsic_id = llvm::Intrinsic::tpu_vmatpush_packed_xpose_f32;
+          break;
+        default:
+          return FailedPrecondition("unexpected GainLatchMode");
+      }
+    } break;
+    default:
+      return FailedPrecondition("unexpected vmatpush opcode");
+  }
+  TF_ASSIGN_OR_RETURN(
+      auto dependency,
+      GetFifoDependency(
+          instruction, [this](const LloValue& blocking_instruction) {
+            // A vmatpush instruction may depend on an earlier vmatpush.
+            return LloOpcodeIsVectorLatch(blocking_instruction.opcode())
+                       ? LlvmValue(&blocking_instruction)
+                       : nullptr;
+          }));
+  TF_RET_CHECK(mxu.push_chain == dependency);
+  llvm::Value* result = builder_.CreateCall(
+      llvm::Intrinsic::getDeclaration(module(), intrinsic_id),
+      {gain_chunk, mask, builder_.getInt32(*mxu_id), dependency});
+
+  // Mark new push.
+  mxu.push_count++;
+  mxu.push_chain = result;
+  return result;
+}
+
+Status LlvmModuleBuilder::LoadGainsToGmr(int64_t mxu_id) {
+  const bool valid_id = mxu_id >= 0 && mxu_id < kMxuCount;
+  TF_RET_CHECK(valid_id);
+  MxuState& mxu = mxu_[mxu_id];
+  const DoneWithGainsMode dwg_mode = mxu.next_dwg_mode;
+  TF_RET_CHECK(dwg_mode != DoneWithGainsMode::kNone);
+  mxu.push_count = 0;
+  if (mxu.push_chain == nullptr) {
+    mxu.push_chain = llvm::UndefValue::get(builder_.getInt32Ty());
+  }
+  llvm::Intrinsic::ID intrinsic_id;
+  switch (dwg_mode) {
+    case DoneWithGainsMode::kNone:
+      // won't actually be kNone due to TF_RET_CHECK above
+    case DoneWithGainsMode::kNormal:
+      intrinsic_id = llvm::Intrinsic::tpu_vdwg;
+      break;
+    case DoneWithGainsMode::kTransposed:
+      intrinsic_id = llvm::Intrinsic::tpu_vdwg_xpose;
+      break;
+  }
+  llvm::Value* vdwg = builder_.CreateCall(
+      llvm::Intrinsic::getDeclaration(module(), intrinsic_id),
+      {builder_.getInt32(mxu_id), mxu.push_chain});
+  mxu.mul_chain = vdwg;
+  mxu.next_dwg_mode = DoneWithGainsMode::kNormal;
+  mxu.push_chain = nullptr;
+  return OkStatus();
+}
+
+// Does bookkeeping but doesn't emit any LLVM instructions at this time. Returns
+// an undef value.
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillVectorDoneWithGains(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->operands_size() == 0);
+
+  const auto mxu_id = instruction->unit_id();
+  const bool has_id = mxu_id.has_value();
+  TF_RET_CHECK(has_id);
+  const bool valid_id = *mxu_id >= 0 && *mxu_id < kMxuCount;
+  TF_RET_CHECK(valid_id);
+  MxuState& mxu = mxu_[*mxu_id];
+
+  const auto dwg_mode =
+      static_cast<DoneWithGainsMode>(instruction->done_with_gains_mode());
+  TF_RET_CHECK(dwg_mode != DoneWithGainsMode::kNone);
+  mxu.next_dwg_mode = dwg_mode;
+  mxu.gmrmt = true;
+  return llvm::UndefValue::get(builder_.getInt32Ty());
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillVectorMatmul(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  const auto operands_size = instruction->operands_size();
+  TF_RET_CHECK(operands_size == 1 || operands_size == 2);
+
+  TF_ASSIGN_OR_RETURN(auto input,
+                      LlvmValue(instruction->operands(operands_size - 1)));
+  TF_ASSIGN_OR_RETURN(input, EnsureVectorFloat(input));
+  llvm::Value* mask;
+  if (operands_size == 2) {
+    TF_ASSIGN_OR_RETURN(mask, LlvmValue(instruction->operands(0)));
+  } else {
+    mask = builder_.CreateVectorSplat(vector_size_, LlvmTrue());
+  }
+
+  const auto mxu_id = instruction->unit_id();
+  const bool has_id = mxu_id.has_value();
+  TF_RET_CHECK(has_id);
+  const bool valid_id = *mxu_id >= 0 && *mxu_id < kMxuCount;
+  TF_RET_CHECK(valid_id);
+  MxuState& mxu = mxu_[*mxu_id];
+
+  // Implicit GMR load.
+  const bool fifo_state_dwg = mxu.gmrmt;
+  const bool fifo_tracker_dwg =
+      fifo_tracker_.IsGmrmt(*mxu_id) && !fifo_tracker_post_.IsGmrmt(*mxu_id);
+  TF_RET_CHECK(fifo_state_dwg == fifo_tracker_dwg)
+      << "fifo_state_dwg=" << fifo_state_dwg
+      << " fifo_tracker_dwg=" << fifo_tracker_dwg
+      << " pre=" << fifo_tracker_.IsGmrmt(*mxu_id)
+      << " post=" << fifo_tracker_post_.IsGmrmt(*mxu_id)
+      << " inst=" << ToMnemonic(*instruction);
+  if (fifo_tracker_dwg) {
+    TF_RETURN_IF_ERROR(LoadGainsToGmr(*mxu_id));
+  }
+
+  llvm::Intrinsic::ID intrinsic_id;
+  switch (instruction->opcode()) {
+    case LloOpcode::kVectorMatmul:
+    case LloOpcode::kVectorMatmulMsk:
+      intrinsic_id = llvm::Intrinsic::tpu_vmatmul_f32;
+      break;
+    case LloOpcode::kVectorMatmulHigh:
+    case LloOpcode::kVectorMatmulHighMsk:
+      intrinsic_id = llvm::Intrinsic::tpu_vmatmul_hi_f32;
+      break;
+    case LloOpcode::kVectorMatmulLow:
+    case LloOpcode::kVectorMatmulLowMsk:
+      intrinsic_id = llvm::Intrinsic::tpu_vmatmul_low_f32;
+      break;
+    case LloOpcode::kVectorMatmulPacked:
+    case LloOpcode::kVectorMatmulPackedMsk:
+      intrinsic_id = llvm::Intrinsic::tpu_vmatmul_packed_f32;
+      break;
+    default:
+      return FailedPrecondition("unexpected vmatmul opcode");
+  }
+  llvm::Value* result = builder_.CreateCall(
+      llvm::Intrinsic::getDeclaration(module(), intrinsic_id),
+      {input, mask, builder_.getInt32(*mxu_id), mxu.mul_chain});
+  mxu.muls.push(result);
+  mxu.gmrmt = false;
+  // Verify that vmatmul.dwg was decomposed into vmatmul plus vdwg.
+  TF_RET_CHECK(
+      static_cast<DoneWithGainsMode>(instruction->done_with_gains_mode()) ==
+      DoneWithGainsMode::kNone);
+  return result;
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillVectorMatres(
+    const LloInstruction* instruction) {
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->operands_size() == 0);
+
+  const auto mxu_id = instruction->unit_id();
+  const bool has_id = mxu_id.has_value();
+  TF_RET_CHECK(has_id);
+  const bool valid_id = *mxu_id >= 0 && *mxu_id < kMxuCount;
+  TF_RET_CHECK(valid_id);
+  MxuState& mxu = mxu_[*mxu_id];
+
+  const bool has_muls = !mxu.muls.empty();
+  TF_RET_CHECK(has_muls);
+  TF_ASSIGN_OR_RETURN(
+      auto dependency,
+      GetFifoDependency(
+          instruction, [this](const LloValue& blocking_instruction) {
+            // A vmatres instruction may depend on an earlier vmatmul.
+            return LloOpcodeIsVectorMatmul(blocking_instruction.opcode())
+                       ? LlvmValue(&blocking_instruction)
+                       : nullptr;
+          }));
+  TF_RET_CHECK(dependency == mxu.muls.front());
+  llvm::Value* result =
+      builder_.CreateCall(llvm::Intrinsic::getDeclaration(
+                              module(), llvm::Intrinsic::tpu_vmatres_f32),
+                          {builder_.getInt32(*mxu_id), dependency});
+  mxu.muls.pop();
+  return result;
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillVectorTranspose(
+    const LloInstruction* instruction) {
+  llvm::Intrinsic::getDeclaration(module(), llvm::Intrinsic::tpu_tc_transpose,
+                                  {VectorIntTy()});
+  TF_RET_CHECK(!instruction->PredicateOrNull().has_value());
+  TF_RET_CHECK(instruction->operands_size() == 2);
+  TF_ASSIGN_OR_RETURN(auto source, LlvmValue(instruction->operands(0)));
+  TF_ASSIGN_OR_RETURN(source, EnsureVectorInt(source));
+  TF_ASSIGN_OR_RETURN(auto width, LlvmValue(instruction->operands(1)));
+  const int32_t chunk_id = instruction->GetTransposeChunkId();
+  const int32_t number_of_chunks =
+      instruction->GetNumberOfChunksInTransposeSequence();
+  const int32_t source_bus = 0;  // Default to source bus zero.
+  const int32_t height = 8 * number_of_chunks;
+  const bool is_end = chunk_id == number_of_chunks - 1;
+  llvm::Intrinsic::ID intrinsic_id;
+  // LLO doesn't currently model the segmented and/or packed versions.
+  if (is_end) {
+    intrinsic_id = llvm::Intrinsic::tpu_tc_transpose_end;
+  } else {
+    intrinsic_id = llvm::Intrinsic::tpu_tc_transpose;
+  }
+  const auto filter =
+      [this](const LloValue& blocking_instruction) -> StatusOr<llvm::Value*> {
+    // A vxpose instruction may depend on an earlier vxpose, but not vxpose.end.
+    if (blocking_instruction.opcode() == LloOpcode::kVectorTranspose) {
+      const auto instruction = LloInstruction::FromValue(&blocking_instruction);
+      const int32_t chunk_id = instruction->GetTransposeChunkId();
+      const int32_t number_of_chunks =
+          instruction->GetNumberOfChunksInTransposeSequence();
+      if (chunk_id != number_of_chunks - 1) {
+        return LlvmValue(&blocking_instruction);
+      }
+    }
+    return nullptr;
+  };
+  TF_ASSIGN_OR_RETURN(auto dependency, GetFifoDependency(instruction, filter));
+  llvm::Value* result = builder_.CreateCall(
+      llvm::Intrinsic::getDeclaration(module(), intrinsic_id, {VectorIntTy()}),
+      {source, width, builder_.getInt32(height), builder_.getInt32(source_bus),
+       dependency});
+  return result;
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillVectorTransposeResult(
+    const LloInstruction* instruction) {
+  const int32_t xlu_id = *instruction->unit_id();
+  TF_ASSIGN_OR_RETURN(
+      auto dependency,
+      GetFifoDependency(
+          instruction, [this](const LloValue& blocking_instruction) {
+            // A vxpose-result instruction may depend on an earlier vxpose.
+            return blocking_instruction.opcode() == LloOpcode::kVectorTranspose
+                       ? LlvmValue(&blocking_instruction)
+                       : nullptr;
+          }));
+  llvm::Value* result = builder_.CreateCall(
+      llvm::Intrinsic::getDeclaration(module(), llvm::Intrinsic::tpu_tc_vtrfpop,
+                                      {VectorIntTy()}),
+      {builder_.getInt32(xlu_id), dependency});
+  return result;
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillPfcBcVectorStore(
+    const LloInstruction* instruction) {
+  // See CodeGenerator::EmitInstruction(...) code for kBarnaCoreVectorStore, as
+  // well as implementation of
+  //   PufferfishBarnaCoreChannelEmitter::EmitBarnaCoreVectorStore(...)
+
+  TF_RET_CHECK(instruction->operands_size() == 4);
+  // CodeGenerator::EmitInstruction(...) discards the first argument.
+  const LloValue* const llo_register_number = instruction->operands(1);
+  TF_RET_CHECK(llo_register_number->opcode() == LloOpcode::kScalarConstantU32);
+  TF_ASSIGN_OR_RETURN(const auto base_address_register,
+                      LlvmArgument(llo_register_number->GetConstantAsS32()));
+  TF_ASSIGN_OR_RETURN(const auto feature_length_multiple,
+                      LlvmValue(instruction->operands(2)));
+  TF_ASSIGN_OR_RETURN(const auto value, LlvmValue(instruction->operands(3)));
+
+  // Apparently EmitBarnaCoreVectorStore(...) always uses '1' as add_loop_index.
+  llvm::Value* const add_loop_index = builder_.getInt32(1);
+  std::vector<llvm::Value*> args{value, base_address_register, add_loop_index,
+                                 feature_length_multiple};
+  return CreateIntrinsicCallWithArgs(llvm::Intrinsic::tpu_bc_store_aliaddr_flm,
+                                     {args.data(), args.size()});
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillPfcBcVectorLoad(
+    const LloInstruction* instruction) {
+  // See CodeGenerator::EmitInstruction(...) code for kBarnaCoreVectorLoad, as
+  // well as implementation of
+  //   PufferfishBarnaCoreChannelEmitter::EmitBarnaCoreVectorLoad(...)
+
+  TF_RET_CHECK(instruction->operands_size() == 3);
+  // CodeGenerator::EmitInstruction(...) discards the first argument.
+  const LloValue* const llo_register_number = instruction->operands(1);
+  TF_RET_CHECK(llo_register_number->opcode() == LloOpcode::kScalarConstantU32);
+  TF_ASSIGN_OR_RETURN(const auto base_address_register,
+                      LlvmArgument(llo_register_number->GetConstantAsS32()));
+  TF_ASSIGN_OR_RETURN(const auto feature_length_multiple,
+                      LlvmValue(instruction->operands(2)));
+
+  // Apparently EmitBarnaCoreVectorLoad(...) always uses '1' as add_loop_index.
+  llvm::Value* const add_loop_index = builder_.getInt32(1);
+  std::vector<llvm::Value*> args{base_address_register, add_loop_index,
+                                 feature_length_multiple};
+  return CreateIntrinsicCallWithArgs(llvm::Intrinsic::tpu_bc_load_aliaddr_flm,
+                                     {args.data(), args.size()});
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::FillPfcBcVectorLoadImmediateOffset(
+    const LloInstruction* instruction) {
+  // See CodeGenerator::EmitInstruction(...) code for
+  // kBarnaCoreVectorLoadImmediateOffset, as well as implementation of
+  // PufferfishBarnaCoreChannelEmitter::EmitBarnaCoreVectorLoadImmediateOffset(...).
+
+  TF_RET_CHECK(instruction->operands_size() == 1);
+  // CodeGenerator::EmitInstruction(...) discards the first argument.
+
+  TF_ASSIGN_OR_RETURN(llvm::PointerType * pointer_type,
+                      PointerTy(MemorySpace::kBarnaCoreBmem));
+  TF_ASSIGN_OR_RETURN(
+      llvm::Value* const base_address_ptr,
+      CreateIntToPointer(
+          builder_.getInt32(instruction->scalar_constant_value()),
+          pointer_type));
+  return builder_.CreateLoad(VectorIntTy(), base_address_ptr);
+}
+
+Status LlvmModuleBuilder::FillInstruction(const LloInstruction* instruction) {
+  switch (instruction->opcode()) {
+    case LloOpcode::kHloEnd:
+    case LloOpcode::kHloStart:
+    case LloOpcode::kVectorNop:
+      return OkStatus();
+    case LloOpcode::kTuple:
+      // Tuple values are created by the compiler only to be passed to other
+      // *compiler* components as a collection of values. Unfortunately they are
+      // added to module, and if DCE does not remove them they stay there. We
+      // can safely ignore tuple values during transformation.
+      return OkStatus();
+    default:
+      break;
+  }
+
+  // Mark debug location for mapping into this instruction.
+  TF_ASSIGN_OR_RETURN(auto debug_location, CreateDebugLocation(instruction));
+  builder_.SetCurrentDebugLocation(debug_location);
+
+  fifo_tracker_post_.TrackInstruction(const_cast<LloInstruction*>(instruction));
+  const auto map_value = [this](const LloValue* llo_value,
+                                llvm::Value* llvm_value) -> Status {
+    TF_RET_CHECK(llvm_value != nullptr)
+        << "No LLVM result value for: " << ToMnemonic(*llo_value);
+    const auto it = value_mapping_.find(llo_value);
+    TF_RET_CHECK(it == value_mapping_.end());
+    value_mapping_[llo_value] = llvm_value;
+    return OkStatus();
+  };
+
+  const auto filler_it = fillers_.find(instruction->opcode());
+  TF_RET_CHECK(filler_it != fillers_.end())
+      << "LLO instruction is not supported " << ToMnemonic(*instruction);
+  TF_ASSIGN_OR_RETURN(auto result, (filler_it->second)(instruction));
+  TF_RET_CHECK(result != nullptr)
+      << "LLO instruction filler returned null " << ToMnemonic(*instruction);
+  TF_RETURN_IF_ERROR(map_value(instruction, result));
+  fifo_tracker_.TrackInstruction(const_cast<LloInstruction*>(instruction));
+  arch_register_tracker_.TrackInstruction(
+      const_cast<LloInstruction*>(instruction));
+
+  // Reset debug location.
+  builder_.SetCurrentDebugLocation(nullptr);
+
+  return OkStatus();
+}
+
+StatusOr<llvm::Type*> LlvmModuleBuilder::LlvmType(const LloValue* llo_value) {
+  switch (llo_value->opcode()) {
+    case LloOpcode::kIntToPtr:
+      if (llo_value->memory_space() == MemorySpace::kHib) {
+        // There is no such thing as a pointer to HIB memory and we
+        // intentionally don't model it in LLVM. In the particular case of
+        // kIntToPtr LLO op we just use i32* as a dummy type replacement.
+        return ScalarIntTy()->getPointerTo();
+      }
+      return PointerTy(llo_value->memory_space());
+
+    case LloOpcode::kAllocationAddress:
+      return PointerTy(llo_value->GetAllocation()->space());
+
+    case LloOpcode::kScalarMove:
+      return LlvmType(llo_value->operands(0));
+
+    case LloOpcode::kScalarPhi:
+      // Infer type of scalar phi from types of its first operand. Note that we
+      // rely on LLVM assumption that the first operand is default/initial phi
+      // value, thus don't check for recursion.
+      return LlvmType(llo_value->operands(0));
+
+    default:
+      if (llo_value->ProducesSreg()) {
+        return ScalarIntTy();
+      }
+      if (llo_value->ProducesPreg()) {
+        return PredicateTy();
+      }
+      if (llo_value->ProducesVreg()) {
+        return VectorIntTy();
+      }
+      if (llo_value->ProducesVmreg()) {
+        return VectorMaskTy();
+      }
+      if (llo_value->ProducesNothing()) {
+        return builder_.getVoidTy();
+      }
+      return InternalError("Return type is not yet supported: %s",
+                           ToMnemonic(*llo_value));
+  }
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::LlvmValue(const LloValue* llo_value) {
+  TF_RET_CHECK(llo_value != nullptr);
+  if (llo_value->IsConstant()) {
+    return LlvmConstant(llo_value);
+  }
+  const auto it = value_mapping_.find(llo_value);
+  TF_RET_CHECK(it != value_mapping_.end())
+      << "Missing LLVM value for: " << ToMnemonic(*llo_value);
+  return it->second;
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::EnsurePointer(
+    llvm::Value* value, llvm::Type* expected_type) {
+  const auto value_type = value->getType();
+  if (value_type->isPointerTy()) {
+    TF_RET_CHECK(value_type == expected_type)
+        << ValueAsString(value) << " expected: " << TypeAsString(expected_type)
+        << " actual: " << TypeAsString(value_type);
+    return value;
+  }
+  TF_RET_CHECK(value_type->isIntegerTy(32)) << ValueAsString(value);
+  return CreateIntToPointer(value, expected_type);
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::EnsureScalarFloat(
+    llvm::Value* value) {
+  const auto value_type = value->getType();
+  TF_RET_CHECK(!value_type->isVectorTy()) << ValueAsString(value);
+  if (value_type->isFloatTy()) {
+    return value;
+  }
+  TF_RET_CHECK(value_type->isIntegerTy(32)) << ValueAsString(value);
+  return builder_.CreateBitCast(value, ScalarFloatTy());
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::EnsureScalarOrVectorFloat(
+    llvm::Value* value) {
+  const auto value_type = value->getType();
+  if (value_type->getScalarType()->isFloatTy()) {
+    return value;
+  }
+  TF_RET_CHECK(value_type->getScalarType()->isIntegerTy(32))
+      << ValueAsString(value);
+  return builder_.CreateBitCast(
+      value, value_type->isVectorTy() ? VectorFloatTy() : ScalarFloatTy());
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::EnsureVectorFloat(
+    llvm::Value* value) {
+  const auto value_type = value->getType();
+  TF_RET_CHECK(value_type->isVectorTy()) << ValueAsString(value);
+  if (value_type->getScalarType()->isFloatTy()) {
+    return value;
+  }
+  TF_RET_CHECK(value_type->getScalarType()->isIntegerTy(32))
+      << ValueAsString(value);
+  return builder_.CreateBitCast(value, VectorFloatTy());
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::EnsureScalarInt(llvm::Value* value) {
+  auto value_type = value->getType();
+  TF_RET_CHECK(!value_type->isVectorTy()) << ValueAsString(value);
+  if (value_type->isPointerTy()) {
+    TF_ASSIGN_OR_RETURN(value, CreatePointerToInt(value));
+    value_type = value->getType();
+  }
+  if (value_type->isIntegerTy(32)) {
+    return value;
+  }
+  TF_RET_CHECK(value_type->isFloatTy()) << ValueAsString(value);
+  return builder_.CreateBitCast(value, ScalarIntTy());
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::EnsureScalarOrVectorInt(
+    llvm::Value* value) {
+  auto value_type = value->getType();
+  if (value_type->isPointerTy()) {
+    TF_ASSIGN_OR_RETURN(value, CreatePointerToInt(value));
+    value_type = value->getType();
+  }
+  if (value_type->getScalarType()->isIntegerTy(32)) {
+    return value;
+  }
+  TF_RET_CHECK(value_type->getScalarType()->isFloatTy())
+      << TypeAsString(value_type);
+  return builder_.CreateBitCast(
+      value, value_type->isVectorTy() ? VectorIntTy() : ScalarIntTy());
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::EnsureVectorInt(llvm::Value* value) {
+  const auto value_type = value->getType();
+  TF_RET_CHECK(value_type->isVectorTy()) << ValueAsString(value);
+  TF_RET_CHECK(!value_type->isPointerTy()) << ValueAsString(value);
+  if (value_type->getScalarType()->isIntegerTy(32)) {
+    return value;
+  }
+  TF_RET_CHECK(value_type->getScalarType()->isFloatTy())
+      << ValueAsString(value);
+  return builder_.CreateBitCast(value, VectorIntTy());
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::LlvmAddress(const LloValue* address) {
+  TF_RET_CHECK(address->opcode() == LloOpcode::kAllocationAddress);
+  const int64_t word_offset_within_allocation =
+      LloConstant::FromValue(address)->word_offset_within_allocation();
+
+  const LloAllocation* allocation = address->GetAllocation();
+  if (allocation->space() == MemorySpace::kVmem ||
+      allocation->space() == MemorySpace::kSmem) {
+    auto it = allocation_mapping_.find(allocation);
+    TF_RET_CHECK(it != allocation_mapping_.end())
+        << "Missing LLVM value for: " << ToMnemonic(*address);
+    return CreateScalarAddressCalculation(
+        it->second, builder_.getInt32(word_offset_within_allocation),
+        allocation->space());
+  }
+
+  TF_RET_CHECK(allocation->has_offset());
+  const int64_t byte_offset =
+      GetAdjustedAllocationOffset(compilation_result(), allocation);
+  const int64_t word_offset = address_util::ConvertOffsetByteToWord(
+      allocation->space(), byte_offset, target());
+  TF_ASSIGN_OR_RETURN(llvm::PointerType * pointer_type,
+                      PointerTy(allocation->space()));
+  return CreateIntToPointer(
+      builder_.getInt32(word_offset + word_offset_within_allocation),
+      pointer_type);
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::LlvmConstant(
+    const LloValue* llo_value) {
+  TF_RET_CHECK(llo_value->IsConstant());
+  const auto create_float_const = [&] {
+    float f32const = llo_value->GetConstantAsF32();
+    return builder_.getInt32(*reinterpret_cast<int32_t*>(&f32const));
+  };
+  switch (llo_value->opcode()) {
+    case LloOpcode::kAllocationAddress:
+      return LlvmAddress(llo_value);
+    case LloOpcode::kScalarConstantU32:
+      return builder_.getInt32(llo_value->GetConstantAsS32());
+    case LloOpcode::kVectorConstantU32:
+      return builder_.CreateVectorSplat(
+          vector_size_, builder_.getInt32(llo_value->GetConstantAsS32()));
+    case LloOpcode::kPredicateConstant:
+      return llo_value->GetConstantAsS32() == 0 ? LlvmFalse() : LlvmTrue();
+    case LloOpcode::kScalarConstantF32:
+      return create_float_const();
+    case LloOpcode::kVectorConstantF32:
+      return builder_.CreateVectorSplat(vector_size_, create_float_const());
+    case LloOpcode::kVectorMaskConstant:
+      return builder_.CreateVectorSplat(
+          vector_size_,
+          llo_value->GetConstantAsS32() == 0 ? LlvmFalse() : LlvmTrue());
+    default:
+      return InternalError("Constant is not supported yet: %s",
+                           ToMnemonic(*llo_value));
+  }
+}
+
+StatusOr<llvm::Value*> LlvmModuleBuilder::LlvmArgument(int argno) {
+  TF_RET_CHECK(argno >= 0);
+  TF_RET_CHECK(argno < llvm_function()->getFunctionType()->getNumParams())
+      << "Asked for argument " << argno << " from function "
+      << TypeAsString(llvm_function()->getFunctionType());
+  return llvm_function()->getArg(argno);
+}
+
+Status LlvmModuleBuilder::FillBasicBlocks(LloModule* llo_module) {
+  const auto switch_block = [&](llvm::BasicBlock* next_block) -> Status {
+    llvm::BasicBlock* previous_block = builder_.GetInsertBlock();
+    TF_RET_CHECK(previous_block != nullptr);
+    VLOG(1) << "previous_block=" << previous_block->getName().str()
+            << " next_block=" << next_block->getName().str()
+            << " next_block.prev_node="
+            << next_block->getPrevNode()->getName().str();
+    TF_RET_CHECK(previous_block == next_block->getPrevNode());
+
+    if (next_branch_ == nullptr) {
+      builder_.CreateBr(next_block);  // Just a regular fallthrough.
+    } else {
+      const LloInstruction* branch = next_branch_;
+      LloRegion* target_region = branch->branch_target_region();
+      TF_RET_CHECK(target_region != nullptr);
+      TF_ASSIGN_OR_RETURN(llvm::BasicBlock * target_block,
+                          GetFirstRegionBlock(target_region));
+
+      const auto predication = branch->PredicateOrNull();
+      if (!predication.has_value()) {
+        value_mapping_[branch] =
+            builder_.CreateBr(target_block);  // Unconditional branch.
+      } else {
+        const bool direct =
+            predication->polarity == PredicationPolarity::kPositive;
+        TF_ASSIGN_OR_RETURN(llvm::Value * predicate_value,
+                            LlvmValue(predication->predicate));
+        value_mapping_[branch] = builder_.CreateCondBr(
+            predicate_value, direct ? target_block : next_block,
+            direct ? next_block : target_block);
+      }
+    }
+
+    builder_.SetInsertPoint(next_block);
+    next_branch_ = nullptr;
+    return OkStatus();
+  };
+
+  const auto pre_run_on_basic_block =
+      [&](const LloRegion* region, const LloRegionMember* member) -> Status {
+    if (region_filter_.contains(region)) {
+      return OkStatus();
+    }
+    TF_RETURN_IF_ERROR(fifo_tracker_.ValidateFifoStateEmpty(region));
+    if (member != nullptr) {
+      TF_ASSIGN_OR_RETURN(llvm::BasicBlock * next_block,
+                          GetFirstRegionBlock(region));
+      TF_RET_CHECK(next_block->empty());
+      TF_RETURN_IF_ERROR(switch_block(next_block));
+    }
+    return OkStatus();
+  };
+
+  const auto post_run_on_basic_block =
+      [&](const LloRegion* region, const LloRegionMember* member) -> Status {
+    if (region_filter_.contains(region)) {
+      return OkStatus();
+    }
+    if (member != nullptr) {
+      TF_RETURN_IF_ERROR(fifo_tracker_.ValidateFifoStateEmpty(region));
+      last_region_to_block_[region] = builder_.GetInsertBlock();
+    }
+    return OkStatus();
+  };
+
+  const auto pre_run_on_region = [&](const LloRegion* region) -> Status {
+    if (region_filter_.contains(region)) {
+      return OkStatus();
+    }
+    current_region_ = region;
+
+    const auto* const allocation_maps = converter_->GetProgramAllocationMaps();
+    const auto it = allocation_maps->find(compilation_result());
+    TF_RET_CHECK(it != allocation_maps->end());
+    const ModuleAllocationMaps* allocation_map = it->second.get();
+
+    for (const auto& allocation : region->allocations()) {
+      if (allocation->space() != MemorySpace::kVmem &&
+          allocation->space() != MemorySpace::kSmem) {
+        continue;
+      }
+      const llvm::Intrinsic::TPUIntrinsics intrinsic_id =
+          allocation->space() == MemorySpace::kVmem
+              ? llvm::Intrinsic::tpu_allocate_vmem
+              : llvm::Intrinsic::tpu_allocate_smem;
+      TF_ASSIGN_OR_RETURN(
+          const MemRegion mem_region,
+          allocation_map->GetAllocationRegionInWords(allocation.get()));
+      std::vector<llvm::Value*> args{
+          builder_.getInt32(mem_region.second - mem_region.first),
+          builder_.getInt32(mem_region.first)};
+      TF_ASSIGN_OR_RETURN(llvm::Value * alloc,
+                          CreateIntrinsicCallWithArgs(
+                              intrinsic_id, {args.data(), args.size()}));
+      allocation_mapping_.emplace(allocation.get(), alloc);
+    }
+    return OkStatus();
+  };
+
+  const auto post_run_on_region = [&](const LloRegion* region) -> Status {
+    if (region_filter_.contains(region)) {
+      return OkStatus();
+    }
+    TF_RET_CHECK(next_branch_ == nullptr);
+    next_branch_ = region->has_branch() ? region->branch() : nullptr;
+    if (region == llo_module->top_region() &&
+        llo_module->comp_env().xla_llvm_generate_xla_compatible_dwg()) {
+      // XLA assumes that each HLO leaves MXU in DWG state, essentially meaning
+      // that at the first matmul after HLO a new set of gains will be loaded
+      // from the GSF into the GMR. In LLVM world we emit DWG each time we know
+      // we just pushed all gains into GSF and before the first matmul, so we
+      // may leave MXU state in state w/o DWG flag set, knowing it will be set
+      // when needed. This does not work for mixed mode when XLA generated HLO
+      // follows HLO generated with LLVM, for these scenarios we always add an
+      // extra DWG to all HLOs which use MXU, which is a no-op for the case when
+      // one HLO generated with LLVM is followed by another LLVM generated HLO.
+      for (int64_t mxu_id = 0; mxu_id < kMxuCount; mxu_id++) {
+        if (mxu_[mxu_id].mul_chain != nullptr) {
+          TF_RETURN_IF_ERROR(LoadGainsToGmr(mxu_id));
+        }
+      }
+    }
+    return OkStatus();
+  };
+
+  const auto run_on_instruction = [&](const LloRegionMember* member) -> Status {
+    if (region_filter_.contains(member->instruction()->parent())) {
+      return OkStatus();
+    }
+    return FillInstruction(member->instruction());
+  };
+
+  TF_RETURN_IF_ERROR(ConstLloRegionVisitor()
+                         .WithPreRunOnBasicBlock(pre_run_on_basic_block)
+                         .WithPreRunOnRegion(pre_run_on_region)
+                         .WithPostRunOnRegion(post_run_on_region)
+                         .WithPostRunOnBasicBlock(post_run_on_basic_block)
+                         .WithRunOnInstruction(run_on_instruction)
+                         .VisitRegion(llo_module->top_region()));
+
+  // If the module represents an HLO we need to add return-void instruction to
+  // the last block created, otherwise the module will just fall through.
+  TF_RET_CHECK(builder_.GetInsertBlock() != nullptr);
+  if (builder_.GetInsertBlock()->getTerminator() == nullptr) {
+    // Each barna core channel program should end with a loop intrinsic
+    // branching back to the start of the loop. Currently we only support one
+    // loop starting in the beginning of the program.
+    if (cpu_ == kCpuPfcBarnaCoreChannelController) {
+      llvm::BasicBlock* const current_block = builder_.GetInsertBlock();
+      llvm::BasicBlock* const next_block =
+          llvm::BasicBlock::Create(context(), "loop-out", llvm_function());
+
+      // Get first loop block.
+      TF_RET_CHECK(llvm::succ_size(&llvm_function()->getEntryBlock()) == 1)
+          << llvm::succ_size(&llvm_function()->getEntryBlock());
+      llvm::BasicBlock* loop_start_block =
+          *succ_begin(&llvm_function()->getEntryBlock());
+
+      // The loop-end intrinsic returns a pseudo-value which is true if the loop
+      // should repeat.
+      TF_ASSIGN_OR_RETURN(
+          auto loop_back,
+          CreateIntrinsicCallWithArgs(llvm::Intrinsic::tpu_bc_loop_end, {}));
+      builder_.CreateCondBr(loop_back, loop_start_block, next_block);
+
+      // Move the insertion point to the newly created block.
+      next_block->moveAfter(current_block);
+      builder_.SetInsertPoint(next_block);
+    }
+
+    builder_.CreateRetVoid();
+  }
+
+  LloPhiClassifier llo_phi_classifier;
+  llo_phi_classifier.Init(llo_module->top_region());
+  // Fixup phis after all values are converted.
+  for (auto [instruction, llvm_phi] : phis_to_fix_up_) {
+    const auto phi_type = llvm_phi->getType();
+    auto llo_source_regions = llo_phi_classifier.PhiSources(*instruction);
+    TF_RET_CHECK(llvm_phi->getNumOperands() == 0);
+    TF_RET_CHECK(instruction->operands_size() == llo_source_regions.size());
+    for (int64_t i = 0; i < llo_source_regions.size(); ++i) {
+      TF_RET_CHECK(last_region_to_block_.contains(llo_source_regions[i]));
+      llvm::BasicBlock* llvm_basic_block =
+          last_region_to_block_[llo_source_regions[i]];
+      builder_.SetInsertPoint(llvm_basic_block,
+                              llvm_basic_block->getTerminator()->getIterator());
+      TF_ASSIGN_OR_RETURN(llvm::Value * llvm_value,
+                          LlvmValue(instruction->operands(i)));
+      if (phi_type->isPointerTy()) {
+        TF_ASSIGN_OR_RETURN(llvm_value, EnsurePointer(llvm_value, phi_type));
+      }
+      llvm_phi->addIncoming(llvm_value, llvm_basic_block);
+    }
+  }
+  phis_to_fix_up_.clear();
+
+  return OkStatus();
+}
+
+Status LlvmModuleBuilder::BuildBasicBlocks(LloModule* llo_module) {
+  llvm::BasicBlock* entry_block =
+      llvm::BasicBlock::Create(context(), "entry", llvm_function());
+  builder_.SetInsertPoint(entry_block);
+
+  // Each barna core channel program should start with a loop intrinsic.
+  if (cpu_ == kCpuPfcBarnaCoreChannelController) {
+    llvm::BasicBlock* const next_block =
+        llvm::BasicBlock::Create(context(), "loop-start", llvm_function());
+
+    llvm::Value* args[] = {builder_.getInt32(0)};
+    TF_ASSIGN_OR_RETURN(
+        auto intrinsic_ret_value,
+        CreateIntrinsicCallWithArgs(llvm::Intrinsic::tpu_bc_loop_start, args));
+    (void)intrinsic_ret_value;  // Unused.
+    builder_.CreateBr(next_block);
+
+    // Move the insertion point to the newly created block.
+    next_block->moveAfter(entry_block);
+    builder_.SetInsertPoint(next_block);
+  }
+  TF_RETURN_IF_ERROR(
+      ConstLloRegionVisitor()
+          .WithPreRunOnBasicBlock([&](const LloRegion* region,
+                                      const LloRegionMember* member) -> Status {
+            if (region_filter_.contains(region)) {
+              return OkStatus();
+            }
+            if (member == nullptr) {
+              first_region_to_block_[region] = nullptr;
+            } else {
+              TF_ASSIGN_OR_RETURN(llvm::BasicBlock * block,
+                                  CreateBasicBlock(absl::StrCat(
+                                      "llo-region-", region->ordinal())));
+              first_region_to_block_[region] = block;
+              VLOG(1) << "first_region_to_block_[$region" << region->ordinal()
+                      << "] = "
+                      << first_region_to_block_[region]->getName().str();
+            }
+            return OkStatus();
+          })
+          .VisitRegion(llo_module->top_region()));
+  return OkStatus();
+}
+
+Status LlvmModuleBuilder::ApplyBuildConfig() {
+  const HloBuildConfig* build_config = converter_->hlo_build_config_;
+  if (build_config == nullptr) {
+    return OkStatus();
+  }
+  TF_RET_CHECK(!this->is_tlp_module()) << "HLO build config on TLP!";
+  if (const auto* reshape_config = build_config->as<ReshapeBuildConfig>()) {
+    // For reshapes we filter out all regions specified in config.
+    TF_RET_CHECK(converter_->hlo_function_prototype_ != nullptr);
+    TF_RET_CHECK(
+        converter_->hlo_function_prototype_->getName().contains("reshape"));
+    for (const LloRegion* hidden_region : reshape_config->hidden_regions()) {
+      // Mark all regions under this one to be ignored.
+      TF_RETURN_IF_ERROR(
+          ConstLloRegionVisitor()
+              .WithPreRunOnRegion([&](const LloRegion* region) -> Status {
+                region_filter_.insert(region);
+                return OkStatus();
+              })
+              .VisitRegion(hidden_region));
+    }
+  }
+
+  return OkStatus();
+}
+
+StatusOr<std::vector<HloReference>> LlvmModuleBuilder::BuildMainModule() {
+  const LloCompilationResult* const main_compilation_result =
+      converter_->compilation_result_;
+
+  TF_RETURN_IF_ERROR(ApplyBuildConfig());
+
+  if (cpu_ != kCpuPfcBarnaCoreChannelController) {
+    TF_RETURN_IF_ERROR(ComputeSpillRegions(main_compilation_result));
+  }
+
+  // Build main function.
+  TF_RET_CHECK(llvm_function_ == nullptr);
+  TF_ASSIGN_OR_RETURN(llvm_function_, CreateMainFunction());
+
+  TF_RETURN_IF_ERROR(LloVerifier::Verify(*llo_module()->top_region()));
+  if (dumper() != nullptr) {
+    dumper()->DumpIfEnabled(
+        *llo_module()->top_region(),
+        NormalizeAsFileName(absl::StrCat("pre-llvm-lowering-",
+                                         llvm_function_->getName().str(), "-",
+                                         converter_->retry_)),
+        LloDumper::Category::kLlvm);
+  }
+
+  // For each HLO module initialize load/store iar tokens with values passed
+  // in the first two parameters.
+  if (!is_tlp_module()) {
+    // When building an external HLO as a main module, we assume the undef
+    // default value at entry for both load and store iar tokens.
+    last_issued_load_iar_token_ = llvm::UndefValue::get(builder_.getInt32Ty());
+    last_issued_store_iar_token_ = llvm::UndefValue::get(builder_.getInt32Ty());
+  }
+  TF_RETURN_IF_ERROR(BuildBasicBlocks(llo_module()));
+  TF_RETURN_IF_ERROR(FillBasicBlocks(llo_module()));
+
+  // Need to finalize debug info for the function.
+  llvm::DISubprogram* function_scope = llvm_function_->getSubprogram();
+  TF_RET_CHECK(function_scope != nullptr);
+  di_builder_.finalizeSubprogram(function_scope);
+  // Build functions for HLOs.
+  std::vector<HloReference> hlo_references;
+  for (const auto& hlo : hlo_functions_) {
+    TF_RET_CHECK(hlo.function_or_null() != nullptr);
+    hlo_references.push_back(hlo);
+  }
+
+  di_builder_.finalize();
+
+  return hlo_references;
+}
+
+Status LlvmModuleBuilder::ComputeSpillRegions(
+    const LloCompilationResult* main_compilation_result) {
+  // Grab references to SMEM/VMEM spill region bounds.
+  auto& spill_regions = converter_->spill_regions_;
+  auto smem_spill_region_it = spill_regions.find(MemorySpace::kSmem);
+  TF_RET_CHECK(smem_spill_region_it != spill_regions.end());
+  auto vmem_spill_region_it = spill_regions.find(MemorySpace::kVmem);
+  TF_RET_CHECK(vmem_spill_region_it != spill_regions.end());
+
+  // Assumed memory start for SMEM and VMEM.
+  const int64_t smem_memory_start = target().SmemUserSpaceWordOffset();
+  const int64_t vmem_memory_start = 0;
+
+  const ProgramAllocationMaps& program_allocation_maps =
+      *converter_->GetProgramAllocationMaps();
+
+  if (is_tlp_module()) {
+    // In TLP modules for both VMEM and SMEM we expect 'spill_regions' to
+    // represent the whole range of the memory available for the program.
+    TF_RET_CHECK(smem_memory_start == smem_spill_region_it->second.first)
+        << "Invalid SMEM spill region start for TLP, expected: "
+        << smem_memory_start
+        << ", actual: " << smem_spill_region_it->second.first;
+    TF_RET_CHECK(vmem_memory_start == vmem_spill_region_it->second.first)
+        << "Invalid VMEM spill region start for TLP, expected: "
+        << vmem_memory_start
+        << ", actual: " << vmem_spill_region_it->second.first;
+
+    // For TLP module we start with computing the biggest region which will work
+    // for TLP *and* all HLOs. Later we extend it the same way we extend the
+    // regions for HLOs, but doing this first should help us avoid wrongly
+    // choosing biggest gap in TLP memory which does not match HLOs memory gaps.
+    for (const auto& [compilation_result, maps] : program_allocation_maps) {
+      // Pack VMEM before computing spill regions for TLP.
+      if (cpu_ != kCpuPfcBarnaCoreChannelController) {
+        MemRegion vmem_bounds{
+            0, address_util::ConvertOffsetByteToWord(
+                   MemorySpace::kVmem, target().VmemSizeBytes(), target())};
+        TF_RETURN_IF_ERROR(maps->PackVmemAllocations(vmem_bounds));
+      }
+
+      // Note that passing *mem_spill_region as bounds argument to the
+      // computation effectively works as intersection, and also speeds up the
+      // computation by helping ignore no-more-relevant allocations.
+      TF_ASSIGN_OR_RETURN(
+          smem_spill_region_it->second,
+          maps->ComputeBestSpillRange(MemorySpace::kSmem,
+                                      smem_spill_region_it->second));
+      TF_ASSIGN_OR_RETURN(
+          vmem_spill_region_it->second,
+          maps->ComputeBestSpillRange(MemorySpace::kVmem,
+                                      vmem_spill_region_it->second));
+    }
+  } else {
+    // In HLO modules for both VMEM and SMEM we expect 'spill_regions' to
+    // represent the spill region used in TLP minus the part of it which is
+    // actually used for TLP spills across HLO call sites. So we don't need to
+    // do anything here.
+  }
+
+  {
+    // For HLO spills (both SMEM & VMEM) we use following approach: knowing that
+    // HLO spills may only conflict with allocations within this HLO or TLP, but
+    // not other HLOs, we keep the spill location limit the same, but extend the
+    // start of spill region to max of the latest allocation in TLP and the
+    // current HLO.
+
+    // NOTE: we do the same region extension for TLP, after expansion is done we
+    // will potentially spill into locations which are used for HLO allocations,
+    // but they will only conflict with HLO allocations if they are preserved
+    // across call sites, and even if they do, we still can try re-lowering the
+    // HLO to minimize its memory use.
+
+    const auto adjust_spill_region = [&](MemorySpace space,
+                                         MemRegion* spill_region,
+                                         int64_t new_first) -> Status {
+      MemRegion candidate_spill_region{new_first, spill_region->second};
+
+      const auto adjust_spill_interval =
+          [&](const LloCompilationResult* compilation_result) -> Status {
+        TF_ASSIGN_OR_RETURN(
+            const auto& hlo_maps,
+            MaybeFind(program_allocation_maps, compilation_result));
+        TF_ASSIGN_OR_RETURN(candidate_spill_region,
+                            hlo_maps.get()->ComputeBestSpillRange(
+                                space, candidate_spill_region));
+        if (candidate_spill_region.second <= candidate_spill_region.first) {
+          // For empty region we are using the region with start/limit same as
+          // limit of the original spill_region.
+          candidate_spill_region = {spill_region->second, spill_region->second};
+        }
+        return OkStatus();
+      };
+
+      if (is_tlp_module()) {
+        // Adjust lower bound to not interfere with TLP allocations.
+        TF_RETURN_IF_ERROR(adjust_spill_interval(main_compilation_result));
+      } else {
+        TF_RET_CHECK(converter_->tlp_compilation_result_ != nullptr);
+        // Adjust lower bound to not interfere with TLP allocations.
+        TF_RETURN_IF_ERROR(
+            adjust_spill_interval(converter_->tlp_compilation_result_));
+        // Same for HLO allocations.
+        TF_RETURN_IF_ERROR(adjust_spill_interval(main_compilation_result));
+
+        // Since the compilation of TLP now can potentially overwrite HLO
+        // allocations we need to check if this has happened.
+        TF_RET_CHECK(converter_->tlp_used_spill_regions_ != nullptr);
+        TF_ASSIGN_OR_RETURN(
+            MemRegion tlp_region,
+            MaybeFind(*converter_->tlp_used_spill_regions_, space));
+        TF_ASSIGN_OR_RETURN(
+            const auto& hlo_maps,
+            MaybeFind(program_allocation_maps, main_compilation_result));
+        TF_ASSIGN_OR_RETURN(
+            MemRegion empty_region,
+            hlo_maps.get()->ComputeBestSpillRange(space, tlp_region));
+        if (empty_region != tlp_region) {
+          return space == MemorySpace::kVmem
+                     ? ResourceExhausted(
+                           "HLO %s allocations are overwritten with TLP "
+                           "spill region [%d, %d)",
+                           MemorySpaceToString(space), tlp_region.first,
+                           tlp_region.second)
+                     : Cancelled(
+                           "HLO %s allocations are overwritten with TLP "
+                           "spill region [%d, %d)",
+                           MemorySpaceToString(space), tlp_region.first,
+                           tlp_region.second);
+        }
+      }
+
+      TF_RET_CHECK(candidate_spill_region.second <= spill_region->second);
+      *spill_region = candidate_spill_region;
+      return OkStatus();
+    };
+
+    TF_RETURN_IF_ERROR(adjust_spill_region(MemorySpace::kSmem,
+                                           &smem_spill_region_it->second,
+                                           /*new_first=*/smem_memory_start));
+    TF_RETURN_IF_ERROR(adjust_spill_region(MemorySpace::kVmem,
+                                           &vmem_spill_region_it->second,
+                                           /*new_first=*/vmem_memory_start));
+  }
+
+  llvm::MDBuilder md_builder(context());
+  const auto set_mem_range = [&](absl::string_view name_start,
+                                 absl::string_view name_limit,
+                                 const MemRegion& region) {
+    const auto set_meta = [&](absl::string_view name, int64_t value) {
+      llvm::NamedMDNode* named_md_node = module()->getOrInsertNamedMetadata(
+          llvm::StringRef(name.data(), name.size()));
+      named_md_node->addOperand(llvm::MDNode::get(
+          context(), md_builder.createConstant(llvm::ConstantInt::get(
+                         builder_.getInt32Ty(), value))));
+    };
+
+    set_meta(name_start, region.first);
+    set_meta(name_limit, region.second);
+
+    VLOG(1) << "Spill region [" << name_start << "...[" << name_limit
+            << " (words): [" << region.first << ", " << region.second
+            << "):" << (region.second - region.first);
+  };
+
+  set_mem_range("smem.spill.start", "smem.spill.limit",
+                smem_spill_region_it->second);
+  set_mem_range("vmem.spill.start", "vmem.spill.limit",
+                vmem_spill_region_it->second);
+
+  return OkStatus();
+}
+
+std::string Converter::PrintAsLlvm() {
+  std::string tmp;
+  llvm::raw_string_ostream OS(tmp);
+  if (main_module_->comp_env().xla_jf_llvm_use_bitcode_dump()) {
+    llvm::WriteBitcodeToFile(module_, OS);
+  } else {
+    OS << module_;
+    OS.flush();
+  }
+  return tmp;
+}
+
+Status Converter::VerifyModule() {
+  SubTimer timer(assigned_timer_, "verify_module");
+  std::string error_str;
+  llvm::raw_string_ostream error_os(error_str);
+  if (llvm::verifyModule(module_, &error_os)) {
+    error_os.flush();
+    return InternalError("Failed to build a valid module from LLO:\n\n%s",
+                         error_str);
+  }
+  return OkStatus();
+}
+
+StatusOr<std::string> Converter::OptimizeAndCodeGen(
+    PassPopulationHandler handler) {
+  SubTimer opt_and_codegen_timer(assigned_timer_, "optimize_and_codegen");
+
+  // Run TPU target passes on a module copy.
+  llvm::SmallVector<char, 0> stream_buffer;
+  llvm::raw_svector_ostream ostream(stream_buffer);
+  llvm::LLVMContext local_context;
+  std::unique_ptr<llvm::Module> local_module;
+  llvm::legacy::PassManager pass_manager;
+  const bool use_fast_opt = main_module_->comp_env().xla_jf_llvm_use_fast_opt();
+  // Turn on default opt passes to be able to compare code quality.
+  if (!use_fast_opt) {
+    SubTimer module_opt_timer(opt_and_codegen_timer, "llvm_non_fast_optimize");
+    llvm::LoopAnalysisManager lam;
+    llvm::FunctionAnalysisManager fam;
+    llvm::CGSCCAnalysisManager cgam;
+    llvm::ModuleAnalysisManager mam;
+
+    llvm::PassBuilder pb(&*target_machine_);
+    pb.registerModuleAnalyses(mam);
+    pb.registerCGSCCAnalyses(cgam);
+    pb.registerFunctionAnalyses(fam);
+    pb.registerLoopAnalyses(lam);
+    pb.crossRegisterProxies(lam, fam, cgam, mam);
+
+    llvm::ModulePassManager mpm =
+        pb.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O2);
+
+    mpm.run(module_, mam);
+
+    module_opt_timer.StopAndReport();
+    // Make a local copy of LLVM module. This should NOT be required, but
+    // apparently LLVM stores some state somewhere on module_ and running target
+    // passes on module_ is not equivalent to running same passes on module copy
+    // created via serialization and deserialization. We keep this approach for
+    // some time since it helps with reproducibility.
+    // TODO(b/148084247): investigate and remove unnecessary copy creation
+    // overhead.
+    SubTimer module_copy_timer(opt_and_codegen_timer, "module_copy");
+    std::string module_string = PrintAsLlvm();
+    if (dumper_ != nullptr) {
+      if (hlo_function_prototype_ == nullptr) {
+        dumper_->DumpTextIfEnabled(NormalizeAsFileName("llvm-for-llo-opt-tlp"),
+                                   module_string, LloDumper::Category::kLlvm);
+      } else {
+        dumper_->DumpTextIfEnabled(
+            NormalizeAsFileName(absl::StrCat(
+                "llvm-for-llo-opt-", hlo_function_prototype_->getName().data(),
+                "-", retry_)),
+            module_string, LloDumper::Category::kLlvm);
+      }
+    }
+    auto local_buffer = llvm::MemoryBuffer::getMemBuffer(module_string);
+    llvm::SMDiagnostic Err;
+    TF_RET_CHECK(local_buffer != nullptr);
+
+    local_module =
+        llvm::getLazyIRModule(std::move(local_buffer), Err, local_context);
+    module_copy_timer.StopAndReport();
+    TF_RET_CHECK(local_module != nullptr)
+        << "Error: " << Err.getMessage().str();
+  }
+  TF_RET_CHECK(target_machine_);
+  SubTimer run_codegen_timer(opt_and_codegen_timer, "run_codegen_passes");
+
+  // Initialize passes by calling handler.
+  TF_RETURN_IF_ERROR(handler(&pass_manager, &ostream));
+
+  // Run passes, return LLVM output.
+  pass_manager.run(local_module ? *local_module : module_);
+  run_codegen_timer.StopAndReport();
+  return std::string(stream_buffer.begin(), stream_buffer.end());
+}
+
+StatusOr<std::string> Converter::OptimizeAndCodeGen() {
+  return OptimizeAndCodeGen([this](llvm::legacy::PassManager* pass_manager,
+                                   llvm::raw_svector_ostream* ostream) {
+    if (target_machine_->addPassesToEmitFile(*pass_manager, *ostream, nullptr,
+                                             llvm::CGFT_AssemblyFile)) {
+      return InternalError("TargetMachine can't emit a file of this type");
+    }
+    return OkStatus();
+  });
+}
+
+StatusOrMcCode Converter::OptimizeAndCodeGen(const IsaProgramTarget& target) {
+  McCodeProvider mc_code_provider;
+  TF_ASSIGN_OR_RETURN(
+      auto output_to_be_ignored,
+      OptimizeAndCodeGen([&](llvm::legacy::PassManager* pass_manager,
+                             llvm::raw_svector_ostream* ostream) {
+        TF_ASSIGN_OR_RETURN(
+            mc_code_provider,
+            AddPassesToEmitMcCode(target, target_machine_.get(), pass_manager));
+        return OkStatus();
+      }));
+  TF_RET_CHECK(mc_code_provider != nullptr);
+  return mc_code_provider();
+}
+
+Status Converter::BuildMainModule() {
+  SubTimer timer(assigned_timer_, "build_llvm_module");
+  // Initialize target machine, fail if already initialized.
+  std::string cpu = target_machine_->getTargetCPU().str();
+  LlvmModuleBuilder builder(this, cpu);
+
+  // Build the module, including the main function as well as all HLO functions.
+  TF_ASSIGN_OR_RETURN(hlo_references_, builder.BuildMainModule());
+
+  // Set target layout.
+  TF_RET_CHECK(target_machine_);
+  module_.setTargetTriple(target_machine_->getTargetTriple().getTriple());
+  module_.setDataLayout(target_machine_->createDataLayout());
+
+  return OkStatus();
+}
+
+// Assumes the program assembly is printed with encoding information, searches
+// for encoding comments and parse them.
+StatusOr<std::vector<std::vector<uint8_t>>> ExtractEncodedBundles(
+    const std::string& program) {
+  std::vector<std::vector<uint8_t>> result;
+
+  // We expect the encoding information to be represented in the following form:
+  //     "// encoding: [0x72,0x77,...,0x73]"
+  const auto bytes_pattern = RE2(R"(^.*\/\/.*encoding:\s*\[(.*)\].*$)",
+                                 RE2::CannedOptions::DefaultOptions);
+  const auto digits_pattern =
+      RE2(R"(^0x([0-9a-f]+)$)", RE2::CannedOptions::DefaultOptions);
+
+  absl::string_view program_text = program;
+  size_t program_cursor = 0;
+  const size_t program_end = program_text.size();
+  while (program_cursor < program_end) {
+    size_t nl_index = program_text.find('\n', program_cursor);
+    if (nl_index == absl::string_view::npos) {
+      nl_index = program_end;
+    }
+    if (program_cursor < nl_index) {
+      std::string bytes;
+      if (RE2::FullMatch(
+              program_text.substr(program_cursor, nl_index - program_cursor),
+              bytes_pattern, &bytes)) {
+        result.emplace_back();
+
+        size_t bytes_cursor = 0;
+        const size_t bytes_end = bytes.size();
+        while (bytes_cursor < bytes_end) {
+          size_t comma_index = bytes.find(',', bytes_cursor);
+          if (comma_index == absl::string_view::npos) {
+            comma_index = bytes_end;
+          }
+          if (bytes_cursor < comma_index) {
+            std::string hex;
+            if (!RE2::FullMatch(
+                    bytes.substr(bytes_cursor, comma_index - bytes_cursor),
+                    digits_pattern, &hex)) {
+              return InternalError(
+                  "Failed to process byte hex code: %s",
+                  bytes.substr(bytes_cursor, comma_index - bytes_cursor));
+            }
+
+            uint64_t value = ParseLeadingHex64Value(hex, 256);
+            TF_RET_CHECK(value < 256) << value;
+            result.back().push_back(static_cast<uint8_t>(value));
+          }
+          bytes_cursor = comma_index + 1;
+        }
+      }
+    }
+    program_cursor = nl_index + 1;
+  }
+  return result;
+}
+
+StatusOr<
+    std::unique_ptr<asic_sw::deepsea::pxc::pfc::isa::BarnaCoreChannelProgram>>
+BarnaCoreChannelProgramFromEncoded(
+    const std::vector<std::vector<uint8_t>>& encoded_bundles) {
+  auto program = std::make_unique<
+      asic_sw::deepsea::pxc::pfc::isa::BarnaCoreChannelProgram>();
+  asic_sw::deepsea::pxc::pfc::isa::BarnaCoreChannelCodec codec;
+  for (const auto& encoded_bundle : encoded_bundles) {
+    TF_ASSIGN_OR_RETURN(
+        asic_sw::deepsea::pxc::pfc::isa::BarnaCoreChannelBundle bundle,
+        FromUtilStatusOr(codec.Decode(encoded_bundle)));
+    *program->add_bundles() = std::move(bundle);
+  }
+  return std::move(program);
+}
+
+// Used to simplify LloModule equality test. Current implementation is based on
+// SHA256 digest of a textual representation, which can be improved later.
+class LloModuleFingerprint {
+ public:
+  static LloModuleFingerprint Create(LloModule* module) {
+    std::string module_text = ToMnemonic(*module->top_region());
+    security::SHA256 data_hasher;
+    data_hasher.Update(module_text);
+    return LloModuleFingerprint(data_hasher.Digest());
+  }
+
+  bool IsSameAs(const LloModuleFingerprint& other) const {
+    return fingerprint_ == other.fingerprint_;
+  }
+
+ private:
+  explicit LloModuleFingerprint(std::string fingerprint)
+      : fingerprint_(fingerprint) {}
+
+  std::string fingerprint_;
+};
+
+StatusOrIsaProgram BuildProgramFromEncoding(Converter* converter,
+                                            LloModule* module,
+                                            LloDumper* dumper,
+                                            const SubTimer& timer) {
+  SubTimer encoding_timer(timer, "build_from_encodings");
+
+  TF_ASSIGN_OR_RETURN(std::string program_assembly,
+                      converter->OptimizeAndCodeGen());
+  if (dumper != nullptr) {
+    dumper->DumpTextIfEnabled("llvm-target-encoded-assembly", program_assembly,
+                              LloDumper::Category::kLlvm);
+  }
+
+  TF_ASSIGN_OR_RETURN(std::vector<std::vector<uint8_t>> bundle_encodings,
+                      ExtractEncodedBundles(program_assembly));
+  if (dumper != nullptr) {
+    std::string encoded_bundles =
+        absl::StrJoin(bundle_encodings, "\n",
+                      [](std::string* out, const std::vector<uint8_t>& values) {
+                        out->append(absl::StrJoin(values, ","));
+                      });
+    dumper->DumpTextIfEnabled("llvm-target-encoded-bundles", encoded_bundles,
+                              LloDumper::Category::kLlvm);
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<asic_sw::deepsea::pxc::pfc::isa::BarnaCoreChannelProgram>
+          program,
+      BarnaCoreChannelProgramFromEncoded(bundle_encodings));
+  if (dumper != nullptr) {
+    dumper->DumpTextIfEnabled("llvm-barnacore-channel-program-proto",
+                              program->DebugString(),
+                              LloDumper::Category::kLlvm);
+  }
+
+  auto isa_program = std::make_unique<xla::IsaProgramProto>();
+  isa_program->set_allocated_pufferfish_barna_core_channel_program(
+      program.release());
+  isa_program->set_deepsea_version(TpuVersionToProto(TpuVersion::kPufferfish));
+  isa_program->set_core_type(
+      tpu::TpuCoreTypeToProto(tpu::TpuCoreTypeForSequencer(
+          TpuSequencerType::kBarnaCoreAddressHandler)));
+  return std::move(isa_program);
+}
+
+StatusOrIsaProgram BuildProgramFromMCCode(
+    Converter* converter, LloCompilationResult* tlp_compilation_result,
+    LloModule* module, LloDumper* dumper, int num_threads,
+    BundleMapping* bundle_instructions, const SubTimer& timer) {
+  SubTimer build_from_mc(timer, "build_program_from_mc");
+  const Target& target = module->target();
+  IsaProgramTarget program_target{module->SequencerType(), &target};
+  TF_RET_CHECK(program_target.sequencer ==
+               TpuSequencerType::kTensorCoreSequencer);
+
+  // Get TLP McCode. Do not resolve branches until we finish stitching.
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<AbstractMcCode> tlp_mc_code,
+                      converter->OptimizeAndCodeGen(program_target));
+
+  // Update the number of static throttling cycles inserted by the compiler.
+  tlp_compilation_result->static_throttling_cycle_count +=
+      tlp_mc_code->GetStaticInsertedThrottleCycles();
+
+  // Stores mappings from debug location used to track instructions to their
+  // original LloInstruction combined across all LLO modules.
+  DebugLocationMapping combined_debug_location_mapping;
+  absl::Mutex mutex;
+  const auto merge_mapping = [&](const Converter& converter) {
+    absl::MutexLock lock(&mutex);
+    const auto& mappings = converter.GetUsedDebugLocations();
+    combined_debug_location_mapping.insert(mappings.begin(), mappings.end());
+  };
+  merge_mapping(*converter);
+
+  const auto get_spill_size = [&target](MemorySpace space) -> int64_t {
+    switch (space) {
+      case MemorySpace::kSmem:
+        return 1;
+      case MemorySpace::kVmem:
+        // See LloAllocationAssignment::AllocationAlignment().
+        return address_util::ConvertOffsetByteToWord(
+            space, target.ChunkSizeBytes(), target);
+      default:
+        LOG(FATAL) << "Unsupported memory space: " << space;
+    }
+  };
+
+  TF_RETURN_IF_ERROR(tlp_mc_code->CheckForSpillRegionsOverflow(
+      converter->GetSpillRegions(), get_spill_size,
+      /*vmem_overflow_slots=*/nullptr));
+
+  // Check if stitching is needed, i.e. there are external HLO references via
+  // CALL pseudo-instructions that need to be fixed-up.
+  TF_ASSIGN_OR_RETURN(std::set<std::string> referenced_hlos,
+                      tlp_mc_code->GetHloReferences());
+
+  // We need to ensure all McCode objects we collect for externally compiled
+  // HLOs live until we have generated the final program for TLP module, so we
+  // define the map outside on top-level, even though it will only be used in
+  // the conditional below.
+  using HloInfo = std::pair<HloReference, StatusOrMcCode>;
+  using HloInfoMap = std::map<std::string, HloInfo>;
+  HloInfoMap hlo_info_by_name;
+
+  if (!referenced_hlos.empty()) {
+    // Adjust spill regions to subtract the space used for TLP spilling.
+    SpillRegionCollection hlo_spill_regions = converter->GetSpillRegions();
+    TF_RETURN_IF_ERROR(
+        tlp_mc_code->AdjustTlpSpillRegions(&hlo_spill_regions, get_spill_size));
+
+    // Compute spill regions actually used by TLP.
+    SpillRegionCollection tlp_spill_regions;
+    const auto compute_tlp_spill_region = [&](MemorySpace space) -> Status {
+      TF_ASSIGN_OR_RETURN(const MemRegion tlp_region,
+                          MaybeFind(converter->GetSpillRegions(), space));
+      TF_ASSIGN_OR_RETURN(const MemRegion hlo_region,
+                          MaybeFind(hlo_spill_regions, space));
+      TF_RET_CHECK(tlp_region.first == hlo_region.first);
+      TF_RET_CHECK(tlp_region.second >= hlo_region.second);
+      tlp_spill_regions.emplace(
+          space, MemRegion(hlo_region.second, tlp_region.second));
+      return OkStatus();
+    };
+    TF_RETURN_IF_ERROR(compute_tlp_spill_region(MemorySpace::kVmem));
+    TF_RETURN_IF_ERROR(compute_tlp_spill_region(MemorySpace::kSmem));
+
+    // Fill in the map, so that all records are already created. This should
+    // ensure that creating McCode in separate threads is safe.
+    for (const HloReference& hlo_reference : converter->GetHloReferences()) {
+      llvm::Function* llvm_function = hlo_reference.function_or_null();
+      if (llvm_function == nullptr) {
+        continue;  // LLVM function must be optimized out and deleted.
+      }
+      std::string hlo_name = llvm_function->getName().str();
+      if (referenced_hlos.erase(hlo_name) == 0) {
+        continue;  // HLO is not referenced from TLP.
+      }
+      const auto it = hlo_info_by_name.insert(
+          {hlo_name, std::make_pair(hlo_reference,
+                                    StatusOrMcCode(InternalError(
+                                        "McCode for HLO is not built yet.")))});
+      TF_RET_CHECK(it.second) << hlo_name;
+    }
+    TF_RET_CHECK(referenced_hlos.empty())
+        << "Failed to find HLO function(s) referenced from TLP";
+
+    // Compile referenced HLOs into McCode in parallel. Basically: create a
+    // module, optimize it, build McCode and resolve the branches (stitching
+    // assumes all branches in the inlinee are relative).
+    const auto build_hlo_mc_code =
+        [&](const HloReference& hlo, int64_t retry,
+            const SubTimer& timer) -> StatusOrMcCode {
+      hlo.compilation_result()->extra_vmem_memory_used_bytes = 0;
+      TF_RETURN_IF_ERROR(hlo.compilation_result()->status);
+      TF_RET_CHECK(hlo.function_or_null() != nullptr);
+      const std::string hlo_name = hlo.function_or_null()->getName().str();
+      SubTimer hlo_compilation_timer(timer,
+                                     absl::StrCat("compilation_retry_", retry));
+      Converter hlo_converter(
+          "LloHloModule_" + hlo_name, hlo.compilation_result(), hlo.config(),
+          hlo.compilation_result()->module.get(), dumper,
+          converter->GetProgramAllocationMaps(), hlo_spill_regions,
+          converter->compilation_result(), &tlp_spill_regions,
+          hlo.function_or_null(), retry, hlo_compilation_timer);
+
+      TF_RETURN_IF_ERROR(hlo_converter.BuildMainModule());
+      if (dumper != nullptr && dumper->IsDumpEnabled()) {
+        dumper->DumpTextIfEnabled(NormalizeAsFileName(absl::StrCat(
+                                      "llvm-for-llo-", hlo_name, "-", retry)),
+                                  hlo_converter.PrintAsLlvm(),
+                                  LloDumper::Category::kLlvm);
+      }
+      TF_RETURN_IF_ERROR(hlo_converter.VerifyModule());
+
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<AbstractMcCode> hlo_mc_code,
+                          hlo_converter.OptimizeAndCodeGen(program_target));
+
+      // Update the number of static throttling cycles inserted by the compiler.
+      hlo.compilation_result()->static_throttling_cycle_count +=
+          hlo_mc_code->GetStaticInsertedThrottleCycles();
+      TF_RETURN_IF_ERROR(hlo_mc_code->FinalizeAsInlinee());
+
+      int64_t vmem_overflow_slots = 0;
+      Status status = hlo_mc_code->CheckForSpillRegionsOverflow(
+          hlo_converter.GetSpillRegions(), get_spill_size,
+          &vmem_overflow_slots);
+      if (tensorflow::errors::IsResourceExhausted(status)) {
+        hlo.compilation_result()->extra_vmem_memory_used_bytes =
+            vmem_overflow_slots * target.ChunkSizeBytes();
+      }
+      if (!status.ok()) {
+        return status;
+      }
+
+      merge_mapping(hlo_converter);
+      return std::move(hlo_mc_code);
+    };
+
+    thread::TreeOptions options;
+    options.set_parallelism(num_threads);
+    std::unique_ptr<thread::Fiber> tree = thread::NewTree(options, [&] {
+      thread::Bundle bundle;
+      for (auto& pair : hlo_info_by_name) {
+        auto work_unit = [&] {
+          const std::string& hlo_name = pair.first;
+          SubTimer hlo_timer(build_from_mc, hlo_name);
+          HloInfo& hlo_info = pair.second;
+          const HloReference& hlo_reference = hlo_info.first;
+          StatusOrMcCode& status_or_mc_code = hlo_info.second;
+          LloCompilationResult* const compilation_result =
+              hlo_reference.compilation_result();
+          if (!compilation_result->status.ok()) {
+            status_or_mc_code = compilation_result->status;
+            LOG(WARNING) << "HLO " << hlo_name
+                         << " compilation result already failed with : "
+                         << compilation_result->status.error_message();
+            return;
+          }
+
+          // Build the code the first time.
+          status_or_mc_code =
+              build_hlo_mc_code(hlo_reference, /*retry=*/0, hlo_timer);
+
+          // Try retrying HLO lowering if needed/supported.
+          if (compilation_result->lower_with_retries &&
+              (hlo_reference.config() == nullptr ||
+               hlo_reference.config()->supports_retry())) {
+            SubTimer retry_logic_timer(hlo_timer, "retry_logic");
+            const int64_t starting_retry_count =
+                compilation_result->retry_count;
+            compilation_result->status = status_or_mc_code.status();
+
+            // After we've built mc-code and know its spills and bundle number
+            // we can try running it via lower_with_retries, so code exceeding
+            // one overlay or having too many spills could be re-lowered.
+            const auto result_bundle_count = [&status_or_mc_code] {
+              return status_or_mc_code.ok()
+                         ? status_or_mc_code.value()->GetBundleCount()
+                         : 0;
+            };
+
+            int64_t retry_attempt = 0;
+            LloModuleFingerprint last_module_fingerprint =
+                LloModuleFingerprint::Create(
+                    hlo_reference.compilation_result()->module.get());
+
+            const auto post_process = [&] {
+              if (!compilation_result->status.ok()) {
+                status_or_mc_code = compilation_result->status;
+                return;
+              }
+
+              LloModuleFingerprint current_module_fingerplint =
+                  LloModuleFingerprint::Create(
+                      hlo_reference.compilation_result()->module.get());
+
+              if (current_module_fingerplint.IsSameAs(
+                      last_module_fingerprint)) {
+                LOG(WARNING) << "Retry attempt #" << (++retry_attempt)
+                             << " for HLO " << hlo_name
+                             << ": skipping LLVM construction since there was "
+                                "no change in LLO module.";
+                compilation_result->status = status_or_mc_code.status();
+                return;
+              }
+
+              last_module_fingerprint = std::move(current_module_fingerplint);
+
+              // At this point, llo module is completely re-lowered. While HLO's
+              // allocations need to be completely re-computed, the adjustments
+              // do not since they are computed based on TLP allocations only,
+              // and those didn't change.
+              ModuleAllocationMaps* hlo_allocation_maps =
+                  FindOrDie(*converter->GetProgramAllocationMaps(),
+                            compilation_result)
+                      .get();
+              {
+                hlo_allocation_maps->Reset();
+
+                const auto pre_run_on_region =
+                    [hlo_allocation_maps](const LloRegion* region) -> Status {
+                  for (const auto& allocation : region->allocations()) {
+                    if (allocation->space() == MemorySpace::kSmem ||
+                        allocation->space() == MemorySpace::kVmem) {
+                      TF_RETURN_IF_ERROR(
+                          hlo_allocation_maps->RegisterAllocation(
+                              allocation.get()));
+                    }
+                  }
+                  return OkStatus();
+                };
+                compilation_result->status =
+                    ConstLloRegionVisitor()
+                        .WithPreRunOnRegion(pre_run_on_region)
+                        .VisitRegion(compilation_result->module->top_region());
+                if (!compilation_result->status.ok()) {
+                  status_or_mc_code = compilation_result->status;
+                  return;
+                }
+
+                // Re-pack allocations.
+                MemRegion vmem_bounds{
+                    0, address_util::ConvertOffsetByteToWord(
+                           MemorySpace::kVmem, target.VmemSizeBytes(), target)};
+                compilation_result->status =
+                    hlo_allocation_maps->PackVmemAllocations(vmem_bounds);
+                if (!compilation_result->status.ok()) {
+                  status_or_mc_code = compilation_result->status;
+                  return;
+                }
+              }
+
+              // If we don't break TLP spills, we can try and recompile mc-code.
+              LOG(WARNING) << "Retry attempt #" << (++retry_attempt)
+                           << " for HLO " << hlo_name;
+              status_or_mc_code =
+                  build_hlo_mc_code(hlo_reference, retry_attempt, hlo_timer);
+              if (!status_or_mc_code.ok()) {
+                LOG(ERROR) << "Retry attempt #" << retry_attempt << " for HLO "
+                           << hlo_name << " failed with error: "
+                           << status_or_mc_code.status().ToString();
+              }
+              compilation_result->status = status_or_mc_code.status();
+            };
+
+            compilation_result->lower_with_retries(
+                /*already_lowered=*/true, result_bundle_count, post_process);
+
+            const int64_t actual_retries =
+                compilation_result->retry_count - starting_retry_count;
+
+            if (actual_retries > 0) {
+              LOG(WARNING) << (compilation_result->status.ok() ? "Successful"
+                                                               : "Unsuccessful")
+                           << " retry compilation of " << hlo_name
+                           << " after extra llvm retry count: "
+                           << actual_retries;
+            }
+          }
+
+          if (!status_or_mc_code.ok()) {
+            // Report HLO compilation errors here as warnings, the inliner will
+            // only report one compilation error when it'll try to inline the
+            // inlinee with error.
+            LOG(WARNING) << "HLO " << hlo_name
+                         << " compilation failed with error: "
+                         << status_or_mc_code.status().error_message();
+          }
+        };
+        if (num_threads > 1) {
+          bundle.Add(work_unit);
+        } else {
+          work_unit();
+        }
+      }
+      bundle.JoinAll();
+    });
+    tree->Join();
+  }
+
+  SubTimer inlining_timer(build_from_mc, "perform_inlining");
+  // By this time we have McCode for all HLOs we need, so we can do inlining.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<AbstractMcCode> final_mc_code,
+      tlp_mc_code->PerformInlining([&hlo_info_by_name](const std::string& name)
+                                       -> StatusOr<AbstractMcCode*> {
+        auto it = hlo_info_by_name.find(name);
+        if (it == hlo_info_by_name.end()) {
+          return InternalError("Cannot find McCode for HLO: %s", name);
+        }
+        TF_RETURN_IF_ERROR(it->second.second.status());
+        return it->second.second.value().get();
+      }));
+  inlining_timer.StopAndReport();
+
+  SubTimer finalize_timer(build_from_mc, "finalize_as_tlp");
+
+  std::vector<SourceLocationList> source_locations;
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<IsaProgramProto> program,
+                      final_mc_code->FinalizeAsTlpAndConsume(
+                          num_threads, &source_locations, module));
+
+  // If asked to fill in bundle --> LLO instruction mapping, do it.
+  if (bundle_instructions != nullptr) {
+    TF_RET_CHECK(bundle_instructions->empty());
+    for (int64_t bundle_number = 0; bundle_number < source_locations.size();
+         bundle_number++) {
+      std::vector<const LloInstruction*> instructions;
+      for (const SourceLocation& location : source_locations[bundle_number]) {
+        if (location.first != 0 && location.second != 0) {
+          const auto it = combined_debug_location_mapping.find(location);
+          TF_RET_CHECK(it != combined_debug_location_mapping.end());
+          if (it->second->opcode() == LloOpcode::kVectorTrace) {
+            // This is a tricky case, we don't want to confuse XPROF with
+            // several different HLO symbols associated with same bundle, so we
+            // just ignore the rest of instructions in this case.
+            instructions.clear();
+            instructions.push_back(it->second);
+            break;
+          }
+          instructions.push_back(it->second);
+        }
+      }
+
+      auto [it, inserted] = bundle_instructions->insert({bundle_number, {}});
+      for (const LloInstruction* instruction : instructions) {
+        it->second.push_back(instruction);
+      }
+    }
+  }
+  return std::move(program);
+}
+
+// Performs HLO allocation adjustments and builds allocation maps for VMEM/SMEM
+// allocations for main program module as well as all HLO modules.
+StatusOr<std::unique_ptr<ProgramAllocationMaps>> CollectProgramAllocationMaps(
+    const LloCompilationResult* main_compilation_result,
+    LloModule* main_module) {
+  // Adjust allocations inside deduplicated HLO modules to account for TLP
+  // allocations. Note that VMEM adjustments are expected to be 0s since TLP
+  // module is assumed to never have live allocations across HLO calls.
+  {
+    const ProgramMemoryAllocator& main_allocator =
+        *main_compilation_result->memory_requirement;
+
+    const auto run_on_instruction =
+        [&](const LloRegionMember* member) -> Status {
+      const LloInstruction* instruction = member->instruction();
+      TF_RET_CHECK(instruction->opcode() != LloOpcode::kCall)
+          << instruction->ToString();
+      if (instruction->opcode() == LloOpcode::kInlinedCall) {
+        LloCompilationResult* compilation_result =
+            instruction->associated_compilation_result();
+        const ProgramMemoryAllocator& callee_allocator =
+            *compilation_result->memory_requirement;
+        const MemorySizes current_adjustment_bytes =
+            main_allocator.ComputeAdjustmentAtInlinedCall(callee_allocator,
+                                                          *instruction);
+        MemorySizes& hlo_adjustment_bytes =
+            compilation_result->allocation_offset_adjustment_bytes;
+        hlo_adjustment_bytes =
+            hlo_adjustment_bytes.Maximum(current_adjustment_bytes);
+
+        // Enforce zero adjustments for VMEM.
+        for (int j = 0; j < kNumberMemorySubSpaces; ++j) {
+          TF_RET_CHECK(0 ==
+                       hlo_adjustment_bytes[MemorySpace::kVmem]
+                                           [static_cast<MemorySubSpace>(j)])
+              << hlo_adjustment_bytes.SizeString();
+        }
+
+        // Enforce 2-words (max) alignment for SMEM.
+        //
+        // Note that a call to ComputeAdjustmentAtInlinedCall above should take
+        // into account HLO allocation alignments, but the problem is that we
+        // call it once here and don't recompute this info later. In case HLO is
+        // re-lowered later during LLVM retries, its alignments might change, so
+        // instead of recomputing the allocations after each retry we use max
+        // possible SMEM alignment here to make sure we are good for all
+        // potential re-lowering outcomes.
+        constexpr int64_t kSmemAlignmentWords = 2;
+        constexpr int64_t kSmemAlignmentBytes = 4 * kSmemAlignmentWords;
+        for (int j = 0; j < kNumberMemorySubSpaces; ++j) {
+          int64_t& adjustment_bytes =
+              hlo_adjustment_bytes[MemorySpace::kSmem]
+                                  [static_cast<MemorySubSpace>(j)];
+          adjustment_bytes =
+              MathUtil::RoundUpTo(adjustment_bytes, kSmemAlignmentBytes);
+        }
+      }
+      return OkStatus();
+    };
+
+    TF_RETURN_IF_ERROR(ConstLloRegionVisitor()
+                           .WithRunOnInstruction(run_on_instruction)
+                           .VisitRegion(main_module->top_region()));
+  }
+
+  auto allocation_maps = std::make_unique<ProgramAllocationMaps>();
+
+  // Collect allocation maps for TLP module as well as all HLO modules.
+  {
+    // Creates an allocation map if it does not exist yet, if it does returns
+    // nullptr to indicate that.
+    const auto create_new_allocation_maps =
+        [&](const LloCompilationResult* compilation_result)
+        -> ModuleAllocationMaps* {
+      CHECK(compilation_result != nullptr);
+      const auto it = allocation_maps->find(compilation_result);
+      if (it != allocation_maps->end()) {
+        return nullptr;
+      }
+      const auto inserted = allocation_maps->emplace(
+          compilation_result, std::make_unique<ModuleAllocationMaps>(
+                                  main_module->target(), compilation_result));
+      return inserted.first->second.get();
+    };
+
+    // Returns a region processor collecting VMEM/SMEM allocations in the
+    // captured module allocation maps.
+    const auto pre_run_on_region =
+        [](ModuleAllocationMaps* module_allocation_maps)
+        -> std::function<Status(const LloRegion*)> {
+      CHECK(module_allocation_maps != nullptr);
+      return [module_allocation_maps](const LloRegion* region) -> Status {
+        for (const auto& allocation : region->allocations()) {
+          if (allocation->space() == MemorySpace::kSmem ||
+              allocation->space() == MemorySpace::kVmem) {
+            TF_RETURN_IF_ERROR(
+                module_allocation_maps->RegisterAllocation(allocation.get()));
+          }
+        }
+        return OkStatus();
+      };
+    };
+
+    // Runs on TLP instruction and collects the allocations of called HLOs.
+    const auto run_on_instruction =
+        [&](const LloRegionMember* member) -> Status {
+      if (member->kind() == LloRegionMember::kInstruction &&
+          member->instruction()->opcode() == LloOpcode::kInlinedCall) {
+        const LloCompilationResult* hlo_compilation_result =
+            member->instruction()->associated_compilation_result();
+        TF_RET_CHECK(hlo_compilation_result != nullptr);
+        if (ModuleAllocationMaps* hlo_allocation_maps =
+                create_new_allocation_maps(hlo_compilation_result);
+            hlo_allocation_maps != nullptr) {
+          // Only collect allocations once per HLO. Note: no instruction handles
+          // since we don't expect nested HLO calls,
+          TF_RETURN_IF_ERROR(
+              ConstLloRegionVisitor()
+                  .WithPreRunOnRegion(pre_run_on_region(hlo_allocation_maps))
+                  .VisitRegion(hlo_compilation_result->module->top_region()));
+        }
+      }
+      return OkStatus();
+    };
+
+    TF_RETURN_IF_ERROR(
+        ConstLloRegionVisitor()
+            .WithPreRunOnRegion(pre_run_on_region(
+                create_new_allocation_maps(main_compilation_result)))
+            .WithRunOnInstruction(run_on_instruction)
+            .VisitRegion(main_module->top_region()));
+  }
+
+  return std::move(allocation_maps);
+}
+
+}  // namespace
+
+StatusOrIsaProgram GenerateProgramWithLlvmTarget(
+    LloCompilationResult* compilation_result, LloModule* main_module,
+    LloDumper* dumper, int num_threads, BundleMapping* bundle_instructions) {
+  TF_RET_CHECK(main_module != nullptr);
+  TF_RET_CHECK(compilation_result->module.get() == nullptr ||
+               compilation_result->module.get() == main_module);
+
+  std::unique_ptr<TimingInfo> time_info;
+  // Collect timing information only when dumping is enabled or the flag is
+  // explicitly enabled If time_info is unset then every timing operation
+  // becomes a NOP.
+  if (main_module->comp_env().xla_jf_llo2llvm_timing_info()) {
+    time_info = std::make_unique<TimingInfo>("generate_program_with_llvm");
+  }
+  SubTimer whole_program_timer(time_info.get(), "whole_program");
+
+  SubTimer alloc_timing(whole_program_timer, "allocation_map_collection");
+  // Build allocation maps for main module and all HLO modules.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<ProgramAllocationMaps> allocation_maps,
+      CollectProgramAllocationMaps(compilation_result, main_module));
+  alloc_timing.StopAndReport();
+
+  // Main program module converter.
+  Converter converter("LloTlpModule", compilation_result, main_module, dumper,
+                      allocation_maps.get(), whole_program_timer);
+
+  TF_RETURN_IF_ERROR(converter.BuildMainModule());
+  if (dumper != nullptr && dumper->IsDumpEnabled()) {
+    dumper->DumpTextIfEnabled("llvm-for-llo-tlp", converter.PrintAsLlvm(),
+                              LloDumper::Category::kLlvm);
+  }
+
+  TF_RETURN_IF_ERROR(converter.VerifyModule());
+
+  StatusOr<std::unique_ptr<IsaProgramProto>> return_status;
+  switch (main_module->SequencerType()) {
+    case TpuSequencerType::kTensorCoreSequencer: {
+      return_status = BuildProgramFromMCCode(
+          &converter, compilation_result, main_module, dumper, num_threads,
+          bundle_instructions, whole_program_timer);
+      break;
+    }
+    case TpuSequencerType::kBarnaCoreAddressHandler:
+      return_status = BuildProgramFromEncoding(&converter, main_module, dumper,
+                                               whole_program_timer);
+      break;
+    case TpuSequencerType::kBarnaCoreSequencer:
+    case TpuSequencerType::kSparseCoreSequencer:
+    case TpuSequencerType::kSparseCoreTileAccessSequencer:
+    case TpuSequencerType::kSparseCoreTileExecuteSequencer:
+      return_status = InternalError(
+          "Unsupported sequencer type in GenerateProgramWithLlvmTarget.");
+      break;
+  }
+  whole_program_timer.StopAndReport();
+
+  if (main_module->comp_env().xla_jf_llo2llvm_timing_info()) {
+    // If we have a dumper available dump it there.
+    if (dumper != nullptr && dumper->IsDumpEnabled()) {
+      dumper->DumpTextIfEnabled("llo2llvm-timing-info",
+                                time_info->GetTimingReport(),
+                                LloDumper::Category::kLlvm);
+    } else {
+      // Otherwise dump it to the log file.
+      // Printing line by line to avoid logging limits.
+      std::istringstream istring(time_info->GetTimingReport());
+      std::string line;
+      while (std::getline(istring, line)) {
+        LOG(INFO) << line;
+      }
+    }
+  }
+  if (dumper != nullptr) {
+    std::string time_passes_info = GetAndResetLlvmTimingInfo();
+    // Avoid creating an empty report file.
+    if (!time_passes_info.empty()) {
+      dumper->DumpTextIfEnabled("llvm-timing-info", time_passes_info,
+                                LloDumper::Category::kLlvm);
+    }
+  }
+  return return_status;
+}
+
+}  // namespace jellyfish
+}  // namespace xla

diff --git a/tpu_recision/platforms/xla/service/jellyfish/llvm_code_generator.h b/tpu_recision/platforms/xla/service/jellyfish/llvm_code_generator.h
new file mode 100644
index 0000000..1c73e85
--- /dev/null
+++ b/tpu_recision/platforms/xla/service/jellyfish/llvm_code_generator.h

@@ -0,0 +1,24 @@
+#ifndef PLATFORMS_XLA_SERVICE_JELLYFISH_LLVM_CODE_GENERATOR_H_
+#define PLATFORMS_XLA_SERVICE_JELLYFISH_LLVM_CODE_GENERATOR_H_
+
+#include <memory>
+
+#include "platforms/xla/service/jellyfish/isa_program.proto.h"
+#include "platforms/xla/service/jellyfish/llo_dumper.h"
+#include "platforms/xla/service/jellyfish/llo_module.h"
+#include "third_party/tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+namespace jellyfish {
+
+using LloInstructionList = absl::InlinedVector<const LloInstruction*, 3>;
+using BundleMapping = absl::flat_hash_map<int64_t, LloInstructionList>;
+
+StatusOr<std::unique_ptr<IsaProgramProto>> GenerateProgramWithLlvmTarget(
+    LloCompilationResult* compilation_result, LloModule* main_module,
+    LloDumper* dumper, int num_threads, BundleMapping* bundle_instructions);
+
+}  // namespace jellyfish
+}  // namespace xla
+
+#endif  // PLATFORMS_XLA_SERVICE_JELLYFISH_LLVM_CODE_GENERATOR_H_

diff --git a/tpu_recision/platforms/xla/service/jellyfish/llvm_code_generator_hlo_test.cc b/tpu_recision/platforms/xla/service/jellyfish/llvm_code_generator_hlo_test.cc
new file mode 100644
index 0000000..afb9959
--- /dev/null
+++ b/tpu_recision/platforms/xla/service/jellyfish/llvm_code_generator_hlo_test.cc

@@ -0,0 +1,220 @@
+#include <memory>
+#include <random>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "platforms/deepsea/executor/deepsea_platform.h"
+#include "platforms/xla/service/jellyfish/llo_execution_tests/llo_execution_test.h"
+#include "platforms/xla/service/jellyfish/tpu_compilation_environment.proto.h"
+#include "platforms/xla/tools/tuning.proto.h"
+#include "testing/base/public/gunit.h"
+#include "third_party/tensorflow/compiler/xla/error_spec.h"
+#include "third_party/tensorflow/compiler/xla/service/hlo_parser.h"
+#include "third_party/tensorflow/compiler/xla/service/hlo_runner.h"
+#include "third_party/tensorflow/compiler/xla/service/platform_util.h"
+#include "third_party/tensorflow/compiler/xla/status_macros.h"
+#include "third_party/tensorflow/compiler/xla/stream_executor/lib/statusor.h"
+#include "third_party/tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "third_party/tensorflow/compiler/xla/tests/test_utils.h"
+#include "third_party/tensorflow/core/lib/core/status_test_util.h"
+
+ABSL_DECLARE_FLAG(bool, xla_use_llvm_backend);
+ABSL_DECLARE_FLAG(bool, xla_llvm_generate_xla_compatible_dwg);
+
+namespace xla {
+namespace jellyfish {
+namespace {
+
+constexpr auto kDeepseaTestPlatform = "deepsea";
+
+constexpr double kAbsErrorBound = 0.01;
+constexpr double kRelErrorBound = 0.1;
+
+bool ExpectMatch(absl::Span<const Literal> input_data,
+                 const Literal& test_result, const Literal& reference_result,
+                 const HloModule& test_module, const std::string& test_platform,
+                 const std::string& reference_platform) {
+  ErrorSpec error_spec(kAbsErrorBound, kRelErrorBound);
+  ::testing::AssertionResult matched =
+      LiteralTestUtil::Near(/*expected=*/reference_result,
+                            /*actual=*/test_result,
+                            /*error_spec=*/error_spec,
+                            /*detailed_message=*/true);
+  if (matched) {
+    return true;
+  }
+
+  // EXPECT(match) emits a messy error message because the assertion result
+  // from LiteralTestUtil::Near has a very long error message. LOG(ERROR) the
+  // message instead and fail with a generic message.
+  ADD_FAILURE() << "Value mismatch between test (" << test_platform
+                << ") and reference (" << reference_platform << ")";
+  LOG(ERROR) << matched.message();
+  LOG(ERROR) << "Test module:\n" << test_module.ToString();
+  return false;
+}
+
+StatusOr<Literal> RunModule(absl::Span<const Literal> input_data,
+                            Executable* const executable,
+                            HloRunner* const runner) {
+  // TODO(cheshire): Instead, here we are supposed to use the more high-level
+  // abstraction which does the transfer for us.
+  TF_RET_CHECK(runner != nullptr);
+
+  // Format and transfer the input data once instead of once per run.
+  TF_ASSIGN_OR_RETURN(std::vector<ScopedShapedBuffer> device_input_data,
+                      runner->TransferLiteralsToDevice(input_data));
+
+  TF_ASSIGN_OR_RETURN(
+      ExecutionOutput device_output,
+      runner->ExecuteWithDeviceBuffers(executable,
+                                       /*arguments=*/device_input_data,
+                                       /*profile=*/nullptr));
+
+  TF_ASSIGN_OR_RETURN(Literal output, runner->TransferLiteralFromDevice(
+                                          device_output.Result()));
+
+  return std::move(output);
+}
+
+StatusOr<std::unique_ptr<Executable>> CompileModule(
+    const HloModule& module, TpuCompilationEnvironment& comp_env,
+    HloRunner* const runner, bool use_llvm) {
+  TF_RET_CHECK(runner != nullptr);
+  // CreateExecutable expected a unique pointer to a module.
+  std::unique_ptr<HloModule> cloned_module = module.Clone(module.config(), "");
+  TF_RET_CHECK(module.entry_computation()
+                   ->root_instruction()
+                   ->raw_backend_config_string() ==
+               cloned_module->entry_computation()
+                   ->root_instruction()
+                   ->raw_backend_config_string());
+  comp_env.set_xla_llvm_generate_xla_compatible_dwg(use_llvm);
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      runner->CreateExecutable(std::move(cloned_module),
+                                               /*run_hlo_passes=*/false));
+  return std::move(executable);
+}
+
+class LlvmCodegenHloTest : public LloExecutionTest {
+ protected:
+  // Runs HLO test for the module provided.
+  Status RunTest(std::string_view module_string, int64_t run_count = 25);
+};
+
+Status LlvmCodegenHloTest::RunTest(std::string_view module_string,
+                                   int64_t run_count) {
+  auto engine = std::make_unique<std::minstd_rand0>();
+  // Enforce compat mode.
+  comp_env().set_xla_llvm_generate_xla_compatible_dwg(true);
+
+  // Load module.
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      ParseAndReturnUnverifiedModule(module_string, {}));
+
+  // Create test runner.
+  TF_ASSIGN_OR_RETURN(se::Platform * platform,
+                      PlatformUtil::GetPlatform(kDeepseaTestPlatform));
+  HloRunner runner(platform);
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Executable> reference_executable,
+      CompileModule(*module, comp_env(), &runner, /*use_llvm=*/false));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Executable> llvm_executable,
+      CompileModule(*module, comp_env(), &runner, /*use_llvm=*/true));
+
+  for (int64_t i = 0; i < run_count; i++) {
+    // Create input data.
+    TF_ASSIGN_OR_RETURN(std::vector<Literal> input_data,
+                        MakeFakeArguments(module.get(), engine.get()));
+
+    TF_ASSIGN_OR_RETURN(
+        Literal reference_output,
+        RunModule(input_data, reference_executable.get(), &runner));
+    TF_ASSIGN_OR_RETURN(Literal llvm_output,
+                        RunModule(input_data, llvm_executable.get(), &runner));
+
+    if (!ExpectMatch(
+            input_data, llvm_output, reference_output, *module,
+            absl::StrCat(runner.backend().platform()->Name(), ", LLVM"),
+            absl::StrCat(runner.backend().platform()->Name(), ", XLA"))) {
+      return OkStatus();
+    }
+  }
+  return OkStatus();
+}
+
+TEST_F(LlvmCodegenHloTest, DISABLED_FusionWithConvolution01) {
+  TF_ASSERT_OK(RunTest(R"(
+HloModule fusion.78, is_scheduled=true
+
+%scalar_add_computation.7.. (scalar_lhs.7: f32[], scalar_rhs.7: f32[]) -> f32[] {
+  %scalar_lhs.7 = f32[]{:T(256)} parameter(0)
+  %scalar_rhs.7 = f32[]{:T(256)} parameter(1)
+  ROOT %add = f32[]{:T(256)} add(f32[]{:T(256)} %scalar_lhs.7, f32[]{:T(256)} %scalar_rhs.7)
+}
+
+%fused_computation.78.clone.clone (param_0.484: f32[], param_1.414: f32[8,14,11,4], param_2.343: f32[1,1,4,4]) -> (f32[4], f32[8,7,6,4], f32[8,7,6,4]) {
+  %param_1.414 = f32[8,14,11,4]{3,0,2,1:T(8,128)} parameter(1)
+  %param_2.343 = f32[1,1,4,4]{3,2,1,0:T(8,128)} parameter(2)
+  %convolution = f32[8,7,6,4]{3,0,2,1:T(8,128)} convolution(f32[8,14,11,4]{3,0,2,1:T(8,128)} %param_1.414, f32[1,1,4,4]{3,2,1,0:T(8,128)} %param_2.343), window={size=1x1 stride=2x2 pad=0_-1x0_0}, dim_labels=b01f_01io->b01f, metadata={op_type="Conv2D" op_name="while/conv2d_1/Conv2D"}
+  %param_0.484 = f32[]{:T(256)} parameter(0)
+  %reduce = f32[4]{0:T(256)} reduce(f32[8,7,6,4]{3,0,2,1:T(8,128)} %convolution, f32[]{:T(256)} %param_0.484), dimensions={0,1,2}, to_apply=%scalar_add_computation.7.., metadata={op_type="FusedBatchNormV3" op_name="while/batch_normalization_1/FusedBatchNormV3"}
+  %multiply = f32[8,7,6,4]{3,0,2,1:T(8,128)} multiply(f32[8,7,6,4]{3,0,2,1:T(8,128)} %convolution, f32[8,7,6,4]{3,0,2,1:T(8,128)} %convolution), metadata={op_type="FusedBatchNormV3" op_name="while/batch_normalization_1/FusedBatchNormV3"}
+  ROOT %tuple = (f32[4]{0:T(256)}, f32[8,7,6,4]{3,0,2,1:T(8,128)}, f32[8,7,6,4]{3,0,2,1:T(8,128)}) tuple(f32[4]{0:T(256)} %reduce, f32[8,7,6,4]{3,0,2,1:T(8,128)} %multiply, f32[8,7,6,4]{3,0,2,1:T(8,128)} %convolution)
+}
+
+ENTRY %fusion.78.. (parameter.0: f32[], parameter.1: f32[8,14,11,4], parameter.2: f32[1,1,4,4]) -> (f32[4], f32[8,7,6,4], f32[8,7,6,4]) {
+  %parameter.0 = f32[]{:T(256)} parameter(0)
+  %parameter.1 = f32[8,14,11,4]{3,0,2,1:T(8,128)} parameter(1)
+  %parameter.2 = f32[1,1,4,4]{3,2,1,0:T(8,128)} parameter(2)
+  ROOT %fusion = (f32[4]{0:T(256)}, f32[8,7,6,4]{3,0,2,1:T(8,128)}, f32[8,7,6,4]{3,0,2,1:T(8,128)}) fusion(f32[]{:T(256)} %parameter.0, f32[8,14,11,4]{3,0,2,1:T(8,128)} %parameter.1, f32[1,1,4,4]{3,2,1,0:T(8,128)} %parameter.2), kind=kOutput, calls=%fused_computation.78.clone.clone, metadata={op_type="Conv2D" op_name="while/conv2d_1/Conv2D"}, backend_config="{window_config: {\"kernel_window_bounds\":[\"1\",\"1\",\"8\",\"2\"],\"output_window_bounds\":[\"1\",\"28\",\"2\",\"2\"],\"input_window_bounds\":[],\"estimated_cycles\":\"48579\"}}"
+}
+  )"));
+}
+
+TEST_F(LlvmCodegenHloTest, DISABLED_FusionWithConvolution02) {
+  TF_ASSERT_OK(RunTest(R"(
+HloModule fusion.952, is_scheduled=true
+
+%scalar_add_computation.1.. (scalar_lhs.1: f32[], scalar_rhs.1: f32[]) -> f32[] {
+  %scalar_lhs.1 = f32[]{:T(256)} parameter(0)
+  %scalar_rhs.1 = f32[]{:T(256)} parameter(1)
+  ROOT %add = f32[]{:T(256)} add(f32[]{:T(256)} %scalar_lhs.1, f32[]{:T(256)} %scalar_rhs.1)
+}
+
+%fused_computation.952.clone.clone (param_0.954: f32[], param_1.954: f32[8,8,8,256], param_2.581: f32[1,1,256,128]) -> (f32[128], f32[8,8,8,128]) {
+  %param_1.954 = f32[8,8,8,256]{3,0,2,1:T(8,128)} parameter(1)
+  %param_2.581 = f32[1,1,256,128]{3,2,1,0:T(8,128)} parameter(2)
+  %convolution = f32[8,8,8,128]{3,0,2,1:T(8,128)} convolution(f32[8,8,8,256]{3,0,2,1:T(8,128)} %param_1.954, f32[1,1,256,128]{3,2,1,0:T(8,128)} %param_2.581), window={size=1x1}, dim_labels=b01f_01io->b01f, metadata={op_type="Conv2D" op_name="gid_Conv_BatchNorm_Relu_1/conv_2d/convolution"}
+  %param_0.954 = f32[]{:T(256)} parameter(0)
+  %reduce = f32[128]{0:T(256)} reduce(f32[8,8,8,128]{3,0,2,1:T(8,128)} %convolution, f32[]{:T(256)} %param_0.954), dimensions={0,1,2}, to_apply=%scalar_add_computation.1.., metadata={op_type="FusedBatchNormV3" op_name="gid_Conv_BatchNorm_Relu_1/batch_norm/batch_norm"}
+  ROOT %tuple = (f32[128]{0:T(256)}, f32[8,8,8,128]{3,0,2,1:T(8,128)}) tuple(f32[128]{0:T(256)} %reduce, f32[8,8,8,128]{3,0,2,1:T(8,128)} %convolution)
+}
+
+ENTRY %fusion.952.. (parameter.0: f32[], parameter.1: f32[8,8,8,256], parameter.2: f32[1,1,256,128]) -> (f32[128], f32[8,8,8,128]) {
+  %parameter.0 = f32[]{:T(256)} parameter(0)
+  %parameter.1 = f32[8,8,8,256]{3,0,2,1:T(8,128)} parameter(1)
+  %parameter.2 = f32[1,1,256,128]{3,2,1,0:T(8,128)} parameter(2)
+  ROOT %fusion = (f32[128]{0:T(256)}, f32[8,8,8,128]{3,0,2,1:T(8,128)}) fusion(f32[]{:T(256)} %parameter.0, f32[8,8,8,256]{3,0,2,1:T(8,128)} %parameter.1, f32[1,1,256,128]{3,2,1,0:T(8,128)} %parameter.2), kind=kOutput, calls=%fused_computation.952.clone.clone, metadata={op_type="Conv2D" op_name="gid_Conv_BatchNorm_Relu_1/conv_2d/convolution"}, backend_config="{window_config: {\"kernel_window_bounds\":[\"1\",\"1\",\"32\",\"1\"],\"output_window_bounds\":[\"4\",\"8\",\"1\",\"1\"],\"input_window_bounds\":[],\"estimated_cycles\":\"3631\"}}"
+}
+  )"));
+}
+
+// TODO(b/152446304): LLVM backend returns different results for CustomCall HLO.
+TEST_F(LlvmCodegenHloTest, DISABLED_CustomCall01) {
+  TF_ASSERT_OK(RunTest(R"(
+HloModule custom-call.25, is_scheduled=true
+
+ENTRY %custom-call.25.. (parameter.0: f32[4,64,64,16]) -> f32[4,32,32,16] {
+  %parameter.0 = f32[4,64,64,16]{0,3,2,1:T(8,128)} parameter(0)
+  ROOT %custom-call = f32[4,32,32,16]{0,3,2,1:T(8,128)} custom-call(f32[4,64,64,16]{0,3,2,1:T(8,128)} %parameter.0), custom_call_target="ResizeBilinearGrad", metadata={op_type="ResizeBilinearGrad" op_name="gradients/Decoder/resize_4/ResizeBilinear_grad/ResizeBilinearGrad"}, backend_config="{integer_config: {integer: \"10\"}}"
+}
+  )"));
+}
+
+}  // namespace
+}  // namespace jellyfish
+}  // namespace xla

diff --git a/tpu_recision/platforms/xla/service/jellyfish/llvm_mc_program_processor.cc b/tpu_recision/platforms/xla/service/jellyfish/llvm_mc_program_processor.cc
new file mode 100644
index 0000000..495da50
--- /dev/null
+++ b/tpu_recision/platforms/xla/service/jellyfish/llvm_mc_program_processor.cc

@@ -0,0 +1,4033 @@
+#include "platforms/xla/service/jellyfish/llvm_mc_program_processor.h"
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <map>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "learning/brain/tpu/runtime/tpu_version.h"
+#include "platforms/deepsea/software/jfc/mnemonics/parser_factory.h"
+#include "platforms/xla/service/jellyfish/bundle_requirement.h"
+#include "platforms/xla/service/jellyfish/dma_strides.h"
+#include "platforms/xla/service/jellyfish/isa_emitter_factory.h"
+#include "platforms/xla/service/jellyfish/llo_constant.h"
+#include "platforms/xla/service/jellyfish/llo_module.h"
+#include "platforms/xla/service/jellyfish/llo_opcode_helpers.h"
+#include "platforms/xla/service/jellyfish/llo_verifier.h"
+#include "platforms/xla/service/jellyfish/matmul_data_format.h"
+#include "platforms/xla/service/jellyfish/memory_space_enum.h"
+#include "platforms/xla/service/jellyfish/metadata/llo_opcode.h"
+#include "platforms/xla/service/jellyfish/vpack_format.h"
+#include "platforms/xla/service/jellyfish/vxpose_mode.h"
+#include "third_party/absl/container/flat_hash_map.h"
+#include "third_party/absl/container/node_hash_map.h"
+#include "third_party/absl/strings/str_cat.h"
+#include "third_party/absl/strings/str_replace.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/Analysis/TargetTransformInfo.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/CodeGen/AsmPrinter.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/CodeGen/MachineModuleInfo.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/CodeGen/Passes.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/CodeGen/TargetPassConfig.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/DiagnosticInfo.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/DiagnosticPrinter.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/IRBuilder.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/LLVMContext.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/LegacyPassManager.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/IR/MDBuilder.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/MC/MCAsmBackend.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/MC/MCAsmInfo.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/MC/MCCodeEmitter.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/MC/MCContext.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/MC/MCExpr.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/MC/MCInstPrinter.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/MC/MCObjectFileInfo.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/MC/MCParser/MCAsmParser.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/MC/MCStreamer.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/MC/MCSubtargetInfo.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/MC/MCValue.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/MC/TargetRegistry.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/Support/Casting.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/Support/SourceMgr.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/Transforms/IPO.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "third_party/llvm/llvm/lib/Target/GoogleTPU/TPU.h"
+#include "third_party/tensorflow/compiler/xla/status_macros.h"
+#include "third_party/tensorflow/compiler/xla/stream_executor/lib/statusor.h"
+#include "third_party/tensorflow/core/platform/errors.h"
+#include "thread/fiber/bundle.h"
+#include "util/regexp/re2/re2.h"
+
+using RangeSpec = xla::jellyfish::RangeSpec;
+
+namespace xla {
+namespace jellyfish {
+
+using StatusOrIsaProgram = StatusOr<std::unique_ptr<IsaProgramProto>>;
+
+using SpillLocation = std::pair<MemorySpace, int64_t>;
+
+// Some native instructions are not handled by IsaEmitter because they lack a
+// corresponding LloOpcode. For these instructions, we use an opcode as a
+// placeholder that is certain to be bogus.
+constexpr auto kInvalid = LloOpcode::kVectorMaskPhi;
+
+struct ScalarUnOpDesc {
+  LloOpcode llo_opcode;
+};
+
+struct ScalarBinOpDesc {
+  LloOpcode llo_opcode;
+  bool reverse_sx_sy = false;
+};
+
+struct VectorUnOpDesc {
+  LloOpcode llo_opcode;
+};
+
+struct VectorUnOpNoDstDesc {
+  LloOpcode llo_opcode;
+  bool has_extra_src = true;
+};
+
+struct VectorBinOpDesc {
+  LloOpcode llo_opcode;
+  bool is_scalar_vy = false;
+  bool reverse_vx_vy = false;
+};
+
+struct VectorPackOpDesc {
+  VpackFormat format;
+  bool is_scalar_vy = false;
+};
+
+struct VectorUnpackOpDesc {
+  VpackFormat format;
+  int sublane_id;
+};
+
+struct ScalarCompareDesc {
+  ComparisonDirection comparison_direction;
+  Comparison::Type comparison_type;
+};
+
+struct VectorCompareDesc {
+  ComparisonDirection comparison_direction;
+  Comparison::Type comparison_type;
+  bool is_scalar_vy = false;
+};
+
+struct VectorLoadDesc {
+  bool has_displacement = false;
+  bool has_sublane_mask = false;
+  bool has_sublane_stride = false;
+  bool has_sublane_shuffle = false;
+  int iar_regno = -1;
+};
+
+struct VectorStoreDesc {
+  bool has_displacement = false;
+  bool has_sublane_mask = false;
+  bool has_sublane_stride = false;
+  bool has_vmask = false;
+  int iar_regno = -1;
+};
+
+struct MatmulDesc {
+  int mxu_id;
+  LloOpcode llo_opcode;
+  bool is_masked;
+};
+
+struct LatchDesc {
+  int mxu_id;
+  GainLatchMode latch_mode;
+  bool is_masked;
+};
+
+struct VectorWaitDesc {
+  LloOpcode llo_opcode;
+};
+
+struct DmaDesc {
+  LloOpcode llo_opcode;
+  bool is_strided = false;
+  bool is_general = false;
+  bool has_dst_address = true;  // false for HIB destination
+};
+
+struct VectorReduceDesc {
+  LloOpcode llo_opcode;
+  int bus_id;
+  bool is_segmented = false;
+};
+
+struct ReadRegisterDesc {
+  // The IsaEmitter implementations are not set up to handle most registers, so
+  // provide a bogus default value.
+  LloOpcode llo_opcode = kInvalid;
+};
+
+using MCInstDesc =
+    std::variant<std::monostate, ScalarUnOpDesc, ScalarBinOpDesc,
+                 VectorUnOpDesc, VectorUnpackOpDesc, VectorUnOpNoDstDesc,
+                 VectorBinOpDesc, VectorPackOpDesc, ScalarCompareDesc,
+                 VectorCompareDesc, VectorLoadDesc, VectorStoreDesc, MatmulDesc,
+                 LatchDesc, VectorWaitDesc, DmaDesc, VectorReduceDesc,
+                 ReadRegisterDesc>;
+
+std::string PrintInst(const llvm::MCInst& inst) {
+  std::string printed_inst;
+  {
+    llvm::raw_string_ostream os(printed_inst);
+    inst.print(os);
+  }
+  return printed_inst;
+}
+
+// Get string representation for LLVM construct.
+template <typename T>
+std::string LlvmAsString(const T& t) {
+  std::string s;
+  llvm::raw_string_ostream os(s);
+  os << t;
+  return os.str();
+}
+
+bool IsCall(const llvm::MCInst& inst) {
+  return inst.getOpcode() == llvm::TPU::CALL;
+}
+
+bool IsBranch(const llvm::MCInst& inst) {
+  return inst.getOpcode() == llvm::TPU::BR ||
+         inst.getOpcode() == llvm::TPU::BRcond ||
+         inst.getOpcode() == llvm::TPU::BRrel;
+}
+
+bool IsScalarHalt(const llvm::MCInst& inst) {
+  return inst.getOpcode() == llvm::TPU::HALT;
+}
+
+// Returns true if the instruction uses a scalar slot and may contain an
+// immediate operand.
+bool IsScalarImmInst(const llvm::MCInst& inst) {
+  switch (inst.getOpcode()) {
+    case llvm::TPU::ADDri:
+    case llvm::TPU::ADDri_S0:
+    case llvm::TPU::ADDri_S1:
+    case llvm::TPU::ADDri_SM:
+    case llvm::TPU::ANDri:
+    case llvm::TPU::ANDri_S0:
+    case llvm::TPU::ANDri_S1:
+    case llvm::TPU::ANDri_SM:
+    case llvm::TPU::CARRYOUTri:
+    case llvm::TPU::CARRYOUTri_S0:
+    case llvm::TPU::CARRYOUTri_S1:
+    case llvm::TPU::CARRYOUTri_SM:
+    case llvm::TPU::CMPUGEri:
+    case llvm::TPU::CMPUGEri_S0:
+    case llvm::TPU::CMPUGEri_S1:
+    case llvm::TPU::CMPUGEri_SM:
+    case llvm::TPU::CMPUGTri:
+    case llvm::TPU::CMPUGTri_S0:
+    case llvm::TPU::CMPUGTri_S1:
+    case llvm::TPU::CMPUGTri_SM:
+    case llvm::TPU::CMPULEri:
+    case llvm::TPU::CMPULEri_S0:
+    case llvm::TPU::CMPULEri_S1:
+    case llvm::TPU::CMPULEri_SM:
+    case llvm::TPU::CMPULTri:
+    case llvm::TPU::CMPULTri_S0:
+    case llvm::TPU::CMPULTri_S1:
+    case llvm::TPU::CMPULTri_SM:
+    case llvm::TPU::CMPEQri:
+    case llvm::TPU::CMPEQri_S0:
+    case llvm::TPU::CMPEQri_S1:
+    case llvm::TPU::CMPEQri_SM:
+    case llvm::TPU::CMPGEri:
+    case llvm::TPU::CMPGEri_S0:
+    case llvm::TPU::CMPGEri_S1:
+    case llvm::TPU::CMPGEri_SM:
+    case llvm::TPU::CMPGTri:
+    case llvm::TPU::CMPGTri_S0:
+    case llvm::TPU::CMPGTri_S1:
+    case llvm::TPU::CMPGTri_SM:
+    case llvm::TPU::CMPLEri:
+    case llvm::TPU::CMPLEri_S0:
+    case llvm::TPU::CMPLEri_S1:
+    case llvm::TPU::CMPLEri_SM:
+    case llvm::TPU::CMPLTri:
+    case llvm::TPU::CMPLTri_S0:
+    case llvm::TPU::CMPLTri_S1:
+    case llvm::TPU::CMPLTri_SM:
+    case llvm::TPU::CMPNEri:
+    case llvm::TPU::CMPNEri_S0:
+    case llvm::TPU::CMPNEri_S1:
+    case llvm::TPU::CMPNEri_SM:
+    case llvm::TPU::FADDri:
+    case llvm::TPU::FCMPEQri:
+    case llvm::TPU::FCMPEQri_S0:
+    case llvm::TPU::FCMPEQri_S1:
+    case llvm::TPU::FCMPGEri:
+    case llvm::TPU::FCMPGEri_S0:
+    case llvm::TPU::FCMPGEri_S1:
+    case llvm::TPU::FCMPGTri:
+    case llvm::TPU::FCMPGTri_S0:
+    case llvm::TPU::FCMPGTri_S1:
+    case llvm::TPU::FCMPLEri:
+    case llvm::TPU::FCMPLEri_S0:
+    case llvm::TPU::FCMPLEri_S1:
+    case llvm::TPU::FCMPLTri:
+    case llvm::TPU::FCMPLTri_S0:
+    case llvm::TPU::FCMPLTri_S1:
+    case llvm::TPU::FCMPNEri:
+    case llvm::TPU::FCMPNEri_S0:
+    case llvm::TPU::FCMPNEri_S1:
+    case llvm::TPU::FIMM:
+    case llvm::TPU::FIMM_S0:
+    case llvm::TPU::FIMM_S1:
+    case llvm::TPU::FMAXri:
+    case llvm::TPU::FMAXri_S0:
+    case llvm::TPU::FMAXri_S1:
+    case llvm::TPU::FMINri:
+    case llvm::TPU::FMINri_S0:
+    case llvm::TPU::FMINri_S1:
+    case llvm::TPU::FMULri:
+    case llvm::TPU::FPTOSIri:
+    case llvm::TPU::FSUBir:
+    case llvm::TPU::IMM:
+    case llvm::TPU::IMM_S0:
+    case llvm::TPU::IMM_S1:
+    case llvm::TPU::IMM_SM:
+    case llvm::TPU::MUL24ri:
+    case llvm::TPU::MULri:
+    case llvm::TPU::ORri:
+    case llvm::TPU::ORri_S0:
+    case llvm::TPU::ORri_S1:
+    case llvm::TPU::ORri_SM:
+    case llvm::TPU::SHLri:
+    case llvm::TPU::SHLri_S0:
+    case llvm::TPU::SHLri_S1:
+    case llvm::TPU::SHLri_SM:
+    case llvm::TPU::SITOFPi:
+    case llvm::TPU::SLDi:
+    case llvm::TPU::SLDri:
+    case llvm::TPU::SRAri:
+    case llvm::TPU::SRAri_S0:
+    case llvm::TPU::SRAri_S1:
+    case llvm::TPU::SRAri_SM:
+    case llvm::TPU::SRLri:
+    case llvm::TPU::SRLri_S0:
+    case llvm::TPU::SRLri_S1:
+    case llvm::TPU::SRLri_SM:
+    case llvm::TPU::SSTi:
+    case llvm::TPU::SUBir:
+    case llvm::TPU::SUBir_S0:
+    case llvm::TPU::SUBir_S1:
+    case llvm::TPU::SUBir_SM:
+    case llvm::TPU::XORri:
+    case llvm::TPU::XORri_S0:
+    case llvm::TPU::XORri_S1:
+    case llvm::TPU::XORri_SM:
+      return true;
+    default:
+      return false;
+  }
+}
+
+MCInstDesc GetMCInstDesc(int opcode) {
+  switch (opcode) {
+    // Scalar unop instructions.
+    case llvm::TPU::IMM:
+    case llvm::TPU::IMM_S0:
+    case llvm::TPU::IMM_S1:
+    case llvm::TPU::FIMM:
+    case llvm::TPU::FIMM_S0:
+    case llvm::TPU::FIMM_S1:
+    case llvm::TPU::MOV:
+    case llvm::TPU::MOV_S0:
+    case llvm::TPU::MOV_S1:
+      return ScalarUnOpDesc{LloOpcode::kScalarMove};
+    case llvm::TPU::SITOFPi:
+    case llvm::TPU::SITOFPr:
+      return ScalarUnOpDesc{LloOpcode::kScalarConvertS32ToF32};
+    case llvm::TPU::CLZ:
+    case llvm::TPU::CLZ_S0:
+    case llvm::TPU::CLZ_S1:
+      return ScalarUnOpDesc{LloOpcode::kScalarCountLeadingZeros};
+
+    // Scalar binop instructions.
+    case llvm::TPU::ADDri:
+    case llvm::TPU::ADDri_S0:
+    case llvm::TPU::ADDri_S1:
+    case llvm::TPU::ADDrr:
+    case llvm::TPU::ADDrr_S0:
+    case llvm::TPU::ADDrr_S1:
+      return ScalarBinOpDesc{LloOpcode::kScalarAddS32};
+    case llvm::TPU::ANDri:
+    case llvm::TPU::ANDri_S0:
+    case llvm::TPU::ANDri_S1:
+    case llvm::TPU::ANDrr:
+    case llvm::TPU::ANDrr_S0:
+    case llvm::TPU::ANDrr_S1:
+      return ScalarBinOpDesc{LloOpcode::kScalarBitwiseAnd};
+    case llvm::TPU::FADDri:
+    case llvm::TPU::FADDrr:
+      return ScalarBinOpDesc{LloOpcode::kScalarAddF32};
+    case llvm::TPU::FPTOSIri:
+    case llvm::TPU::FPTOSIrr:
+      return ScalarBinOpDesc{LloOpcode::kScalarConvertF32ToS32WithProbRounding};
+    case llvm::TPU::FMAXri:
+    case llvm::TPU::FMAXri_S0:
+    case llvm::TPU::FMAXri_S1:
+    case llvm::TPU::FMAXrr:
+    case llvm::TPU::FMAXrr_S0:
+    case llvm::TPU::FMAXrr_S1:
+      return ScalarBinOpDesc{LloOpcode::kScalarMaximumF32};
+    case llvm::TPU::FMINri:
+    case llvm::TPU::FMINri_S0:
+    case llvm::TPU::FMINri_S1:
+    case llvm::TPU::FMINrr:
+    case llvm::TPU::FMINrr_S0:
+    case llvm::TPU::FMINrr_S1:
+      return ScalarBinOpDesc{LloOpcode::kScalarMinimumF32};
+    case llvm::TPU::FMULri:
+    case llvm::TPU::FMULrr:
+      return ScalarBinOpDesc{LloOpcode::kScalarMultiplyF32};
+    case llvm::TPU::FSUBir:
+    case llvm::TPU::FSUBrr:
+      return ScalarBinOpDesc{LloOpcode::kScalarSubtractF32, true};
+    case llvm::TPU::MUL24ri:
+    case llvm::TPU::MUL24rr:
+      return ScalarBinOpDesc{LloOpcode::kScalarMultiplyU24};
+    case llvm::TPU::MULri:
+    case llvm::TPU::MULrr:
+      return ScalarBinOpDesc{LloOpcode::kScalarMultiplyU32};
+    case llvm::TPU::ORri:
+    case llvm::TPU::ORri_S0:
+    case llvm::TPU::ORri_S1:
+    case llvm::TPU::ORrr:
+    case llvm::TPU::ORrr_S0:
+    case llvm::TPU::ORrr_S1:
+      return ScalarBinOpDesc{LloOpcode::kScalarBitwiseOr};
+    case llvm::TPU::SHLri:
+    case llvm::TPU::SHLri_S0:
+    case llvm::TPU::SHLri_S1:
+    case llvm::TPU::SHLrr:
+    case llvm::TPU::SHLrr_S0:
+    case llvm::TPU::SHLrr_S1:
+      return ScalarBinOpDesc{LloOpcode::kScalarShll};
+    case llvm::TPU::SRAri:
+    case llvm::TPU::SRAri_S0:
+    case llvm::TPU::SRAri_S1:
+    case llvm::TPU::SRArr:
+    case llvm::TPU::SRArr_S0:
+    case llvm::TPU::SRArr_S1:
+      return ScalarBinOpDesc{LloOpcode::kScalarShra};
+    case llvm::TPU::SRLri:
+    case llvm::TPU::SRLri_S0:
+    case llvm::TPU::SRLri_S1:
+    case llvm::TPU::SRLrr:
+    case llvm::TPU::SRLrr_S0:
+    case llvm::TPU::SRLrr_S1:
+      return ScalarBinOpDesc{LloOpcode::kScalarShrl};
+    case llvm::TPU::SUBir:
+    case llvm::TPU::SUBir_S0:
+    case llvm::TPU::SUBir_S1:
+    case llvm::TPU::SUBrr:
+    case llvm::TPU::SUBrr_S0:
+    case llvm::TPU::SUBrr_S1:
+      return ScalarBinOpDesc{LloOpcode::kScalarSubtractS32, true};
+    case llvm::TPU::XORri:
+    case llvm::TPU::XORri_S0:
+    case llvm::TPU::XORri_S1:
+    case llvm::TPU::XORrr:
+    case llvm::TPU::XORrr_S0:
+    case llvm::TPU::XORrr_S1:
+      return ScalarBinOpDesc{LloOpcode::kScalarBitwiseXor};
+
+    // Vector unop instructions.
+    case llvm::TPU::VROTDOWNr:
+    case llvm::TPU::VROTDOWNr_V0:
+    case llvm::TPU::VROTDOWNr_V1:
+      return VectorUnOpDesc{LloOpcode::kVectorSublaneRotateTZ};
+    case llvm::TPU::VSIGNIFICANDr:
+    case llvm::TPU::VSIGNIFICANDr_V0:
+    case llvm::TPU::VSIGNIFICANDr_V1:
+      return VectorUnOpDesc{LloOpcode::kVectorExtractSignificand};
+    case llvm::TPU::VSITOFPr:
+      return VectorUnOpDesc{LloOpcode::kVectorConvertS32ToF32};
+    case llvm::TPU::VCLZr:
+    case llvm::TPU::VCLZr_V0:
+    case llvm::TPU::VCLZr_V1:
+      return VectorUnOpDesc{LloOpcode::kVectorCountLeadingZeros};
+    case llvm::TPU::VPOPCNTr:
+    case llvm::TPU::VPOPCNTr_V0:
+    case llvm::TPU::VPOPCNTr_V1:
+      return VectorUnOpDesc{LloOpcode::kVectorPopCount};
+    case llvm::TPU::VEXPONENTr:
+    case llvm::TPU::VEXPONENTr_V0:
+    case llvm::TPU::VEXPONENTr_V1:
+      return VectorUnOpDesc{LloOpcode::kVectorExtractExponent};
+    case llvm::TPU::VUNPACKL:
+    case llvm::TPU::VUNPACKL_V0:
+    case llvm::TPU::VUNPACKL_V1:
+      return VectorUnpackOpDesc{VpackFormat::kCompressedBf16, 0};
+    case llvm::TPU::VUNPACKU:
+    case llvm::TPU::VUNPACKU_V0:
+    case llvm::TPU::VUNPACKU_V1:
+      return VectorUnpackOpDesc{VpackFormat::kCompressedBf16, 1};
+
+    // Vector unop insructions with no destination.
+    case llvm::TPU::SetRngSeed:
+      return VectorUnOpNoDstDesc{LloOpcode::kVectorSetRngSeed, false};
+    case llvm::TPU::VLOG2:
+    case llvm::TPU::VLOG2_V0:
+    case llvm::TPU::VLOG2_V1:
+    case llvm::TPU::VLOG2_V2:
+      return VectorUnOpNoDstDesc{LloOpcode::kVectorLog2F32};
+    case llvm::TPU::VPOW2:
+    case llvm::TPU::VPOW2_V0:
+    case llvm::TPU::VPOW2_V1:
+    case llvm::TPU::VPOW2_V2:
+      return VectorUnOpNoDstDesc{LloOpcode::kVectorPow2F32};
+    case llvm::TPU::VRCP:
+    case llvm::TPU::VRCP_V0:
+    case llvm::TPU::VRCP_V1:
+    case llvm::TPU::VRCP_V2:
+      return VectorUnOpNoDstDesc{LloOpcode::kVectorReciprocalF32};
+    case llvm::TPU::VRSQRT:
+    case llvm::TPU::VRSQRT_V0:
+    case llvm::TPU::VRSQRT_V1:
+    case llvm::TPU::VRSQRT_V2:
+      return VectorUnOpNoDstDesc{LloOpcode::kVectorRsqrtF32};
+    case llvm::TPU::VTANH:
+    case llvm::TPU::VTANH_V0:
+    case llvm::TPU::VTANH_V1:
+    case llvm::TPU::VTANH_V2:
+      return VectorUnOpNoDstDesc{LloOpcode::kVectorTanhF32};
+
+    // Vector binop instructions.
+    case llvm::TPU::VADDri:
+    case llvm::TPU::VADDri_V0:
+    case llvm::TPU::VADDri_V1:
+    case llvm::TPU::VADDri_V2:
+    case llvm::TPU::VADDrr:
+    case llvm::TPU::VADDrr_V0:
+    case llvm::TPU::VADDrr_V1:
+    case llvm::TPU::VADDrr_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorAddS32};
+    case llvm::TPU::VADDrs:
+    case llvm::TPU::VADDrs_V0:
+    case llvm::TPU::VADDrs_V1:
+    case llvm::TPU::VADDrs_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorAddS32, true};
+    case llvm::TPU::VANDri:
+    case llvm::TPU::VANDri_V0:
+    case llvm::TPU::VANDri_V1:
+    case llvm::TPU::VANDri_V2:
+    case llvm::TPU::VANDrr:
+    case llvm::TPU::VANDrr_V0:
+    case llvm::TPU::VANDrr_V1:
+    case llvm::TPU::VANDrr_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorAndU32};
+    case llvm::TPU::VANDrs:
+    case llvm::TPU::VANDrs_V0:
+    case llvm::TPU::VANDrs_V1:
+    case llvm::TPU::VANDrs_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorAndU32, true};
+    case llvm::TPU::VCLAMPSri:
+    case llvm::TPU::VCLAMPSri_V0:
+    case llvm::TPU::VCLAMPSri_V1:
+    case llvm::TPU::VCLAMPSri_V2:
+    case llvm::TPU::VCLAMPSrr:
+    case llvm::TPU::VCLAMPSrr_V0:
+    case llvm::TPU::VCLAMPSrr_V1:
+    case llvm::TPU::VCLAMPSrr_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorClampSymmetricF32};
+    case llvm::TPU::VCLAMPSrs:
+    case llvm::TPU::VCLAMPSrs_V0:
+    case llvm::TPU::VCLAMPSrs_V1:
+    case llvm::TPU::VCLAMPSrs_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorClampSymmetricF32, true};
+    case llvm::TPU::VCLAMPZri:
+    case llvm::TPU::VCLAMPZri_V0:
+    case llvm::TPU::VCLAMPZri_V1:
+    case llvm::TPU::VCLAMPZri_V2:
+    case llvm::TPU::VCLAMPZrr:
+    case llvm::TPU::VCLAMPZrr_V0:
+    case llvm::TPU::VCLAMPZrr_V1:
+    case llvm::TPU::VCLAMPZrr_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorClampGezF32};
+    case llvm::TPU::VCLAMPZrs:
+    case llvm::TPU::VCLAMPZrs_V0:
+    case llvm::TPU::VCLAMPZrs_V1:
+    case llvm::TPU::VCLAMPZrs_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorClampGezF32, true};
+    case llvm::TPU::VCOMPOSEir:
+    case llvm::TPU::VCOMPOSEir_V0:
+    case llvm::TPU::VCOMPOSEir_V1:
+    case llvm::TPU::VCOMPOSEir_V2:
+    case llvm::TPU::VCOMPOSErr:
+    case llvm::TPU::VCOMPOSErr_V0:
+    case llvm::TPU::VCOMPOSErr_V1:
+    case llvm::TPU::VCOMPOSErr_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorComposeF32, false, true};
+    case llvm::TPU::VCOMPOSEsr:
+    case llvm::TPU::VCOMPOSEsr_V0:
+    case llvm::TPU::VCOMPOSEsr_V1:
+    case llvm::TPU::VCOMPOSEsr_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorComposeF32, true, true};
+    case llvm::TPU::VFADDri:
+    case llvm::TPU::VFADDri_V1:
+    case llvm::TPU::VFADDrr:
+    case llvm::TPU::VFADDrr_V1:
+      return VectorBinOpDesc{LloOpcode::kVectorAddF32};
+    case llvm::TPU::VFADDrs:
+      return VectorBinOpDesc{LloOpcode::kVectorAddF32, true};
+    case llvm::TPU::VFMAXri:
+    case llvm::TPU::VFMAXri_V0:
+    case llvm::TPU::VFMAXri_V1:
+    case llvm::TPU::VFMAXri_V2:
+    case llvm::TPU::VFMAXrr:
+    case llvm::TPU::VFMAXrr_V0:
+    case llvm::TPU::VFMAXrr_V1:
+    case llvm::TPU::VFMAXrr_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorMaximumF32};
+    case llvm::TPU::VFMAXrs:
+    case llvm::TPU::VFMAXrs_V0:
+    case llvm::TPU::VFMAXrs_V1:
+    case llvm::TPU::VFMAXrs_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorMaximumF32, true};
+    case llvm::TPU::VFMINri:
+    case llvm::TPU::VFMINri_V0:
+    case llvm::TPU::VFMINri_V1:
+    case llvm::TPU::VFMINri_V2:
+    case llvm::TPU::VFMINrr:
+    case llvm::TPU::VFMINrr_V0:
+    case llvm::TPU::VFMINrr_V1:
+    case llvm::TPU::VFMINrr_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorMinimumF32};
+    case llvm::TPU::VFMINrs:
+    case llvm::TPU::VFMINrs_V0:
+    case llvm::TPU::VFMINrs_V1:
+    case llvm::TPU::VFMINrs_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorMinimumF32, true};
+    case llvm::TPU::VFMULri:
+    case llvm::TPU::VFMULri_V0:
+    case llvm::TPU::VFMULrr:
+    case llvm::TPU::VFMULrr_V0:
+      return VectorBinOpDesc{LloOpcode::kVectorMultiplyF32};
+    case llvm::TPU::VFMULrs:
+      return VectorBinOpDesc{LloOpcode::kVectorMultiplyF32, true};
+    case llvm::TPU::VFPTOSIri:
+    case llvm::TPU::VFPTOSIrr:
+      return VectorBinOpDesc{LloOpcode::kVectorConvertF32ToS32WithProbRounding};
+    case llvm::TPU::VFPTOSIrs:
+      return VectorBinOpDesc{LloOpcode::kVectorConvertF32ToS32WithProbRounding,
+                             true};
+    case llvm::TPU::VFSUBir:
+    case llvm::TPU::VFSUBir_V1:
+    case llvm::TPU::VFSUBrr:
+    case llvm::TPU::VFSUBrr_V1:
+      return VectorBinOpDesc{LloOpcode::kVectorSubtractF32, false, true};
+    case llvm::TPU::VFSUBsr:
+      return VectorBinOpDesc{LloOpcode::kVectorSubtractF32, true, true};
+    case llvm::TPU::VORri:
+    case llvm::TPU::VORri_V0:
+    case llvm::TPU::VORri_V1:
+    case llvm::TPU::VORri_V2:
+    case llvm::TPU::VORrr:
+    case llvm::TPU::VORrr_V0:
+    case llvm::TPU::VORrr_V1:
+    case llvm::TPU::VORrr_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorOrU32};
+    case llvm::TPU::VORrs:
+    case llvm::TPU::VORrs_V0:
+    case llvm::TPU::VORrs_V1:
+    case llvm::TPU::VORrs_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorOrU32, true};
+    case llvm::TPU::VPACKCir:
+    case llvm::TPU::VPACKCir_V0:
+    case llvm::TPU::VPACKCir_V1:
+    case llvm::TPU::VPACKCir_V2:
+    case llvm::TPU::VPACKCrr:
+    case llvm::TPU::VPACKCrr_V0:
+    case llvm::TPU::VPACKCrr_V1:
+    case llvm::TPU::VPACKCrr_V2:
+      return VectorPackOpDesc{VpackFormat::kCompressedBf16, false};
+    case llvm::TPU::VPACKCsr:
+    case llvm::TPU::VPACKCsr_V0:
+    case llvm::TPU::VPACKCsr_V1:
+    case llvm::TPU::VPACKCsr_V2:
+      return VectorPackOpDesc{VpackFormat::kCompressedBf16, true};
+    case llvm::TPU::VPACKir:
+    case llvm::TPU::VPACKir_V0:
+    case llvm::TPU::VPACKir_V1:
+    case llvm::TPU::VPACKir_V2:
+    case llvm::TPU::VPACKrr:
+    case llvm::TPU::VPACKrr_V0:
+    case llvm::TPU::VPACKrr_V1:
+    case llvm::TPU::VPACKrr_V2:
+      return VectorPackOpDesc{VpackFormat::kInterleavedBf16, false};
+    case llvm::TPU::VPACKsr:
+    case llvm::TPU::VPACKsr_V0:
+    case llvm::TPU::VPACKsr_V1:
+    case llvm::TPU::VPACKsr_V2:
+      return VectorPackOpDesc{VpackFormat::kInterleavedBf16, true};
+    case llvm::TPU::VSHLri:
+    case llvm::TPU::VSHLri_V0:
+    case llvm::TPU::VSHLri_V1:
+    case llvm::TPU::VSHLri_V2:
+    case llvm::TPU::VSHLrr:
+    case llvm::TPU::VSHLrr_V0:
+    case llvm::TPU::VSHLrr_V1:
+    case llvm::TPU::VSHLrr_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorShiftLeftLogical};
+    case llvm::TPU::VSHLrs:
+    case llvm::TPU::VSHLrs_V0:
+    case llvm::TPU::VSHLrs_V1:
+    case llvm::TPU::VSHLrs_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorShiftLeftLogical, true};
+    case llvm::TPU::VSRAri:
+    case llvm::TPU::VSRAri_V0:
+    case llvm::TPU::VSRAri_V1:
+    case llvm::TPU::VSRAri_V2:
+    case llvm::TPU::VSRArr:
+    case llvm::TPU::VSRArr_V0:
+    case llvm::TPU::VSRArr_V1:
+    case llvm::TPU::VSRArr_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorShiftRightArithmetic};
+    case llvm::TPU::VSRArs:
+    case llvm::TPU::VSRArs_V0:
+    case llvm::TPU::VSRArs_V1:
+    case llvm::TPU::VSRArs_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorShiftRightArithmetic, true};
+    case llvm::TPU::VSRLri:
+    case llvm::TPU::VSRLri_V0:
+    case llvm::TPU::VSRLri_V1:
+    case llvm::TPU::VSRLri_V2:
+    case llvm::TPU::VSRLrr:
+    case llvm::TPU::VSRLrr_V0:
+    case llvm::TPU::VSRLrr_V1:
+    case llvm::TPU::VSRLrr_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorShiftRightLogical};
+    case llvm::TPU::VSRLrs:
+    case llvm::TPU::VSRLrs_V0:
+    case llvm::TPU::VSRLrs_V1:
+    case llvm::TPU::VSRLrs_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorShiftRightLogical, true};
+    case llvm::TPU::VSUBir:
+    case llvm::TPU::VSUBir_V0:
+    case llvm::TPU::VSUBir_V1:
+    case llvm::TPU::VSUBir_V2:
+    case llvm::TPU::VSUBrr:
+    case llvm::TPU::VSUBrr_V0:
+    case llvm::TPU::VSUBrr_V1:
+    case llvm::TPU::VSUBrr_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorSubtractS32, false, true};
+    case llvm::TPU::VSUBsr:
+    case llvm::TPU::VSUBsr_V0:
+    case llvm::TPU::VSUBsr_V1:
+    case llvm::TPU::VSUBsr_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorSubtractS32, true, true};
+    case llvm::TPU::VXORri:
+    case llvm::TPU::VXORri_V0:
+    case llvm::TPU::VXORri_V1:
+    case llvm::TPU::VXORri_V2:
+    case llvm::TPU::VXORrr:
+    case llvm::TPU::VXORrr_V0:
+    case llvm::TPU::VXORrr_V1:
+    case llvm::TPU::VXORrr_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorXorU32};
+    case llvm::TPU::VXORrs:
+    case llvm::TPU::VXORrs_V0:
+    case llvm::TPU::VXORrs_V1:
+    case llvm::TPU::VXORrs_V2:
+      return VectorBinOpDesc{LloOpcode::kVectorXorU32, true};
+
+    // Scalar compare instructions.
+    case llvm::TPU::CMPEQri:
+    case llvm::TPU::CMPEQri_S0:
+    case llvm::TPU::CMPEQri_S1:
+    case llvm::TPU::CMPEQrr:
+    case llvm::TPU::CMPEQrr_S0:
+    case llvm::TPU::CMPEQrr_S1:
+      return ScalarCompareDesc{ComparisonDirection::kEq,
+                               Comparison::Type::kSigned};
+    case llvm::TPU::CMPGEri:
+    case llvm::TPU::CMPGEri_S0:
+    case llvm::TPU::CMPGEri_S1:
+    case llvm::TPU::CMPGErr:
+    case llvm::TPU::CMPGErr_S0:
+    case llvm::TPU::CMPGErr_S1:
+      return ScalarCompareDesc{ComparisonDirection::kGe,
+                               Comparison::Type::kSigned};
+    case llvm::TPU::CMPGTri:
+    case llvm::TPU::CMPGTri_S0:
+    case llvm::TPU::CMPGTri_S1:
+    case llvm::TPU::CMPGTrr:
+    case llvm::TPU::CMPGTrr_S0:
+    case llvm::TPU::CMPGTrr_S1:
+      return ScalarCompareDesc{ComparisonDirection::kGt,
+                               Comparison::Type::kSigned};
+    case llvm::TPU::CMPLEri:
+    case llvm::TPU::CMPLEri_S0:
+    case llvm::TPU::CMPLEri_S1:
+    case llvm::TPU::CMPLErr:
+    case llvm::TPU::CMPLErr_S0:
+    case llvm::TPU::CMPLErr_S1:
+      return ScalarCompareDesc{ComparisonDirection::kLe,
+                               Comparison::Type::kSigned};
+    case llvm::TPU::CMPLTri:
+    case llvm::TPU::CMPLTri_S0:
+    case llvm::TPU::CMPLTri_S1:
+    case llvm::TPU::CMPLTrr:
+    case llvm::TPU::CMPLTrr_S0:
+    case llvm::TPU::CMPLTrr_S1:
+      return ScalarCompareDesc{ComparisonDirection::kLt,
+                               Comparison::Type::kSigned};
+    case llvm::TPU::CMPUGEri:
+    case llvm::TPU::CMPUGEri_S0:
+    case llvm::TPU::CMPUGEri_S1:
+    case llvm::TPU::CMPUGErr:
+    case llvm::TPU::CMPUGErr_S0:
+    case llvm::TPU::CMPUGErr_S1:
+      return ScalarCompareDesc{ComparisonDirection::kGe,
+                               Comparison::Type::kUnsigned};
+    case llvm::TPU::CMPUGTri:
+    case llvm::TPU::CMPUGTri_S0:
+    case llvm::TPU::CMPUGTri_S1:
+    case llvm::TPU::CMPUGTrr:
+    case llvm::TPU::CMPUGTrr_S0:
+    case llvm::TPU::CMPUGTrr_S1:
+      return ScalarCompareDesc{ComparisonDirection::kGt,
+                               Comparison::Type::kUnsigned};
+    case llvm::TPU::CMPULEri:
+    case llvm::TPU::CMPULEri_S0:
+    case llvm::TPU::CMPULEri_S1:
+    case llvm::TPU::CMPULErr:
+    case llvm::TPU::CMPULErr_S0:
+    case llvm::TPU::CMPULErr_S1:
+      return ScalarCompareDesc{ComparisonDirection::kLe,
+                               Comparison::Type::kUnsigned};
+    case llvm::TPU::CMPULTri:
+    case llvm::TPU::CMPULTri_S0:
+    case llvm::TPU::CMPULTri_S1:
+    case llvm::TPU::CMPULTrr:
+    case llvm::TPU::CMPULTrr_S0:
+    case llvm::TPU::CMPULTrr_S1:
+      return ScalarCompareDesc{ComparisonDirection::kLt,
+                               Comparison::Type::kUnsigned};
+    case llvm::TPU::CMPNEri:
+    case llvm::TPU::CMPNEri_S0:
+    case llvm::TPU::CMPNEri_S1:
+    case llvm::TPU::CMPNErr:
+    case llvm::TPU::CMPNErr_S0:
+    case llvm::TPU::CMPNErr_S1:
+      return ScalarCompareDesc{ComparisonDirection::kNe,
+                               Comparison::Type::kSigned};
+    case llvm::TPU::FCMPEQri:
+    case llvm::TPU::FCMPEQri_S0:
+    case llvm::TPU::FCMPEQri_S1:
+    case llvm::TPU::FCMPEQrr:
+    case llvm::TPU::FCMPEQrr_S0:
+    case llvm::TPU::FCMPEQrr_S1:
+      return ScalarCompareDesc{ComparisonDirection::kEq,
+                               Comparison::Type::kFloat};
+    case llvm::TPU::FCMPGEri:
+    case llvm::TPU::FCMPGEri_S0:
+    case llvm::TPU::FCMPGEri_S1:
+    case llvm::TPU::FCMPGErr:
+    case llvm::TPU::FCMPGErr_S0:
+    case llvm::TPU::FCMPGErr_S1:
+      return ScalarCompareDesc{ComparisonDirection::kGe,
+                               Comparison::Type::kFloat};
+    case llvm::TPU::FCMPGTri:
+    case llvm::TPU::FCMPGTri_S0:
+    case llvm::TPU::FCMPGTri_S1:
+    case llvm::TPU::FCMPGTrr:
+    case llvm::TPU::FCMPGTrr_S0:
+    case llvm::TPU::FCMPGTrr_S1:
+      return ScalarCompareDesc{ComparisonDirection::kGt,
+                               Comparison::Type::kFloat};
+    case llvm::TPU::FCMPLEri:
+    case llvm::TPU::FCMPLEri_S0:
+    case llvm::TPU::FCMPLEri_S1:
+    case llvm::TPU::FCMPLErr:
+    case llvm::TPU::FCMPLErr_S0:
+    case llvm::TPU::FCMPLErr_S1:
+      return ScalarCompareDesc{ComparisonDirection::kLe,
+                               Comparison::Type::kFloat};
+    case llvm::TPU::FCMPLTri:
+    case llvm::TPU::FCMPLTri_S0:
+    case llvm::TPU::FCMPLTri_S1:
+    case llvm::TPU::FCMPLTrr:
+    case llvm::TPU::FCMPLTrr_S0:
+    case llvm::TPU::FCMPLTrr_S1:
+      return ScalarCompareDesc{ComparisonDirection::kLt,
+                               Comparison::Type::kFloat};
+    case llvm::TPU::FCMPNEri:
+    case llvm::TPU::FCMPNEri_S0:
+    case llvm::TPU::FCMPNEri_S1:
+    case llvm::TPU::FCMPNErr:
+    case llvm::TPU::FCMPNErr_S0:
+    case llvm::TPU::FCMPNErr_S1:
+      return ScalarCompareDesc{ComparisonDirection::kNe,
+                               Comparison::Type::kFloat};
+
+    // Vector compare instructions.
+    case llvm::TPU::VCMPEQri:
+    case llvm::TPU::VCMPEQri_V0:
+    case llvm::TPU::VCMPEQri_V1:
+    case llvm::TPU::VCMPEQri_V2:
+    case llvm::TPU::VCMPEQrr:
+    case llvm::TPU::VCMPEQrr_V0:
+    case llvm::TPU::VCMPEQrr_V1:
+    case llvm::TPU::VCMPEQrr_V2:
+      return VectorCompareDesc{ComparisonDirection::kEq,
+                               Comparison::Type::kSigned};
+    case llvm::TPU::VCMPEQrs:
+    case llvm::TPU::VCMPEQrs_V0:
+    case llvm::TPU::VCMPEQrs_V1:
+    case llvm::TPU::VCMPEQrs_V2:
+      return VectorCompareDesc{ComparisonDirection::kEq,
+                               Comparison::Type::kSigned, true};
+    case llvm::TPU::VCMPGEri:
+    case llvm::TPU::VCMPGEri_V0:
+    case llvm::TPU::VCMPGEri_V1:
+    case llvm::TPU::VCMPGEri_V2:
+    case llvm::TPU::VCMPGErr:
+    case llvm::TPU::VCMPGErr_V0:
+    case llvm::TPU::VCMPGErr_V1:
+    case llvm::TPU::VCMPGErr_V2:
+      return VectorCompareDesc{ComparisonDirection::kGe,
+                               Comparison::Type::kSigned};
+    case llvm::TPU::VCMPGErs:
+    case llvm::TPU::VCMPGErs_V0:
+    case llvm::TPU::VCMPGErs_V1:
+    case llvm::TPU::VCMPGErs_V2:
+      return VectorCompareDesc{ComparisonDirection::kGe,
+                               Comparison::Type::kSigned, true};
+    case llvm::TPU::VCMPGTri:
+    case llvm::TPU::VCMPGTri_V0:
+    case llvm::TPU::VCMPGTri_V1:
+    case llvm::TPU::VCMPGTri_V2:
+    case llvm::TPU::VCMPGTrr:
+    case llvm::TPU::VCMPGTrr_V0:
+    case llvm::TPU::VCMPGTrr_V1:
+    case llvm::TPU::VCMPGTrr_V2:
+      return VectorCompareDesc{ComparisonDirection::kGt,
+                               Comparison::Type::kSigned};
+    case llvm::TPU::VCMPGTrs:
+    case llvm::TPU::VCMPGTrs_V0:
+    case llvm::TPU::VCMPGTrs_V1:
+    case llvm::TPU::VCMPGTrs_V2:
+      return VectorCompareDesc{ComparisonDirection::kGt,
+                               Comparison::Type::kSigned, true};
+    case llvm::TPU::VCMPLEri:
+    case llvm::TPU::VCMPLEri_V0:
+    case llvm::TPU::VCMPLEri_V1:
+    case llvm::TPU::VCMPLEri_V2:
+    case llvm::TPU::VCMPLErr:
+    case llvm::TPU::VCMPLErr_V0:
+    case llvm::TPU::VCMPLErr_V1:
+    case llvm::TPU::VCMPLErr_V2:
+      return VectorCompareDesc{ComparisonDirection::kLe,
+                               Comparison::Type::kSigned};
+    case llvm::TPU::VCMPLErs:
+    case llvm::TPU::VCMPLErs_V0:
+    case llvm::TPU::VCMPLErs_V1:
+    case llvm::TPU::VCMPLErs_V2:
+      return VectorCompareDesc{ComparisonDirection::kLe,
+                               Comparison::Type::kSigned, true};
+    case llvm::TPU::VCMPLTri:
+    case llvm::TPU::VCMPLTri_V0:
+    case llvm::TPU::VCMPLTri_V1:
+    case llvm::TPU::VCMPLTri_V2:
+    case llvm::TPU::VCMPLTrr:
+    case llvm::TPU::VCMPLTrr_V0:
+    case llvm::TPU::VCMPLTrr_V1:
+    case llvm::TPU::VCMPLTrr_V2:
+      return VectorCompareDesc{ComparisonDirection::kLt,
+                               Comparison::Type::kSigned};
+    case llvm::TPU::VCMPLTrs:
+    case llvm::TPU::VCMPLTrs_V0:
+    case llvm::TPU::VCMPLTrs_V1:
+    case llvm::TPU::VCMPLTrs_V2:
+      return VectorCompareDesc{ComparisonDirection::kLt,
+                               Comparison::Type::kSigned, true};
+    case llvm::TPU::VCMPNEri:
+    case llvm::TPU::VCMPNEri_V0:
+    case llvm::TPU::VCMPNEri_V1:
+    case llvm::TPU::VCMPNEri_V2:
+    case llvm::TPU::VCMPNErr:
+    case llvm::TPU::VCMPNErr_V0:
+    case llvm::TPU::VCMPNErr_V1:
+    case llvm::TPU::VCMPNErr_V2:
+      return VectorCompareDesc{ComparisonDirection::kNe,
+                               Comparison::Type::kSigned};
+    case llvm::TPU::VCMPNErs:
+    case llvm::TPU::VCMPNErs_V0:
+    case llvm::TPU::VCMPNErs_V1:
+    case llvm::TPU::VCMPNErs_V2:
+      return VectorCompareDesc{ComparisonDirection::kNe,
+                               Comparison::Type::kSigned, true};
+    case llvm::TPU::VFCMPEQri:
+    case llvm::TPU::VFCMPEQri_V0:
+    case llvm::TPU::VFCMPEQri_V1:
+    case llvm::TPU::VFCMPEQri_V2:
+    case llvm::TPU::VFCMPEQrr:
+    case llvm::TPU::VFCMPEQrr_V0:
+    case llvm::TPU::VFCMPEQrr_V1:
+    case llvm::TPU::VFCMPEQrr_V2:
+      return VectorCompareDesc{ComparisonDirection::kEq,
+                               Comparison::Type::kFloat};
+    case llvm::TPU::VFCMPEQrs:
+    case llvm::TPU::VFCMPEQrs_V0:
+    case llvm::TPU::VFCMPEQrs_V1:
+    case llvm::TPU::VFCMPEQrs_V2:
+      return VectorCompareDesc{ComparisonDirection::kEq,
+                               Comparison::Type::kFloat, true};
+    case llvm::TPU::VFCMPGEri:
+    case llvm::TPU::VFCMPGEri_V0:
+    case llvm::TPU::VFCMPGEri_V1:
+    case llvm::TPU::VFCMPGEri_V2:
+    case llvm::TPU::VFCMPGErr:
+    case llvm::TPU::VFCMPGErr_V0:
+    case llvm::TPU::VFCMPGErr_V1:
+    case llvm::TPU::VFCMPGErr_V2:
+      return VectorCompareDesc{ComparisonDirection::kGe,
+                               Comparison::Type::kFloat};
+    case llvm::TPU::VFCMPGErs:
+    case llvm::TPU::VFCMPGErs_V0:
+    case llvm::TPU::VFCMPGErs_V1:
+    case llvm::TPU::VFCMPGErs_V2:
+      return VectorCompareDesc{ComparisonDirection::kGe,
+                               Comparison::Type::kFloat, true};
+    case llvm::TPU::VFCMPGTri:
+    case llvm::TPU::VFCMPGTri_V0:
+    case llvm::TPU::VFCMPGTri_V1:
+    case llvm::TPU::VFCMPGTri_V2:
+    case llvm::TPU::VFCMPGTrr:
+    case llvm::TPU::VFCMPGTrr_V0:
+    case llvm::TPU::VFCMPGTrr_V1:
+    case llvm::TPU::VFCMPGTrr_V2:
+      return VectorCompareDesc{ComparisonDirection::kGt,
+                               Comparison::Type::kFloat};
+    case llvm::TPU::VFCMPGTrs:
+    case llvm::TPU::VFCMPGTrs_V0:
+    case llvm::TPU::VFCMPGTrs_V1:
+    case llvm::TPU::VFCMPGTrs_V2:
+      return VectorCompareDesc{ComparisonDirection::kGt,
+                               Comparison::Type::kFloat, true};
+    case llvm::TPU::VFCMPLEri:
+    case llvm::TPU::VFCMPLEri_V0:
+    case llvm::TPU::VFCMPLEri_V1:
+    case llvm::TPU::VFCMPLEri_V2:
+    case llvm::TPU::VFCMPLErr:
+    case llvm::TPU::VFCMPLErr_V0:
+    case llvm::TPU::VFCMPLErr_V1:
+    case llvm::TPU::VFCMPLErr_V2:
+      return VectorCompareDesc{ComparisonDirection::kLe,
+                               Comparison::Type::kFloat};
+    case llvm::TPU::VFCMPLErs:
+    case llvm::TPU::VFCMPLErs_V0:
+    case llvm::TPU::VFCMPLErs_V1:
+    case llvm::TPU::VFCMPLErs_V2:
+      return VectorCompareDesc{ComparisonDirection::kLe,
+                               Comparison::Type::kFloat, true};
+    case llvm::TPU::VFCMPLTri:
+    case llvm::TPU::VFCMPLTri_V0:
+    case llvm::TPU::VFCMPLTri_V1:
+    case llvm::TPU::VFCMPLTri_V2:
+    case llvm::TPU::VFCMPLTrr:
+    case llvm::TPU::VFCMPLTrr_V0:
+    case llvm::TPU::VFCMPLTrr_V1:
+    case llvm::TPU::VFCMPLTrr_V2:
+      return VectorCompareDesc{ComparisonDirection::kLt,
+                               Comparison::Type::kFloat};
+    case llvm::TPU::VFCMPLTrs:
+    case llvm::TPU::VFCMPLTrs_V0:
+    case llvm::TPU::VFCMPLTrs_V1:
+    case llvm::TPU::VFCMPLTrs_V2:
+      return VectorCompareDesc{ComparisonDirection::kLt,
+                               Comparison::Type::kFloat, true};
+    case llvm::TPU::VFCMPNEri:
+    case llvm::TPU::VFCMPNEri_V0:
+    case llvm::TPU::VFCMPNEri_V1:
+    case llvm::TPU::VFCMPNEri_V2:
+    case llvm::TPU::VFCMPNErr:
+    case llvm::TPU::VFCMPNErr_V0:
+    case llvm::TPU::VFCMPNErr_V1:
+    case llvm::TPU::VFCMPNErr_V2:
+      return VectorCompareDesc{ComparisonDirection::kNe,
+                               Comparison::Type::kFloat};
+    case llvm::TPU::VFCMPNErs:
+    case llvm::TPU::VFCMPNErs_V0:
+    case llvm::TPU::VFCMPNErs_V1:
+    case llvm::TPU::VFCMPNErs_V2:
+      return VectorCompareDesc{ComparisonDirection::kNe,
+                               Comparison::Type::kFloat, true};
+
+    // Vector load instructions.
+    case llvm::TPU::tcVLV_MaskIi:
+    case llvm::TPU::tcVLV_MaskRi:
+      return VectorLoadDesc{false, true};
+    case llvm::TPU::tcVLV_MaskIri:
+    case llvm::TPU::tcVLV_MaskRri:
+      return VectorLoadDesc{true, true};
+    case llvm::TPU::tcVLV_ShuffleI_MaskIi:
+    case llvm::TPU::tcVLV_ShuffleI_MaskRi:
+    case llvm::TPU::tcVLV_ShuffleR_MaskIi:
+    case llvm::TPU::tcVLV_ShuffleR_MaskRi:
+      return VectorLoadDesc{false, true, false, true};
+    case llvm::TPU::tcVLV_ShuffleI_MaskIri:
+    case llvm::TPU::tcVLV_ShuffleR_MaskIri:
+    case llvm::TPU::tcVLV_ShuffleI_MaskRri:
+    case llvm::TPU::tcVLV_ShuffleR_MaskRri:
+      return VectorLoadDesc{true, true, false, true};
+    case llvm::TPU::tcVLV_ShuffleIi:
+    case llvm::TPU::tcVLV_ShuffleRi:
+      return VectorLoadDesc{false, false, false, true};
+    case llvm::TPU::tcVLV_ShuffleIri:
+    case llvm::TPU::tcVLV_ShuffleRri:
+      return VectorLoadDesc{true, false, false, true};
+    case llvm::TPU::tcVLV_StrideI_MaskIi:
+    case llvm::TPU::tcVLV_StrideI_MaskRi:
+    case llvm::TPU::tcVLV_StrideR_MaskIi:
+    case llvm::TPU::tcVLV_StrideR_MaskRi:
+      return VectorLoadDesc{false, true, true};
+    case llvm::TPU::tcVLV_StrideI_MaskIri:
+    case llvm::TPU::tcVLV_StrideI_MaskRri:
+    case llvm::TPU::tcVLV_StrideR_MaskIri:
+    case llvm::TPU::tcVLV_StrideR_MaskRri:
+      return VectorLoadDesc{true, true, true};
+    case llvm::TPU::tcVLV_StrideIi:
+    case llvm::TPU::tcVLV_StrideRi:
+      return VectorLoadDesc{false, false, true};
+    case llvm::TPU::tcVLV_StrideIri:
+    case llvm::TPU::tcVLV_StrideRri:
+      return VectorLoadDesc{true, false, true};
+    case llvm::TPU::tcVLVi:
+      return VectorLoadDesc{};
+    case llvm::TPU::tcVLVri:
+      return VectorLoadDesc{true};
+
+    // Same as above with iar_regno=0, but no Shuffle.
+    case llvm::TPU::tcVLV_IAR0_MaskIi:
+    case llvm::TPU::tcVLV_IAR0_MaskRi:
+      return VectorLoadDesc{false, true, false, false, 0};
+    case llvm::TPU::tcVLV_IAR0_MaskIri:
+    case llvm::TPU::tcVLV_IAR0_MaskRri:
+      return VectorLoadDesc{true, true, false, false, 0};
+    case llvm::TPU::tcVLV_IAR0_StrideI_MaskIi:
+    case llvm::TPU::tcVLV_IAR0_StrideI_MaskRi:
+    case llvm::TPU::tcVLV_IAR0_StrideR_MaskIi:
+    case llvm::TPU::tcVLV_IAR0_StrideR_MaskRi:
+      return VectorLoadDesc{false, true, true, false, 0};
+    case llvm::TPU::tcVLV_IAR0_StrideI_MaskIri:
+    case llvm::TPU::tcVLV_IAR0_StrideI_MaskRri:
+    case llvm::TPU::tcVLV_IAR0_StrideR_MaskIri:
+    case llvm::TPU::tcVLV_IAR0_StrideR_MaskRri:
+      return VectorLoadDesc{true, true, true, false, 0};
+    case llvm::TPU::tcVLV_IAR0_StrideIi:
+    case llvm::TPU::tcVLV_IAR0_StrideRi:
+      return VectorLoadDesc{false, false, true, false, 0};
+    case llvm::TPU::tcVLV_IAR0_StrideIri:
+    case llvm::TPU::tcVLV_IAR0_StrideRri:
+      return VectorLoadDesc{true, false, true, false, 0};
+    case llvm::TPU::tcVLV_IAR0i:
+      return VectorLoadDesc{false, false, false, false, 0};
+    case llvm::TPU::tcVLV_IAR0ri:
+      return VectorLoadDesc{true, false, false, false, 0};
+
+    // Same as above with iar_regno=1.
+    case llvm::TPU::tcVLV_IAR1_MaskIi:
+    case llvm::TPU::tcVLV_IAR1_MaskRi:
+      return VectorLoadDesc{false, true, false, false, 1};
+    case llvm::TPU::tcVLV_IAR1_MaskIri:
+    case llvm::TPU::tcVLV_IAR1_MaskRri:
+      return VectorLoadDesc{true, true, false, false, 1};
+    case llvm::TPU::tcVLV_IAR1_StrideI_MaskIi:
+    case llvm::TPU::tcVLV_IAR1_StrideI_MaskRi:
+    case llvm::TPU::tcVLV_IAR1_StrideR_MaskIi:
+    case llvm::TPU::tcVLV_IAR1_StrideR_MaskRi:
+      return VectorLoadDesc{false, true, true, false, 1};
+    case llvm::TPU::tcVLV_IAR1_StrideI_MaskIri:
+    case llvm::TPU::tcVLV_IAR1_StrideI_MaskRri:
+    case llvm::TPU::tcVLV_IAR1_StrideR_MaskIri:
+    case llvm::TPU::tcVLV_IAR1_StrideR_MaskRri:
+      return VectorLoadDesc{true, true, true, false, 1};
+    case llvm::TPU::tcVLV_IAR1_StrideIi:
+    case llvm::TPU::tcVLV_IAR1_StrideRi:
+      return VectorLoadDesc{false, false, true, false, 1};
+    case llvm::TPU::tcVLV_IAR1_StrideIri:
+    case llvm::TPU::tcVLV_IAR1_StrideRri:
+      return VectorLoadDesc{true, false, true, false, 1};
+    case llvm::TPU::tcVLV_IAR1i:
+      return VectorLoadDesc{false, false, false, false, 1};
+    case llvm::TPU::tcVLV_IAR1ri:
+      return VectorLoadDesc{true, false, false, false, 1};
+
+    // Vector store instructions.
+    case llvm::TPU::tcVSV_MaskIi:
+    case llvm::TPU::tcVSV_MaskRi:
+      return VectorStoreDesc{false, true};
+    case llvm::TPU::tcVSV_MaskIri:
+    case llvm::TPU::tcVSV_MaskRri:
+      return VectorStoreDesc{true, true};
+    case llvm::TPU::tcVSV_StrideI_MaskIi:
+    case llvm::TPU::tcVSV_StrideI_MaskRi:
+    case llvm::TPU::tcVSV_StrideR_MaskIi:
+    case llvm::TPU::tcVSV_StrideR_MaskRi:
+      return VectorStoreDesc{false, true, true};
+    case llvm::TPU::tcVSV_StrideI_MaskIri:
+    case llvm::TPU::tcVSV_StrideI_MaskRri:
+    case llvm::TPU::tcVSV_StrideR_MaskIri:
+    case llvm::TPU::tcVSV_StrideR_MaskRri:
+      return VectorStoreDesc{true, true, true};
+    case llvm::TPU::tcVSV_StrideIi:
+    case llvm::TPU::tcVSV_StrideRi:
+      return VectorStoreDesc{false, false, true};
+    case llvm::TPU::tcVSV_StrideIri:
+    case llvm::TPU::tcVSV_StrideRri:
+      return VectorStoreDesc{true, false, true};
+    case llvm::TPU::tcVSVi:
+      return VectorStoreDesc{};
+    case llvm::TPU::tcVSVri:
+      return VectorStoreDesc{true};
+    case llvm::TPU::tcVSV_VMask_MaskIi:
+    case llvm::TPU::tcVSV_VMask_MaskRi:
+      return VectorStoreDesc{false, true, false, true};
+    case llvm::TPU::tcVSV_VMask_MaskIri:
+    case llvm::TPU::tcVSV_VMask_MaskRri:
+      return VectorStoreDesc{true, true, false, true};
+    case llvm::TPU::tcVSV_VMask_StrideI_MaskIi:
+    case llvm::TPU::tcVSV_VMask_StrideI_MaskRi:
+    case llvm::TPU::tcVSV_VMask_StrideR_MaskIi:
+    case llvm::TPU::tcVSV_VMask_StrideR_MaskRi:
+      return VectorStoreDesc{false, true, true, true};
+    case llvm::TPU::tcVSV_VMask_StrideI_MaskIri:
+    case llvm::TPU::tcVSV_VMask_StrideI_MaskRri:
+    case llvm::TPU::tcVSV_VMask_StrideR_MaskIri:
+    case llvm::TPU::tcVSV_VMask_StrideR_MaskRri:
+      return VectorStoreDesc{true, true, true, true};
+    case llvm::TPU::tcVSV_VMask_StrideIi:
+    case llvm::TPU::tcVSV_VMask_StrideRi:
+      return VectorStoreDesc{false, false, true, true};
+    case llvm::TPU::tcVSV_VMask_StrideIri:
+    case llvm::TPU::tcVSV_VMask_StrideRri:
+      return VectorStoreDesc{true, false, true, true};
+    case llvm::TPU::tcVSV_VMaski:
+      return VectorStoreDesc{false, false, false, true};
+    case llvm::TPU::tcVSV_VMaskri:
+      return VectorStoreDesc{true, false, false, true};
+
+    // Same as above with iar_regno=0.
+    case llvm::TPU::tcVSV_IAR0_MaskIi:
+    case llvm::TPU::tcVSV_IAR0_MaskRi:
+      return VectorStoreDesc{false, true, false, false, 0};
+    case llvm::TPU::tcVSV_IAR0_MaskIri:
+    case llvm::TPU::tcVSV_IAR0_MaskRri:
+      return VectorStoreDesc{true, true, false, false, 0};
+    case llvm::TPU::tcVSV_IAR0_StrideI_MaskIi:
+    case llvm::TPU::tcVSV_IAR0_StrideI_MaskRi:
+    case llvm::TPU::tcVSV_IAR0_StrideR_MaskIi:
+    case llvm::TPU::tcVSV_IAR0_StrideR_MaskRi:
+      return VectorStoreDesc{false, true, true, false, 0};
+    case llvm::TPU::tcVSV_IAR0_StrideI_MaskIri:
+    case llvm::TPU::tcVSV_IAR0_StrideI_MaskRri:
+    case llvm::TPU::tcVSV_IAR0_StrideR_MaskIri:
+    case llvm::TPU::tcVSV_IAR0_StrideR_MaskRri:
+      return VectorStoreDesc{true, true, true, false, 0};
+    case llvm::TPU::tcVSV_IAR0_StrideIi:
+    case llvm::TPU::tcVSV_IAR0_StrideRi:
+      return VectorStoreDesc{false, false, true, false, 0};
+    case llvm::TPU::tcVSV_IAR0_StrideIri:
+    case llvm::TPU::tcVSV_IAR0_StrideRri:
+      return VectorStoreDesc{true, false, true, false, 0};
+    case llvm::TPU::tcVSV_IAR0i:
+      return VectorStoreDesc{};
+    case llvm::TPU::tcVSV_IAR0ri:
+      return VectorStoreDesc{true, false, false, false, 0};
+    case llvm::TPU::tcVSV_IAR0_VMask_MaskIi:
+    case llvm::TPU::tcVSV_IAR0_VMask_MaskRi:
+      return VectorStoreDesc{false, true, false, true, 0};
+    case llvm::TPU::tcVSV_IAR0_VMask_MaskIri:
+    case llvm::TPU::tcVSV_IAR0_VMask_MaskRri:
+      return VectorStoreDesc{true, true, false, true, 0};
+    case llvm::TPU::tcVSV_IAR0_VMask_StrideI_MaskIi:
+    case llvm::TPU::tcVSV_IAR0_VMask_StrideI_MaskRi:
+    case llvm::TPU::tcVSV_IAR0_VMask_StrideR_MaskIi:
+    case llvm::TPU::tcVSV_IAR0_VMask_StrideR_MaskRi:
+      return VectorStoreDesc{false, true, true, true, 0};
+    case llvm::TPU::tcVSV_IAR0_VMask_StrideI_MaskIri:
+    case llvm::TPU::tcVSV_IAR0_VMask_StrideI_MaskRri:
+    case llvm::TPU::tcVSV_IAR0_VMask_StrideR_MaskIri:
+    case llvm::TPU::tcVSV_IAR0_VMask_StrideR_MaskRri:
+      return VectorStoreDesc{true, true, true, true, 0};
+    case llvm::TPU::tcVSV_IAR0_VMask_StrideIi:
+    case llvm::TPU::tcVSV_IAR0_VMask_StrideRi:
+      return VectorStoreDesc{false, false, true, true, 0};
+    case llvm::TPU::tcVSV_IAR0_VMask_StrideIri:
+    case llvm::TPU::tcVSV_IAR0_VMask_StrideRri:
+      return VectorStoreDesc{true, false, true, true, 0};
+    case llvm::TPU::tcVSV_IAR0_VMaski:
+      return VectorStoreDesc{false, false, false, true, 0};
+    case llvm::TPU::tcVSV_IAR0_VMaskri:
+      return VectorStoreDesc{true, false, false, true, 0};
+
+    // Same as above with iar_regno=1.
+    case llvm::TPU::tcVSV_IAR1_MaskIi:
+    case llvm::TPU::tcVSV_IAR1_MaskRi:
+      return VectorStoreDesc{false, true, false, false, 1};
+    case llvm::TPU::tcVSV_IAR1_MaskIri:
+    case llvm::TPU::tcVSV_IAR1_MaskRri:
+      return VectorStoreDesc{true, true, false, false, 1};
+    case llvm::TPU::tcVSV_IAR1_StrideI_MaskIi:
+    case llvm::TPU::tcVSV_IAR1_StrideI_MaskRi:
+    case llvm::TPU::tcVSV_IAR1_StrideR_MaskIi:
+    case llvm::TPU::tcVSV_IAR1_StrideR_MaskRi:
+      return VectorStoreDesc{false, true, true, false, 1};
+    case llvm::TPU::tcVSV_IAR1_StrideI_MaskIri:
+    case llvm::TPU::tcVSV_IAR1_StrideI_MaskRri:
+    case llvm::TPU::tcVSV_IAR1_StrideR_MaskIri:
+    case llvm::TPU::tcVSV_IAR1_StrideR_MaskRri:
+      return VectorStoreDesc{true, true, true, false, 1};
+    case llvm::TPU::tcVSV_IAR1_StrideIi:
+    case llvm::TPU::tcVSV_IAR1_StrideRi:
+      return VectorStoreDesc{false, false, true, false, 1};
+    case llvm::TPU::tcVSV_IAR1_StrideIri:
+    case llvm::TPU::tcVSV_IAR1_StrideRri:
+      return VectorStoreDesc{true, false, true, false, 1};
+    case llvm::TPU::tcVSV_IAR1i:
+      return VectorStoreDesc{};
+    case llvm::TPU::tcVSV_IAR1ri:
+      return VectorStoreDesc{true, false, false, false, 1};
+    case llvm::TPU::tcVSV_IAR1_VMask_MaskIi:
+    case llvm::TPU::tcVSV_IAR1_VMask_MaskRi:
+      return VectorStoreDesc{false, true, false, true, 1};
+    case llvm::TPU::tcVSV_IAR1_VMask_MaskIri:
+    case llvm::TPU::tcVSV_IAR1_VMask_MaskRri:
+      return VectorStoreDesc{true, true, false, true, 1};
+    case llvm::TPU::tcVSV_IAR1_VMask_StrideI_MaskIi:
+    case llvm::TPU::tcVSV_IAR1_VMask_StrideI_MaskRi:
+    case llvm::TPU::tcVSV_IAR1_VMask_StrideR_MaskIi:
+    case llvm::TPU::tcVSV_IAR1_VMask_StrideR_MaskRi:
+      return VectorStoreDesc{false, true, true, true, 1};
+    case llvm::TPU::tcVSV_IAR1_VMask_StrideI_MaskIri:
+    case llvm::TPU::tcVSV_IAR1_VMask_StrideI_MaskRri:
+    case llvm::TPU::tcVSV_IAR1_VMask_StrideR_MaskIri:
+    case llvm::TPU::tcVSV_IAR1_VMask_StrideR_MaskRri:
+      return VectorStoreDesc{true, true, true, true, 1};
+    case llvm::TPU::tcVSV_IAR1_VMask_StrideIi:
+    case llvm::TPU::tcVSV_IAR1_VMask_StrideRi:
+      return VectorStoreDesc{false, false, true, true, 1};
+    case llvm::TPU::tcVSV_IAR1_VMask_StrideIri:
+    case llvm::TPU::tcVSV_IAR1_VMask_StrideRri:
+      return VectorStoreDesc{true, false, true, true, 1};
+    case llvm::TPU::tcVSV_IAR1_VMaski:
+      return VectorStoreDesc{false, false, false, true, 1};
+    case llvm::TPU::tcVSV_IAR1_VMaskri:
+      return VectorStoreDesc{true, false, false, true, 1};
+
+    // Matmul instructions.
+    case llvm::TPU::tcMXU0MATMUL:
+      return MatmulDesc{0, LloOpcode::kVectorMatmul, false};
+    case llvm::TPU::tcMXU0MATMULm:
+      return MatmulDesc{0, LloOpcode::kVectorMatmul, true};
+    case llvm::TPU::tcMXU0MATMUL_HI:
+      return MatmulDesc{0, LloOpcode::kVectorMatmulHigh, false};
+    case llvm::TPU::tcMXU0MATMUL_HIm:
+      return MatmulDesc{0, LloOpcode::kVectorMatmulHigh, true};
+    case llvm::TPU::tcMXU0MATMUL_LOW:
+      return MatmulDesc{0, LloOpcode::kVectorMatmulLow, false};
+    case llvm::TPU::tcMXU0MATMUL_LOWm:
+      return MatmulDesc{0, LloOpcode::kVectorMatmulLow, true};
+    case llvm::TPU::tcMXU0MATMUL_PACKED:
+      return MatmulDesc{0, LloOpcode::kVectorMatmulPacked, false};
+    case llvm::TPU::tcMXU0MATMUL_PACKEDm:
+      return MatmulDesc{0, LloOpcode::kVectorMatmulPacked, true};
+
+    case llvm::TPU::tcMXU1MATMUL:
+      return MatmulDesc{1, LloOpcode::kVectorMatmul, false};
+    case llvm::TPU::tcMXU1MATMULm:
+      return MatmulDesc{1, LloOpcode::kVectorMatmul, true};
+    case llvm::TPU::tcMXU1MATMUL_HI:
+      return MatmulDesc{1, LloOpcode::kVectorMatmulHigh, false};
+    case llvm::TPU::tcMXU1MATMUL_HIm:
+      return MatmulDesc{1, LloOpcode::kVectorMatmulHigh, true};
+    case llvm::TPU::tcMXU1MATMUL_LOW:
+      return MatmulDesc{1, LloOpcode::kVectorMatmulLow, false};
+    case llvm::TPU::tcMXU1MATMUL_LOWm:
+      return MatmulDesc{1, LloOpcode::kVectorMatmulLow, true};
+    case llvm::TPU::tcMXU1MATMUL_PACKED:
+      return MatmulDesc{1, LloOpcode::kVectorMatmulPacked, false};
+    case llvm::TPU::tcMXU1MATMUL_PACKEDm:
+      return MatmulDesc{1, LloOpcode::kVectorMatmulPacked, true};
+
+    case llvm::TPU::tcMXU2MATMUL:
+      return MatmulDesc{2, LloOpcode::kVectorMatmul, false};
+    case llvm::TPU::tcMXU2MATMULm:
+      return MatmulDesc{2, LloOpcode::kVectorMatmul, true};
+    case llvm::TPU::tcMXU2MATMUL_HI:
+      return MatmulDesc{2, LloOpcode::kVectorMatmulHigh, false};
+    case llvm::TPU::tcMXU2MATMUL_HIm:
+      return MatmulDesc{2, LloOpcode::kVectorMatmulHigh, true};
+    case llvm::TPU::tcMXU2MATMUL_LOW:
+      return MatmulDesc{2, LloOpcode::kVectorMatmulLow, false};
+    case llvm::TPU::tcMXU2MATMUL_LOWm:
+      return MatmulDesc{2, LloOpcode::kVectorMatmulLow, true};
+    case llvm::TPU::tcMXU2MATMUL_PACKED:
+      return MatmulDesc{2, LloOpcode::kVectorMatmulPacked, false};
+    case llvm::TPU::tcMXU2MATMUL_PACKEDm:
+      return MatmulDesc{2, LloOpcode::kVectorMatmulPacked, true};
+
+    case llvm::TPU::tcMXU3MATMUL:
+      return MatmulDesc{3, LloOpcode::kVectorMatmul, false};
+    case llvm::TPU::tcMXU3MATMULm:
+      return MatmulDesc{3, LloOpcode::kVectorMatmul, true};
+    case llvm::TPU::tcMXU3MATMUL_HI:
+      return MatmulDesc{3, LloOpcode::kVectorMatmulHigh, false};
+    case llvm::TPU::tcMXU3MATMUL_HIm:
+      return MatmulDesc{3, LloOpcode::kVectorMatmulHigh, true};
+    case llvm::TPU::tcMXU3MATMUL_LOW:
+      return MatmulDesc{3, LloOpcode::kVectorMatmulLow, false};
+    case llvm::TPU::tcMXU3MATMUL_LOWm:
+      return MatmulDesc{3, LloOpcode::kVectorMatmulLow, true};
+    case llvm::TPU::tcMXU3MATMUL_PACKED:
+      return MatmulDesc{3, LloOpcode::kVectorMatmulPacked, false};
+    case llvm::TPU::tcMXU3MATMUL_PACKEDm:
+      return MatmulDesc{3, LloOpcode::kVectorMatmulPacked, true};
+
+    // Matpush instructions.
+    case llvm::TPU::tcMXU0MATPUSH:
+      return LatchDesc{0, GainLatchMode::kNoXposeF32, false};
+    case llvm::TPU::tcMXU0MATPUSHm:
+      return LatchDesc{0, GainLatchMode::kNoXposeF32, true};
+    case llvm::TPU::tcMXU0MATPUSH_HI:
+      return LatchDesc{0, GainLatchMode::kNoXposeHiF32, false};
+    case llvm::TPU::tcMXU0MATPUSH_HIm:
+      return LatchDesc{0, GainLatchMode::kNoXposeHiF32, true};
+    case llvm::TPU::tcMXU0MATPUSH_HI_XPOS:
+    case llvm::TPU::tcMXU0MATPUSH_HI_XPOS_JF:
+      return LatchDesc{0, GainLatchMode::kXposeHiF32, false};
+    case llvm::TPU::tcMXU0MATPUSH_HI_XPOSm:
+    case llvm::TPU::tcMXU0MATPUSH_HI_XPOS_JFm:
+      return LatchDesc{0, GainLatchMode::kXposeHiF32, true};
+    case llvm::TPU::tcMXU0MATPUSH_LOW:
+      return LatchDesc{0, GainLatchMode::kNoXposeLowF32, false};
+    case llvm::TPU::tcMXU0MATPUSH_LOWm:
+      return LatchDesc{0, GainLatchMode::kNoXposeLowF32, true};
+    case llvm::TPU::tcMXU0MATPUSH_LOW_XPOS:
+    case llvm::TPU::tcMXU0MATPUSH_LOW_XPOS_JF:
+      return LatchDesc{0, GainLatchMode::kXposeLowF32, false};
+    case llvm::TPU::tcMXU0MATPUSH_LOW_XPOSm:
+    case llvm::TPU::tcMXU0MATPUSH_LOW_XPOS_JFm:
+      return LatchDesc{0, GainLatchMode::kXposeLowF32, true};
+    case llvm::TPU::tcMXU0MATPUSH_PACKED:
+      return LatchDesc{0, GainLatchMode::kNoXposePackedBf16, false};
+    case llvm::TPU::tcMXU0MATPUSH_PACKEDm:
+      return LatchDesc{0, GainLatchMode::kNoXposePackedBf16, true};
+    case llvm::TPU::tcMXU0MATPUSH_PACKED_XPOS:
+    case llvm::TPU::tcMXU0MATPUSH_PACKED_XPOS_JF:
+      return LatchDesc{0, GainLatchMode::kXposePackedBf16, false};
+    case llvm::TPU::tcMXU0MATPUSH_PACKED_XPOSm:
+    case llvm::TPU::tcMXU0MATPUSH_PACKED_XPOS_JFm:
+      return LatchDesc{0, GainLatchMode::kXposePackedBf16, true};
+    case llvm::TPU::tcMXU0MATPUSH_XPOS:
+    case llvm::TPU::tcMXU0MATPUSH_XPOS_JF:
+      return LatchDesc{0, GainLatchMode::kXposeF32, false};
+    case llvm::TPU::tcMXU0MATPUSH_XPOSm:
+    case llvm::TPU::tcMXU0MATPUSH_XPOS_JFm:
+      return LatchDesc{0, GainLatchMode::kXposeF32, true};
+
+    case llvm::TPU::tcMXU1MATPUSH:
+      return LatchDesc{1, GainLatchMode::kNoXposeF32, false};
+    case llvm::TPU::tcMXU1MATPUSHm:
+      return LatchDesc{1, GainLatchMode::kNoXposeF32, true};
+    case llvm::TPU::tcMXU1MATPUSH_HI:
+      return LatchDesc{1, GainLatchMode::kNoXposeHiF32, false};
+    case llvm::TPU::tcMXU1MATPUSH_HIm:
+      return LatchDesc{1, GainLatchMode::kNoXposeHiF32, true};
+    case llvm::TPU::tcMXU1MATPUSH_HI_XPOS:
+    case llvm::TPU::tcMXU1MATPUSH_HI_XPOS_JF:
+      return LatchDesc{1, GainLatchMode::kXposeHiF32, false};
+    case llvm::TPU::tcMXU1MATPUSH_HI_XPOSm:
+    case llvm::TPU::tcMXU1MATPUSH_HI_XPOS_JFm:
+      return LatchDesc{1, GainLatchMode::kXposeHiF32, true};
+    case llvm::TPU::tcMXU1MATPUSH_LOW:
+      return LatchDesc{1, GainLatchMode::kNoXposeLowF32, false};
+    case llvm::TPU::tcMXU1MATPUSH_LOWm:
+      return LatchDesc{1, GainLatchMode::kNoXposeLowF32, true};
+    case llvm::TPU::tcMXU1MATPUSH_LOW_XPOS:
+    case llvm::TPU::tcMXU1MATPUSH_LOW_XPOS_JF:
+      return LatchDesc{1, GainLatchMode::kXposeLowF32, false};
+    case llvm::TPU::tcMXU1MATPUSH_LOW_XPOSm:
+    case llvm::TPU::tcMXU1MATPUSH_LOW_XPOS_JFm:
+      return LatchDesc{1, GainLatchMode::kXposeLowF32, true};
+    case llvm::TPU::tcMXU1MATPUSH_PACKED:
+      return LatchDesc{1, GainLatchMode::kNoXposePackedBf16, false};
+    case llvm::TPU::tcMXU1MATPUSH_PACKEDm:
+      return LatchDesc{1, GainLatchMode::kNoXposePackedBf16, true};
+    case llvm::TPU::tcMXU1MATPUSH_PACKED_XPOS:
+    case llvm::TPU::tcMXU1MATPUSH_PACKED_XPOS_JF:
+      return LatchDesc{1, GainLatchMode::kXposePackedBf16, false};
+    case llvm::TPU::tcMXU1MATPUSH_PACKED_XPOSm:
+    case llvm::TPU::tcMXU1MATPUSH_PACKED_XPOS_JFm:
+      return LatchDesc{1, GainLatchMode::kXposePackedBf16, true};
+    case llvm::TPU::tcMXU1MATPUSH_XPOS:
+    case llvm::TPU::tcMXU1MATPUSH_XPOS_JF:
+      return LatchDesc{1, GainLatchMode::kXposeF32, false};
+    case llvm::TPU::tcMXU1MATPUSH_XPOSm:
+    case llvm::TPU::tcMXU1MATPUSH_XPOS_JFm:
+      return LatchDesc{1, GainLatchMode::kXposeF32, true};
+
+    case llvm::TPU::tcMXU2MATPUSH:
+      return LatchDesc{2, GainLatchMode::kNoXposeF32, false};
+    case llvm::TPU::tcMXU2MATPUSHm:
+      return LatchDesc{2, GainLatchMode::kNoXposeF32, true};
+    case llvm::TPU::tcMXU2MATPUSH_HI:
+      return LatchDesc{2, GainLatchMode::kNoXposeHiF32, false};
+    case llvm::TPU::tcMXU2MATPUSH_HIm:
+      return LatchDesc{2, GainLatchMode::kNoXposeHiF32, true};
+    case llvm::TPU::tcMXU2MATPUSH_HI_XPOS:
+    case llvm::TPU::tcMXU2MATPUSH_HI_XPOS_JF:
+      return LatchDesc{2, GainLatchMode::kXposeHiF32, false};
+    case llvm::TPU::tcMXU2MATPUSH_HI_XPOSm:
+    case llvm::TPU::tcMXU2MATPUSH_HI_XPOS_JFm:
+      return LatchDesc{2, GainLatchMode::kXposeHiF32, true};
+    case llvm::TPU::tcMXU2MATPUSH_LOW:
+      return LatchDesc{2, GainLatchMode::kNoXposeLowF32, false};
+    case llvm::TPU::tcMXU2MATPUSH_LOWm:
+      return LatchDesc{2, GainLatchMode::kNoXposeLowF32, true};
+    case llvm::TPU::tcMXU2MATPUSH_LOW_XPOS:
+    case llvm::TPU::tcMXU2MATPUSH_LOW_XPOS_JF:
+      return LatchDesc{2, GainLatchMode::kXposeLowF32, false};
+    case llvm::TPU::tcMXU2MATPUSH_LOW_XPOSm:
+    case llvm::TPU::tcMXU2MATPUSH_LOW_XPOS_JFm:
+      return LatchDesc{2, GainLatchMode::kXposeLowF32, true};
+    case llvm::TPU::tcMXU2MATPUSH_PACKED:
+      return LatchDesc{2, GainLatchMode::kNoXposePackedBf16, false};
+    case llvm::TPU::tcMXU2MATPUSH_PACKEDm:
+      return LatchDesc{2, GainLatchMode::kNoXposePackedBf16, true};
+    case llvm::TPU::tcMXU2MATPUSH_PACKED_XPOS:
+    case llvm::TPU::tcMXU2MATPUSH_PACKED_XPOS_JF:
+      return LatchDesc{2, GainLatchMode::kXposePackedBf16, false};
+    case llvm::TPU::tcMXU2MATPUSH_PACKED_XPOSm:
+    case llvm::TPU::tcMXU2MATPUSH_PACKED_XPOS_JFm:
+      return LatchDesc{2, GainLatchMode::kXposePackedBf16, true};
+    case llvm::TPU::tcMXU2MATPUSH_XPOS:
+    case llvm::TPU::tcMXU2MATPUSH_XPOS_JF:
+      return LatchDesc{2, GainLatchMode::kXposeF32, false};
+    case llvm::TPU::tcMXU2MATPUSH_XPOSm:
+    case llvm::TPU::tcMXU2MATPUSH_XPOS_JFm:
+      return LatchDesc{2, GainLatchMode::kXposeF32, true};
+
+    case llvm::TPU::tcMXU3MATPUSH:
+      return LatchDesc{3, GainLatchMode::kNoXposeF32, false};
+    case llvm::TPU::tcMXU3MATPUSHm:
+      return LatchDesc{3, GainLatchMode::kNoXposeF32, true};
+    case llvm::TPU::tcMXU3MATPUSH_HI:
+      return LatchDesc{3, GainLatchMode::kNoXposeHiF32, false};
+    case llvm::TPU::tcMXU3MATPUSH_HIm:
+      return LatchDesc{3, GainLatchMode::kNoXposeHiF32, true};
+    case llvm::TPU::tcMXU3MATPUSH_HI_XPOS:
+    case llvm::TPU::tcMXU3MATPUSH_HI_XPOS_JF:
+      return LatchDesc{3, GainLatchMode::kXposeHiF32, false};
+    case llvm::TPU::tcMXU3MATPUSH_HI_XPOSm:
+    case llvm::TPU::tcMXU3MATPUSH_HI_XPOS_JFm:
+      return LatchDesc{3, GainLatchMode::kXposeHiF32, true};
+    case llvm::TPU::tcMXU3MATPUSH_LOW:
+      return LatchDesc{3, GainLatchMode::kNoXposeLowF32, false};
+    case llvm::TPU::tcMXU3MATPUSH_LOWm:
+      return LatchDesc{3, GainLatchMode::kNoXposeLowF32, true};
+    case llvm::TPU::tcMXU3MATPUSH_LOW_XPOS:
+    case llvm::TPU::tcMXU3MATPUSH_LOW_XPOS_JF:
+      return LatchDesc{3, GainLatchMode::kXposeLowF32, false};
+    case llvm::TPU::tcMXU3MATPUSH_LOW_XPOSm:
+    case llvm::TPU::tcMXU3MATPUSH_LOW_XPOS_JFm:
+      return LatchDesc{3, GainLatchMode::kXposeLowF32, true};
+    case llvm::TPU::tcMXU3MATPUSH_PACKED:
+      return LatchDesc{3, GainLatchMode::kNoXposePackedBf16, false};
+    case llvm::TPU::tcMXU3MATPUSH_PACKEDm:
+      return LatchDesc{3, GainLatchMode::kNoXposePackedBf16, true};
+    case llvm::TPU::tcMXU3MATPUSH_PACKED_XPOS:
+    case llvm::TPU::tcMXU3MATPUSH_PACKED_XPOS_JF:
+      return LatchDesc{3, GainLatchMode::kXposePackedBf16, false};
+    case llvm::TPU::tcMXU3MATPUSH_PACKED_XPOSm:
+    case llvm::TPU::tcMXU3MATPUSH_PACKED_XPOS_JFm:
+      return LatchDesc{3, GainLatchMode::kXposePackedBf16, true};
+    case llvm::TPU::tcMXU3MATPUSH_XPOS:
+    case llvm::TPU::tcMXU3MATPUSH_XPOS_JF:
+      return LatchDesc{3, GainLatchMode::kXposeF32, false};
+    case llvm::TPU::tcMXU3MATPUSH_XPOSm:
+    case llvm::TPU::tcMXU3MATPUSH_XPOS_JFm:
+      return LatchDesc{3, GainLatchMode::kXposeF32, true};
+
+    case llvm::TPU::tcWAITEQii:
+    case llvm::TPU::tcWAITEQir:
+    case llvm::TPU::tcWAITEQri:
+    case llvm::TPU::tcWAITEQrr:
+      return VectorWaitDesc{LloOpcode::kVectorWaitEq};
+    case llvm::TPU::tcWAITNEii:
+    case llvm::TPU::tcWAITNEir:
+    case llvm::TPU::tcWAITNEri:
+    case llvm::TPU::tcWAITNErr:
+      return VectorWaitDesc{LloOpcode::kVectorWaitNe};
+    case llvm::TPU::tcWAITGTii:
+    case llvm::TPU::tcWAITGTir:
+    case llvm::TPU::tcWAITGTri:
+    case llvm::TPU::tcWAITGTrr:
+      return VectorWaitDesc{LloOpcode::kVectorWaitGt};
+    case llvm::TPU::tcWAITGEii:
+    case llvm::TPU::tcWAITGEir:
+    case llvm::TPU::tcWAITGEri:
+    case llvm::TPU::tcWAITGErr:
+      return VectorWaitDesc{LloOpcode::kVectorWaitGe};
+    case llvm::TPU::tcWAITLTii:
+    case llvm::TPU::tcWAITLTir:
+    case llvm::TPU::tcWAITLTri:
+    case llvm::TPU::tcWAITLTrr:
+      return VectorWaitDesc{LloOpcode::kVectorWaitEq};
+    case llvm::TPU::tcWAITLEii:
+    case llvm::TPU::tcWAITLEir:
+    case llvm::TPU::tcWAITLEri:
+    case llvm::TPU::tcWAITLErr:
+      return VectorWaitDesc{LloOpcode::kVectorWaitLe};
+
+    case llvm::TPU::DMA_HBM_TO_SMEMriri:
+    case llvm::TPU::DMA_HBM_TO_SMEMrirr:
+    case llvm::TPU::DMA_HBM_TO_SMEMrrri:
+    case llvm::TPU::DMA_HBM_TO_SMEMrrrr:
+      return DmaDesc{LloOpcode::kDmaHbmToSmem};
+    case llvm::TPU::DMA_HBM_TO_SMEM_GENERALii:
+    case llvm::TPU::DMA_HBM_TO_SMEM_GENERALir:
+    case llvm::TPU::DMA_HBM_TO_SMEM_GENERALri:
+    case llvm::TPU::DMA_HBM_TO_SMEM_GENERALrr:
+      return DmaDesc{LloOpcode::kDmaHbmToSmem, false, true};
+    case llvm::TPU::DMA_HBM_TO_SMEM_STRIDEDii:
+    case llvm::TPU::DMA_HBM_TO_SMEM_STRIDEDir:
+    case llvm::TPU::DMA_HBM_TO_SMEM_STRIDEDri:
+    case llvm::TPU::DMA_HBM_TO_SMEM_STRIDEDrr:
+      return DmaDesc{LloOpcode::kDmaHbmToSmem, true, false};
+    case llvm::TPU::DMA_HBM_TO_HIBriri:
+    case llvm::TPU::DMA_HBM_TO_HIBrirr:
+    case llvm::TPU::DMA_HBM_TO_HIBrrri:
+    case llvm::TPU::DMA_HBM_TO_HIBrrrr:
+      return DmaDesc{LloOpcode::kDmaHbmToHib, false, false, false};
+    case llvm::TPU::DMA_HBM_TO_VMEMriri:
+    case llvm::TPU::DMA_HBM_TO_VMEMrirr:
+    case llvm::TPU::DMA_HBM_TO_VMEMrrri:
+    case llvm::TPU::DMA_HBM_TO_VMEMrrrr:
+      return DmaDesc{LloOpcode::kDmaHbmToVmem};
+    case llvm::TPU::DMA_HBM_TO_VMEM_GENERALii:
+    case llvm::TPU::DMA_HBM_TO_VMEM_GENERALir:
+    case llvm::TPU::DMA_HBM_TO_VMEM_GENERALri:
+    case llvm::TPU::DMA_HBM_TO_VMEM_GENERALrr:
+      return DmaDesc{LloOpcode::kDmaHbmToVmem, false, true};
+    case llvm::TPU::DMA_HBM_TO_VMEM_STRIDEDii:
+    case llvm::TPU::DMA_HBM_TO_VMEM_STRIDEDir:
+    case llvm::TPU::DMA_HBM_TO_VMEM_STRIDEDri:
+    case llvm::TPU::DMA_HBM_TO_VMEM_STRIDEDrr:
+      return DmaDesc{LloOpcode::kDmaHbmToVmem, true, false};
+    case llvm::TPU::DMA_HBM_TO_VMEM_HIB_UPDATEriri:
+    case llvm::TPU::DMA_HBM_TO_VMEM_HIB_UPDATErirr:
+    case llvm::TPU::DMA_HBM_TO_VMEM_HIB_UPDATErrri:
+    case llvm::TPU::DMA_HBM_TO_VMEM_HIB_UPDATErrrr:
+      return DmaDesc{LloOpcode::kDmaHbmToVmemWithHibUpdate};
+    case llvm::TPU::DMA_SMEM_TO_HBMriri:
+    case llvm::TPU::DMA_SMEM_TO_HBMrirr:
+    case llvm::TPU::DMA_SMEM_TO_HBMrrri:
+    case llvm::TPU::DMA_SMEM_TO_HBMrrrr:
+      return DmaDesc{LloOpcode::kDmaSmemToHbm};
+    case llvm::TPU::DMA_SMEM_TO_HBM_GENERALii:
+    case llvm::TPU::DMA_SMEM_TO_HBM_GENERALir:
+    case llvm::TPU::DMA_SMEM_TO_HBM_GENERALri:
+    case llvm::TPU::DMA_SMEM_TO_HBM_GENERALrr:
+      return DmaDesc{LloOpcode::kDmaSmemToHbm, false, true};
+    case llvm::TPU::DMA_SMEM_TO_HBM_STRIDEDii:
+    case llvm::TPU::DMA_SMEM_TO_HBM_STRIDEDir:
+    case llvm::TPU::DMA_SMEM_TO_HBM_STRIDEDri:
+    case llvm::TPU::DMA_SMEM_TO_HBM_STRIDEDrr:
+      return DmaDesc{LloOpcode::kDmaSmemToHbm, true, false};
+    case llvm::TPU::DMA_VMEM_TO_HBMriri:
+    case llvm::TPU::DMA_VMEM_TO_HBMrirr:
+    case llvm::TPU::DMA_VMEM_TO_HBMrrri:
+    case llvm::TPU::DMA_VMEM_TO_HBMrrrr:
+      return DmaDesc{LloOpcode::kDmaVmemToHbm};
+    case llvm::TPU::DMA_VMEM_TO_HBM_GENERALii:
+    case llvm::TPU::DMA_VMEM_TO_HBM_GENERALir:
+    case llvm::TPU::DMA_VMEM_TO_HBM_GENERALri:
+    case llvm::TPU::DMA_VMEM_TO_HBM_GENERALrr:
+      return DmaDesc{LloOpcode::kDmaVmemToHbm, false, true};
+    case llvm::TPU::DMA_VMEM_TO_HBM_STRIDEDii:
+    case llvm::TPU::DMA_VMEM_TO_HBM_STRIDEDir:
+    case llvm::TPU::DMA_VMEM_TO_HBM_STRIDEDri:
+    case llvm::TPU::DMA_VMEM_TO_HBM_STRIDEDrr:
+      return DmaDesc{LloOpcode::kDmaVmemToHbm, true, false};
+
+    case llvm::TPU::tcXLU0B0XLANE_ADD:
+    case llvm::TPU::tcXLU1B0XLANE_ADD:
+      return VectorReduceDesc{LloOpcode::kVectorAddReduceF32, 0};
+    case llvm::TPU::tcXLU0B1XLANE_ADD:
+    case llvm::TPU::tcXLU1B1XLANE_ADD:
+      return VectorReduceDesc{LloOpcode::kVectorAddReduceF32, 1};
+    case llvm::TPU::tcXLU0B0XLANE_ADD_SEGMENTED:
+    case llvm::TPU::tcXLU1B0XLANE_ADD_SEGMENTED:
+      return VectorReduceDesc{LloOpcode::kVectorAddSegmentReduceF32, 0, true};
+    case llvm::TPU::tcXLU0B1XLANE_ADD_SEGMENTED:
+    case llvm::TPU::tcXLU1B1XLANE_ADD_SEGMENTED:
+      return VectorReduceDesc{LloOpcode::kVectorAddSegmentReduceF32, 1, true};
+    case llvm::TPU::tcXLU0B0XLANE_MAX:
+    case llvm::TPU::tcXLU1B0XLANE_MAX:
+      return VectorReduceDesc{LloOpcode::kVectorMaxReduceF32, 0};
+    case llvm::TPU::tcXLU0B1XLANE_MAX:
+    case llvm::TPU::tcXLU1B1XLANE_MAX:
+      return VectorReduceDesc{LloOpcode::kVectorMaxReduceF32, 1};
+    case llvm::TPU::tcXLU0B0XLANE_MAX_SEGMENTED:
+    case llvm::TPU::tcXLU1B0XLANE_MAX_SEGMENTED:
+      return VectorReduceDesc{LloOpcode::kVectorMaxSegmentReduceF32, 0, true};
+    case llvm::TPU::tcXLU0B1XLANE_MAX_SEGMENTED:
+    case llvm::TPU::tcXLU1B1XLANE_MAX_SEGMENTED:
+      return VectorReduceDesc{LloOpcode::kVectorMaxSegmentReduceF32, 1, true};
+    case llvm::TPU::tcXLU0B0XLANE_MAXINDEX:
+    case llvm::TPU::tcXLU1B0XLANE_MAXINDEX:
+      return VectorReduceDesc{LloOpcode::kVectorMaxIndexReduceF32, 0};
+    case llvm::TPU::tcXLU0B1XLANE_MAXINDEX:
+    case llvm::TPU::tcXLU1B1XLANE_MAXINDEX:
+      return VectorReduceDesc{LloOpcode::kVectorMaxIndexReduceF32, 1};
+    case llvm::TPU::tcXLU0B0XLANE_MAXINDEX_SEGMENTED:
+    case llvm::TPU::tcXLU1B0XLANE_MAXINDEX_SEGMENTED:
+      return VectorReduceDesc{kInvalid, 0, true};
+    case llvm::TPU::tcXLU0B1XLANE_MAXINDEX_SEGMENTED:
+    case llvm::TPU::tcXLU1B1XLANE_MAXINDEX_SEGMENTED:
+      return VectorReduceDesc{kInvalid, 1, true};
+    case llvm::TPU::tcXLU0B0XLANE_MIN:
+    case llvm::TPU::tcXLU1B0XLANE_MIN:
+      return VectorReduceDesc{LloOpcode::kVectorMinReduceF32, 0};
+    case llvm::TPU::tcXLU0B1XLANE_MIN:
+    case llvm::TPU::tcXLU1B1XLANE_MIN:
+      return VectorReduceDesc{LloOpcode::kVectorMinReduceF32, 1};
+    case llvm::TPU::tcXLU0B0XLANE_MIN_SEGMENTED:
+    case llvm::TPU::tcXLU1B0XLANE_MIN_SEGMENTED:
+      return VectorReduceDesc{LloOpcode::kVectorMinSegmentReduceF32, 0, true};
+    case llvm::TPU::tcXLU0B1XLANE_MIN_SEGMENTED:
+    case llvm::TPU::tcXLU1B1XLANE_MIN_SEGMENTED:
+      return VectorReduceDesc{LloOpcode::kVectorMinSegmentReduceF32, 1, true};
+    case llvm::TPU::tcXLU0B0XLANE_MININDEX:
+    case llvm::TPU::tcXLU1B0XLANE_MININDEX:
+      return VectorReduceDesc{LloOpcode::kVectorMinIndexReduceF32, 0};
+    case llvm::TPU::tcXLU0B1XLANE_MININDEX:
+    case llvm::TPU::tcXLU1B1XLANE_MININDEX:
+      return VectorReduceDesc{LloOpcode::kVectorMinIndexReduceF32, 1};
+    case llvm::TPU::tcXLU0B0XLANE_MININDEX_SEGMENTED:
+    case llvm::TPU::tcXLU1B0XLANE_MININDEX_SEGMENTED:
+      return VectorReduceDesc{kInvalid, 0, true};
+    case llvm::TPU::tcXLU0B1XLANE_MININDEX_SEGMENTED:
+    case llvm::TPU::tcXLU1B1XLANE_MININDEX_SEGMENTED:
+      return VectorReduceDesc{kInvalid, 1, true};
+
+    case llvm::TPU::SRDREG_LCCHI:
+    case llvm::TPU::SRDREG_LCCHI_S0:
+    case llvm::TPU::SRDREG_LCCHI_S1:
+      return ReadRegisterDesc{LloOpcode::kScalarReadCycleHigh};
+    case llvm::TPU::SRDREG_LCCLO:
+    case llvm::TPU::SRDREG_LCCLO_S0:
+    case llvm::TPU::SRDREG_LCCLO_S1:
+      return ReadRegisterDesc{LloOpcode::kScalarReadCycleLow};
+    case llvm::TPU::SRDREG_BTR:
+    case llvm::TPU::SRDREG_BTR_S0:
+    case llvm::TPU::SRDREG_BTR_S1:
+    case llvm::TPU::SRDREG_CRRHI:
+    case llvm::TPU::SRDREG_CRRHI_S0:
+    case llvm::TPU::SRDREG_CRRHI_S1:
+    case llvm::TPU::SRDREG_CRRLO:
+    case llvm::TPU::SRDREG_CRRLO_S0:
+    case llvm::TPU::SRDREG_CRRLO_S1:
+    case llvm::TPU::SRDREG_GTCHI:
+    case llvm::TPU::SRDREG_GTCHI_S0:
+    case llvm::TPU::SRDREG_GTCHI_S1:
+    case llvm::TPU::SRDREG_GTCLO:
+    case llvm::TPU::SRDREG_GTCLO_S0:
+    case llvm::TPU::SRDREG_GTCLO_S1:
+    case llvm::TPU::SRDREG_TAG:
+    case llvm::TPU::SRDREG_TAG_S0:
+    case llvm::TPU::SRDREG_TAG_S1:
+    case llvm::TPU::SRDREG_TM:
+    case llvm::TPU::SRDREG_TM_S0:
+    case llvm::TPU::SRDREG_TM_S1:
+      return ReadRegisterDesc{};
+
+    default:
+      return MCInstDesc();
+  }
+}  // NOLINT(readability/fn_size)
+
+std::array<MCInstDesc, llvm::TPU::INSTRUCTION_LIST_END> MakeInstDescArray() {
+  std::array<MCInstDesc, llvm::TPU::INSTRUCTION_LIST_END> result;
+  for (int i = 0; i < result.size(); ++i) {
+    result[i] = GetMCInstDesc(i);
+  }
+  return result;
+}
+
+bool Has32bImmOperand(const llvm::MCInst& inst, uint32_t immediate_bit_width) {
+  for (const llvm::MCOperand& opnd : inst) {
+    if (opnd.isImm() &&
+        !ImmValue::ValueIsNbits(opnd.getImm(), immediate_bit_width)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool HasImmOperand(const llvm::MCInst& inst) {
+  for (int i = 0; i < inst.getNumOperands() - 2; ++i) {
+    if (inst.getOperand(i).isImm() || inst.getOperand(i).isDFPImm()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+StatusOr<std::string> GetCalleeSymbolName(const llvm::MCInst& inst) {
+  TF_RET_CHECK(IsCall(inst));
+  TF_RET_CHECK(inst.getNumOperands() >= 1);
+  const auto& Op0 = inst.getOperand(0);
+  TF_RET_CHECK(Op0.isExpr());
+  TF_RET_CHECK(Op0.getExpr()->getKind() == llvm::MCExpr::SymbolRef);
+  return LlvmAsString(*Op0.getExpr());
+}
+
+ProgramGenerator::~ProgramGenerator() {}
+
+Status ProgramGenerator::AppendBundleFromText(int64_t bundle_number,
+                                              std::string* bundle_text) {
+  return OkStatus();
+}
+
+Status ProgramGenerator::CopyBundle(int64_t from_bundle_number,
+                                    int64_t to_bundle_number) {
+  return OkStatus();
+}
+
+StatusOr<Predication> ProgramGenerator::ToPredication(const llvm::MCInst& inst,
+                                                      int operand_index) {
+  const int operand_count = inst.getNumOperands();
+  if (operand_index < 0) {
+    TF_RET_CHECK(operand_count >= 2);
+    operand_index = operand_count - 2;
+  }
+  TF_RET_CHECK(operand_index + 2 <= operand_count);
+  const PredicationPolarity polarity =
+      inst.getOperand(operand_index + 1).getImm() == 0
+          ? PredicationPolarity::kPositive
+          : PredicationPolarity::kNegative;
+  if (inst.getOperand(operand_index).getReg() == llvm::TPU::Palways) {
+    return polarity == PredicationPolarity::kPositive
+               ? Predication::kAlwaysExecute
+               : Predication::kNeverExecute;
+  }
+  TF_ASSIGN_OR_RETURN(auto pregno, ToPregno(inst.getOperand(operand_index)));
+  return Predication(pregno, polarity);
+}
+
+StatusOr<Pregno> ProgramGenerator::ToPregno(const llvm::MCOperand& operand) {
+  TF_RET_CHECK(operand.isReg());
+  const unsigned regno = operand.getReg();
+  TF_RET_CHECK(regno >= llvm::TPU::P0);
+  TF_RET_CHECK(regno <= llvm::TPU::P14);
+  return Pregno(regno - llvm::TPU::P0);
+}
+
+StatusOr<Sregno> ProgramGenerator::ToSregno(const llvm::MCOperand& operand) {
+  TF_RET_CHECK(operand.isReg());
+  const unsigned regno = operand.getReg();
+  TF_RET_CHECK(regno >= llvm::TPU::S0);
+  TF_RET_CHECK(regno <= llvm::TPU::S31);
+  return Sregno(regno - llvm::TPU::S0);
+}
+
+StatusOr<Vregno> ProgramGenerator::ToVregno(const llvm::MCOperand& operand) {
+  TF_RET_CHECK(operand.isReg());
+  const unsigned regno = operand.getReg();
+  TF_RET_CHECK(regno >= llvm::TPU::V0);
+  TF_RET_CHECK(regno <= llvm::TPU::V31);
+  return Vregno(regno - llvm::TPU::V0);
+}
+
+StatusOr<Vmregno> ProgramGenerator::ToVmregno(const llvm::MCOperand& operand) {
+  TF_RET_CHECK(operand.isReg());
+  const unsigned regno = operand.getReg();
+  TF_RET_CHECK(regno >= llvm::TPU::M0);
+  TF_RET_CHECK(regno <= llvm::TPU::M15);
+  return Vmregno(regno - llvm::TPU::M0);
+}
+
+StatusOr<SregnoOrImm> ProgramGenerator::ToSregnoOrImm(
+    const llvm::MCOperand& operand) {
+  if (operand.isImm()) {
+    return SregnoOrImm::MakeImm(operand.getImm());
+  }
+  if (operand.isDFPImm()) {
+    return SregnoOrImm::MakeImm(absl::bit_cast<uint32_t>(
+        static_cast<float>(absl::bit_cast<double>(operand.getDFPImm()))));
+  }
+  if (operand.isReg()) {
+    TF_ASSIGN_OR_RETURN(auto sregno, ToSregno(operand));
+    return SregnoOrImm::MakeRegno(sregno);
+  }
+  return FailedPrecondition("operand is neither imm nor reg");
+}
+
+StatusOr<VregnoOrImm> ProgramGenerator::ToVregnoOrImm(
+    const llvm::MCOperand& operand) {
+  if (operand.isImm()) {
+    return VregnoOrImm::MakeImm(operand.getImm());
+  }
+  if (operand.isDFPImm()) {
+    return VregnoOrImm::MakeImm(absl::bit_cast<uint32_t>(
+        static_cast<float>(absl::bit_cast<double>(operand.getDFPImm()))));
+  }
+  if (operand.isReg()) {
+    TF_ASSIGN_OR_RETURN(auto vregno, ToVregno(operand));
+    return VregnoOrImm::MakeRegno(vregno);
+  }
+  return FailedPrecondition("operand is neither imm nor reg");
+}
+
+StatusOr<ImmValue> ProgramGenerator::ToImmValue(
+    const llvm::MCOperand& operand) {
+  if (operand.isImm()) {
+    return ImmValue(operand.getImm());
+  }
+  if (operand.isDFPImm()) {
+    return ImmValue(absl::bit_cast<uint32_t>(
+        static_cast<float>(absl::bit_cast<double>(operand.getDFPImm()))));
+  }
+  return FailedPrecondition("operand is not imm");
+}
+
+using StatusOrProgramGenerator = StatusOr<std::unique_ptr<ProgramGenerator>>;
+
+StatusOrProgramGenerator ProgramGenerator::Create(
+    const IsaProgramTarget& target, LloModule* module) {
+  TF_RET_CHECK(target.sequencer == TpuSequencerType::kTensorCoreSequencer);
+  std::unique_ptr<ProgramGenerator> generator;
+  if (target.xla_target->IsIss()) {
+    generator = std::unique_ptr<ProgramGenerator>(
+        ProgramGeneratorRegisterer::CreateByNameOrDie(
+            ISS_PROGRAM_GENERATOR, target,
+            RangeSpec(module->comp_env().xla_llvm_isa_emitter_bundles()),
+            module));
+  } else {
+    const std::string& name =
+        ProgramGeneratorAliasRegisterer::GetNameByAliasOrDie(
+            tpu::TpuVersionToString(target.xla_target->DeepseaVersion()));
+    generator = std::unique_ptr<ProgramGenerator>(
+        ProgramGeneratorRegisterer::CreateByNameOrDie(
+            name, target,
+            RangeSpec(module->comp_env().xla_llvm_isa_emitter_bundles()),
+            module));
+  }
+  return generator;
+}
+
+Status ProgramGenerator::CreateProgramProto(int64_t bundle_count) {
+  TF_RET_CHECK(top_isa_emitter_ == nullptr);
+  top_isa_emitter_ = IsaEmitterFactory::Create(
+      target_.xla_target, /*compiler_metadata=*/nullptr, target_.sequencer);
+  TF_RET_CHECK(program() != nullptr);
+  program()->set_deepsea_version(
+      TpuVersionToProto(target_.xla_target->DeepseaVersion()));
+  program()->set_core_type(tpu::TPU_CORE_TYPE_TENSOR_CORE);
+  top_isa_emitter_->AddBundles(bundle_count);
+  return OkStatus();
+}
+
+StatusOr<IsaEmitter*> ProgramGenerator::GetThreadLocalIsaEmitter() {
+  if (thread_local_isa_emitter_.get() == nullptr) {
+    TF_RET_CHECK(top_isa_emitter_);
+    *thread_local_isa_emitter_.pointer() = IsaEmitterFactory::Create(
+        target_.xla_target, /*compiler_metadata=*/nullptr, target_.sequencer,
+        /*emit_annotations=*/false, top_isa_emitter_.get());
+  }
+  return thread_local_isa_emitter_.get().get();
+}
+
+StatusOrIsaProgram ProgramGenerator::FinalizeProgramProto() {
+  TF_RET_CHECK(program() != nullptr);
+  return top_isa_emitter_->ConsumeProgram();
+}
+
+void ProgramGenerator::PatchBeforeParsing(std::string* str) {
+  // Replace more invalid instructions.
+  absl::StrReplaceAll(
+      {
+          {"dma.desc", " dma"},
+          // ISS doesn't support snop.
+          {"_ = \tsnop", "vnop"},
+          // ISS doesn't support sdelay.
+          {"\tsdelay", " vdelay"},
+          // ISS's parser is picky about tabs when parsing events.
+          {"\tevent", " event"},
+          // ISS parser epxects gsf and not gsfn for vdwg
+          {"vdwg.f16 (gsfn", "vdwg.f16 (gsf"},
+          {"vperm.0", "vperm"},
+          {"vrot.0", "vrot"},
+          {"xlane.0", "xlane.f32"},
+          {"vxpose.0", "vxpose"},
+      },
+      str);
+}
+
+Status ProgramGenerator::AppendBundleFromMcBundle(int64_t bundle_number,
+                                                  const llvm::MCInst& bundle) {
+  if (!module_->comp_env().xla_llvm_isa_emitter()) {
+    return Unimplemented("Direct IsaProgram proto generation is disabled");
+  }
+  if (!bundles_to_handle_.Match(bundle_number)) {
+    return Unimplemented("Rejecting bundle %lu", bundle_number);
+  }
+  VLOG(3) << "bundle: " << PrintInst(bundle);
+  TF_RET_CHECK(bundle.getOpcode() == llvm::TPU::BUNDLE);
+
+  TF_ASSIGN_OR_RETURN(IsaEmitter * isa_emitter, GetThreadLocalIsaEmitter());
+  isa_emitter->set_current_bundle_number(EmittedBundleNumber(bundle_number));
+
+  const uint32_t immediate_bit_width = target_.xla_target->ImmediateBitWidth();
+  auto score = [immediate_bit_width](const llvm::MCInst* inst) -> int {
+    // Sort the bundle instructions in priority order:
+    //   scalar branch which requires S0/IMM0
+    //   scalar 32-bit-immediate-using instructions
+    //   scalar 16-bit-immediate-using instructions
+    //   vector 32-bit-immediate-using instructions
+    //   vector 16-bit-immediate-using instructions
+    //   remaining instructions
+    // Lowest score means highest priority.
+    // Zero means no immediates or "don't care".
+    //
+    // Note that we don't have to be absolutely precise in the
+    // score/classification. The goal is to make a good initial guess that
+    // almost always works, and when it doesn't, we go through all the
+    // permutations to find a solution.
+    if (IsBranch(*inst)) {
+      return -900;
+    }
+    if (IsScalarImmInst(*inst)) {
+      return Has32bImmOperand(*inst, immediate_bit_width) ? -800 : -700;
+    }
+    if (Has32bImmOperand(*inst, immediate_bit_width)) {
+      return -600;
+    }
+    return HasImmOperand(*inst) ? -500 : 0;
+  };
+  std::vector<InstAndWeight> sorted_insts;
+  // inst_num is used to make the weights unique even when score() is the same.
+  int inst_num = 0;
+  // non_zero_weight_count tracks the number of instructions whose score() is
+  // negative, i.e. the number of instructions with at least one immediate
+  // operand to place. Those are sorted to the front, and we only bother
+  // checking permutations of that set.
+  int non_zero_weight_count = 0;
+  for (const llvm::MCOperand& bundle_inst : bundle) {
+    TF_RET_CHECK(bundle_inst.isInst());
+    int weight = score(bundle_inst.getInst());
+    if (weight != 0) {
+      weight += inst_num;
+      ++non_zero_weight_count;
+    }
+    sorted_insts.emplace_back(bundle_inst.getInst(), weight);
+    ++inst_num;
+  }
+  const auto comparator = [](const InstAndWeight& a, const InstAndWeight& b) {
+    return a.second < b.second;
+  };
+  absl::c_stable_sort(sorted_insts, comparator);
+
+  do {
+    Status status = EmitOneBundle(sorted_insts);
+    if (status.ok()) {
+      break;
+    }
+    // An unlucky permutation may result in a ResourceExhausted status. Any
+    // other status values should be just returned.
+    if (status.code() != tensorflow::error::RESOURCE_EXHAUSTED) {
+      return status;
+    }
+    // Reset the bundle before trying the next permutation.
+    isa_emitter->ResetBundle(EmittedBundleNumber(bundle_number));
+  } while (std::next_permutation(sorted_insts.begin(),
+                                 sorted_insts.begin() + non_zero_weight_count,
+                                 comparator));
+
+  // At this point, we successfully packed all instructions.
+  isa_emitter->FinalizeCurrentBundle();
+  return OkStatus();
+}
+
+Status ProgramGenerator::EmitOneBundle(
+    const std::vector<InstAndWeight>& inst_list) {
+  TF_ASSIGN_OR_RETURN(IsaEmitter * isa_emitter, GetThreadLocalIsaEmitter());
+  for (auto [instp, weight] : inst_list) {
+    TF_RETURN_IF_ERROR(EmitOneInstruction(*instp));
+    // If the emitter reported a new error, such as failure to pack immediates,
+    // return the error status and reset the emitter's error state.
+    if (Status emitter_status = isa_emitter->GetStatus();
+        !emitter_status.ok()) {
+      isa_emitter->ClearError();
+      return emitter_status;
+    }
+  }
+
+  return OkStatus();
+}
+
+Status ProgramGenerator::EmitOneInstruction(const llvm::MCInst& inst) {
+  static std::array<MCInstDesc, llvm::TPU::INSTRUCTION_LIST_END> inst_descs =
+      MakeInstDescArray();
+
+  TF_ASSIGN_OR_RETURN(Predication instruction_predication, ToPredication(inst));
+  TF_ASSIGN_OR_RETURN(IsaEmitter * isa_emitter, GetThreadLocalIsaEmitter());
+  auto emitter = isa_emitter->WithPredication(instruction_predication);
+  VLOG(3) << "opcode=" << inst.getOpcode();
+
+  TF_RET_CHECK(inst.getOpcode() < inst_descs.size());
+  const auto& inst_desc = inst_descs[inst.getOpcode()];
+
+  // Handle vector unops.
+  if (auto desc = std::get_if<VectorUnOpDesc>(&inst_desc)) {
+    TF_RET_CHECK(inst.getNumOperands() == 4);
+    TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(0)));
+    TF_ASSIGN_OR_RETURN(auto vsrc0, ToVregno(inst.getOperand(1)));
+    emitter->EmitVectorUnop(vdst, desc->llo_opcode, vsrc0);
+    return OkStatus();
+  }
+  if (auto desc = std::get_if<VectorUnpackOpDesc>(&inst_desc)) {
+    TF_RET_CHECK(inst.getNumOperands() == 4);
+    TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(0)));
+    TF_ASSIGN_OR_RETURN(auto vsrc0, ToVregno(inst.getOperand(1)));
+    emitter->EmitVectorUnpackop(desc->sublane_id, desc->format, vdst, vsrc0);
+    return OkStatus();
+  }
+
+  // Handle vector unops with no destination.
+  if (auto desc = std::get_if<VectorUnOpNoDstDesc>(&inst_desc)) {
+    TF_RET_CHECK(inst.getNumOperands() == (desc->has_extra_src ? 4 : 3));
+    TF_ASSIGN_OR_RETURN(auto vsrc,
+                        ToVregno(inst.getOperand(desc->has_extra_src ? 1 : 0)));
+    emitter->EmitVectorUnopNoDst(desc->llo_opcode, vsrc);
+    return OkStatus();
+  }
+
+  // Handle vector binops.
+  if (auto desc = std::get_if<VectorBinOpDesc>(&inst_desc)) {
+    TF_RET_CHECK(inst.getNumOperands() == 5);
+    const int x_index = desc->reverse_vx_vy ? 2 : 1;
+    const int y_index = desc->reverse_vx_vy ? 1 : 2;
+    TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(0)));
+    TF_ASSIGN_OR_RETURN(auto vx, ToVregno(inst.getOperand(x_index)));
+    std::variant<VregnoOrImm, Sregno> vy_variant;
+    if (desc->is_scalar_vy) {
+      TF_ASSIGN_OR_RETURN(vy_variant, ToSregno(inst.getOperand(y_index)));
+    } else {
+      TF_ASSIGN_OR_RETURN(vy_variant, ToVregnoOrImm(inst.getOperand(y_index)));
+    }
+    emitter->EmitVectorBinop(vdst, desc->llo_opcode, vx, vy_variant);
+    return OkStatus();
+  }
+
+  if (auto desc = std::get_if<VectorPackOpDesc>(&inst_desc)) {
+    TF_RET_CHECK(inst.getNumOperands() == 5);
+    const int x_index = 2;
+    const int y_index = 1;
+    TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(0)));
+    TF_ASSIGN_OR_RETURN(auto vx, ToVregno(inst.getOperand(x_index)));
+    std::variant<VregnoOrImm, Sregno> vy_variant;
+    if (desc->is_scalar_vy) {
+      TF_ASSIGN_OR_RETURN(vy_variant, ToSregno(inst.getOperand(y_index)));
+    } else {
+      TF_ASSIGN_OR_RETURN(vy_variant, ToVregnoOrImm(inst.getOperand(y_index)));
+    }
+    emitter->EmitVectorPackop(desc->format, vdst, vx, vy_variant);
+    return OkStatus();
+  }
+
+  // Handle scalar unops.
+  if (auto desc = std::get_if<ScalarUnOpDesc>(&inst_desc)) {
+    TF_RET_CHECK(inst.getNumOperands() == 4);
+    TF_ASSIGN_OR_RETURN(auto sdst, ToSregno(inst.getOperand(0)));
+    TF_ASSIGN_OR_RETURN(auto sy, ToSregnoOrImm(inst.getOperand(1)));
+    emitter->EmitScalarUnop(sdst, desc->llo_opcode, sy);
+    return OkStatus();
+  }
+
+  // Handle scalar binops.
+  if (auto desc = std::get_if<ScalarBinOpDesc>(&inst_desc)) {
+    TF_RET_CHECK(inst.getNumOperands() == 5);
+    const int x_index = desc->reverse_sx_sy ? 2 : 1;
+    const int y_index = desc->reverse_sx_sy ? 1 : 2;
+    TF_ASSIGN_OR_RETURN(auto sdst, ToSregno(inst.getOperand(0)));
+    TF_ASSIGN_OR_RETURN(auto sx, ToSregno(inst.getOperand(x_index)));
+    TF_ASSIGN_OR_RETURN(auto sy, ToSregnoOrImm(inst.getOperand(y_index)));
+    emitter->EmitScalarBinop(sdst, desc->llo_opcode, sx, sy);
+    return OkStatus();
+  }
+
+  // Handle vector comparisons.
+  if (auto desc = std::get_if<VectorCompareDesc>(&inst_desc)) {
+    TF_RET_CHECK(inst.getNumOperands() == 5);
+    TF_ASSIGN_OR_RETURN(auto vmdst, ToVmregno(inst.getOperand(0)));
+    TF_ASSIGN_OR_RETURN(auto vx, ToVregno(inst.getOperand(1)));
+    std::variant<VregnoOrImm, Sregno> vy_variant;
+    if (desc->is_scalar_vy) {
+      TF_ASSIGN_OR_RETURN(vy_variant, ToSregno(inst.getOperand(2)));
+    } else {
+      TF_ASSIGN_OR_RETURN(vy_variant, ToVregnoOrImm(inst.getOperand(2)));
+    }
+    emitter->EmitVectorCompare(
+        vmdst, Comparison(desc->comparison_direction, desc->comparison_type),
+        vx, vy_variant);
+    return OkStatus();
+  }
+
+  // Handle scalar comparisons.
+  if (auto desc = std::get_if<ScalarCompareDesc>(&inst_desc)) {
+    TF_RET_CHECK(inst.getNumOperands() == 5);
+    TF_ASSIGN_OR_RETURN(auto pdst, ToPregno(inst.getOperand(0)));
+    TF_ASSIGN_OR_RETURN(auto sx, ToSregno(inst.getOperand(1)));
+    TF_ASSIGN_OR_RETURN(auto sy, ToSregnoOrImm(inst.getOperand(2)));
+    emitter->EmitScalarCompare(
+        pdst, Comparison(desc->comparison_direction, desc->comparison_type), sx,
+        sy);
+    return OkStatus();
+  }
+
+  // Handle vector loads.
+  if (auto desc = std::get_if<VectorLoadDesc>(&inst_desc)) {
+    // Order is:
+    //   (vdst, base, offset, mask, [stride|shuffle])
+    // minus the operands that aren't provided for that opcode.
+    int cur = 0;
+    TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(cur++)));
+    TF_ASSIGN_OR_RETURN(auto sbase, ToSregnoOrImm(inst.getOperand(cur++)));
+    ImmValue displacement(0);
+    if (desc->has_displacement) {
+      TF_ASSIGN_OR_RETURN(displacement, ToImmValue(inst.getOperand(cur++)));
+    }
+    std::optional<SregnoOrImm> sublane_mask;
+    if (desc->has_sublane_mask) {
+      TF_ASSIGN_OR_RETURN(sublane_mask, ToSregnoOrImm(inst.getOperand(cur++)));
+    }
+    SregnoOrImm ssublane_stride = SregnoOrImm::MakeImm(1);
+    if (desc->has_sublane_stride) {
+      TF_ASSIGN_OR_RETURN(ssublane_stride,
+                          ToSregnoOrImm(inst.getOperand(cur++)));
+    }
+    if (desc->has_sublane_shuffle) {
+      TF_RET_CHECK(desc->iar_regno == -1);
+      TF_ASSIGN_OR_RETURN(auto sshfl_pattern,
+                          ToSregnoOrImm(inst.getOperand(cur++)));
+      emitter->EmitVectorLoadSublaneShuffle(vdst, sbase, sshfl_pattern,
+                                            displacement, sublane_mask);
+    } else {
+      std::optional<uint32_t> iar;
+      if (desc->iar_regno != -1) {
+        iar = static_cast<uint32_t>(desc->iar_regno);
+      }
+      emitter->EmitVectorLoad(vdst, ssublane_stride, sbase, displacement,
+                              sublane_mask, iar);
+    }
+    return OkStatus();
+  }
+
+  // Handle vector stores.
+  if (auto desc = std::get_if<VectorStoreDesc>(&inst_desc)) {
+    // Order is:
+    //   (vsrc, base, offset, sublane_mask, stride, vmask)
+    // minus the operands that aren't provided for that opcode.
+    int cur = 0;
+    TF_ASSIGN_OR_RETURN(auto vsrc, ToVregno(inst.getOperand(cur++)));
+    TF_ASSIGN_OR_RETURN(auto sbase, ToSregnoOrImm(inst.getOperand(cur++)));
+    ImmValue displacement(0);
+    if (desc->has_displacement) {
+      TF_ASSIGN_OR_RETURN(displacement, ToImmValue(inst.getOperand(cur++)));
+    }
+    std::optional<SregnoOrImm> sublane_mask;
+    if (desc->has_sublane_mask) {
+      TF_ASSIGN_OR_RETURN(sublane_mask, ToSregnoOrImm(inst.getOperand(cur++)));
+    }
+    auto ssublane_stride = SregnoOrImm::MakeImm(1);
+    if (desc->has_sublane_stride) {
+      TF_ASSIGN_OR_RETURN(ssublane_stride,
+                          ToSregnoOrImm(inst.getOperand(cur++)));
+    }
+    // TODO(b/207140934): Add support for indexed vector stores in LLVM. It
+    // seems that the store for EvenOdd sublanes is pretty intertwined with the
+    // IAR number, rather than the respective op code.
+    std::optional<uint32_t> iar;
+    if (desc->iar_regno != -1) {
+      iar = static_cast<uint32_t>(desc->iar_regno);
+    }
+    if (desc->has_vmask) {
+      TF_ASSIGN_OR_RETURN(auto vmsrc, ToVmregno(inst.getOperand(cur++)));
+      emitter->EmitVectorStoreMasked(ssublane_stride, sbase, displacement,
+                                     vmsrc, vsrc, sublane_mask, iar);
+    } else {
+      emitter->EmitVectorStore(ssublane_stride, sbase, displacement, vsrc,
+                               sublane_mask, iar);
+    }
+    return OkStatus();
+  }
+
+  // Handle vmatmul variants.
+  if (auto desc = std::get_if<MatmulDesc>(&inst_desc)) {
+    TF_RET_CHECK(!desc->is_masked) << "not yet implemented";
+    TF_RET_CHECK(inst.getNumOperands() == 5);
+    TF_ASSIGN_OR_RETURN(auto vsrc, ToVregno(inst.getOperand(1)));
+    const int32_t mxu_id = desc->mxu_id;
+    const auto dwg_mode = DoneWithGainsMode::kNone;
+    // Note : LloOpcodes - kVectorMatmul and kVectorMatmulMsk do not have one on
+    // one mapping to MatmulDataFormat. Multiple MatmulDataFormats map to the
+    // two LloOpcodes.
+    // This would provide incorrect results when extended for the newer data
+    // formats in VxC - other than kF32 or kBf16.
+    auto opcode_to_matmul_data_format =
+        [&](LloOpcode opcode) -> MatmulDataFormat {
+      switch (opcode) {
+        case LloOpcode::kVectorMatmul:
+        case LloOpcode::kVectorMatmulMsk:
+        case LloOpcode::kVectorMatmulHigh:
+        case LloOpcode::kVectorMatmulHighMsk:
+        case LloOpcode::kVectorMatmulLow:
+        case LloOpcode::kVectorMatmulLowMsk:
+          return MatmulDataFormat::kF32;
+        case LloOpcode::kVectorMatmulPacked:
+        case LloOpcode::kVectorMatmulPackedMsk:
+          return MatmulDataFormat::kBf16;
+        default:
+          LOG(FATAL) << "Unexpected opcode: " << opcode;
+      }
+    };
+
+    emitter->EmitVectorMatmul(desc->llo_opcode, vsrc, mxu_id, dwg_mode,
+                              opcode_to_matmul_data_format(desc->llo_opcode));
+    return OkStatus();
+  }
+
+  // Handle vmatpush variants.
+  if (auto desc = std::get_if<LatchDesc>(&inst_desc)) {
+    TF_ASSIGN_OR_RETURN(auto vsrc, ToVregno(inst.getOperand(1)));
+    TF_RET_CHECK(!desc->is_masked) << "implemented but not tested";
+    if (desc->is_masked) {
+      TF_RET_CHECK(inst.getNumOperands() == 6);
+      TF_ASSIGN_OR_RETURN(auto vmsrc, ToVmregno(inst.getOperand(2)));
+      emitter->EmitVectorLatchMsk(LloOpcode::kVectorLatchMsk, vmsrc, vsrc,
+                                  desc->latch_mode, desc->mxu_id);
+    } else {
+      TF_RET_CHECK(inst.getNumOperands() == 5);
+      emitter->EmitVectorLatch(LloOpcode::kVectorLatch, vsrc, desc->latch_mode,
+                               desc->mxu_id);
+    }
+    return OkStatus();
+  }
+
+  // Handle vwait instructions.
+  if (auto desc = std::get_if<VectorWaitDesc>(&inst_desc)) {
+    TF_RET_CHECK(inst.getNumOperands() == 4);
+    TF_ASSIGN_OR_RETURN(auto sync_flag, ToSregnoOrImm(inst.getOperand(0)));
+    TF_ASSIGN_OR_RETURN(auto value, ToSregnoOrImm(inst.getOperand(1)));
+    emitter->EmitVectorWait(desc->llo_opcode, sync_flag, value);
+    return OkStatus();
+  }
+
+  // Handle DMA instructions.
+  if (auto desc = std::get_if<DmaDesc>(&inst_desc)) {
+    TF_RET_CHECK(!desc->is_strided);  // verify the operand ordering
+    TF_RET_CHECK(!desc->is_general);  // verify the operand ordering
+    TF_RET_CHECK(inst.getNumOperands() == (desc->has_dst_address ? 6 : 5));
+    int cur = 0;
+    Sregno dst_address;
+    if (desc->has_dst_address) {
+      TF_ASSIGN_OR_RETURN(dst_address, ToSregno(inst.getOperand(cur++)));
+    }
+    TF_ASSIGN_OR_RETURN(auto dst_sync_flagno,
+                        ToSregnoOrImm(inst.getOperand(cur++)));
+    TF_ASSIGN_OR_RETURN(auto src_address, ToSregno(inst.getOperand(cur++)));
+    TF_ASSIGN_OR_RETURN(auto dma_length, ToSregnoOrImm(inst.getOperand(cur++)));
+    if (!desc->has_dst_address) {
+      // The MCInst doesn't contain a dst_address, but EmitDmaLocal() needs one,
+      // so we fake it.
+      dst_address = src_address;
+    }
+    auto stride = std::nullopt;
+    emitter->EmitDmaLocal(
+        desc->llo_opcode, SregnoOrVregno::MakeSregno(src_address), dma_length,
+        SregnoOrVregno::MakeSregno(dst_address), dst_sync_flagno, stride,
+        /*is_prefetch=*/false, /*is_vmem_broadcast=*/false);
+    return OkStatus();
+  }
+
+  // Handle vector reduce instructions.
+  if (auto desc = std::get_if<VectorReduceDesc>(&inst_desc)) {
+    TF_RET_CHECK(inst.getNumOperands() == 4);
+    TF_ASSIGN_OR_RETURN(auto vsrc0, ToVregno(inst.getOperand(1)));
+    TF_RET_CHECK(inst.getOperand(0).isReg());
+    const int32_t trf_id = inst.getOperand(0).getReg() - llvm::TPU::TRF0;
+    const int32_t source_bus_id = desc->bus_id;
+    if (desc->is_segmented) {
+      emitter->EmitVectorSegmentedReduce(desc->llo_opcode, vsrc0, trf_id,
+                                         source_bus_id);
+    } else {
+      emitter->EmitVectorCrossLaneReduce(desc->llo_opcode, vsrc0, trf_id,
+                                         source_bus_id);
+    }
+    return OkStatus();
+  }
+
+  // Handle read register instructions.
+  if (auto desc = std::get_if<ReadRegisterDesc>(&inst_desc)) {
+    TF_RET_CHECK(inst.getNumOperands() == 3);
+    TF_ASSIGN_OR_RETURN(auto sdst, ToSregno(inst.getOperand(0)));
+    emitter->EmitScalarReadCycle(sdst, desc->llo_opcode);
+    return OkStatus();
+  }
+
+  TF_RET_CHECK(std::holds_alternative<std::monostate>(inst_desc));
+
+  // Handle everything else.
+  switch (inst.getOpcode()) {
+    case llvm::TPU::BRrel:
+      TF_RET_CHECK(inst.getNumOperands() == 3);
+      emitter->EmitScalarBranch(inst.getOperand(0).getImm(),
+                                /*pc_relative=*/true);
+      break;
+    case llvm::TPU::CARRYOUTri:
+    case llvm::TPU::CARRYOUTri_S0:
+    case llvm::TPU::CARRYOUTri_S1:
+    case llvm::TPU::CARRYOUTrr:
+    case llvm::TPU::CARRYOUTrr_S0:
+    case llvm::TPU::CARRYOUTrr_S1: {
+      TF_RET_CHECK(inst.getNumOperands() == 5);
+      TF_ASSIGN_OR_RETURN(auto pdst, ToPregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto sx, ToSregno(inst.getOperand(1)));
+      TF_ASSIGN_OR_RETURN(auto sy, ToSregnoOrImm(inst.getOperand(2)));
+      emitter->EmitScalarAddCarryU32(pdst, sx, sy);
+      break;
+    }
+    case llvm::TPU::DMADesci:
+    case llvm::TPU::DMADescr: {
+      TF_RET_CHECK(inst.getNumOperands() == 3);
+      TF_ASSIGN_OR_RETURN(auto ssrc, ToSregnoOrImm(inst.getOperand(0)));
+      emitter->EmitDma(ssrc);
+      break;
+    }
+    case llvm::TPU::GetRngSeed: {
+      TF_RET_CHECK(inst.getNumOperands() == 3);
+      TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(0)));
+      emitter->EmitVectorNullop(vdst, LloOpcode::kVectorGetRngSeed);
+      break;
+    }
+    case llvm::TPU::HALT:
+      TF_RET_CHECK(inst.getNumOperands() == 2);
+      if (instruction_predication.is_always_execute()) {
+        isa_emitter->EmitScalarHalt();
+      } else {
+        emitter->EmitScalarHaltOnError(/*message=*/"");
+      }
+      break;
+    case llvm::TPU::IAR0_SET_RAW:
+    case llvm::TPU::IAR1_SET_RAW: {
+      TF_ASSIGN_OR_RETURN(auto vsrc, ToVregno(inst.getOperand(1)));
+      const uint32_t iar = inst.getOpcode() == llvm::TPU::IAR0_SET_RAW ? 0 : 1;
+      emitter->EmitVectorSetIar(IsaEmitter::SetIarType::RAW, vsrc, iar);
+    } break;
+    case llvm::TPU::tcMXU0DWGN:
+    case llvm::TPU::tcMXU1DWGN:
+    case llvm::TPU::tcMXU2DWGN:
+    case llvm::TPU::tcMXU3DWGN: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      const auto done_with_gains_mode = DoneWithGainsMode::kNormal;
+      const int32_t mxu_id = (inst.getOpcode() - llvm::TPU::tcMXU0DWGN) /
+                             (llvm::TPU::tcMXU1DWGN - llvm::TPU::tcMXU0DWGN);
+      emitter->EmitVectorDoneWithGains(done_with_gains_mode, mxu_id);
+      break;
+    }
+    case llvm::TPU::tcMXU0MATPOP:
+    case llvm::TPU::tcMXU1MATPOP:
+    case llvm::TPU::tcMXU2MATPOP:
+    case llvm::TPU::tcMXU3MATPOP: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(0)));
+      const int32_t mxu_id =
+          (inst.getOpcode() - llvm::TPU::tcMXU0MATPOP) /
+          (llvm::TPU::tcMXU1MATPOP - llvm::TPU::tcMXU0MATPOP);
+      emitter->EmitVectorMatres(vdst, mxu_id);
+      break;
+    }
+    case llvm::TPU::PMOV: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto pdst, ToPregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto psrc, ToPregno(inst.getOperand(1)));
+      emitter->EmitPredicateMove(pdst, psrc);
+      break;
+    }
+    case llvm::TPU::POR: {
+      TF_RET_CHECK(inst.getNumOperands() == 7);
+      TF_ASSIGN_OR_RETURN(auto pdst, ToPregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto plhs, ToPredication(inst, /*operand_index=*/1));
+      TF_ASSIGN_OR_RETURN(auto prhs, ToPredication(inst, /*operand_index=*/3));
+      emitter->EmitPredicateOr(pdst, plhs, prhs);
+      break;
+    }
+    case llvm::TPU::PORii: {
+      TF_RET_CHECK(inst.getNumOperands() == 5);
+      TF_ASSIGN_OR_RETURN(auto pdst, ToPregno(inst.getOperand(0)));
+      TF_RET_CHECK(inst.getOperand(1).isImm());
+      TF_RET_CHECK(inst.getOperand(2).isImm());
+      const bool value =
+          inst.getOperand(1).getImm() != 0 || inst.getOperand(2).getImm() != 0;
+      emitter->EmitPredicateImmediate(pdst, value);
+      break;
+    }
+    case llvm::TPU::SFENCE:
+      TF_RET_CHECK(inst.getNumOperands() == 2);
+      emitter->EmitScalarFence();
+      break;
+    case llvm::TPU::SLDi: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto sdst, ToSregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto base, ToSregnoOrImm(inst.getOperand(1)));
+      emitter->EmitScalarLoad(sdst, base);
+      break;
+    }
+    case llvm::TPU::SLDri:
+    case llvm::TPU::SLDrr: {
+      TF_RET_CHECK(inst.getNumOperands() == 5);
+      TF_ASSIGN_OR_RETURN(auto sdst, ToSregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto base, ToSregnoOrImm(inst.getOperand(2)));
+      TF_ASSIGN_OR_RETURN(auto offset, ToSregno(inst.getOperand(1)));
+      emitter->EmitScalarLoad(sdst, base, offset);
+      break;
+    }
+    case llvm::TPU::SPOP_V2SF: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto sdst, ToSregno(inst.getOperand(0)));
+      emitter->EmitScalarV2SPop(sdst);
+      break;
+    }
+    case llvm::TPU::SSTi:
+    case llvm::TPU::SSTr: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto ssrc, ToSregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto adst, ToSregnoOrImm(inst.getOperand(1)));
+      emitter->EmitScalarStore(adst, ssrc);
+      break;
+    }
+    case llvm::TPU::tcSYNCADD_REMOTEii:
+    case llvm::TPU::tcSYNCADD_REMOTEir:
+    case llvm::TPU::tcSYNCADD_REMOTEri:
+    case llvm::TPU::tcSYNCADD_REMOTErr: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto sync_flag_no, ToSregnoOrImm(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto addend, ToSregnoOrImm(inst.getOperand(1)));
+      emitter->EmitVectorSyncFlagAddRemote(sync_flag_no, addend);
+      break;
+    }
+    case llvm::TPU::tcSYNCADDii:
+    case llvm::TPU::tcSYNCADDir:
+    case llvm::TPU::tcSYNCADDri:
+    case llvm::TPU::tcSYNCADDrr: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto sync_flag_no, ToSregnoOrImm(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto addend, ToSregnoOrImm(inst.getOperand(1)));
+      emitter->EmitVectorSyncFlagAdd(sync_flag_no, addend);
+      break;
+    }
+    case llvm::TPU::tcSYNCSET_REMOTEii:
+    case llvm::TPU::tcSYNCSET_REMOTEir:
+    case llvm::TPU::tcSYNCSET_REMOTEri:
+    case llvm::TPU::tcSYNCSET_REMOTErr: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto sync_flag_no, ToSregnoOrImm(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto value, ToSregnoOrImm(inst.getOperand(1)));
+      emitter->EmitVectorSyncFlagSetRemote(sync_flag_no, value);
+      break;
+    }
+    case llvm::TPU::tcSYNCSETii:
+    case llvm::TPU::tcSYNCSETir:
+    case llvm::TPU::tcSYNCSETri:
+    case llvm::TPU::tcSYNCSETrr: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto sync_flag_no, ToSregnoOrImm(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto value, ToSregnoOrImm(inst.getOperand(1)));
+      emitter->EmitVectorSyncFlagSet(sync_flag_no, value,
+                                     /*update_done_to=*/{});
+      break;
+    }
+    case llvm::TPU::tcVSETTMi:
+    case llvm::TPU::tcVSETTMr: {
+      TF_RET_CHECK(inst.getNumOperands() == 3);
+      TF_ASSIGN_OR_RETURN(auto value, ToSregnoOrImm(inst.getOperand(0)));
+      emitter->EmitVectorSetTracemark(value);
+      break;
+    }
+    case llvm::TPU::tcVTRACEi:
+    case llvm::TPU::tcVTRACEr: {
+      TF_RET_CHECK(inst.getNumOperands() == 3);
+      TF_ASSIGN_OR_RETURN(auto value, ToSregnoOrImm(inst.getOperand(0)));
+      emitter->EmitVectorTrace(value);
+      break;
+    }
+    case llvm::TPU::VCARRYOUTri:
+    case llvm::TPU::VCARRYOUTri_V0:
+    case llvm::TPU::VCARRYOUTri_V1:
+    case llvm::TPU::VCARRYOUTri_V2:
+    case llvm::TPU::VCARRYOUTrr:
+    case llvm::TPU::VCARRYOUTrr_V0:
+    case llvm::TPU::VCARRYOUTrr_V1:
+    case llvm::TPU::VCARRYOUTrr_V2: {
+      TF_RET_CHECK(inst.getNumOperands() == 5);
+      TF_ASSIGN_OR_RETURN(auto vmdst, ToVmregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto vx, ToVregno(inst.getOperand(1)));
+      TF_ASSIGN_OR_RETURN(auto vy_variant, ToVregnoOrImm(inst.getOperand(2)));
+      emitter->EmitVectorAddCarryU32(vmdst, vx, vy_variant);
+      break;
+    }
+    case llvm::TPU::VCARRYOUTrs:
+    case llvm::TPU::VCARRYOUTrs_V0:
+    case llvm::TPU::VCARRYOUTrs_V1:
+    case llvm::TPU::VCARRYOUTrs_V2: {
+      TF_RET_CHECK(inst.getNumOperands() == 5);
+      TF_ASSIGN_OR_RETURN(auto vmdst, ToVmregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto vx, ToVregno(inst.getOperand(1)));
+      TF_ASSIGN_OR_RETURN(auto vy_variant, ToSregno(inst.getOperand(2)));
+      emitter->EmitVectorAddCarryU32(vmdst, vx, vy_variant);
+      break;
+    }
+    case llvm::TPU::VDELAY:
+    case llvm::TPU::VDELAY_LONG: {
+      TF_RET_CHECK(inst.getNumOperands() == 3);
+      TF_ASSIGN_OR_RETURN(auto ssrc, ToSregnoOrImm(inst.getOperand(0)));
+      emitter->EmitVectorDelay(ssrc);
+      break;
+    }
+    case llvm::TPU::VIMMF:
+    case llvm::TPU::VIMMF_V0:
+    case llvm::TPU::VIMMF_V1:
+    case llvm::TPU::VIMMF_V2: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(0)));
+      emitter->EmitVectorImmediateF32(
+          vdst, absl::bit_cast<double>(inst.getOperand(1).getDFPImm()));
+      break;
+    }
+    case llvm::TPU::VIMMI:
+    case llvm::TPU::VIMMI_V0:
+    case llvm::TPU::VIMMI_V1:
+    case llvm::TPU::VIMMI_V2: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(0)));
+      emitter->EmitVectorImmediateU32(vdst, inst.getOperand(1).getImm());
+      break;
+    }
+    case llvm::TPU::VLANESEQ:
+    case llvm::TPU::VLANESEQ_V0:
+    case llvm::TPU::VLANESEQ_V1: {
+      TF_RET_CHECK(inst.getNumOperands() == 3);
+      TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(0)));
+      emitter->EmitVectorNullop(vdst, LloOpcode::kVectorLaneSequence);
+      break;
+    }
+    case llvm::TPU::VMAND: {
+      TF_RET_CHECK(inst.getNumOperands() == 5);
+      TF_ASSIGN_OR_RETURN(auto vmdst, ToVmregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto vmx, ToVmregno(inst.getOperand(1)));
+      TF_ASSIGN_OR_RETURN(auto vmy, ToVmregno(inst.getOperand(2)));
+      emitter->EmitVectorMaskBinop(vmdst, LloOpcode::kVectorMaskAnd, vmx, vmy);
+      break;
+    }
+    case llvm::TPU::VMMOV: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto vmdst, ToVmregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto vmsrc, ToVmregno(inst.getOperand(1)));
+      emitter->EmitVectorMaskUnop(vmdst, LloOpcode::kVectorMaskMove, vmsrc);
+      break;
+    }
+    case llvm::TPU::VMOR: {
+      TF_RET_CHECK(inst.getNumOperands() == 5);
+      TF_ASSIGN_OR_RETURN(auto vmdst, ToVmregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto vmx, ToVmregno(inst.getOperand(1)));
+      TF_ASSIGN_OR_RETURN(auto vmy, ToVmregno(inst.getOperand(2)));
+      emitter->EmitVectorMaskBinop(vmdst, LloOpcode::kVectorMaskOr, vmx, vmy);
+      break;
+    }
+    case llvm::TPU::VMOVr:
+    case llvm::TPU::VMOVr_V0:
+    case llvm::TPU::VMOVr_V1:
+    case llvm::TPU::VMOVr_V2: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto vsrc, ToVregnoOrImm(inst.getOperand(1)));
+      emitter->EmitVectorMove(vdst, vsrc);
+      break;
+    }
+    case llvm::TPU::VMOVs:
+    case llvm::TPU::VMOVs_V0:
+    case llvm::TPU::VMOVs_V1:
+    case llvm::TPU::VMOVs_V2: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto ssrc, ToSregno(inst.getOperand(1)));
+      emitter->EmitScalarToVector(vdst, ssrc);
+      break;
+    }
+    case llvm::TPU::VMXOR: {
+      TF_RET_CHECK(inst.getNumOperands() == 5);
+      TF_ASSIGN_OR_RETURN(auto vmdst, ToVmregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto vmx, ToVmregno(inst.getOperand(1)));
+      TF_ASSIGN_OR_RETURN(auto vmy, ToVmregno(inst.getOperand(2)));
+      emitter->EmitVectorMaskBinop(vmdst, LloOpcode::kVectorMaskXor, vmx, vmy);
+      break;
+    }
+    case llvm::TPU::VNOP:
+      TF_RET_CHECK(inst.getNumOperands() == 2);
+      emitter->EmitVectorNop();
+      break;
+    case llvm::TPU::VPUSH: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto vsrc, ToVregno(inst.getOperand(1)));
+      emitter->EmitVectorToScalarPush(vsrc);
+      break;
+    }
+    case llvm::TPU::VRES_EUP_VRES0:
+    case llvm::TPU::VRES_EUP_VRES0_V0:
+    case llvm::TPU::VRES_EUP_VRES0_V1:
+    case llvm::TPU::VRES_EUP_VRES0_VAUX:
+    case llvm::TPU::VRES_EUP_VRES0_VLD: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(0)));
+      emitter->EmitVectorNullop(vdst, LloOpcode::kVectorEupResult);
+      break;
+    }
+    case llvm::TPU::VRng: {
+      TF_RET_CHECK(inst.getNumOperands() == 3);
+      TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(0)));
+      emitter->EmitVectorNullop(vdst, LloOpcode::kVectorPrng);
+      break;
+    }
+    case llvm::TPU::VSELir:
+    case llvm::TPU::VSELir_V0:
+    case llvm::TPU::VSELir_V1:
+    case llvm::TPU::VSELir_V2:
+    case llvm::TPU::VSELrr:
+    case llvm::TPU::VSELrr_V0:
+    case llvm::TPU::VSELrr_V1:
+    case llvm::TPU::VSELrr_V2: {
+      TF_RET_CHECK(inst.getNumOperands() == 6);
+      TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto vmsrc, ToVmregno(inst.getOperand(1)));
+      TF_ASSIGN_OR_RETURN(auto vtrue_variant,
+                          ToVregnoOrImm(inst.getOperand(2)));
+      TF_ASSIGN_OR_RETURN(auto vfalse, ToVregno(inst.getOperand(3)));
+      emitter->EmitVectorSelect(vdst, vmsrc, vtrue_variant, vfalse);
+      break;
+    }
+    case llvm::TPU::VSELsr:
+    case llvm::TPU::VSELsr_V0:
+    case llvm::TPU::VSELsr_V1:
+    case llvm::TPU::VSELsr_V2: {
+      TF_RET_CHECK(inst.getNumOperands() == 6);
+      TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto vmsrc, ToVmregno(inst.getOperand(1)));
+      TF_ASSIGN_OR_RETURN(auto vtrue_variant, ToSregno(inst.getOperand(2)));
+      TF_ASSIGN_OR_RETURN(auto vfalse, ToVregno(inst.getOperand(3)));
+      emitter->EmitVectorSelect(vdst, vmsrc, vtrue_variant, vfalse);
+      break;
+    }
+    case llvm::TPU::VSUBLANE_MASK: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto vmdst, ToVmregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto vy, ToVregnoOrImm(inst.getOperand(1)));
+      emitter->EmitVectorCreateMask(vmdst, LloOpcode::kVectorCreateSublaneMask,
+                                    vy);
+      break;
+    }
+    case llvm::TPU::VSYNCMOVEi:
+    case llvm::TPU::VSYNCMOVEr: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto ssf, ToSregnoOrImm(inst.getOperand(1)));
+      emitter->EmitSyncFlagToScalarPush(ssf);
+      break;
+    }
+    case llvm::TPU::VWEIRD: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto vmdst, ToVmregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto vx, ToVregno(inst.getOperand(1)));
+      emitter->EmitVectorWeird(vmdst, vx, PrimitiveType::F32);
+      break;
+    }
+    case llvm::TPU::WEIRD:
+    case llvm::TPU::WEIRD_S0:
+    case llvm::TPU::WEIRD_S1: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto pdst, ToPregno(inst.getOperand(0)));
+      TF_ASSIGN_OR_RETURN(auto sx, ToSregno(inst.getOperand(1)));
+      emitter->EmitScalarWeird(pdst, sx);
+      break;
+    }
+    case llvm::TPU::tcXLU0B0PERMUTE: {
+      TF_RET_CHECK(inst.getNumOperands() == 5);
+      TF_ASSIGN_OR_RETURN(auto vsrc, ToVregno(inst.getOperand(1)));
+      const int unit_id = inst.getOperand(2).getReg() - llvm::TPU::PCR0;
+      TF_RET_CHECK(inst.getOperand(2).isReg());
+      const int32_t source_bus_id = 0;  // B0
+      emitter->EmitVectorPermute(vsrc, unit_id, source_bus_id);
+      break;
+    }
+    case llvm::TPU::tcXLU0B0ROTATEi:
+    case llvm::TPU::tcXLU0B0ROTATEr: {
+      TF_RET_CHECK(inst.getNumOperands() == 5);
+      TF_ASSIGN_OR_RETURN(auto vsrc, ToVregno(inst.getOperand(1)));
+      TF_ASSIGN_OR_RETURN(auto specifier, ToSregnoOrImm(inst.getOperand(2)));
+      TF_RET_CHECK(inst.getOperand(0).isReg());
+      const int32_t trf_id = inst.getOperand(0).getReg() - llvm::TPU::TRF0;
+      const int32_t source_bus_id = 0;  // B0
+      emitter->EmitVectorRotate(vsrc, specifier, jellyfish::BitDataFormat::kB32,
+                                trf_id, source_bus_id);
+      break;
+    }
+    case llvm::TPU::tcXLU0B0SETPERMUTE_SUBLANE: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_ASSIGN_OR_RETURN(auto vsrc, ToVregno(inst.getOperand(1)));
+      TF_RET_CHECK(inst.getOperand(0).isReg());
+      const int32_t pcr_id = inst.getOperand(0).getReg() - llvm::TPU::PCR0;
+      const int32_t source_bus_id = 0;  // B0
+      emitter->EmitVectorSetPermutePattern(vsrc, SetPermuteMode::kOneSublane,
+                                           pcr_id, source_bus_id);
+      break;
+    }
+    case llvm::TPU::tcXLU0B0TRANSPOSE:
+    case llvm::TPU::tcXLU0B0TRANSPOSE_END: {
+      TF_RET_CHECK(inst.getNumOperands() == 7);
+      TF_ASSIGN_OR_RETURN(auto vsrc, ToVregno(inst.getOperand(1)));
+      TF_RET_CHECK(inst.getOperand(0).isReg());
+      const int32_t trf_id = inst.getOperand(0).getReg() - llvm::TPU::TRF0;
+      const int32_t source_bus_id = 0;  // B0
+      TF_RET_CHECK(inst.getOperand(2).isImm());
+      const int32_t width = inst.getOperand(2).getImm();
+      const bool start = false;
+      const bool end = inst.getOpcode() == llvm::TPU::tcXLU0B0TRANSPOSE_END;
+      emitter->EmitVectorTranspose(vsrc, jellyfish::VxposeMode::kB32, trf_id,
+                                   source_bus_id, width, start, end);
+      break;
+    }
+    case llvm::TPU::tcXLU0Pop: {
+      TF_RET_CHECK(inst.getNumOperands() == 4);
+      TF_RET_CHECK(inst.getOperand(1).isReg());
+      const int32_t trf_id = inst.getOperand(1).getReg() - llvm::TPU::TRF0;
+      TF_ASSIGN_OR_RETURN(auto vdst, ToVregno(inst.getOperand(0)));
+      emitter->EmitVectorXlures(vdst, trf_id);
+      break;
+    }
+    default:
+      // Access the predication because otherwise, the emitter object's dtor
+      // will explicitly CHECK-fail.
+      emitter->predication();
+      return Unimplemented("Unhandled opcode in llvm TPU program processor %d",
+                           inst.getOpcode());
+  }
+
+  return OkStatus();
+}  // NOLINT(readability/fn_size)
+
+namespace {
+
+enum class SpillScope { kWholeProgram, kCrossCalls };
+using MemorySpaceAndScope = std::pair<MemorySpace, SpillScope>;
+
+std::string SpillScopeToString(SpillScope scope) {
+  switch (scope) {
+    case SpillScope::kCrossCalls:
+      return "cross-call";
+    case SpillScope::kWholeProgram:
+      return "whole-program";
+  }
+}
+
+// The McCodeBase an abstract base class for other McCode divided classes.
+class McCodeBase : public AbstractMcCode {
+ public:
+  using McBundle = std::pair<llvm::MCInst, SourceLocationList>;
+
+  McCodeBase(std::vector<std::pair<llvm::MCSymbol*, int64_t>>&& symbols,
+             std::map<int64_t, int64_t>&& long_branches)
+      : symbols_(std::move(symbols)),
+        long_branches_(std::move(long_branches)) {}
+
+ protected:
+  using BundleWithIndexHandler = std::function<Status(int64_t, McBundle*)>;
+  virtual Status ForEachBundleWithIndex(
+      const BundleWithIndexHandler& handler) = 0;
+
+  // Resolves and materializes branches.
+  Status ResolveBranches();
+
+  virtual llvm::MCContext* context() = 0;
+
+  // All symbols with their absolute offsets.
+  std::vector<std::pair<llvm::MCSymbol*, int64_t>> symbols_;
+
+  // Contains branches with relative offset outside of int16_t range, used to
+  // later reflect this information in IsaProgram, note that int16_t range is
+  // partially arbitrary but is used in XLA compiler and other components
+  // working with IsaProgram. Collected in ResolveBranches() and consumed in
+  // FinalizeAsTlpAndConsume().
+  std::map<int64_t, int64_t> long_branches_;
+};
+
+// The McCode represents a program as a sequence of MC instructions, allowing
+// analysis, transformation and final conversion of them into TPU ISA program.
+// It owns MC context and other objects being used for the program.
+class McCode : public McCodeBase {
+ public:
+  friend class CompositeMcCode;
+  friend class BundleFiller;
+
+  McCode(llvm::TargetMachine* target_machine, const IsaProgramTarget& target);
+
+  ~McCode() override;
+
+  Status EmitInstruction(const llvm::MCInst& inst);
+  Status EmitLabel(llvm::MCSymbol* symbol);
+  Status EmitSpillSlots(MemorySpaceAndScope space_and_scope, int64_t slots);
+
+  Status AdjustTlpSpillRegions(
+      SpillRegionCollection* spill_regions,
+      const std::function<int64_t(MemorySpace)>& get_spill_size) override;
+  int64_t GetBundleCount() const override { return bundles_.size(); }
+  uint32_t GetStaticInsertedThrottleCycles() const override {
+    return inserted_throttle_cycles_;
+  }
+  void SetStaticInsertedThrottleCycles(uint32_t value) {
+    inserted_throttle_cycles_ = value;
+  }
+  Status CheckForSpillRegionsOverflow(
+      const SpillRegionCollection& max_spill_regions,
+      const std::function<int64_t(MemorySpace)>& get_slot_size,
+      int64_t* vmem_overflow_slots) override;
+
+  StatusOr<std::set<std::string>> GetHloReferences() override;
+  Status FinalizeAsInlinee() override;
+  StatusOrMcCode PerformInlining(HloMcCodeProvider mc_code_provider) override;
+  StatusOrIsaProgram FinalizeAsTlpAndConsume(
+      int64_t num_threads, std::vector<SourceLocationList>* source_locations,
+      LloModule* module) override {
+    LOG(FATAL) << "Unreachable";
+  }
+
+  llvm::MCContext* context() override { return context_.get(); }
+  Status ForEachBundleWithIndex(const BundleWithIndexHandler& handler) override;
+
+  void ClearBundle(llvm::MCInst* bundle);
+
+ private:
+  // Deep clone bundles and symbols into the MC context associated with MC code.
+  StatusOr<llvm::MCInst> DeepCopyBundle(const llvm::MCInst& bundle,
+                                        SourceLocationList* locations);
+  StatusOr<llvm::MCInst*> DeepCopyInstr(const llvm::MCInst* inst,
+                                        SourceLocationList* locations);
+  Status DeepCopyOperand(llvm::MCOperand* operand);
+  StatusOr<const llvm::MCExpr*> DeepCopyExpr(const llvm::MCExpr*);
+  StatusOr<llvm::MCSymbol*> DeepCopySymbol(const llvm::MCSymbol*);
+
+  Status RemoveReturnScalarHalt();
+
+  Status ForEachMcInst(
+      const std::function<Status(const llvm::MCInst&)>& handler);
+
+  const IsaProgramTarget target_;
+  llvm::TargetMachine* const target_machine_;
+  std::unique_ptr<llvm::MCObjectFileInfo> object_file_info_;
+  llvm::SourceMgr source_mgr_;
+  std::unique_ptr<llvm::MCContext> context_;
+
+  std::vector<McBundle> bundles_;
+
+  // Emitted spill slots for a memory space if reported.
+  absl::flat_hash_map<MemorySpaceAndScope, int64_t> spill_slots_;
+
+  // Number of static compiler inserted throttle cycles.
+  uint32_t inserted_throttle_cycles_ = 0;
+
+  // Mapping between symbols coming from streamer to their clones in the MC code
+  // context.
+  absl::flat_hash_map<const llvm::MCSymbol*, llvm::MCSymbol*> symbol_map_;
+};
+
+// CompositeMcCode represents a program as a sequence of fragments from other
+// McCode objects, used for efficiently representing the result of inlining.
+class CompositeMcCode : public McCodeBase {
+ public:
+  using McBundleRange = std::pair<int64_t, int64_t>;
+  using McCodeFragment = std::pair<McCode*, McBundleRange>;
+  using McCodeFragmentList = std::vector<McCodeFragment>;
+
+  CompositeMcCode(McCode* tlp_mc_code, llvm::TargetMachine* target_machine,
+                  const IsaProgramTarget& target,
+                  McCodeFragmentList&& fragments,
+                  std::vector<std::pair<llvm::MCSymbol*, int64_t>>&& symbols,
+                  std::map<int64_t, int64_t>&& long_branches)
+      : McCodeBase(std::move(symbols), std::move(long_branches)),
+        target_(target),
+        target_machine_(target_machine),
+        tlp_mc_code_(tlp_mc_code),
+        fragments_(std::move(fragments)) {}
+
+  int64_t GetBundleCount() const override {
+    int64_t size = 0;
+    for (const auto& fragment : fragments_) {
+      size += fragment.second.second - fragment.second.first;
+    }
+    return size;
+  }
+
+  Status AdjustTlpSpillRegions(
+      SpillRegionCollection* spill_regions,
+      const std::function<int64_t(MemorySpace)>& get_spill_size) override {
+    LOG(FATAL) << "Unreachable";
+  }
+
+  Status CheckForSpillRegionsOverflow(
+      const SpillRegionCollection& max_spill_regions,
+      const std::function<int64_t(MemorySpace)>& get_slot_size,
+      int64_t* vmem_overflow_slots) override {
+    LOG(FATAL) << "Unreachable";
+  }
+
+  StatusOr<std::set<std::string>> GetHloReferences() override {
+    LOG(FATAL) << "Unreachable";
+  }
+
+  Status FinalizeAsInlinee() override { LOG(FATAL) << "Unreachable"; }
+
+  StatusOrMcCode PerformInlining(HloMcCodeProvider mc_code_provider) override {
+    LOG(FATAL) << "Unreachable";
+  }
+
+  uint32_t GetStaticInsertedThrottleCycles() const override {
+    LOG(FATAL) << "Unreachable";
+  }
+
+  StatusOrIsaProgram FinalizeAsTlpAndConsume(
+      int64_t num_threads, std::vector<SourceLocationList>* source_locations,
+      LloModule* module) override;
+
+  llvm::MCContext* context() override { return tlp_mc_code_->context(); }
+
+  Status ForEachBundleWithIndex(const BundleWithIndexHandler& handler) override;
+
+ private:
+  // Use parser to create the ISA program bundles from MC instructions.
+  Status CreateProgramBundles(ProgramGenerator* generator,
+                              llvm::MCInstPrinter* inst_printer,
+                              int64_t num_threads);
+
+  const IsaProgramTarget target_;
+  llvm::TargetMachine* const target_machine_;
+
+  McCode* const tlp_mc_code_;
+
+  McCodeFragmentList fragments_;
+};
+
+McCode::McCode(llvm::TargetMachine* target_machine,
+               const IsaProgramTarget& target)
+    : McCodeBase({}, {}),
+      target_(target),
+      target_machine_(target_machine),
+      context_(std::make_unique<llvm::MCContext>(
+          target_machine->getTargetTriple(), target_machine->getMCAsmInfo(),
+          target_machine->getMCRegisterInfo(),
+          target_machine_->getMCSubtargetInfo(), &source_mgr_)) {
+  object_file_info_.reset(target_machine_->getTarget().createMCObjectFileInfo(
+      *context_, /*PIC=*/false));
+  context_->setObjectFileInfo(object_file_info_.get());
+  // Use DWARF-5 so we can use the extended .file directive to embed source.
+  context_->setDwarfVersion(5);
+  context_->setUseNamesOnTempLabels(true);
+  source_mgr_.AddNewSourceBuffer(
+      llvm::MemoryBuffer::getMemBuffer(/*InputData=*/"", /*BufferName=*/"",
+                                       /*RequiresNullTerminator=*/false),
+      llvm::SMLoc());
+}
+
+void McCode::ClearBundle(llvm::MCInst* bundle) {
+  for (auto& operand : *bundle) {
+    CHECK(operand.isInst());
+    delete operand.getInst();
+  }
+  bundle->clear();
+}
+
+McCode::~McCode() {
+  for (auto& [bundle, locations] : bundles_) {
+    ClearBundle(&bundle);
+  }
+}
+
+Status McCode::ForEachMcInst(
+    const std::function<Status(const llvm::MCInst&)>& handler) {
+  for (const auto& [bundle, location] : bundles_) {
+    TF_RET_CHECK(bundle.getOpcode() == llvm::TPU::BUNDLE);
+    for (const llvm::MCOperand& inst : bundle) {
+      TF_RET_CHECK(inst.isInst());
+      TF_RETURN_IF_ERROR(handler(*inst.getInst()));
+    }
+  }
+  return OkStatus();
+}
+
+Status McCode::AdjustTlpSpillRegions(
+    SpillRegionCollection* spill_regions,
+    const std::function<int64_t(MemorySpace)>& get_spill_size) {
+  TF_RET_CHECK(spill_regions != nullptr);
+
+  constexpr SpillScope kScope = SpillScope::kCrossCalls;
+
+  for (auto& [space, region] : *spill_regions) {
+    TF_RET_CHECK(space == MemorySpace::kVmem || space == MemorySpace::kSmem);
+    const auto it = spill_slots_.find({space, kScope});
+    TF_RET_CHECK(it != spill_slots_.end())
+        << "Spill count for scope "
+        << SpillScopeToString(SpillScope::kCrossCalls) << " in memory space "
+        << space << " not found.";
+    const int64_t used_words = it->second * get_spill_size(space);
+    TF_RET_CHECK(region.first <= region.second - used_words)
+        << "Reported count of " << it->second << " slots in memory space "
+        << space << " does not fit into spill region [" << region.first << ".."
+        << region.second << ")";
+    region.second -= used_words;
+  }
+
+  return OkStatus();
+}
+
+Status McCode::CheckForSpillRegionsOverflow(
+    const SpillRegionCollection& max_spill_regions,
+    const std::function<int64_t(MemorySpace)>& get_slot_size,
+    int64_t* vmem_overflow_slots) {
+  if (vmem_overflow_slots != nullptr) {
+    *vmem_overflow_slots = 0;
+  }
+  for (const auto& [space_and_scope, count] : spill_slots_) {
+    const auto& [space, scope] = space_and_scope;
+    if (count > 0 && scope == SpillScope::kWholeProgram) {
+      const auto it = max_spill_regions.find(space);
+      if (it == max_spill_regions.end()) {
+        return Cancelled("%d spills detected in %s with no region available",
+                         count, MemorySpaceToString(space));
+      }
+      const int64_t lower_bound = it->second.first,
+                    upper_bound = it->second.second;
+      const int64_t slots_available =
+          (upper_bound - lower_bound) / get_slot_size(space);
+      if (count > slots_available) {
+        if (space == MemorySpace::kVmem) {
+          if (vmem_overflow_slots != nullptr) {
+            *vmem_overflow_slots = count - slots_available;
+          }
+          return ResourceExhausted(
+              "%d spills detected in %s while only %d available, region [%d, "
+              "%d)",
+              count, MemorySpaceToString(space), slots_available, lower_bound,
+              upper_bound);
+        }
+        return Cancelled(
+            "%d spills detected in %s while only %d available, region [%d, "
+            "%d)",
+            count, MemorySpaceToString(space), slots_available, lower_bound,
+            upper_bound);
+      }
+    }
+  }
+  return OkStatus();
+}
+
+StatusOr<std::set<std::string>> McCode::GetHloReferences() {
+  std::set<std::string> result;
+  TF_RETURN_IF_ERROR(ForEachMcInst([&](const llvm::MCInst& inst) -> Status {
+    if (IsCall(inst)) {
+      TF_ASSIGN_OR_RETURN(std::string name, GetCalleeSymbolName(inst));
+      result.insert(name);
+    }
+    return OkStatus();
+  }));
+  return result;
+}
+
+Status McCode::EmitInstruction(const llvm::MCInst& inst) {
+  SourceLocationList locations;
+  TF_ASSIGN_OR_RETURN(llvm::MCInst cloned_bundle,
+                      DeepCopyBundle(inst, &locations));
+  bundles_.push_back({cloned_bundle, std::move(locations)});
+  return OkStatus();
+}
+
+Status McCode::EmitLabel(llvm::MCSymbol* symbol) {
+  TF_ASSIGN_OR_RETURN(llvm::MCSymbol * local_symbol, DeepCopySymbol(symbol));
+  symbols_.emplace_back(local_symbol, bundles_.size());
+  return OkStatus();
+}
+
+Status McCode::EmitSpillSlots(MemorySpaceAndScope space_and_scope,
+                              int64_t slots) {
+  const auto result = spill_slots_.emplace(space_and_scope, slots);
+  TF_RET_CHECK(result.second)
+      << "Scope " << SpillScopeToString(space_and_scope.second)
+      << " in memory space " << space_and_scope.first << " is already reported";
+  return OkStatus();
+}
+
+StatusOrIsaProgram CompositeMcCode::FinalizeAsTlpAndConsume(
+    int64_t num_threads, std::vector<SourceLocationList>* source_locations,
+    LloModule* module) {
+  constexpr unsigned kSyntaxVariantWithoutSlotAssignments = 1;
+  // Create printer to be used.
+  std::unique_ptr<llvm::MCInstPrinter> inst_printer(
+      target_machine_->getTarget().createMCInstPrinter(
+          target_machine_->getTargetTriple(),
+          kSyntaxVariantWithoutSlotAssignments,
+          *target_machine_->getMCAsmInfo(), *target_machine_->getMCInstrInfo(),
+          *target_machine_->getMCRegisterInfo()));
+  TF_RET_CHECK(static_cast<bool>(inst_printer));
+
+  TF_RETURN_IF_ERROR(ResolveBranches());
+
+  TF_ASSIGN_OR_RETURN(auto generator,
+                      ProgramGenerator::Create(target_, module));
+  TF_RETURN_IF_ERROR(
+      CreateProgramBundles(generator.get(), inst_printer.get(), num_threads));
+  TF_ASSIGN_OR_RETURN(auto program, generator->FinalizeProgramProto());
+
+  // Set up long branches.
+  auto* const long_branch_targets = program->mutable_branch_target_map();
+  for (const auto [source, target] : long_branches_) {
+    TF_RET_CHECK(long_branch_targets->find(source) ==
+                 long_branch_targets->end());
+    (*long_branch_targets)[source] = target;
+  }
+
+  TF_RETURN_IF_ERROR(ForEachBundleWithIndex(
+      [&](int64_t bundle_number, McBundle* mc_bundle) -> Status {
+        (*source_locations).push_back(mc_bundle->second);
+        return OkStatus();
+      }));
+  return std::move(program);
+}
+
+// Class responsible for creating ISA bundle(s) based on MCInst specified by
+// MC code and instruction index.
+class BundleFiller {
+ public:
+  BundleFiller(McCode* mc_code, int32_t index)
+      : mc_code_(mc_code), index_(index) {}
+
+  StatusOr<bool> AddBundle(ProgramGenerator* generator,
+                           llvm::MCInstPrinter* inst_printer,
+                           const llvm::MCSubtargetInfo* subtarget_info,
+                           int64_t bundle_number,
+                           const std::function<Status()>& bundle_cleanup) {
+    {
+      absl::MutexLock lock(&mutex_);
+      if (bundle_number_ < 0) {
+        // Convert MCInst into bundle just once.
+
+        Status st = generator->AppendBundleFromMcBundle(
+            bundle_number, mc_code_->bundles_[index_].first);
+        if (!st.ok()) {
+          if (generator->module()->comp_env().xla_llvm_isa_emitter() &&
+              generator->module()->comp_env().xla_llvm_isa_emitter_force()) {
+            return st;
+          }
+          // Fall back to parser-based implementation.
+          VLOG(3) << "Failed to generate proto bundle from MC bundle: "
+                  << st.ToString();
+
+          // Print the resolved instruction to an assembly string.
+          std::string printed_bundle;
+          {
+            llvm::raw_string_ostream os(printed_bundle);
+            inst_printer->printInst(&mc_code_->bundles_[index_].first,
+                                    bundle_number,
+                                    /*Annot=*/"", *subtarget_info, os);
+          }
+
+          // Parse the bundle.
+          TF_RETURN_IF_ERROR(
+              generator->AppendBundleFromText(bundle_number, &printed_bundle));
+        }
+
+        // Since we only print the bundle once, we can clear it now.
+        TF_RETURN_IF_ERROR(bundle_cleanup());
+
+        // Store the index of the bundle which was actually created first from
+        // print/parse roundtrip, it will be copied on all following calls.
+        bundle_number_ = bundle_number;
+      }
+    }
+    TF_RET_CHECK(bundle_number_ != -1);
+
+    if (bundle_number_ != bundle_number) {
+      // If this is not the bundle created by ISA parser we need to copy it.
+      TF_RETURN_IF_ERROR(generator->CopyBundle(bundle_number_, bundle_number));
+    }
+    return --usages_ == 0;
+  }
+
+  void MarkUsage() { usages_++; }
+
+  McCode* mc_code() const { return mc_code_; }
+  int32_t index() const { return index_; }
+
+ private:
+  McCode* const mc_code_;
+  const int32_t index_;
+  int32_t bundle_number_ = -1;
+  std::atomic<int32_t> usages_ = 0;
+  absl::Mutex mutex_;
+};
+
+Status CompositeMcCode::CreateProgramBundles(ProgramGenerator* generator,
+                                             llvm::MCInstPrinter* inst_printer,
+                                             int64_t num_threads) {
+  const int64_t instr_count = GetBundleCount();
+  TF_RETURN_IF_ERROR(generator->CreateProgramProto(instr_count));
+
+  // Status may be updated in different threads.
+  absl::Mutex mutex;
+  Status result = OkStatus();
+  const auto still_ok = [&] {
+    absl::ReaderMutexLock lock(&mutex);
+    return result.ok();
+  };
+  const auto update_status = [&](Status new_status) {
+    if (!new_status.ok()) {
+      absl::MutexLock lock(&mutex);
+      if (result.ok()) {
+        result = new_status;
+      }
+    }
+  };
+
+  // Map contains one filler per MC code bundle, each filler may be used in one
+  // or more places of the final program. Fillers guarantee each MCInst bundle
+  // is only parsed once after that it is cleared to free memory. After all
+  // bundle usages are filled filler itself will be destructed.
+  absl::node_hash_map<std::pair<const McCode*, int32_t>, BundleFiller> fillers;
+  // For each output bundle stores a filler reference to be used.
+  std::vector<BundleFiller*> bundle_fillers;
+  bundle_fillers.reserve(instr_count);
+  // Mutex (by MC code) used for synchronizing parallel bundle cleanup.
+  absl::node_hash_map<const McCode*, absl::Mutex> bundle_cleanup_mutex;
+
+  // Initialize bundle fillers.
+  for (const auto& fragment : fragments_) {
+    McCode* const mc_code = fragment.first;
+    bundle_cleanup_mutex.try_emplace(mc_code);
+    for (int32_t i = fragment.second.first; i < fragment.second.second; i++) {
+      auto it = fillers.try_emplace({mc_code, i}, mc_code, i);
+      bundle_fillers.push_back(&it.first->second);
+      bundle_fillers.back()->MarkUsage();
+    }
+  }
+
+  const llvm::MCSubtargetInfo* const subtarget_info =
+      target_machine_->getMCSubtargetInfo();
+
+  absl::Mutex cleanup_mutex;
+  const auto convert_bundle = [&](int64_t bundle_number) -> Status {
+    BundleFiller* const filler = bundle_fillers[bundle_number];
+    McCode* const mc_code = filler->mc_code();
+    const int32_t index = filler->index();
+    TF_ASSIGN_OR_RETURN(
+        const bool done,
+        filler->AddBundle(
+            generator, inst_printer, subtarget_info, bundle_number,
+            [&]() -> Status {
+              absl::MutexLock lock(&bundle_cleanup_mutex[mc_code]);
+              filler->mc_code()->ClearBundle(&mc_code->bundles_[index].first);
+              return OkStatus();
+            }));
+    if (done) {
+      absl::MutexLock lock(&cleanup_mutex);
+      TF_RET_CHECK(fillers.erase({mc_code, index}) == 1);
+    }
+    return OkStatus();
+  };
+
+  thread::TreeOptions options;
+  options.set_parallelism(num_threads);
+  std::unique_ptr<thread::Fiber> tree = thread::NewTree(options, [&] {
+    thread::Bundle thread_bundle;
+
+    // Instructions are processed in chunks of kChunkSize size.
+    constexpr int64_t kChunkSize = 50000;
+    for (int64_t start = 0; start < instr_count; start += kChunkSize) {
+      auto work_unit = [&, start] {
+        const int64_t limit = std::min(instr_count, start + kChunkSize);
+        for (int64_t bundle_number = start; bundle_number < limit && still_ok();
+             ++bundle_number) {
+          update_status(convert_bundle(bundle_number));
+        }
+      };
+      if (num_threads > 1) {
+        thread_bundle.Add(work_unit);
+      } else {
+        work_unit();
+      }
+    }
+    thread_bundle.JoinAll();
+  });
+  tree->Join();
+  return result;
+}
+
+StatusOrMcCode McCode::PerformInlining(HloMcCodeProvider mc_code_provider) {
+  // For each bundle number X (sorted map key) holds the (total) number of extra
+  // bundles inserted *before* this bundle.
+  std::map<int64_t, int64_t> fixups;
+  fixups.insert({0LL, 0LL});
+
+  int64_t composite_total_bundles = 0;
+  CompositeMcCode::McCodeFragmentList composite_fragments;
+  std::map<int64_t, int64_t> composite_long_branches;
+
+  // Starting TLP fragment.
+  composite_fragments.push_back({this, {0, 0}});
+
+  // Go through all bundles, if the bundle does not have a stitch point (CALL
+  // pseudo-instruction) just copy it. Otherwise copy the bundle (after removing
+  // the CALL instruction itself) and inject callee bundles after it.
+  bool delay_slot = false;
+  for (int64_t index = 0; index < bundles_.size(); index++) {
+    McBundle& bundle = bundles_[index];
+
+    // Extend the current/last fragment to include this bundle.
+    composite_fragments.back().second.second = index + 1;
+    composite_total_bundles++;
+
+    // Analyze bundle.
+    auto call_it = bundle.first.end();
+    auto branch_it = bundle.first.end();
+    for (auto it = bundle.first.begin(); it != bundle.first.end(); it++) {
+      TF_RET_CHECK(it->isInst());
+      if (IsBranch(*it->getInst())) {
+        TF_RET_CHECK(branch_it == bundle.first.end());
+        TF_RET_CHECK(call_it == bundle.first.end());
+        branch_it = it;
+      } else if (IsCall(*it->getInst())) {
+        TF_ASSIGN_OR_RETURN(auto predication,
+                            ProgramGenerator::ToPredication(*it->getInst()));
+        TF_RET_CHECK(predication.is_always_execute())
+            << "Conditional calls are not yet supported";
+        TF_RET_CHECK(!delay_slot);
+        TF_RET_CHECK(branch_it == bundle.first.end());
+        TF_RET_CHECK(call_it == bundle.first.end());
+        call_it = it;
+      }
+    }
+
+    // If there is no CALL, just copy the bundle.
+    if (call_it == bundle.first.end()) {
+      delay_slot = branch_it != bundle.first.end();
+      continue;
+    }
+
+    // The bundle contains a CALL instruction, get the callee McCode.
+    TF_ASSIGN_OR_RETURN(std::string hlo_name,
+                        GetCalleeSymbolName(*call_it->getInst()));
+    TF_ASSIGN_OR_RETURN(AbstractMcCode * abstract_inlinee,
+                        mc_code_provider(hlo_name));
+    TF_RET_CHECK(abstract_inlinee != nullptr) << "HLO inlinee: " << hlo_name;
+    auto* inlinee = down_cast<McCode*>(abstract_inlinee);
+
+    // Add the original bundle w/o CALL instruction. NOTE: we do NOT skip the
+    // bundle if it is empty because it can be referenced by a branch, this may
+    // be improved later.
+    delete call_it->getInst();
+    bundle.first.erase(call_it);
+
+    const int64_t call_bundle_number = composite_total_bundles;
+    const int64_t inlinee_bundles = inlinee->bundles_.size();
+    // Add a fixup info: all inlinee bundles are added after the current bundle.
+    const int64_t total_inserted_bundles =
+        fixups.rbegin()->second + inlinee_bundles;
+    fixups.insert({index + 1, total_inserted_bundles});
+
+    // Insert fragment representing inlinee bundles.
+    composite_total_bundles += inlinee_bundles;
+    composite_fragments.push_back({inlinee, {0, inlinee_bundles}});
+
+    // Then start the next TLP fragment.
+    composite_fragments.push_back({this, {index + 1, index + 1}});
+
+    // And add long branches from the inlinee.
+    for (const auto& long_branch : inlinee->long_branches_) {
+      const auto inserted = composite_long_branches.emplace(
+          call_bundle_number + long_branch.first,
+          call_bundle_number + long_branch.second);
+      TF_RET_CHECK(inserted.second);
+    }
+  }
+
+  // After we finished stitching we should fix-up all symbols which might have
+  // shifted because of inserted bundles.
+  std::vector<std::pair<llvm::MCSymbol*, int64_t>> composite_symbols;
+  for (auto& symbol_with_offset : symbols_) {
+    auto it = fixups.upper_bound(symbol_with_offset.second);
+    int64_t extra_bundles = (it == fixups.begin()) ? 0 : (--it)->second;
+    composite_symbols.push_back(
+        {symbol_with_offset.first, symbol_with_offset.second + extra_bundles});
+  }
+
+  // If the continuations are enabled we need to remove the final shalt since
+  // the continuation program will be appended to the end of the TLP.
+  if (target_.xla_target->UsesContinuations(tpu::TpuCoreType::kTensorCore)) {
+    TF_RETURN_IF_ERROR(RemoveReturnScalarHalt());
+  }
+
+  return std::unique_ptr<AbstractMcCode>(std::make_unique<CompositeMcCode>(
+      this, target_machine_, target_, std::move(composite_fragments),
+      std::move(composite_symbols), std::move(composite_long_branches)));
+}
+
+Status McCode::RemoveReturnScalarHalt() {
+  TF_RET_CHECK(!bundles_.empty());
+  llvm::MCInst& last_bundle = bundles_.back().first;
+  auto shalt_it = last_bundle.end();
+  for (auto it = last_bundle.begin(); it != last_bundle.end(); it++) {
+    TF_RET_CHECK(it->isInst());
+    if (IsScalarHalt(*it->getInst())) {
+      TF_RET_CHECK(shalt_it == last_bundle.end());
+      TF_ASSIGN_OR_RETURN(auto predication,
+                          ProgramGenerator::ToPredication(*it->getInst()));
+      TF_RET_CHECK(predication.is_always_execute())
+          << LlvmAsString(*it->getInst());
+      shalt_it = it;
+    }
+  }
+  TF_RET_CHECK(shalt_it != last_bundle.end());  // We expect 'shalt' to exist.
+  delete shalt_it->getInst();
+  last_bundle.erase(shalt_it);
+  return OkStatus();
+}
+
+Status McCode::FinalizeAsInlinee() {
+  TF_RET_CHECK(long_branches_.empty());
+  TF_RETURN_IF_ERROR(ResolveBranches());
+
+  // We require the last bundle to have an 'shalt' instruction and remove it
+  // here. All other 'shalt's are not changed.
+  TF_RETURN_IF_ERROR(RemoveReturnScalarHalt());
+
+  return OkStatus();
+}
+
+StatusOr<llvm::MCSymbol*> McCode::DeepCopySymbol(const llvm::MCSymbol* symbol) {
+  TF_RET_CHECK(symbol != nullptr);
+  TF_RET_CHECK(!symbol->getName().empty());
+  if (const auto it = symbol_map_.find(symbol); it != symbol_map_.end()) {
+    return it->second;
+  }
+  // Create a new symbol. Note that we don't care much about trying to clone all
+  // various symbol attributes.
+  llvm::MCSymbol* copy;
+  if (symbol->isTemporary()) {
+    copy = context_->createTempSymbol(symbol->getName(),
+                                      /*AlwaysAddSuffix=*/false);
+  } else {
+    copy = context_->getOrCreateSymbol(symbol->getName());
+  }
+  symbol_map_.insert({symbol, copy});
+  return copy;
+}
+
+StatusOr<const llvm::MCExpr*> McCode::DeepCopyExpr(const llvm::MCExpr* expr) {
+  TF_RET_CHECK(expr != nullptr);
+  switch (expr->getKind()) {
+    case llvm::MCExpr::Constant: {
+      const llvm::MCConstantExpr* constant_expr =
+          llvm::cast<llvm::MCConstantExpr>(expr);
+      TF_RET_CHECK(constant_expr != nullptr);
+      return llvm::MCConstantExpr::create(constant_expr->getValue(), *context_,
+                                          constant_expr->useHexFormat(),
+                                          constant_expr->getSizeInBytes());
+    }
+    case llvm::MCExpr::Binary: {
+      const llvm::MCBinaryExpr* binary_expr =
+          llvm::cast<llvm::MCBinaryExpr>(expr);
+      TF_RET_CHECK(binary_expr != nullptr);
+      TF_ASSIGN_OR_RETURN(const llvm::MCExpr* lhs,
+                          DeepCopyExpr(binary_expr->getLHS()));
+      TF_ASSIGN_OR_RETURN(const llvm::MCExpr* rhs,
+                          DeepCopyExpr(binary_expr->getRHS()));
+      return llvm::MCBinaryExpr::create(binary_expr->getOpcode(), lhs, rhs,
+                                        *context_, binary_expr->getLoc());
+    }
+    case llvm::MCExpr::SymbolRef: {
+      const llvm::MCSymbolRefExpr* symbol_expr =
+          llvm::cast<llvm::MCSymbolRefExpr>(expr);
+      TF_RET_CHECK(symbol_expr != nullptr);
+      TF_ASSIGN_OR_RETURN(const llvm::MCSymbol* symbol,
+                          DeepCopySymbol(&symbol_expr->getSymbol()));
+      return llvm::MCSymbolRefExpr::create(symbol, symbol_expr->getKind(),
+                                           *context_, symbol_expr->getLoc());
+    }
+    case llvm::MCExpr::Unary: {
+      const llvm::MCUnaryExpr* unary_expr = llvm::cast<llvm::MCUnaryExpr>(expr);
+      TF_RET_CHECK(unary_expr != nullptr);
+      TF_ASSIGN_OR_RETURN(const llvm::MCExpr* operand,
+                          DeepCopyExpr(unary_expr->getSubExpr()));
+      return llvm::MCUnaryExpr::create(unary_expr->getOpcode(), operand,
+                                       *context_, unary_expr->getLoc());
+    }
+    case llvm::MCExpr::Target: {
+      // TPU target defines three target-specific MCExpr: TPUMCImmExpr,
+      // TPUPCExpr and TPUStringExpr. TPUStringExpr is only used for SparseCore
+      // and we don't expect to see TPUMCImmExpr since we disable immediate
+      // encoding collecting for this scenario.
+
+      // Unfortunately TPUPCExpr is not exposed from TPU backend, so we have to
+      // rely on text representation to do sanity check.
+      TF_RET_CHECK(LlvmAsString(*expr) == ".")
+          << "Expected '.', got: '" << LlvmAsString(*expr) << "'";
+
+      // TPUPCExpr means "current bundle address", and we rewrite it into a
+      // temporary symbol here.
+      llvm::MCSymbol* temp_symbol = context_->createTempSymbol();
+      symbols_.emplace_back(temp_symbol, bundles_.size());
+      return llvm::MCSymbolRefExpr::create(
+          temp_symbol, llvm::MCSymbolRefExpr::VK_None, *context_);
+    }
+  }
+
+  return OkStatus();
+}
+
+Status McCode::DeepCopyOperand(llvm::MCOperand* operand) {
+  if (!operand->isValid()) {
+    // Apparently we have those invalid/uninitialized operands, just keep them
+    // as-is until we fix them.
+    return OkStatus();
+  }
+  TF_RET_CHECK(!operand->isInst());
+  if (operand->isReg() || operand->isImm() || operand->isDFPImm()) {
+    return OkStatus();  // No copy needed.
+  }
+
+  TF_RET_CHECK(operand->isExpr());
+  const llvm::MCExpr* expr = operand->getExpr();
+  TF_RET_CHECK(expr != nullptr);
+  TF_ASSIGN_OR_RETURN(const llvm::MCExpr* copy, DeepCopyExpr(expr));
+  if (expr != copy) {
+    *operand = llvm::MCOperand::createExpr(copy);
+  }
+  return OkStatus();
+}
+
+StatusOr<llvm::MCInst*> McCode::DeepCopyInstr(const llvm::MCInst* inst,
+                                              SourceLocationList* locations) {
+  llvm::MCInst* copy = new llvm::MCInst(*inst);
+
+  const int64_t operand_count = copy->getNumOperands();
+
+  // If `locations` is provided we should expect the last two operands to be
+  // line/column from debug location representing original LLO instruction, we
+  // convert them into SourceLocation and remove.
+  const bool has_source_location = locations != nullptr;
+  TF_RET_CHECK(operand_count >= 2 || !has_source_location);
+
+  const int64_t operands_to_copy =
+      has_source_location ? (operand_count - 2) : operand_count;
+  for (int64_t index = 0; index < operands_to_copy; ++index) {
+    TF_RETURN_IF_ERROR(DeepCopyOperand(&copy->getOperand(index)));
+  }
+
+  if (has_source_location) {
+    // Extract Debug location.
+    const auto eat_last_operand = [&]() -> StatusOr<unsigned> {
+      const auto last = copy->begin() + (copy->getNumOperands() - 1);
+      TF_RET_CHECK(last->isImm());
+      const unsigned value = last->getImm();
+      copy->erase(last);
+      return value;
+    };
+    TF_ASSIGN_OR_RETURN(const unsigned column, eat_last_operand());
+    TF_ASSIGN_OR_RETURN(const unsigned line, eat_last_operand());
+
+    if (line > 0 && column > 0) {
+      const auto it = std::find(locations->begin(), locations->end(),
+                                SourceLocation{line, column});
+      if (it == locations->end()) {
+        locations->push_back({line, column});
+      }
+    }
+  }
+
+  if (IsCall(*copy)) {
+    // Apparently call instruction may have trailing invalid/uninitialized
+    // operands, we remove them here.
+    while (copy->end() != copy->begin() && !(copy->end() - 1)->isValid()) {
+      copy->erase(copy->end() - 1);
+    }
+  }
+  return copy;
+}
+
+StatusOr<llvm::MCInst> McCode::DeepCopyBundle(const llvm::MCInst& bundle,
+                                              SourceLocationList* locations) {
+  llvm::MCInst copy = bundle;
+  for (llvm::MCOperand& slot_inst : copy) {
+    TF_RET_CHECK(slot_inst.isInst());
+    TF_ASSIGN_OR_RETURN(llvm::MCInst * inst,
+                        DeepCopyInstr(slot_inst.getInst(), locations));
+    slot_inst = llvm::MCOperand::createInst(inst);
+  }
+  return copy;
+}
+
+Status McCode::ForEachBundleWithIndex(const BundleWithIndexHandler& handler) {
+  for (int64_t index = 0; index < bundles_.size(); index++) {
+    TF_RETURN_IF_ERROR(handler(index, &bundles_[index]));
+  }
+  return OkStatus();
+}
+
+Status CompositeMcCode::ForEachBundleWithIndex(
+    const BundleWithIndexHandler& handler) {
+  int64_t bundle_number = 0;
+  for (const auto& fragment : fragments_) {
+    McCode* const code = fragment.first;
+    const int64_t start = fragment.second.first;
+    const int64_t limit = fragment.second.second;
+    TF_RET_CHECK(0 <= start);
+    TF_RET_CHECK(start <= limit);
+    TF_RET_CHECK(limit <= code->bundles_.size());
+    for (int64_t index = start; index < limit; index++) {
+      TF_RETURN_IF_ERROR(handler(bundle_number++, &code->bundles_[index]));
+    }
+  }
+  return OkStatus();
+}
+
+Status McCodeBase::ResolveBranches() {
+  // Assign all symbols their values.
+  for (auto& symbol_with_offset : symbols_) {
+    symbol_with_offset.first->setVariableValue(
+        llvm::MCConstantExpr::create(symbol_with_offset.second, *context()));
+  }
+
+  // Go through all bundles in the program, if any instruction in the bundle is
+  // a branch, fix-up the branch target.
+  TF_RETURN_IF_ERROR(ForEachBundleWithIndex(
+      [this](int64_t bundle_number, McBundle* mc_bundle) -> Status {
+        llvm::MCInst& bundle = mc_bundle->first;
+        for (llvm::MCOperand& slot_inst : bundle) {
+          TF_RET_CHECK(slot_inst.isInst());
+          const llvm::MCInst& inst = *slot_inst.getInst();
+          if (!IsBranch(inst)) {
+            continue;
+          }
+          llvm::MCInst* lazy_copy = nullptr;
+
+          for (int64_t i = 0; i < inst.getNumOperands(); i++) {
+            const llvm::MCOperand& operand = inst.getOperand(i);
+            if (!operand.isExpr()) {
+              continue;  // Must be part of predicate.
+            }
+            const llvm::MCExpr* expr = operand.getExpr();
+            TF_RET_CHECK(expr != nullptr);
+            TF_RET_CHECK(expr->getKind() == llvm::MCExpr::ExprKind::Binary);
+
+            llvm::MCValue relocatable_res;
+            TF_RET_CHECK(expr->evaluateAsRelocatable(relocatable_res,
+                                                     /*Layout=*/nullptr,
+                                                     /*Fixup=*/nullptr));
+            const int64_t branch_offset = relocatable_res.getConstant();
+
+            if (lazy_copy == nullptr) {
+              lazy_copy = new llvm::MCInst(inst);
+            }
+            lazy_copy->getOperand(i) =
+                llvm::MCOperand::createImm(branch_offset);
+
+            if (branch_offset < std::numeric_limits<int16_t>::min() ||
+                branch_offset > std::numeric_limits<int16_t>::max()) {
+              auto result = long_branches_.emplace(
+                  bundle_number, bundle_number + branch_offset);
+              TF_RET_CHECK(result.second);
+            }
+          }
+
+          if (lazy_copy != nullptr) {
+            delete &inst;
+            slot_inst = llvm::MCOperand::createInst(lazy_copy);
+          }
+        }
+        return OkStatus();
+      }));
+  return OkStatus();
+}
+
+// The TpuStreamer consumes LLVM's AsmParser output.
+class TpuStreamer : public llvm::MCStreamer {
+ public:
+  TpuStreamer(llvm::MCContext* context, McCode* mc_code, Status* status)
+      : llvm::MCStreamer(*context), status_(status), mc_code_(mc_code) {}
+
+  // MCStreamer interface
+  void emitInstruction(const llvm::MCInst& Inst,
+                       const llvm::MCSubtargetInfo& STI) override {
+    // Note that we do NOT call the method of the base class.
+    if (state_ != State::kInsideCodeSection) {
+      MarkError(absl::StrCat("Instruction outside code section: ",
+                             LlvmAsString(Inst)));
+    } else {
+      MarkStatus(mc_code_->EmitInstruction(Inst));
+    }
+  }
+
+  void emitLabel(llvm::MCSymbol* Symbol, llvm::SMLoc Loc) override {
+    MCStreamer::emitLabel(Symbol, Loc);
+    latest_label_in_section_ = LlvmAsString(*Symbol);
+    switch (state_) {
+      case State::kInsideMetadataSection:
+        break;
+      case State::kInsideCodeSection:
+        MarkStatus(mc_code_->EmitLabel(Symbol));
+        break;
+      case State::kOther:
+        // Intentionally ignore labels in other sections we chose to ignore.
+        break;
+    }
+  }
+
+  void emitAssignment(llvm::MCSymbol* Symbol,
+                      const llvm::MCExpr* Value) override {
+    MarkError("EmitAssignment not implemented!");
+    MCStreamer::emitAssignment(Symbol, Value);
+  }
+
+  void switchSection(llvm::MCSection* Section,
+                     const llvm::MCExpr* Subsection) override;
+
+  void emitFill(const llvm::MCExpr& NumBytes, uint64_t FillValue,
+                llvm::SMLoc Loc) override {
+    MarkError("emitFill not implemented!");
+    MCStreamer::emitFill(NumBytes, FillValue, Loc);
+  }
+  bool emitSymbolAttribute(llvm::MCSymbol* Symbol,
+                           llvm::MCSymbolAttr Attribute) override {
+    // We can safely ignore symbol attributes.
+    return true;
+  }
+
+  void emitBytes(llvm::StringRef Bytes) override {
+    if (state_ != State::kInsideMetadataSection) {
+      if (state_ != State::kOther) {
+        MarkError("unexpected emitBytes(...) in NOT ignored section");
+      }
+      return;
+    }
+
+    const auto get_value = [&] {
+      uint64_t value = 0;
+      if (Bytes.size() > 8) {
+        MarkError(
+            absl::StrCat("Too many bytes in spill number: ", Bytes.size()));
+      }
+      for (size_t i = 0; i < Bytes.size(); i++) {
+        value |= (static_cast<uint64_t>(Bytes[i]) << (i * 8));
+      }
+      return static_cast<int64_t>(value);
+    };
+
+    if (latest_label_in_section_.empty()) {
+      MarkError("Metadata name is missing");
+    } else if (absl::StartsWith(latest_label_in_section_,
+                                ".Lvector_spill_slots_num")) {
+      MarkStatus(mc_code_->EmitSpillSlots(
+          {MemorySpace::kVmem, SpillScope::kWholeProgram}, get_value()));
+    } else if (absl::StartsWith(latest_label_in_section_,
+                                ".Lscalar_spill_slots_num")) {
+      MarkStatus(mc_code_->EmitSpillSlots(
+          {MemorySpace::kSmem, SpillScope::kWholeProgram}, get_value()));
+    } else if (absl::StartsWith(latest_label_in_section_,
+                                ".Lmax_cross_call_vector_regs")) {
+      MarkStatus(mc_code_->EmitSpillSlots(
+          {MemorySpace::kVmem, SpillScope::kCrossCalls}, get_value()));
+    } else if (absl::StartsWith(latest_label_in_section_,
+                                ".Lmax_cross_call_scalar_regs")) {
+      MarkStatus(mc_code_->EmitSpillSlots(
+          {MemorySpace::kSmem, SpillScope::kCrossCalls}, get_value()));
+    } else if (absl::StartsWith(latest_label_in_section_,
+                                ".Lstatic_inserted_throttle_cycles")) {
+      mc_code_->SetStaticInsertedThrottleCycles(get_value());
+    } else {
+      MarkError(absl::StrCat("Metadata label is not recognized: ",
+                             latest_label_in_section_));
+    }
+    MCStreamer::emitBytes(Bytes);
+  }
+
+  void emitCommonSymbol(llvm::MCSymbol* Symbol, uint64_t Size,
+                        llvm::Align ByteAlignment) override {
+    MarkError("EmitCommonSymbol not implemented!");
+  }
+  void emitZerofill(llvm::MCSection* Section, llvm::MCSymbol* Symbol,
+                    uint64_t Size, llvm::Align ByteAlignment,
+                    llvm::SMLoc Loc) override {
+    MarkError("EmitZerofill not implemented!");
+  }
+
+ private:
+  enum class SectionKind {
+    kProgram,   // Main program code section
+    kMetadata,  // Main program metadata section
+    kOther,     // The section we know we can safely ignore.
+  };
+
+  enum class State { kInsideCodeSection, kInsideMetadataSection, kOther };
+
+  // Infer section kind.
+  SectionKind GetSectionKind(llvm::MCSection* Section);
+
+  void MarkError(absl::string_view error) { MarkStatus(Unknown("%s", error)); }
+
+  void MarkStatus(const Status& status) {
+    if (status_->ok() && !status.ok()) {
+      *status_ = status;
+    }
+  }
+
+  bool HasErrors() { return !status_->ok(); }
+
+  Status* const status_;  // Holds the status in case an error occurs.
+  McCode* const mc_code_;
+
+  State state_ = State::kOther;
+  bool seen_program_ = false;
+  bool seen_metadata_ = false;
+  // Holds the label seen latest in the current section, or empty string.
+  std::string latest_label_in_section_;
+};
+
+TpuStreamer::SectionKind TpuStreamer::GetSectionKind(llvm::MCSection* Section) {
+  std::string name = Section->getBeginSymbol()->getName().str();
+  static const auto& kSectionRegexes =
+      *new const std::array<std::pair<SectionKind, RE2>, 4>{
+          {{SectionKind::kProgram, R"(\.text$)"},
+           {SectionKind::kMetadata, R"(function_metadata\..*$)"},
+           {SectionKind::kOther, R"(.note.GNU-stack)"},
+           {SectionKind::kOther, R"(.debug.*$)"}}};
+  for (const auto& kind_and_re : kSectionRegexes) {
+    if (RE2::FullMatch(name, kind_and_re.second)) {
+      return kind_and_re.first;
+    }
+  }
+  MarkError(absl::StrCat("Unknown section kind for \"", name, "\""));
+  return SectionKind::kOther;
+}
+
+void TpuStreamer::switchSection(llvm::MCSection* Section,
+                                const llvm::MCExpr* Subsection) {
+  MCStreamer::switchSection(Section, Subsection);
+  if (HasErrors()) {
+    return;
+  }
+  switch (GetSectionKind(Section)) {
+    case SectionKind::kProgram:
+      if (seen_program_ && state_ != State::kInsideCodeSection) {
+        MarkError("Duplicated code section");
+      }
+      seen_program_ = true;
+      state_ = State::kInsideCodeSection;
+      break;
+    case SectionKind::kMetadata:
+      if (seen_metadata_) {
+        MarkError("Duplicated metadata section");
+      }
+      state_ = State::kInsideMetadataSection;
+      seen_metadata_ = true;
+      break;
+    case SectionKind::kOther:
+      state_ = State::kOther;
+      break;
+  }
+  latest_label_in_section_ = "";
+}
+
+}  // namespace
+
+StatusOr<McCodeProvider> AddPassesToEmitMcCode(
+    const IsaProgramTarget& target, llvm::TargetMachine* target_machine,
+    llvm::PassManagerBase* pass_manager) {
+  // NOTE: implementation is based on LLVMTargetMachine::addPassesToEmitMC().
+  llvm::LLVMTargetMachine* const llvm_target_machine =
+      dynamic_cast<llvm::LLVMTargetMachine*>(target_machine);
+  TF_RET_CHECK(llvm_target_machine != nullptr);
+
+  // Set PassConfig options provided by TargetMachine.
+  llvm::TargetPassConfig* const pass_config =
+      llvm_target_machine->createPassConfig(*pass_manager);
+  TF_RET_CHECK(pass_config != nullptr);
+  pass_config->setDisableVerify(true);
+  pass_manager->add(pass_config);
+  auto* const machine_module_info_wrapper_pass =
+      new llvm::MachineModuleInfoWrapperPass(llvm_target_machine);
+  pass_manager->add(machine_module_info_wrapper_pass);
+  TF_RET_CHECK(!pass_config->addISelPasses());
+  pass_config->addMachinePasses();
+  pass_config->setInitialized();
+  TF_RET_CHECK(llvm::TargetPassConfig::willCompleteCodeGenPipeline());
+
+  llvm::MCContext* const original_context =
+      &machine_module_info_wrapper_pass->getMMI().getContext();
+  original_context->setDwarfVersion(5);
+  original_context->setUseNamesOnTempLabels(true);
+
+  // Note using shared pointer to capture it in the returned lambda.
+  using TpuStreamerState = std::pair<std::unique_ptr<McCode>, Status>;
+  const auto state = std::make_shared<TpuStreamerState>(
+      std::make_unique<McCode>(target_machine, target), OkStatus());
+
+  // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
+  llvm::FunctionPass* const printer =
+      llvm_target_machine->getTarget().createAsmPrinter(
+          *llvm_target_machine,
+          std::unique_ptr<llvm::MCStreamer>(std::make_unique<TpuStreamer>(
+              original_context, state->first.get(), &state->second)));
+  TF_RET_CHECK(printer != nullptr);
+
+  pass_manager->add(printer);
+  pass_manager->add(llvm::createFreeMachineFunctionPass());
+
+  const McCodeProvider provider = [state]() -> StatusOrMcCode {
+    TF_RETURN_IF_ERROR(state->second);
+    return std::unique_ptr<AbstractMcCode>(std::move(state->first));
+  };
+  return provider;
+}
+
+}  // namespace jellyfish
+}  // namespace xla

diff --git a/tpu_recision/platforms/xla/service/jellyfish/llvm_mc_program_processor.h b/tpu_recision/platforms/xla/service/jellyfish/llvm_mc_program_processor.h
new file mode 100644
index 0000000..110999b
--- /dev/null
+++ b/tpu_recision/platforms/xla/service/jellyfish/llvm_mc_program_processor.h

@@ -0,0 +1,182 @@
+#ifndef PLATFORMS_XLA_SERVICE_JELLYFISH_LLVM_MC_PROGRAM_PROCESSOR_H_
+#define PLATFORMS_XLA_SERVICE_JELLYFISH_LLVM_MC_PROGRAM_PROCESSOR_H_
+
+#include <functional>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "learning/brain/tpu/runtime/tpu_chip_enums.h"
+#include "learning/brain/tpu/runtime/tpu_version.h"
+#include "platforms/xla/port/util.h"
+#include "platforms/xla/service/jellyfish/isa_emitter_factory.h"
+#include "platforms/xla/service/jellyfish/isa_program.proto.h"
+#include "platforms/xla/service/jellyfish/llo_dumper.h"
+#include "platforms/xla/service/jellyfish/range_spec.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/MC/MCInst.h"
+#include "third_party/llvm/llvm-project/llvm/include/llvm/Target/TargetMachine.h"
+#include "third_party/tensorflow/compiler/xla/statusor.h"
+#include "thread/threadlocal.h"
+#include "util/registration/registerer.h"
+#include "util/tuple/struct.h"
+
+namespace xla {
+namespace jellyfish {
+
+using StatusOrIsaProgram = StatusOr<std::unique_ptr<IsaProgramProto>>;
+using ::tpu::TpuSequencerType;
+
+// Defines the target platform version and sequencer.
+struct IsaProgramTarget {
+  TUPLE_DEFINE_STRUCT(IsaProgramTarget, (), (tpu::TpuSequencerType, sequencer),
+                      (const Target*, xla_target));
+};
+
+// Defines memory regions in form offset/limit in words.
+using MemRegion = std::pair<int64_t, int64_t>;
+using SpillRegionCollection = absl::flat_hash_map<MemorySpace, MemRegion>;
+
+// Defines Line/Column from DWARF debug locations which is used to identify
+// source LLO instruction of the bundle.
+using SourceLocation = std::pair<unsigned, unsigned>;
+using SourceLocationList = absl::InlinedVector<SourceLocation, 2>;
+
+const char ISS_PROGRAM_GENERATOR[] = "IssProgramGenerator";
+
+// Abstract handler for MC code generated for HLO or top level program.
+class AbstractMcCode {
+ public:
+  virtual ~AbstractMcCode() = default;
+
+  // Adjusts the passed TLP spill regions to exclude part of it used for spills
+  // across call sites.
+  virtual Status AdjustTlpSpillRegions(
+      SpillRegionCollection* spill_regions,
+      const std::function<int64_t(MemorySpace)>& get_spill_size) = 0;
+
+  virtual int64_t GetBundleCount() const = 0;
+
+  // Returns the number of compiler inserted throttle cycles.
+  virtual uint32_t GetStaticInsertedThrottleCycles() const = 0;
+
+  // Checks if the spills overflow specified spill regions.
+  virtual Status CheckForSpillRegionsOverflow(
+      const SpillRegionCollection& max_spill_regions,
+      const std::function<int64_t(MemorySpace)>& get_spill_size,
+      int64_t* vmem_overflow_slots) = 0;
+
+  // Returns external names of the HLOs. (Note using std::set for ordering).
+  virtual StatusOr<std::set<std::string>> GetHloReferences() = 0;
+
+  // Resolve the branches, prepare code to be inlined. After this step all
+  // branch instructions should have immediate operands, also no 'shalt'
+  // instruction to represent return, the control flow is expected to fall
+  // through the end of the HLO function. No calls are allowed at this phase.
+  virtual Status FinalizeAsInlinee() = 0;
+
+  // Inline all call instructions and return new McCode representing the result
+  // of inlining. The McCode for referenced HLO is expected to be provided by
+  // 'mc_code_provider'.
+  using HloMcCodeProvider =
+      std::function<StatusOr<AbstractMcCode*>(const std::string&)>;
+  virtual StatusOr<std::unique_ptr<AbstractMcCode>> PerformInlining(
+      HloMcCodeProvider mc_code_provider) = 0;
+
+  // Used for TLP, resolves the branches, generates an ISA program and bundle
+  // DWARF debug location. No calls are allowed at this phase.
+  virtual StatusOr<std::unique_ptr<IsaProgramProto>> FinalizeAsTlpAndConsume(
+      int64_t num_threads, std::vector<SourceLocationList>* source_locations,
+      LloModule* module) = 0;
+};
+
+// Generate MC code representing an HLO or top level program from LLVM asm
+// printer output created for specified target machine.
+using StatusOrMcCode = StatusOr<std::unique_ptr<AbstractMcCode>>;
+// A substitute for LLVMTargetMachine::addPassesToEmitMC() which emits parser
+// with special streamer capable to create an McCode directly. Returns a lambda
+// which will return McCode or error after LLVM passes are executed.
+using McCodeProvider = std::function<StatusOrMcCode()>;
+StatusOr<McCodeProvider> AddPassesToEmitMcCode(
+    const IsaProgramTarget& target, llvm::TargetMachine* target_machine,
+    llvm::PassManagerBase* pass_manager);
+
+// Defines a program generator interface hiding parsers for different platforms.
+class ProgramGenerator {
+ public:
+  virtual ~ProgramGenerator();
+  // Create a program generator for a target, return an error if the target
+  // configuration is not supported.
+  static StatusOr<std::unique_ptr<ProgramGenerator>> Create(
+      const IsaProgramTarget& target, LloModule* module);
+
+  ProgramGenerator(const IsaProgramTarget& target,
+                   const RangeSpec& xla_llvm_isa_emitter_bundles,
+                   LloModule* module)
+      : target_(target),
+        bundles_to_handle_(xla_llvm_isa_emitter_bundles),
+        module_(module) {}
+
+  ProgramGenerator(const ProgramGenerator& program_generator)
+      : target_(program_generator.target_),
+        bundles_to_handle_(program_generator.bundles_to_handle_),
+        module_(program_generator.module_) {}
+
+  Status CreateProgramProto(int64_t bundle_count);
+
+  // Returns the thread-local view of the main IsaEmitter, first making sure the
+  // thread-local copy is created.
+  StatusOr<IsaEmitter*> GetThreadLocalIsaEmitter();
+
+  // Note: the passed 'bundle_text' string will be clobbered.
+  virtual Status AppendBundleFromText(int64_t bundle_number,
+                                      std::string* bundle_text);
+  Status AppendBundleFromMcBundle(int64_t bundle_number,
+                                  const llvm::MCInst& bundle);
+  virtual Status CopyBundle(int64_t from_bundle_number,
+                            int64_t to_bundle_number);
+  StatusOrIsaProgram FinalizeProgramProto();
+
+  static StatusOr<Predication> ToPredication(const llvm::MCInst& inst,
+                                             int operand_index = -1);
+  static StatusOr<Pregno> ToPregno(const llvm::MCOperand& operand);
+  static StatusOr<Sregno> ToSregno(const llvm::MCOperand& operand);
+  static StatusOr<SregnoOrImm> ToSregnoOrImm(const llvm::MCOperand& operand);
+  static StatusOr<Vregno> ToVregno(const llvm::MCOperand& operand);
+  static StatusOr<VregnoOrImm> ToVregnoOrImm(const llvm::MCOperand& operand);
+  static StatusOr<Vmregno> ToVmregno(const llvm::MCOperand& operand);
+  static StatusOr<ImmValue> ToImmValue(const llvm::MCOperand& operand);
+
+  LloModule* module() const { return module_; }
+
+ protected:
+  virtual void PatchBeforeParsing(std::string* str);
+
+  IsaProgramProto* program() { return top_isa_emitter_->GetProgram(); }
+
+  using InstAndWeight = std::pair<const llvm::MCInst*, int>;
+  Status EmitOneBundle(const std::vector<InstAndWeight>& inst_list);
+  Status EmitOneInstruction(const llvm::MCInst& inst);
+
+  const IsaProgramTarget target_;
+
+  // The IsaEmitter handles the details of writing all the various protobuf
+  // formats.
+  std::unique_ptr<IsaEmitter> top_isa_emitter_;
+  // Each thread gets its own view into top_isa_emitter_ so that there are no
+  // races on IsaEmitter current_bundle_number state.
+  ThreadLocal<std::unique_ptr<IsaEmitter>> thread_local_isa_emitter_;
+
+  RangeSpec bundles_to_handle_;
+
+  LloModule* const module_;
+};
+
+DEFINE_REGISTERER(ProgramGenerator, const IsaProgramTarget&, const RangeSpec&,
+                  LloModule*);
+DEFINE_ALIAS_REGISTERER(ProgramGenerator);
+}  // namespace jellyfish
+}  // namespace xla
+
+#endif  // PLATFORMS_XLA_SERVICE_JELLYFISH_LLVM_MC_PROGRAM_PROCESSOR_H_

diff --git a/tpu_recision/third_party/llvm/llvm/include/llvm/IR/IntrinsicsTPU.td b/tpu_recision/third_party/llvm/llvm/include/llvm/IR/IntrinsicsTPU.td
new file mode 100644
index 0000000..1d77a46
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/include/llvm/IR/IntrinsicsTPU.td

@@ -0,0 +1,2334 @@
+//===- Intrinsicstpu.td - tpu intrinsics -----*- tablegen ------*----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the tpu-specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+def llvm_v1024i32_ty   : LLVMType<v1024i32>;
+def llvm_v1024f32_ty   : LLVMType<v1024f32>;
+
+// i32 pointers to the various tpu address spaces
+def llvm_smemptr_ty           : LLVMQualPointerType<llvm_i32_ty, 0>;
+def llvm_smemanyptr_ty        : LLVMQualPointerType<llvm_i32_ty, 212>;
+def llvm_simemptr_ty          : LLVMQualPointerType<llvm_i32_ty, 215>;
+def llvm_timemptr_ty          : LLVMQualPointerType<llvm_i32_ty, 214>;
+def llvm_tilespmemptr_ty      : LLVMQualPointerType<llvm_i32_ty, 201>;
+def llvm_spmemptr_ty          : LLVMQualPointerType<llvm_i32_ty, 202>;
+def llvm_hbmptr_ty            : LLVMQualPointerType<llvm_i32_ty, 203>;
+def llvm_hbmanyptr_ty         : LLVMQualPointerType<llvm_i32_ty, 213>;
+def llvm_sflagptr_ty          : LLVMQualPointerType<llvm_i32_ty, 204>;
+def llvm_sflagotherptr_ty     : LLVMQualPointerType<llvm_i32_ty, 210>;
+def llvm_sflaganyptr_ty       : LLVMQualPointerType<llvm_i32_ty, 211>;
+def llvm_sflagtileptr_ty      : LLVMQualPointerType<llvm_i32_ty, 217>;
+def llvm_tilespmemv8i32ptr_ty : LLVMQualPointerType<llvm_v8i32_ty, 201>;
+def llvm_tilespmemv8f32ptr_ty : LLVMQualPointerType<llvm_v8f32_ty, 201>;
+def llvm_spmemv8i32ptr_ty     : LLVMQualPointerType<llvm_v8i32_ty, 202>;
+def llvm_spmemv8f32ptr_ty     : LLVMQualPointerType<llvm_v8f32_ty, 202>;
+def llvm_vmemv1024i32ptr_ty   : LLVMQualPointerType<llvm_v1024i32_ty, 205>;
+def llvm_vmemv1024f32ptr_ty   : LLVMQualPointerType<llvm_v1024f32_ty, 205>;
+def llvm_bmemv8f32ptr_ty      : LLVMQualPointerType<llvm_v8f32_ty, 207>;
+def llvm_dregptr_ty           : LLVMQualPointerType<llvm_i32_ty, 208>;
+def llvm_iovaptr_ty           : LLVMQualPointerType<llvm_i32_ty, 216>;
+
+// Low precision types.
+def llvm_v64i4_ty             : LLVMType<v64i4>;
+def llvm_v128i2_ty            : LLVMType<v128i2>;
+
+// All intrinsics start with "llvm.tpu."
+let TargetPrefix = "tpu" in {
+  def int_tpu_syncadd : ClangBuiltin<"__builtin_tpu_syncadd">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], [IntrArgMemOnly],
+                              "", [SDNPMemOperand]>;
+  def int_tpu_syncadd_done : ClangBuiltin<"__builtin_tpu_syncadd_done">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], [IntrArgMemOnly],
+                              "", [SDNPMemOperand]>;
+  def int_tpu_syncadd_notdone : ClangBuiltin<"__builtin_tpu_syncadd_notdone">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], [IntrArgMemOnly],
+                              "", [SDNPMemOperand]>;
+  def int_tpu_syncadd_remote : ClangBuiltin<"__builtin_tpu_syncadd_remote">,
+              Intrinsic<[], [llvm_sflaganyptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                        [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  def int_tpu_syncadd_remote_done : ClangBuiltin<"__builtin_tpu_syncadd_remote_done">,
+              Intrinsic<[], [llvm_sflaganyptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                        [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  def int_tpu_syncadd_remote_doneinv : ClangBuiltin<"__builtin_tpu_syncadd_remote_doneinv">,
+              Intrinsic<[], [llvm_sflaganyptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                        [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  def int_tpu_syncadd_other : ClangBuiltin<"__builtin_tpu_syncadd_other">,
+              Intrinsic<[], [llvm_sflagotherptr_ty, llvm_i32_ty], [IntrArgMemOnly],
+                              "", [SDNPMemOperand]>;
+  def int_tpu_syncadd_both : ClangBuiltin<"__builtin_tpu_syncadd_both">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_sflagotherptr_ty, llvm_i32_ty], [IntrArgMemOnly],
+                              "", [SDNPMemOperand]>;
+  def int_tpu_syncadd_tile : ClangBuiltin<"__builtin_tpu_syncadd_tile">,
+              Intrinsic<[], [llvm_sflagtileptr_ty, llvm_i32_ty],
+                        [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  def int_tpu_syncset_done : ClangBuiltin<"__builtin_tpu_syncset_done">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], [IntrArgMemOnly],
+                              "", [SDNPMemOperand]>;
+  def int_tpu_syncset_notdone : ClangBuiltin<"__builtin_tpu_syncset_notdone">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], [IntrArgMemOnly],
+                              "", [SDNPMemOperand]>;
+  def int_tpu_syncset_remote : ClangBuiltin<"__builtin_tpu_syncset_remote">,
+              Intrinsic<[], [llvm_sflaganyptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                        [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  def int_tpu_syncset_remote_done : ClangBuiltin<"__builtin_tpu_syncset_remote_done">,
+              Intrinsic<[], [llvm_sflaganyptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                        [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  def int_tpu_syncset_remote_doneinv : ClangBuiltin<"__builtin_tpu_syncset_remote_doneinv">,
+              Intrinsic<[], [llvm_sflaganyptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                        [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  def int_tpu_syncset_other_done : ClangBuiltin<"__builtin_tpu_syncset_other_done">,
+              Intrinsic<[], [llvm_sflagotherptr_ty, llvm_i32_ty], [IntrArgMemOnly],
+                              "", [SDNPMemOperand]>;
+  def int_tpu_syncset_both : ClangBuiltin<"__builtin_tpu_syncset_both">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_sflagotherptr_ty, llvm_i32_ty], [IntrArgMemOnly],
+                              "", [SDNPMemOperand]>;
+  def int_tpu_syncset_both_done : ClangBuiltin<"__builtin_tpu_syncset_both_done">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_sflagotherptr_ty, llvm_i32_ty], [IntrArgMemOnly],
+                              "", [SDNPMemOperand]>;
+  def int_tpu_syncdonemov : ClangBuiltin<"__builtin_tpu_syncdonemov">,
+              Intrinsic<[llvm_i32_ty], [llvm_sflagptr_ty], [IntrArgMemOnly],
+                         "", [SDNPMemOperand]>;
+  def int_tpu_syncpamov : ClangBuiltin<"__builtin_tpu_syncpamov">,
+              Intrinsic<[llvm_i32_ty], [llvm_sflagptr_ty], [IntrArgMemOnly],
+                         "", [SDNPMemOperand]>;
+
+  // Wait instructions are barriers. No memory access should cross them.
+  def int_tpu_waiteq : ClangBuiltin<"__builtin_tpu_waiteq">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waiteq_yieldable : ClangBuiltin<"__builtin_tpu_waiteq_yieldable">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waiteqordone : ClangBuiltin<"__builtin_tpu_waiteqordone">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waiteqordone_yieldable : ClangBuiltin<"__builtin_tpu_waiteqordone_yieldable">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitne : ClangBuiltin<"__builtin_tpu_waitne">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitne_yieldable : ClangBuiltin<"__builtin_tpu_waitne_yieldable">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitneordone : ClangBuiltin<"__builtin_tpu_waitneordone">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitneordone_yieldable : ClangBuiltin<"__builtin_tpu_waitneordone_yieldable">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitgt : ClangBuiltin<"__builtin_tpu_waitgt">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitgt_yieldable : ClangBuiltin<"__builtin_tpu_waitgt_yieldable">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitgtordone : ClangBuiltin<"__builtin_tpu_waitgtordone">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitgtordone_yieldable : ClangBuiltin<"__builtin_tpu_waitgtordone_yieldable">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitge : ClangBuiltin<"__builtin_tpu_waitge">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitge_yieldable : ClangBuiltin<"__builtin_tpu_waitge_yieldable">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitgeordone : ClangBuiltin<"__builtin_tpu_waitgeordone">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitgeordone_yieldable : ClangBuiltin<"__builtin_tpu_waitgeordone_yieldable">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitlt : ClangBuiltin<"__builtin_tpu_waitlt">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitlt_yieldable : ClangBuiltin<"__builtin_tpu_waitlt_yieldable">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitltordone : ClangBuiltin<"__builtin_tpu_waitltordone">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitltordone_yieldable : ClangBuiltin<"__builtin_tpu_waitltordone_yieldable">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitle : ClangBuiltin<"__builtin_tpu_waitle">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  def int_tpu_waitle_yieldable : ClangBuiltin<"__builtin_tpu_waitle_yieldable">,
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty], []>;
+  // Note: "waitleordone" variants are not supported on any platform.
+  def int_tpu_waitdone : ClangBuiltin<"__builtin_tpu_waitdone">,
+              Intrinsic<[], [llvm_sflagptr_ty], []>;
+  def int_tpu_waitdone_yieldable : ClangBuiltin<"__builtin_tpu_waitdone_yieldable">,
+              Intrinsic<[], [llvm_sflagptr_ty], []>;
+  def int_tpu_waitnotdone : ClangBuiltin<"__builtin_tpu_waitnotdone">,
+              Intrinsic<[], [llvm_sflagptr_ty], []>;
+  def int_tpu_waitnotdone_yieldable : ClangBuiltin<"__builtin_tpu_waitnotdone_yieldable">,
+              Intrinsic<[], [llvm_sflagptr_ty], []>;
+  def int_tpu_nop : ClangBuiltin<"__builtin_tpu_nop">,
+              Intrinsic<[], [], [IntrInaccessibleMemOnly]>;
+
+  def int_tpu_loop_parallel : ClangBuiltin<"__builtin_tpu_loop_parallel">,
+              Intrinsic<[], [], []>;
+
+  def int_tpu_dma_hbm_to_smem :
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_anyptr_ty, llvm_smemptr_ty,
+                             llvm_i32_ty], [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  def int_tpu_dma_hbm_to_vmem :
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_anyptr_ty,
+                             llvm_vmemv1024i32ptr_ty, llvm_i32_ty],
+                            [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  def int_tpu_dma_hbm_to_vmem_hib_update :
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_anyptr_ty,
+                             llvm_vmemv1024i32ptr_ty, llvm_i32_ty],
+                            [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  def int_tpu_dma_hbm_to_timem :
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_anyptr_ty, llvm_timemptr_ty,
+                             llvm_i32_ty], [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  def int_tpu_dma_hbm_to_hbm :
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_anyptr_ty, llvm_anyptr_ty,
+                             llvm_i32_ty], [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  def int_tpu_dma_hbm_to_hib :
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_anyptr_ty,
+                             llvm_i32_ty], [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  def int_tpu_dma_smem_to_hbm :
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_smemptr_ty, llvm_anyptr_ty,
+                             llvm_i32_ty], [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  def int_tpu_dma_vmem_to_hbm :
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_vmemv1024i32ptr_ty, llvm_anyptr_ty,
+                             llvm_i32_ty], [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  def int_tpu_dma_timem_to_hbm :
+              Intrinsic<[], [llvm_sflagptr_ty, llvm_timemptr_ty, llvm_anyptr_ty,
+                             llvm_i32_ty], [IntrArgMemOnly], "", [SDNPMemOperand]>;
+
+  // TODO(b/175253540): Support more fine grained sfence AA analysis.
+  def int_tpu_sfence_stream_spmem :
+    ClangBuiltin<"__builtin_tpu_sfence_stream_spmem">,
+    Intrinsic<[], [], []>;
+  def int_tpu_sfence_stream_hbm :
+    ClangBuiltin<"__builtin_tpu_sfence_stream_hbm">,
+    Intrinsic<[], [], []>;
+
+  def int_tpu_sfence_sel :
+             ClangBuiltin<"__builtin_tpu_sfence_sel">,
+             Intrinsic<[], [llvm_i32_ty], []>;
+  def int_tpu_sfence_scmf :
+             ClangBuiltin<"__builtin_tpu_sfence_scmf">,
+             Intrinsic<[], [], []>;
+
+  // SparseCore instruction fence that behaves like an atomic_fence but
+  // also prevents instructions from moving into the branch delay slot,
+  // used to fence instruction memory.
+  def int_tpu_sfence_imem :
+             Intrinsic<[], [], []>;
+
+  // Initialize the stack pointers in top level functions.
+  def int_tpu_init_stack :
+             ClangBuiltin<"__builtin_tpu_init_stack">,
+             Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], []>;
+
+  // DMA based on the descriptor from SMem. It may access any memory.
+  def int_tpu_dma_descriptor :
+              ClangBuiltin<"__builtin_tpu_dma_descriptor">,
+              Intrinsic<[], [llvm_smemptr_ty], []>;
+
+  multiclass DmaType<LLVMType SrcPtrType, LLVMType DstPtrType> {
+    // single_strided. Same as DMA with extra destination stride and element per
+    // stride.
+    def _single_strided :
+              Intrinsic<[], [llvm_sflagptr_ty, SrcPtrType, DstPtrType,
+                             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrArgMemOnly], "", [SDNPMemOperand]>;
+    // General DMA takes an extra source sync flag, stride level count and stride
+    // descriptor pointer.
+    def _general :
+              Intrinsic<[], [llvm_sflagptr_ty, SrcPtrType, DstPtrType,
+                             /*length*/llvm_i32_ty, llvm_sflagptr_ty,
+                             llvm_smemptr_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  }
+
+  defm int_tpu_dma_hbm_to_smem : DmaType<llvm_anyptr_ty, llvm_smemptr_ty>;
+  defm int_tpu_dma_hbm_to_vmem : DmaType<llvm_anyptr_ty, llvm_vmemv1024i32ptr_ty>;
+  defm int_tpu_dma_smem_to_hbm : DmaType<llvm_smemptr_ty, llvm_anyptr_ty>;
+  defm int_tpu_dma_vmem_to_hbm : DmaType<llvm_vmemv1024i32ptr_ty, llvm_anyptr_ty>;
+
+  // Local DMA for SparseCore.
+  multiclass SimpleDmaSc_<LLVMType SrcPtrType, LLVMType DstPtrType> {
+    def _sc_simple  :
+              Intrinsic<[], [llvm_sflaganyptr_ty, SrcPtrType, DstPtrType,
+                             /* Size */ llvm_i32_ty,
+                             /* trace_en */ llvm_i32_ty],
+                        [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  }
+
+  defm int_tpu_dma_hbm_to_hbm : SimpleDmaSc_<llvm_hbmptr_ty, llvm_hbmptr_ty>;
+  defm int_tpu_dma_smem_to_smem : SimpleDmaSc_<llvm_smemptr_ty, llvm_smemanyptr_ty>;
+  defm int_tpu_dma_hbm_to_smem : SimpleDmaSc_<llvm_hbmptr_ty, llvm_smemptr_ty>;
+  defm int_tpu_dma_hbm_to_simem : SimpleDmaSc_<llvm_hbmptr_ty, llvm_simemptr_ty>;
+  defm int_tpu_dma_hbm_to_timem : SimpleDmaSc_<llvm_hbmptr_ty, llvm_timemptr_ty>;
+  defm int_tpu_dma_hbm_to_tilespmem : SimpleDmaSc_<llvm_hbmptr_ty, llvm_tilespmemptr_ty>;
+  defm int_tpu_dma_hbm_to_spmem : SimpleDmaSc_<llvm_hbmptr_ty, llvm_spmemptr_ty>;
+  defm int_tpu_dma_smem_to_hbm : SimpleDmaSc_<llvm_smemptr_ty, llvm_hbmptr_ty>;
+  defm int_tpu_dma_timem_to_hbm : SimpleDmaSc_<llvm_timemptr_ty, llvm_hbmptr_ty>;
+  defm int_tpu_dma_tilespmem_to_hbm : SimpleDmaSc_<llvm_tilespmemptr_ty, llvm_hbmptr_ty>;
+  defm int_tpu_dma_spmem_to_hbm : SimpleDmaSc_<llvm_spmemptr_ty, llvm_hbmptr_ty>;
+  defm int_tpu_dma_spmem_to_spmem : SimpleDmaSc_<llvm_spmemptr_ty, llvm_spmemptr_ty>;
+  defm int_tpu_dma_tilespmem_to_spmem : SimpleDmaSc_<llvm_tilespmemptr_ty, llvm_spmemptr_ty>;
+  defm int_tpu_dma_spmem_to_tilespmem : SimpleDmaSc_<llvm_spmemptr_ty, llvm_tilespmemptr_ty>;
+
+  // Local Host DMA for SparseCore.
+  multiclass SimpleHostDmaSc_<LLVMType SrcPtrType, LLVMType DstPtrType> {
+    def _sc_simple :
+              Intrinsic<[], [llvm_sflaganyptr_ty, SrcPtrType, DstPtrType,
+                             /* Offset */ llvm_i32_ty, /* Size */ llvm_i32_ty,
+                             /* trace_en */ llvm_i32_ty],
+                        [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  }
+
+  defm int_tpu_dma_hbm_to_iova : SimpleHostDmaSc_<llvm_hbmptr_ty, llvm_iovaptr_ty>;
+  defm int_tpu_dma_iova_to_hbm : SimpleHostDmaSc_<llvm_iovaptr_ty, llvm_hbmptr_ty>;
+
+  // General DMA for SparseCore.
+  multiclass GeneralDmaSc_<LLVMType SrcPtrType, LLVMType DstPtrType> {
+    def _sc_general :
+              Intrinsic<[], [llvm_sflaganyptr_ty, llvm_i32_ty, SrcPtrType, DstPtrType,
+                             /*length*/llvm_i32_ty, llvm_sflagptr_ty, llvm_i32_ty,
+                             llvm_dregptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrArgMemOnly], "", [SDNPMemOperand]>;
+  }
+
+  defm int_tpu_dma_hbm_to_hbm : GeneralDmaSc_<llvm_hbmptr_ty, llvm_hbmanyptr_ty>;
+  defm int_tpu_dma_smem_to_smem : GeneralDmaSc_<llvm_smemptr_ty, llvm_smemanyptr_ty>;
+  defm int_tpu_dma_hbm_to_smem : GeneralDmaSc_<llvm_hbmanyptr_ty, llvm_smemptr_ty>;
+  defm int_tpu_dma_hbm_to_timem : GeneralDmaSc_<llvm_hbmanyptr_ty, llvm_timemptr_ty>;
+  defm int_tpu_dma_hbm_to_spmem : GeneralDmaSc_<llvm_hbmanyptr_ty, llvm_spmemptr_ty>;
+  defm int_tpu_dma_smem_to_hbm : GeneralDmaSc_<llvm_smemptr_ty, llvm_hbmanyptr_ty>;
+  defm int_tpu_dma_timem_to_hbm : GeneralDmaSc_<llvm_timemptr_ty, llvm_hbmanyptr_ty>;
+  defm int_tpu_dma_spmem_to_hbm : GeneralDmaSc_<llvm_spmemptr_ty, llvm_hbmanyptr_ty>;
+  defm int_tpu_dma_spmem_to_spmem : GeneralDmaSc_<llvm_spmemptr_ty, llvm_spmemptr_ty>;
+
+  // Linear stream intrinsics
+
+  multiclass StreamLinearGatherIntr<LLVMType TileLocalMem,
+                                    LLVMType OffTileMem,
+                                    string tile_local_mem,
+                                    string off_tile_mem> {
+    def _#off_tile_mem#"_to_"#tile_local_mem :
+      Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty, OffTileMem, TileLocalMem,
+                     llvm_i32_ty, llvm_i32_ty],
+                [IntrArgMemOnly, IntrWillReturn, ReadOnly<ArgIndex<0>>,
+                 ReadOnly<ArgIndex<2>>],
+                "", [SDNPMemOperand]>;
+  }
+
+  multiclass StreamLinearScatterIntr<LLVMType TileLocalMem,
+                                     LLVMType OffTileMem,
+                                     string tile_local_mem,
+                                     string off_tile_mem> {
+    def _#tile_local_mem#"_to_"#off_tile_mem :
+      Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty, TileLocalMem, OffTileMem,
+                     llvm_i32_ty, llvm_i32_ty],
+                [IntrArgMemOnly, IntrWillReturn, ReadOnly<ArgIndex<0>>,
+                 WriteOnly<ArgIndex<3>>],
+                "", [SDNPMemOperand]>;
+  }
+
+  multiclass StreamLinearGatherIntrWithMod<LLVMType TileLocalMem,
+                                           LLVMType OffTileMem,
+                                           string tile_local_mem,
+                                           string off_tile_mem,
+		                           bit HasAddMod> {
+    defm _cb : StreamLinearGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    defm _cb_upd : StreamLinearGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    if !eq(HasAddMod, 1) then {
+      defm _add_s32 : StreamLinearGatherIntr<TileLocalMem, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _add_f32 : StreamLinearGatherIntr<TileLocalMem, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_add_s32 : StreamLinearGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_add_f32 : StreamLinearGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_upd_add_s32 : StreamLinearGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_upd_add_f32 : StreamLinearGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    }
+  }
+
+  multiclass StreamLinearScatterIntrWithMod<LLVMType TileLocalMem,
+                                            LLVMType OffTileMem,
+                                            string tile_local_mem,
+                                            string off_tile_mem,
+                                            bit HasAddMod> {
+    defm _cb : StreamLinearScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    defm _cb_upd : StreamLinearScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    if !eq(HasAddMod, 1) then {
+      defm _add_s32 : StreamLinearScatterIntr<TileLocalMem, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _add_f32 : StreamLinearScatterIntr<TileLocalMem, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_add_s32 : StreamLinearScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_add_f32 : StreamLinearScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_upd_add_s32 : StreamLinearScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_upd_add_f32 : StreamLinearScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    }
+  }
+
+  defm int_tpu_stream_linear_gather : StreamLinearGatherIntr<llvm_smemptr_ty, llvm_spmemptr_ty,     "smem", "spmem">;
+  defm int_tpu_stream_linear_gather : StreamLinearGatherIntr<llvm_smemptr_ty, llvm_tilespmemptr_ty, "smem", "tilespmem_tileN">;
+  defm int_tpu_stream_linear_gather : StreamLinearGatherIntr<llvm_tilespmemptr_ty, llvm_hbmptr_ty,       "tilespmem", "hbm">;
+  defm int_tpu_stream_linear_gather : StreamLinearGatherIntr<llvm_tilespmemptr_ty, llvm_hbmptr_ty,     "tilespmem", "hbm4b">;
+  defm int_tpu_stream_linear_gather : StreamLinearGatherIntr<llvm_tilespmemptr_ty, llvm_spmemptr_ty,     "tilespmem", "spmem">;
+  defm int_tpu_stream_linear_gather : StreamLinearGatherIntr<llvm_tilespmemptr_ty, llvm_tilespmemptr_ty, "tilespmem", "tilespmem_tileN">;
+  // Stream gather intrinsics with all combinations of .add.s32, .add.f32, .cb, and .cb.upd.
+  // The .add variants are only generated with tile local memory "tilespmem" support.
+  defm int_tpu_stream_linear_gather : StreamLinearGatherIntrWithMod<llvm_smemptr_ty, llvm_spmemptr_ty,     "smem", "spmem", 0>;
+  defm int_tpu_stream_linear_gather : StreamLinearGatherIntrWithMod<llvm_smemptr_ty, llvm_tilespmemptr_ty, "smem", "tilespmem_tileN", 0>;
+  defm int_tpu_stream_linear_gather : StreamLinearGatherIntrWithMod<llvm_tilespmemptr_ty, llvm_hbmptr_ty,       "tilespmem", "hbm", 1>;
+  defm int_tpu_stream_linear_gather : StreamLinearGatherIntrWithMod<llvm_tilespmemptr_ty, llvm_hbmptr_ty,     "tilespmem", "hbm4b", 1>;
+  defm int_tpu_stream_linear_gather : StreamLinearGatherIntrWithMod<llvm_tilespmemptr_ty, llvm_spmemptr_ty,     "tilespmem", "spmem", 1>;
+  defm int_tpu_stream_linear_gather : StreamLinearGatherIntrWithMod<llvm_tilespmemptr_ty, llvm_tilespmemptr_ty, "tilespmem", "tilespmem_tileN", 1>;
+  defm int_tpu_stream_linear_scatter : StreamLinearScatterIntr<llvm_tilespmemptr_ty, llvm_hbmptr_ty,       "tilespmem", "hbm">;
+  defm int_tpu_stream_linear_scatter : StreamLinearScatterIntr<llvm_tilespmemptr_ty, llvm_hbmptr_ty,     "tilespmem", "hbm4b">;
+  defm int_tpu_stream_linear_scatter : StreamLinearScatterIntr<llvm_smemptr_ty, llvm_spmemptr_ty,     "smem", "spmem">;
+  defm int_tpu_stream_linear_scatter : StreamLinearScatterIntr<llvm_smemptr_ty, llvm_tilespmemptr_ty, "smem", "tilespmem_tileN">;
+  defm int_tpu_stream_linear_scatter : StreamLinearScatterIntr<llvm_tilespmemptr_ty, llvm_spmemptr_ty,     "tilespmem", "spmem">;
+  defm int_tpu_stream_linear_scatter : StreamLinearScatterIntr<llvm_tilespmemptr_ty, llvm_tilespmemptr_ty, "tilespmem", "tilespmem_tileN">;
+  // Stream scatter instructions with all combinations of .add.s32, .add.f32, .cb, and .cb.upd.
+  // The .add variants are only generated with off tile memory "spmem" and "tilespmem.tileN"
+  // support.
+  defm int_tpu_stream_linear_scatter : StreamLinearScatterIntrWithMod<llvm_tilespmemptr_ty, llvm_hbmptr_ty,       "tilespmem", "hbm", 0>;
+  defm int_tpu_stream_linear_scatter : StreamLinearScatterIntrWithMod<llvm_tilespmemptr_ty, llvm_hbmptr_ty,     "tilespmem", "hbm4b", 0>;
+  defm int_tpu_stream_linear_scatter : StreamLinearScatterIntrWithMod<llvm_smemptr_ty, llvm_spmemptr_ty,     "smem", "spmem", 1>;
+  defm int_tpu_stream_linear_scatter : StreamLinearScatterIntrWithMod<llvm_smemptr_ty, llvm_tilespmemptr_ty, "smem", "tilespmem_tileN", 1>;
+  defm int_tpu_stream_linear_scatter : StreamLinearScatterIntrWithMod<llvm_tilespmemptr_ty, llvm_spmemptr_ty,     "tilespmem", "spmem", 1>;
+  defm int_tpu_stream_linear_scatter : StreamLinearScatterIntrWithMod<llvm_tilespmemptr_ty, llvm_tilespmemptr_ty, "tilespmem", "tilespmem_tileN", 1>;
+
+  // Indirect stream intrinsics
+
+  multiclass StreamIndirectGatherIntr<LLVMType TileLocalMem,
+                                      LLVMType OffTileMem,
+                                      string tile_local_mem,
+                                      string off_tile_mem> {
+    def _#off_tile_mem#"_to_"#tile_local_mem :
+      Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty, OffTileMem, TileLocalMem,
+                     llvm_i32_ty, llvm_tilespmemptr_ty, llvm_i32_ty, llvm_i32_ty],
+                [IntrArgMemOnly, IntrWillReturn, ReadOnly<ArgIndex<0>>,
+                 ReadOnly<ArgIndex<2>>],
+                "", [SDNPMemOperand]>;
+  }
+
+  multiclass StreamIndirectScatterIntr<LLVMType TileLocalMem,
+                                       LLVMType OffTileMem,
+                                       string tile_local_mem,
+                                       string off_tile_mem> {
+    def _#tile_local_mem#"_to_"#off_tile_mem :
+      Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty, TileLocalMem, OffTileMem,
+                     llvm_i32_ty, llvm_tilespmemptr_ty, llvm_i32_ty, llvm_i32_ty],
+                [IntrArgMemOnly, IntrWillReturn, ReadOnly<ArgIndex<0>>,
+                 WriteOnly<ArgIndex<3>>],
+                "", [SDNPMemOperand]>;
+  }
+
+  multiclass StreamIndirectGatherIntrWithMod<LLVMType TileLocalMem,
+                                             LLVMType OffTileMem,
+                                             string tile_local_mem,
+                                             string off_tile_mem,
+                                             bit HasAddMod> {
+    defm _cb : StreamIndirectGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    defm _cb_upd : StreamIndirectGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    if !eq(HasAddMod, 1) then {
+      defm _add_s32 : StreamIndirectGatherIntr<TileLocalMem, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _add_f32 : StreamIndirectGatherIntr<TileLocalMem, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_add_s32 : StreamIndirectGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_add_f32 : StreamIndirectGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_upd_add_s32 : StreamIndirectGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_upd_add_f32 : StreamIndirectGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    }
+  }
+
+  multiclass StreamIndirectScatterIntrWithMod<LLVMType TileLocalMem,
+                                              LLVMType OffTileMem,
+                                              string tile_local_mem,
+                                              string off_tile_mem,
+                                              bit HasAddMod> {
+    defm _cb : StreamIndirectScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    defm _cb_upd : StreamIndirectScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    if !eq(HasAddMod, 1) then {
+      defm _add_s32 : StreamIndirectScatterIntr<TileLocalMem, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _add_f32 : StreamIndirectScatterIntr<TileLocalMem, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_add_s32 : StreamIndirectScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_add_f32 : StreamIndirectScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_upd_add_s32 : StreamIndirectScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_upd_add_f32 : StreamIndirectScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    }
+  }
+
+  defm int_tpu_stream_indirect_gather : StreamIndirectGatherIntr<llvm_smemptr_ty, llvm_spmemptr_ty,     "smem", "spmem">;
+  defm int_tpu_stream_indirect_gather : StreamIndirectGatherIntr<llvm_smemptr_ty, llvm_tilespmemptr_ty, "smem", "tilespmem_tileN">;
+  defm int_tpu_stream_indirect_gather : StreamIndirectGatherIntr<llvm_tilespmemptr_ty, llvm_hbmptr_ty,       "tilespmem", "hbm">;
+  defm int_tpu_stream_indirect_gather : StreamIndirectGatherIntr<llvm_tilespmemptr_ty, llvm_hbmptr_ty,     "tilespmem", "hbm4b">;
+  defm int_tpu_stream_indirect_gather : StreamIndirectGatherIntr<llvm_tilespmemptr_ty, llvm_spmemptr_ty,     "tilespmem", "spmem">;
+  defm int_tpu_stream_indirect_gather : StreamIndirectGatherIntr<llvm_tilespmemptr_ty, llvm_tilespmemptr_ty, "tilespmem", "tilespmem_tileN">;
+  // Stream gather intrinsics with all combinations of .add.s32, .add.f32, .cb, and .cb.upd.
+  // The .add variants are only generated with tile local memory "tilespmem" support.
+  defm int_tpu_stream_indirect_gather : StreamIndirectGatherIntrWithMod<llvm_smemptr_ty, llvm_spmemptr_ty,     "smem", "spmem", 0>;
+  defm int_tpu_stream_indirect_gather : StreamIndirectGatherIntrWithMod<llvm_smemptr_ty, llvm_tilespmemptr_ty, "smem", "tilespmem_tileN", 0>;
+  defm int_tpu_stream_indirect_gather : StreamIndirectGatherIntrWithMod<llvm_tilespmemptr_ty, llvm_hbmptr_ty,       "tilespmem", "hbm", 1>;
+  defm int_tpu_stream_indirect_gather : StreamIndirectGatherIntrWithMod<llvm_tilespmemptr_ty, llvm_hbmptr_ty,     "tilespmem", "hbm4b", 1>;
+  defm int_tpu_stream_indirect_gather : StreamIndirectGatherIntrWithMod<llvm_tilespmemptr_ty, llvm_spmemptr_ty,     "tilespmem", "spmem", 1>;
+  defm int_tpu_stream_indirect_gather : StreamIndirectGatherIntrWithMod<llvm_tilespmemptr_ty, llvm_tilespmemptr_ty, "tilespmem", "tilespmem_tileN", 1>;
+  defm int_tpu_stream_indirect_scatter : StreamIndirectScatterIntr<llvm_tilespmemptr_ty, llvm_hbmptr_ty,       "tilespmem", "hbm">;
+  defm int_tpu_stream_indirect_scatter : StreamIndirectScatterIntr<llvm_tilespmemptr_ty, llvm_hbmptr_ty,     "tilespmem", "hbm4b">;
+  defm int_tpu_stream_indirect_scatter : StreamIndirectScatterIntr<llvm_smemptr_ty, llvm_spmemptr_ty,     "smem", "spmem">;
+  defm int_tpu_stream_indirect_scatter : StreamIndirectScatterIntr<llvm_smemptr_ty, llvm_tilespmemptr_ty, "smem", "tilespmem_tileN">;
+  defm int_tpu_stream_indirect_scatter : StreamIndirectScatterIntr<llvm_tilespmemptr_ty, llvm_spmemptr_ty,     "tilespmem", "spmem">;
+  defm int_tpu_stream_indirect_scatter : StreamIndirectScatterIntr<llvm_tilespmemptr_ty, llvm_tilespmemptr_ty, "tilespmem", "tilespmem_tileN">;
+  // Stream scatter instructions with all combinations of .add.s32, .add.f32, .cb, and .cb.upd.
+  // The .add variants are only generated with off tile memory "spmem" and "tilespmem.tileN"
+  // support.
+  defm int_tpu_stream_indirect_scatter : StreamIndirectScatterIntrWithMod<llvm_tilespmemptr_ty, llvm_hbmptr_ty,       "tilespmem", "hbm", 0>;
+  defm int_tpu_stream_indirect_scatter : StreamIndirectScatterIntrWithMod<llvm_tilespmemptr_ty, llvm_hbmptr_ty,     "tilespmem", "hbm4b", 0>;
+  defm int_tpu_stream_indirect_scatter : StreamIndirectScatterIntrWithMod<llvm_smemptr_ty, llvm_spmemptr_ty,     "smem", "spmem", 1>;
+  defm int_tpu_stream_indirect_scatter : StreamIndirectScatterIntrWithMod<llvm_smemptr_ty, llvm_tilespmemptr_ty, "smem", "tilespmem_tileN", 1>;
+  defm int_tpu_stream_indirect_scatter : StreamIndirectScatterIntrWithMod<llvm_tilespmemptr_ty, llvm_spmemptr_ty,     "tilespmem", "spmem", 1>;
+  defm int_tpu_stream_indirect_scatter : StreamIndirectScatterIntrWithMod<llvm_tilespmemptr_ty, llvm_tilespmemptr_ty, "tilespmem", "tilespmem_tileN", 1>;
+
+  // Indirect vreg stream intrinsics
+
+  multiclass StreamIndirectVregGatherIntr<LLVMType TileLocalMem,
+                                          LLVMType OffTileMem,
+                                          string tile_local_mem,
+                                          string off_tile_mem> {
+    def _#off_tile_mem#"_to_"#tile_local_mem :
+      Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty, OffTileMem, TileLocalMem,
+                     llvm_i32_ty, llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty],
+                [IntrArgMemOnly, IntrWillReturn, ReadOnly<ArgIndex<0>>,
+                 ReadOnly<ArgIndex<2>>],
+                "", [SDNPMemOperand]>;
+  }
+
+  multiclass StreamIndirectVregScatterIntr<LLVMType TileLocalMem,
+                                           LLVMType OffTileMem,
+                                           string tile_local_mem,
+                                           string off_tile_mem> {
+    def _#tile_local_mem#"_to_"#off_tile_mem :
+      Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty, TileLocalMem, OffTileMem,
+                     llvm_i32_ty, llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty],
+                [IntrArgMemOnly, IntrWillReturn, ReadOnly<ArgIndex<0>>,
+                 WriteOnly<ArgIndex<3>>],
+                "", [SDNPMemOperand]>;
+  }
+
+  multiclass StreamIndirectVregGatherIntrWithMod<LLVMType TileLocalMem,
+                                                 LLVMType OffTileMem,
+                                                 string tile_local_mem,
+                                                 string off_tile_mem,
+                                                 bit HasAddMod> {
+    defm _cb : StreamIndirectVregGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    defm _cb_upd : StreamIndirectVregGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    if !eq(HasAddMod, 1) then {
+      defm _add_s32 : StreamIndirectVregGatherIntr<TileLocalMem, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _add_f32 : StreamIndirectVregGatherIntr<TileLocalMem, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_add_s32 : StreamIndirectVregGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_add_f32 : StreamIndirectVregGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_upd_add_s32 : StreamIndirectVregGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_upd_add_f32 : StreamIndirectVregGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    }
+  }
+
+  multiclass StreamIndirectVregScatterIntrWithMod<LLVMType TileLocalMem,
+                                                  LLVMType OffTileMem,
+                                                  string tile_local_mem,
+                                                  string off_tile_mem,
+                                                  bit HasAddMod> {
+    defm _cb : StreamIndirectVregScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    defm _cb_upd : StreamIndirectVregScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    if !eq(HasAddMod, 1) then {
+      defm _add_s32 : StreamIndirectVregScatterIntr<TileLocalMem, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _add_f32 : StreamIndirectVregScatterIntr<TileLocalMem, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_add_s32 : StreamIndirectVregScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_add_f32 : StreamIndirectVregScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_upd_add_s32 : StreamIndirectVregScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_upd_add_f32 : StreamIndirectVregScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    }
+  }
+
+  defm int_tpu_stream_indirect_vreg_gather : StreamIndirectVregGatherIntr<llvm_smemptr_ty, llvm_spmemptr_ty,     "smem", "spmem">;
+  defm int_tpu_stream_indirect_vreg_gather : StreamIndirectVregGatherIntr<llvm_smemptr_ty, llvm_tilespmemptr_ty, "smem", "tilespmem_tileN">;
+  defm int_tpu_stream_indirect_vreg_gather : StreamIndirectVregGatherIntr<llvm_tilespmemptr_ty, llvm_hbmptr_ty,       "tilespmem", "hbm">;
+  defm int_tpu_stream_indirect_vreg_gather : StreamIndirectVregGatherIntr<llvm_tilespmemptr_ty, llvm_hbmptr_ty,     "tilespmem", "hbm4b">;
+  defm int_tpu_stream_indirect_vreg_gather : StreamIndirectVregGatherIntr<llvm_tilespmemptr_ty, llvm_spmemptr_ty,     "tilespmem", "spmem">;
+  defm int_tpu_stream_indirect_vreg_gather : StreamIndirectVregGatherIntr<llvm_tilespmemptr_ty, llvm_tilespmemptr_ty, "tilespmem", "tilespmem_tileN">;
+  // Stream gather intrinsics with all combinations of .add.s32, .add.f32, .cb, and .cb.upd.
+  // The .add variants are only generated with tile local memory "tilespmem" support.
+  defm int_tpu_stream_indirect_vreg_gather : StreamIndirectVregGatherIntrWithMod<llvm_smemptr_ty, llvm_spmemptr_ty,     "smem", "spmem", 0>;
+  defm int_tpu_stream_indirect_vreg_gather : StreamIndirectVregGatherIntrWithMod<llvm_smemptr_ty, llvm_tilespmemptr_ty, "smem", "tilespmem_tileN", 0>;
+  defm int_tpu_stream_indirect_vreg_gather : StreamIndirectVregGatherIntrWithMod<llvm_tilespmemptr_ty, llvm_hbmptr_ty,       "tilespmem", "hbm", 1>;
+  defm int_tpu_stream_indirect_vreg_gather : StreamIndirectVregGatherIntrWithMod<llvm_tilespmemptr_ty, llvm_hbmptr_ty,     "tilespmem", "hbm4b", 1>;
+  defm int_tpu_stream_indirect_vreg_gather : StreamIndirectVregGatherIntrWithMod<llvm_tilespmemptr_ty, llvm_spmemptr_ty,     "tilespmem", "spmem", 1>;
+  defm int_tpu_stream_indirect_vreg_gather : StreamIndirectVregGatherIntrWithMod<llvm_tilespmemptr_ty, llvm_tilespmemptr_ty, "tilespmem", "tilespmem_tileN", 1>;
+  defm int_tpu_stream_indirect_vreg_scatter : StreamIndirectVregScatterIntr<llvm_tilespmemptr_ty, llvm_hbmptr_ty,       "tilespmem", "hbm">;
+  defm int_tpu_stream_indirect_vreg_scatter : StreamIndirectVregScatterIntr<llvm_tilespmemptr_ty, llvm_hbmptr_ty,     "tilespmem", "hbm4b">;
+  defm int_tpu_stream_indirect_vreg_scatter : StreamIndirectVregScatterIntr<llvm_smemptr_ty, llvm_spmemptr_ty,     "smem", "spmem">;
+  defm int_tpu_stream_indirect_vreg_scatter : StreamIndirectVregScatterIntr<llvm_smemptr_ty, llvm_tilespmemptr_ty, "smem", "tilespmem_tileN">;
+  defm int_tpu_stream_indirect_vreg_scatter : StreamIndirectVregScatterIntr<llvm_tilespmemptr_ty, llvm_spmemptr_ty,     "tilespmem", "spmem">;
+  defm int_tpu_stream_indirect_vreg_scatter : StreamIndirectVregScatterIntr<llvm_tilespmemptr_ty, llvm_tilespmemptr_ty, "tilespmem", "tilespmem_tileN">;
+  // Stream scatter instructions with all combinations of .add.s32, .add.f32, .cb, and .cb.upd.
+  // The .add variants are only generated with off tile memory "spmem" and "tilespmem.tileN"
+  // support.
+  defm int_tpu_stream_indirect_vreg_scatter : StreamIndirectVregScatterIntrWithMod<llvm_tilespmemptr_ty, llvm_hbmptr_ty,       "tilespmem", "hbm", 0>;
+  defm int_tpu_stream_indirect_vreg_scatter : StreamIndirectVregScatterIntrWithMod<llvm_tilespmemptr_ty, llvm_hbmptr_ty,     "tilespmem", "hbm4b", 0>;
+  defm int_tpu_stream_indirect_vreg_scatter : StreamIndirectVregScatterIntrWithMod<llvm_smemptr_ty, llvm_spmemptr_ty,     "smem", "spmem", 1>;
+  defm int_tpu_stream_indirect_vreg_scatter : StreamIndirectVregScatterIntrWithMod<llvm_smemptr_ty, llvm_tilespmemptr_ty, "smem", "tilespmem_tileN", 1>;
+  defm int_tpu_stream_indirect_vreg_scatter : StreamIndirectVregScatterIntrWithMod<llvm_tilespmemptr_ty, llvm_spmemptr_ty,     "tilespmem", "spmem", 1>;
+  defm int_tpu_stream_indirect_vreg_scatter : StreamIndirectVregScatterIntrWithMod<llvm_tilespmemptr_ty, llvm_tilespmemptr_ty, "tilespmem", "tilespmem_tileN", 1>;
+
+  // Strided stream intrinsics
+
+  multiclass StreamStridedGatherIntr<LLVMType TileLocalMem,
+                                     LLVMType OffTileMem,
+                                     string tile_local_mem,
+                                     string off_tile_mem> {
+    def _#off_tile_mem#"_to_"#tile_local_mem :
+      Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty, OffTileMem, TileLocalMem,
+                     llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                [IntrArgMemOnly, IntrWillReturn, ReadOnly<ArgIndex<0>>,
+                 ReadOnly<ArgIndex<2>>],
+                "", [SDNPMemOperand]>;
+  }
+
+  multiclass StreamStridedScatterIntr<LLVMType TileLocalMem,
+                                      LLVMType OffTileMem,
+                                      string tile_local_mem,
+                                      string off_tile_mem> {
+    def _#tile_local_mem#"_to_"#off_tile_mem :
+      Intrinsic<[], [llvm_sflagptr_ty, llvm_i32_ty, TileLocalMem, OffTileMem,
+                     llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                [IntrArgMemOnly, IntrWillReturn, ReadOnly<ArgIndex<0>>,
+                 WriteOnly<ArgIndex<3>>],
+                "", [SDNPMemOperand]>;
+  }
+
+  multiclass StreamStridedGatherIntrWithMod<LLVMType TileLocalMem,
+                                            LLVMType OffTileMem,
+                                            string tile_local_mem,
+                                            string off_tile_mem,
+                                            bit HasAddMod> {
+    defm _cb : StreamStridedGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    defm _cb_upd : StreamStridedGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    if !eq(HasAddMod, 1) then {
+      defm _add_s32 : StreamStridedGatherIntr<TileLocalMem, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _add_f32 : StreamStridedGatherIntr<TileLocalMem, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_add_s32 : StreamStridedGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_add_f32 : StreamStridedGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_upd_add_s32 : StreamStridedGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_upd_add_f32 : StreamStridedGatherIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    }
+  }
+
+  multiclass StreamStridedScatterIntrWithMod<LLVMType TileLocalMem,
+                                             LLVMType OffTileMem,
+                                             string tile_local_mem,
+                                             string off_tile_mem,
+                                             bit HasAddMod> {
+    defm _cb : StreamStridedScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    defm _cb_upd : StreamStridedScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    if !eq(HasAddMod, 1) then {
+      defm _add_s32 : StreamStridedScatterIntr<TileLocalMem, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _add_f32 : StreamStridedScatterIntr<TileLocalMem, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_add_s32 : StreamStridedScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_add_f32 : StreamStridedScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_upd_add_s32 : StreamStridedScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+      defm _cb_upd_add_f32 : StreamStridedScatterIntr<llvm_x86mmx_ty, OffTileMem, tile_local_mem, off_tile_mem>;
+    }
+  }
+
+  defm int_tpu_stream_strided_gather : StreamStridedGatherIntr<llvm_smemptr_ty, llvm_spmemptr_ty,     "smem", "spmem">;
+  defm int_tpu_stream_strided_gather : StreamStridedGatherIntr<llvm_smemptr_ty, llvm_tilespmemptr_ty, "smem", "tilespmem_tileN">;
+  defm int_tpu_stream_strided_gather : StreamStridedGatherIntr<llvm_tilespmemptr_ty, llvm_hbmptr_ty,       "tilespmem", "hbm">;
+  defm int_tpu_stream_strided_gather : StreamStridedGatherIntr<llvm_tilespmemptr_ty, llvm_hbmptr_ty,     "tilespmem", "hbm4b">;
+  defm int_tpu_stream_strided_gather : StreamStridedGatherIntr<llvm_tilespmemptr_ty, llvm_spmemptr_ty,     "tilespmem", "spmem">;
+  defm int_tpu_stream_strided_gather : StreamStridedGatherIntr<llvm_tilespmemptr_ty, llvm_tilespmemptr_ty, "tilespmem", "tilespmem_tileN">;
+  // Stream gather intrinsics with all combinations of .add.s32, .add.f32, .cb, and .cb.upd.
+  // The .add variants are only generated with tile local memory "tilespmem" support.
+  defm int_tpu_stream_strided_gather : StreamStridedGatherIntrWithMod<llvm_smemptr_ty, llvm_spmemptr_ty,     "smem", "spmem", 0>;
+  defm int_tpu_stream_strided_gather : StreamStridedGatherIntrWithMod<llvm_smemptr_ty, llvm_tilespmemptr_ty, "smem", "tilespmem_tileN", 0>;
+  defm int_tpu_stream_strided_gather : StreamStridedGatherIntrWithMod<llvm_tilespmemptr_ty, llvm_hbmptr_ty,       "tilespmem", "hbm", 1>;
+  defm int_tpu_stream_strided_gather : StreamStridedGatherIntrWithMod<llvm_tilespmemptr_ty, llvm_hbmptr_ty,     "tilespmem", "hbm4b", 1>;
+  defm int_tpu_stream_strided_gather : StreamStridedGatherIntrWithMod<llvm_tilespmemptr_ty, llvm_spmemptr_ty,     "tilespmem", "spmem", 1>;
+  defm int_tpu_stream_strided_gather : StreamStridedGatherIntrWithMod<llvm_tilespmemptr_ty, llvm_tilespmemptr_ty, "tilespmem", "tilespmem_tileN", 1>;
+  defm int_tpu_stream_strided_scatter : StreamStridedScatterIntr<llvm_tilespmemptr_ty, llvm_hbmptr_ty,       "tilespmem", "hbm">;
+  defm int_tpu_stream_strided_scatter : StreamStridedScatterIntr<llvm_tilespmemptr_ty, llvm_hbmptr_ty,     "tilespmem", "hbm4b">;
+  defm int_tpu_stream_strided_scatter : StreamStridedScatterIntr<llvm_smemptr_ty, llvm_spmemptr_ty,     "smem", "spmem">;
+  defm int_tpu_stream_strided_scatter : StreamStridedScatterIntr<llvm_smemptr_ty, llvm_tilespmemptr_ty, "smem", "tilespmem_tileN">;
+  defm int_tpu_stream_strided_scatter : StreamStridedScatterIntr<llvm_tilespmemptr_ty, llvm_spmemptr_ty,     "tilespmem", "spmem">;
+  defm int_tpu_stream_strided_scatter : StreamStridedScatterIntr<llvm_tilespmemptr_ty, llvm_tilespmemptr_ty, "tilespmem", "tilespmem_tileN">;
+  // Stream scatter instructions with all combinations of .add.s32, .add.f32, .cb, and .cb.upd.
+  // The .add variants are only generated with off tile memory "spmem" and "tilespmem.tileN"
+  // support.
+  defm int_tpu_stream_strided_scatter : StreamStridedScatterIntrWithMod<llvm_tilespmemptr_ty, llvm_hbmptr_ty,       "tilespmem", "hbm", 0>;
+  defm int_tpu_stream_strided_scatter : StreamStridedScatterIntrWithMod<llvm_tilespmemptr_ty, llvm_hbmptr_ty,     "tilespmem", "hbm4b", 0>;
+  defm int_tpu_stream_strided_scatter : StreamStridedScatterIntrWithMod<llvm_smemptr_ty, llvm_spmemptr_ty,     "smem", "spmem", 1>;
+  defm int_tpu_stream_strided_scatter : StreamStridedScatterIntrWithMod<llvm_smemptr_ty, llvm_tilespmemptr_ty, "smem", "tilespmem_tileN", 1>;
+  defm int_tpu_stream_strided_scatter : StreamStridedScatterIntrWithMod<llvm_tilespmemptr_ty, llvm_spmemptr_ty,     "tilespmem", "spmem", 1>;
+  defm int_tpu_stream_strided_scatter : StreamStridedScatterIntrWithMod<llvm_tilespmemptr_ty, llvm_tilespmemptr_ty, "tilespmem", "tilespmem_tileN", 1>;
+
+  // Vector 1024 intrinsics:
+  // MXU intrinsics (for VF):
+  // MXU Pushes
+  def int_tpu_vmatpush_msra :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_if8_bf16_msra :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_bf16_msra :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_bf8_bf16_msra :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_u8_msra :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_s8_msra :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_u4_msra :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_s4_msra :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_msra_xpose :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_if8_bf16_msra_xpose :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_bf16_msra_xpose :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_bf8_bf16_msra_xpose :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_u8_msra_xpose :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_s8_msra_xpose :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_u4_msra_xpose :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_s4_msra_xpose :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_msrb :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_if8_bf16_msrb :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_bf16_msrb :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_bf8_bf16_msrb :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_u8_msrb :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_s8_msrb :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_u4_msrb :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_s4_msrb :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_msrb_xpose :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_if8_bf16_msrb_xpose :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_bf16_msrb_xpose :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_bf8_bf16_msrb_xpose :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_u8_msrb_xpose :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_s8_msrb_xpose :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_u4_msrb_xpose :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_s4_msrb_xpose :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  // MXU Matmuls
+  def int_tpu_vmatmul_if8_bf16 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_bf16 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_bf8_bf16 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_u8 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_s8 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_u4 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_s4 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  // MXU Matmuls with Fused Load for PF
+  def int_tpu_vmatmul_f32_dwg_gsfn :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_f32_dwg_gsft :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  def int_tpu_vmatmul_low_f32_dwg_gsfn :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_low_f32_dwg_gsft :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  def int_tpu_vmatmul_hi_f32_dwg_gsfn :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_hi_f32_dwg_gsft :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  def int_tpu_vmatmul_packed_f32_dwg_gsfn :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_packed_f32_dwg_gsft :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  // MXU Matmuls with Fused Load for VF
+  def int_tpu_vmatmul_f32_lgmr_msra :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_f32_lgmr_msrb :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  def int_tpu_vmatmul_if8_bf16_lgmr_msra :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_if8_bf16_lgmr_msrb :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  def int_tpu_vmatmul_bf16_lgmr_msra :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_bf16_lgmr_msrb :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  def int_tpu_vmatmul_bf8_bf16_lgmr_msra :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_bf8_bf16_lgmr_msrb :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  def int_tpu_vmatmul_u8_lgmr_msra :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_u8_lgmr_msrb :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  def int_tpu_vmatmul_s8_lgmr_msra :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_s8_lgmr_msrb :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  def int_tpu_vmatmul_u4_lgmr_msra :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_u4_lgmr_msrb :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  def int_tpu_vmatmul_s4_lgmr_msra :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_s4_lgmr_msrb :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  // IF8, U8, or S8 from LMR.
+  def int_tpu_vmatmul_lmr :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_lmr_lgmr_msra :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_lmr_lgmr_msrb :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  // BF16 from LMR.
+  def int_tpu_vmatmul_bf16_lmr :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_bf16_lmr_lgmr_msra :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_bf16_lmr_lgmr_msrb :
+              Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  // Load Gain/Lefthand Side Matrix from MSRA/MSRB
+  def int_tpu_vlgmr_msra : ClangBuiltin<"__builtin_tpu_vlgmr_msra">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vllmr_msra : ClangBuiltin<"__builtin_tpu_vllmr_msra">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vlgmr_msrb : ClangBuiltin<"__builtin_tpu_vlgmr_msrb">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vllmr_msrb : ClangBuiltin<"__builtin_tpu_vllmr_msrb">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  // MXU PF intrinsics:
+  // MXU Pushes
+  def int_tpu_vmatpush_f32 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_low_f32 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_hi_f32 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_packed_f32 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_xpose_f32 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_low_xpose_f32 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_hi_xpose_f32 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatpush_packed_xpose_f32 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  // MXU Matmuls
+  def int_tpu_vmatmul_f32 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_low_f32 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_hi_f32 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vmatmul_packed_f32 :
+              Intrinsic<[llvm_i32_ty], [llvm_v1024f32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  // Done With Gain forces loading a new gain matrix before the next matmul (for PF).
+  def int_tpu_vdwg : ClangBuiltin<"__builtin_tpu_vdwg">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vdwg_xpose : ClangBuiltin<"__builtin_tpu_vdwg_xpose">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  // MXU Pop result
+  def int_tpu_vmatres_f32 : ClangBuiltin<"__builtin_tpu_vmatres_f32">,
+              Intrinsic<[llvm_v1024f32_ty], [llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  //===--------------------------------------------------------------------===//
+  // XLU Intrinsics
+  //===--------------------------------------------------------------------===//
+  // Transpose returns a trf register, it takes source and width and height
+  // followed by bus index. The XLU index is infered from the bus index.
+  // (XLUIndex = BusIndex%2).
+  def int_tpu_tc_transpose :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+  def int_tpu_tc_transpose_start :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+  def int_tpu_tc_transpose_end :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+  def int_tpu_tc_transpose_cont :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+  def int_tpu_tc_transpose_start_end :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+  def int_tpu_tc_transpose_start_segmented :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+  def int_tpu_tc_transpose_segmented :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+  def int_tpu_tc_transpose_end_segmented :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+  def int_tpu_tc_transpose_start_end_segmented :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+  // Packed intrinsics take a second source for the higher bits.
+  def int_tpu_tc_transpose_start_packed :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty,
+               llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+  def int_tpu_tc_transpose_packed :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty,
+               llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+  def int_tpu_tc_transpose_end_packed :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty,
+               llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+  def int_tpu_tc_transpose_start_end_packed :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty,
+               llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+  def int_tpu_tc_transpose_segmented_packed :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty,
+               llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+  def int_tpu_tc_transpose_start_segmented_packed :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty,
+               llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+  def int_tpu_tc_transpose_end_segmented_packed :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty,
+               llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+  def int_tpu_tc_transpose_start_end_segmented_packed :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty,
+               llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+
+  def int_tpu_set_permute :
+              ClangBuiltin<"__builtin_tpu_set_permute">,
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_set_permute_sublane :
+              ClangBuiltin<"__builtin_tpu_set_permute_sublane">,
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_set_permute_bytes :
+              ClangBuiltin<"__builtin_tpu_set_permute_bytes">,
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_set_spr :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  // Permute takes a source a pcr register and a XLU index as source.
+  // FIXME(hgreving): clean up the intrinsic namespace.
+  def int_tpu_permute :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_permute_packed :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, LLVMMatchType<0>,
+               llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vrotate :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_vrotate_packed :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, LLVMMatchType<0>,
+               llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_bcast :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_bcast_packed :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_anyvector_ty, LLVMMatchType<0>,
+               llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  def int_tpu_xlane_add :
+              ClangBuiltin<"__builtin_tpu_xlane_add">,
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_xlane_max :
+              ClangBuiltin<"__builtin_tpu_xlane_max">,
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_xlane_min :
+              ClangBuiltin<"__builtin_tpu_xlane_min">,
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_xlane_maxindex :
+              ClangBuiltin<"__builtin_tpu_xlane_maxindex">,
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_xlane_minindex :
+              ClangBuiltin<"__builtin_tpu_xlane_minindex">,
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  // Segmented version of xlane take an extra SPR source.
+  def int_tpu_xlane_segmented_add :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_xlane_segmented_max :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_xlane_segmented_min :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_xlane_segmented_maxindex :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_xlane_segmented_minindex :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024f32_ty, llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+  def int_tpu_tc_vtrfpop :
+              Intrinsic<[llvm_anyvector_ty], [llvm_i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+
+  def int_tpu_vrshra :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+  def int_tpu_vlaneseq :
+              Intrinsic<[llvm_anyvector_ty], [], [IntrNoMem]>;
+  def int_tpu_exponent :
+              Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+  def int_tpu_significand :
+              Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+  def int_tpu_compose :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+  def int_tpu_pack :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+
+  def int_tpu_packc :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+  def int_tpu_unpacku :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_tpu_unpackl :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+
+
+  // Packed/unpack VFC untyped deprecated.
+
+  def int_tpu_deprecated_pack_i_bf16 :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+  def int_tpu_deprecated_pack_c_bf16 :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+  def int_tpu_deprecated_unpack_i_l_bf16 :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_tpu_deprecated_unpack_i_u_bf16 :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_tpu_deprecated_unpack_c_l_bf16 :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_tpu_deprecated_unpack_c_u_bf16 :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+
+  //  ----------------------
+  // | Packed low precision |
+  //  ----------------------
+
+  // FIXME(b/245600024): Generalize [8|16]xSIMD, all of below.
+  def int_tpu_pack_i_f32_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>;
+  def int_tpu_pack_i_b32_b16 :
+              Intrinsic<[llvm_v16i16_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>;
+  def int_tpu_pack_i_b16_b8 :
+              Intrinsic<[llvm_v32i8_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>;
+  def int_tpu_pack_i_b8_b4 :
+              Intrinsic<[llvm_v64i4_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_pack_c_f32_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>;
+  def int_tpu_pack_c_b32_b16 :
+              Intrinsic<[llvm_v16i16_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>;
+  def int_tpu_pack_c_b16_b8 :
+              Intrinsic<[llvm_v32i8_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>;
+  def int_tpu_pack_c_b8_b4 :
+              Intrinsic<[llvm_v64i4_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_pack_c_b4_b2 :
+              Intrinsic<[llvm_v128i2_ty], [llvm_v64i4_ty, llvm_v64i4_ty], [IntrNoMem]>;
+  def int_tpu_pack_c_b2_b1 :
+              Intrinsic<[llvm_v256i1_ty], [llvm_v128i2_ty, llvm_v128i2_ty], [IntrNoMem]>;
+  def int_tpu_pack_c_bf16_s8 :
+              Intrinsic<[llvm_v32i8_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty], [IntrNoMem]>;
+  def int_tpu_pack_c_bf16_u8 :
+              Intrinsic<[llvm_v32i8_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty], [IntrNoMem]>;
+  def int_tpu_pack_i_bf16_s8 :
+              Intrinsic<[llvm_v32i8_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty], [IntrNoMem]>;
+  def int_tpu_pack_i_bf16_u8 :
+              Intrinsic<[llvm_v32i8_ty], [llvm_v16bf16_ty, llvm_v16bf16_ty], [IntrNoMem]>;
+
+  // FIXME(b/245600024): Generalize [8|16]xSIMD, all of below.
+  def int_tpu_unpack_i_l_bf16_f32 :
+              Intrinsic<[llvm_v8f32_ty], [llvm_v16bf16_ty], [IntrNoMem]>;
+  def int_tpu_unpack_i_u_bf16_f32 :
+              Intrinsic<[llvm_v8f32_ty], [llvm_v16bf16_ty], [IntrNoMem]>;
+  def int_tpu_unpack_i_l_s16_s32 :
+              Intrinsic<[llvm_v8i32_ty], [llvm_v16i16_ty], [IntrNoMem]>;
+  def int_tpu_unpack_i_u_s16_s32 :
+              Intrinsic<[llvm_v8i32_ty], [llvm_v16i16_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_l_bf16_f32 :
+              Intrinsic<[llvm_v8f32_ty], [llvm_v16bf16_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_u_bf16_f32 :
+              Intrinsic<[llvm_v8f32_ty], [llvm_v16bf16_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_l_hf16_f32 :
+              Intrinsic<[llvm_v8f32_ty], [llvm_v16f16_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_u_hf16_f32 :
+              Intrinsic<[llvm_v8f32_ty], [llvm_v16f16_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_l_s16_s32 :
+              Intrinsic<[llvm_v8i32_ty], [llvm_v16i16_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_u_s16_s32 :
+              Intrinsic<[llvm_v8i32_ty], [llvm_v16i16_ty], [IntrNoMem]>;
+  // FIXME(hgreving, b/231595468): bf8, if8 are not supported. For now,
+  // model with i8. This may cause issues in the future.
+  def int_tpu_unpack_c_0_bf8_f32 :
+              Intrinsic<[llvm_v8f32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_1_bf8_f32 :
+              Intrinsic<[llvm_v8f32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_2_bf8_f32 :
+              Intrinsic<[llvm_v8f32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_3_bf8_f32 :
+              Intrinsic<[llvm_v8f32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_0_s8_s32 :
+              Intrinsic<[llvm_v8i32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_1_s8_s32 :
+              Intrinsic<[llvm_v8i32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_2_s8_s32 :
+              Intrinsic<[llvm_v8i32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_3_s8_s32 :
+              Intrinsic<[llvm_v8i32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_l_s4_s8 :
+              Intrinsic<[llvm_v32i8_ty], [llvm_v64i4_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_u_s4_s8 :
+              Intrinsic<[llvm_v32i8_ty], [llvm_v64i4_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_l_s2_s4 :
+              Intrinsic<[llvm_v64i4_ty], [llvm_v128i2_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_u_s2_s4 :
+              Intrinsic<[llvm_v64i4_ty], [llvm_v128i2_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_l_s1_s2 :
+              Intrinsic<[llvm_v128i2_ty], [llvm_v256i1_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_u_s1_s2 :
+              Intrinsic<[llvm_v128i2_ty], [llvm_v256i1_ty], [IntrNoMem]>;
+  // FIXME(hgreving, b/231595468): bf8, if8 are not supported. For now,
+  // model with i8. This may cause issues in the future.
+  def int_tpu_unpack_c_0_if8_f32 :
+              Intrinsic<[llvm_v8f32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_1_if8_f32 :
+              Intrinsic<[llvm_v8f32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_2_if8_f32 :
+              Intrinsic<[llvm_v8f32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_3_if8_f32 :
+              Intrinsic<[llvm_v8f32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_l_s8_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_u_s8_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_l_u8_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_u_u8_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_l_s8_s16 :
+              Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_u_s8_s16 :
+              Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_l_u8_u16 :
+              Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_c_u_u8_u16 :
+              Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_i_l_s8_s16 :
+              Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_i_u_s8_s16 :
+              Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_i_l_u8_u16 :
+              Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_i_u_u8_u16 :
+              Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_i_l_s8_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_i_u_s8_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_i_l_u8_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_i_u_u8_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_ic_l_s8_s16 :
+              Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_ic_u_s8_s16 :
+              Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_ic_l_u8_u16 :
+              Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_ic_u_u8_u16 :
+              Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_ic_l_s8_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_ic_u_s8_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_ic_l_u8_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_unpack_ic_u_u8_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+
+  // FIXME(b/245600024): Generalize [8|16]xSIMD, all of below.
+  def int_tpu_vcvt_s32_f32 :
+              Intrinsic<[llvm_v8f32_ty], [llvm_v8i32_ty], [IntrNoMem]>;
+  def int_tpu_vcvt_f32_s32 :
+              Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+  // FIXME(hgreving, b/231595468): bf8, if8 are not supported. For now,
+  // model with i8. This may cause issues in the future.
+  def int_tpu_vcvt_f32_bf8 :
+              Intrinsic<[llvm_v32i8_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+  def int_tpu_vcvt_f32_if8 :
+              Intrinsic<[llvm_v32i8_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+  def int_tpu_vcvt_f32_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+  def int_tpu_vcvt_f32_hf16 :
+              Intrinsic<[llvm_v16f16_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+  // FIXME(hgreving, b/231595468): bf8, if8 are not supported. For now,
+  // model with i8. This may cause issues in the future.
+  def int_tpu_vcvt_sr_f32_bf8 :
+              Intrinsic<[llvm_v32i8_ty], [llvm_v8i32_ty, llvm_v8f32_ty], [IntrNoMem]>;
+  def int_tpu_vcvt_sr_f32_if8 :
+              Intrinsic<[llvm_v32i8_ty], [llvm_v8i32_ty, llvm_v8f32_ty], [IntrNoMem]>;
+  def int_tpu_vcvt_sr_f32_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v8i32_ty, llvm_v8f32_ty], [IntrNoMem]>;
+  def int_tpu_vcvt_sr_f32_hf16 :
+              Intrinsic<[llvm_v16f16_ty], [llvm_v8i32_ty, llvm_v8f32_ty], [IntrNoMem]>;
+  def int_tpu_vcvt_bf16_s8 :
+              Intrinsic<[llvm_v32i8_ty], [llvm_v16bf16_ty], [IntrNoMem]>;
+  def int_tpu_vcvt_bf16_u8 :
+              Intrinsic<[llvm_v32i8_ty], [llvm_v16bf16_ty], [IntrNoMem]>;
+  def int_tpu_vcvt_bf16_s4 :
+              Intrinsic<[llvm_v64i4_ty], [llvm_v16bf16_ty], [IntrNoMem]>;
+  def int_tpu_vcvt_bf16_u4 :
+              Intrinsic<[llvm_v64i4_ty], [llvm_v16bf16_ty], [IntrNoMem]>;
+  def int_tpu_vcvt_s8_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_vcvt_u8_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v32i8_ty], [IntrNoMem]>;
+  def int_tpu_vcvt_s4_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v64i4_ty], [IntrNoMem]>;
+  def int_tpu_vcvt_u4_bf16 :
+              Intrinsic<[llvm_v16bf16_ty], [llvm_v64i4_ty], [IntrNoMem]>;
+
+  // FIXME(b/245600024): Generalize [8|16]xSIMD, all of below.
+  def int_tpu_add_low_f32_bf16 : Intrinsic<[llvm_v16bf16_ty],
+                                           [llvm_v8f32_ty, llvm_v8f32_ty],
+                                 [IntrNoMem]>;
+  def int_tpu_add_high_f32_bf16 : Intrinsic<[llvm_v16bf16_ty],
+                                            [llvm_v8f32_ty, llvm_v8f32_ty],
+                                  [IntrNoMem]>;
+
+  def int_tpu_vlaneseq_c_bf16 : Intrinsic<[llvm_v16bf16_ty], [], [IntrNoMem]>;
+  def int_tpu_vlaneseq_i_bf16 : Intrinsic<[llvm_v16bf16_ty], [], [IntrNoMem]>;
+
+  // Mask type adapters for low precision masks
+
+  def int_tpu_16i1_to_8i1 : Intrinsic<[llvm_v8i1_ty], [llvm_v16i1_ty], [IntrNoMem]>;
+  def int_tpu_8i1_to_16i1 : Intrinsic<[llvm_v16i1_ty], [llvm_v8i1_ty], [IntrNoMem]>;
+  def int_tpu_16i1_to_32i1 : Intrinsic<[llvm_v32i1_ty], [llvm_v16i1_ty], [IntrNoMem]>;
+  def int_tpu_32i1_to_8i1 : Intrinsic<[llvm_v8i1_ty], [llvm_v32i1_ty], [IntrNoMem]>;
+  def int_tpu_8i1_to_32i1 : Intrinsic<[llvm_v32i1_ty], [llvm_v8i1_ty], [IntrNoMem]>;
+
+  // Vector 8 intrinsics
+
+  // Extension for packed low precision, intrinsic is overloaded.
+  multiclass VldMskNp_ {
+    def "" : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty,
+                                             LLVMAnyPointerType<LLVMMatchType<0>>],
+                       [IntrReadMem, IntrArgMemOnly], "", [SDNPMemOperand]>;
+  }
+  defm "int_tpu_vld_msk" : VldMskNp_;
+  // We could add a _np version here.
+
+  multiclass VldCbMskNp_ {
+    def "" : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_x86mmx_ty, llvm_i32_ty],
+                       [IntrReadMem], "", [SDNPMemOperand]>;
+    def _strided : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_x86mmx_ty, llvm_i32_ty,
+                                               llvm_i32_ty],
+                             [IntrReadMem], "", [SDNPMemOperand]>;
+  }
+  defm "int_tpu_vld_cb_msk" : VldCbMskNp_;
+  defm "int_tpu_vld_cb_upd_msk" : VldCbMskNp_;
+  // We could add a _np version here.
+
+  multiclass VldMskIdxNp_ {
+    def "" : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMAnyPointerType<LLVMMatchType<0>>,
+                                             LLVMScalarOrSameVectorWidth<1, llvm_i32_ty>],
+                                        [IntrReadMem, IntrArgMemOnly], "", [SDNPMemOperand]>;
+  }
+  defm "int_tpu_vld_msk_idx" : VldMskIdxNp_;
+  defm "int_tpu_vld_msk_idx_np" : VldMskIdxNp_;
+
+  multiclass VldCbMskIdxNp_ {
+    def "" : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_x86mmx_ty,
+                                             llvm_i32_ty, LLVMScalarOrSameVectorWidth<1, llvm_i32_ty>],
+                                        [IntrReadMem], "", [SDNPMemOperand]>;
+  }
+  defm "int_tpu_vld_cb_msk_idx" : VldCbMskIdxNp_;
+  defm "int_tpu_vld_cb_msk_idx_np" : VldCbMskIdxNp_;
+
+  multiclass VstIdxMskAddNp_ {
+    def "" : Intrinsic<[], [llvm_anyvector_ty, LLVMAnyPointerType<LLVMMatchType<3>>,
+                            LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>, llvm_anyvector_ty],
+                           [IntrArgMemOnly, IntrWillReturn], "", [SDNPMemOperand]>;
+  }
+  defm "int_tpu_vst_msk_idx_add" : VstIdxMskAddNp_;
+  defm "int_tpu_vst_msk_idx_add_np" : VstIdxMskAddNp_;
+
+  multiclass VstCbMskIdxAddNp_ {
+    def "" : Intrinsic<[], [llvm_anyvector_ty, llvm_x86mmx_ty, llvm_i32_ty,
+                            LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>, llvm_anyvector_ty],
+                           [IntrWriteMem, IntrWillReturn], "", [SDNPMemOperand]>;
+  }
+  defm "int_tpu_vst_cb_msk_idx_add" : VstCbMskIdxAddNp_;
+  defm "int_tpu_vst_cb_msk_idx_add_np" : VstCbMskIdxAddNp_;
+
+  // Extension for packed low precision, intrinsic is overloaded.
+  def int_tpu_vst_msk :
+             Intrinsic<[], [llvm_anyvector_ty, LLVMAnyPointerType<LLVMMatchType<1>>,
+                            llvm_anyvector_ty],
+                       [IntrArgMemOnly, IntrWriteMem], "", [SDNPMemOperand]>;
+  // We could add a _np version here.
+
+  def int_tpu_vst_msk_add :
+             Intrinsic<[], [llvm_anyvector_ty, LLVMAnyPointerType<LLVMMatchType<2>>, llvm_anyvector_ty],
+                       [IntrArgMemOnly], "", [SDNPMemOperand]>;
+
+  multiclass VstCbMskNp_ {
+  def "" : Intrinsic<[], [llvm_anyvector_ty, llvm_x86mmx_ty, llvm_i32_ty,
+                          llvm_anyvector_ty],
+                     [IntrWriteMem], "", [SDNPMemOperand]>;
+  def _strided : Intrinsic<[], [llvm_anyvector_ty, llvm_x86mmx_ty, llvm_i32_ty,
+                                llvm_i32_ty, llvm_anyvector_ty],
+                           [IntrWriteMem], "", [SDNPMemOperand]>;
+  def "_add" : Intrinsic<[], [llvm_anyvector_ty, llvm_x86mmx_ty, llvm_i32_ty,
+                              llvm_anyvector_ty],
+                         [IntrWriteMem], "", [SDNPMemOperand]>;
+  def "_add_strided" : Intrinsic<[], [llvm_anyvector_ty, llvm_x86mmx_ty, llvm_i32_ty,
+                                      llvm_i32_ty, llvm_anyvector_ty],
+                                 [IntrWriteMem], "", [SDNPMemOperand]>;
+  }
+  defm "int_tpu_vst_cb_msk" : VstCbMskNp_;
+  defm "int_tpu_vst_cb_upd_msk" : VstCbMskNp_;
+  // We could add a _np version here.
+
+  multiclass VstMskIdxNp_ {
+    def "" : Intrinsic<[], [llvm_anyvector_ty, LLVMAnyPointerType<LLVMMatchType<1>>,
+                            LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>, llvm_anyvector_ty],
+                           [IntrArgMemOnly, IntrWriteMem], "", [SDNPMemOperand]>;
+  }
+  defm "int_tpu_vst_msk_idx" : VstMskIdxNp_;
+  defm "int_tpu_vst_msk_idx_np" : VstMskIdxNp_;
+
+  multiclass VstCbMskIdxNp_ {
+    def "" : Intrinsic<[], [llvm_anyvector_ty, llvm_x86mmx_ty, llvm_i32_ty,
+                            LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>, llvm_anyvector_ty],
+                           [IntrWriteMem], "", [SDNPMemOperand]>;
+  }
+  defm "int_tpu_vst_cb_msk_idx" : VstCbMskIdxNp_;
+  // We could add a _np version here.
+
+  // Combined special intrinsics. These intrinsics are expanded into their individual
+  // equivalents of the _np derivatives of int_tpu_[vld|vst]_idx[_add]i/f.
+  def int_tpu_vst_msk_idx_ret_add_np :
+             Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMAnyPointerType<LLVMMatchType<0>>,
+                                             LLVMScalarOrSameVectorWidth<1, llvm_i32_ty>, LLVMMatchType<0>],
+                       [IntrArgMemOnly, IntrWillReturn], "", [SDNPMemOperand]>;
+
+  multiclass SegScan1xNSc_ {
+    def _scan1xNi : Intrinsic<[llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                              [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>]>;
+    def _scan1xNf : Intrinsic<[llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                              [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_float_ty>]>;
+  }
+
+  let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
+  defm int_tpu_add : SegScan1xNSc_;
+  defm int_tpu_min : SegScan1xNSc_;
+  defm int_tpu_max : SegScan1xNSc_;
+  defm int_tpu_min_index : SegScan1xNSc_;
+  defm int_tpu_max_index : SegScan1xNSc_;
+  defm int_tpu_add_seg : SegScan1xNSc_;
+  defm int_tpu_min_seg : SegScan1xNSc_;
+  defm int_tpu_max_seg : SegScan1xNSc_;
+  defm int_tpu_min_seg_index : SegScan1xNSc_;
+  defm int_tpu_max_seg_index : SegScan1xNSc_;
+  } // IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn]
+
+  // Packed low precision vector scan
+
+  multiclass SegScan2xNHalfSc_<int isindex> {
+    def _scan2xNbf16 : Intrinsic<[llvm_anyvector_ty, llvm_anyvector_ty],
+                                 [LLVMMatchType<1>, !if(isindex, llvm_anyvector_ty, LLVMMatchType<0>)]>;
+  }
+
+  multiclass SegScan2xNFullSc_ {
+    def _scan2xNbf16 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_anyvector_ty]>;
+  }
+
+  let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
+  defm int_tpu_add_half : SegScan2xNHalfSc_</*isindex=*/0>;
+  defm int_tpu_add_full : SegScan2xNFullSc_;
+  defm int_tpu_min : SegScan2xNHalfSc_</*isindex=*/0>;
+  defm int_tpu_max : SegScan2xNHalfSc_</*isindex=*/0>;
+  defm int_tpu_min_index : SegScan2xNHalfSc_</*isindex=*/1>;
+  defm int_tpu_max_index : SegScan2xNHalfSc_</*isindex=*/1>;
+  defm int_tpu_add_half_seg : SegScan2xNHalfSc_</*isindex=*/0>;
+  defm int_tpu_add_full_seg : SegScan2xNFullSc_;
+  defm int_tpu_min_seg : SegScan2xNHalfSc_</*isindex=*/0>;
+  defm int_tpu_max_seg : SegScan2xNHalfSc_</*isindex=*/0>;
+  defm int_tpu_min_seg_index : SegScan2xNHalfSc_</*isindex=*/1>;
+  defm int_tpu_max_seg_index : SegScan2xNHalfSc_</*isindex=*/1>;
+  } // IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn]
+
+  def int_tpu_mprefix :
+             Intrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                       [IntrNoMem]>;
+  def int_tpu_vshift_insert :
+             Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+                       [IntrNoMem]>;
+
+  def int_tpu_sc_permute :
+              Intrinsic<[llvm_anyvector_ty],
+              [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>],
+              [IntrNoMem]>;
+  def int_tpu_sc_permute_c2i :
+              Intrinsic<[llvm_anyvector_ty],
+              [LLVMMatchType<0>],
+              [IntrNoMem]>;
+  def int_tpu_sc_permute_i2c :
+              Intrinsic<[llvm_anyvector_ty],
+              [LLVMMatchType<0>],
+              [IntrNoMem]>;
+  def int_tpu_sc_mask_permute :
+              Intrinsic<[llvm_anyvector_ty],
+              [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>],
+              [IntrNoMem]>;
+  def int_tpu_sc_mask_permute_c2i :
+              Intrinsic<[llvm_anyvector_ty],
+              [LLVMMatchType<0>],
+              [IntrNoMem]>;
+  def int_tpu_sc_mask_permute_i2c :
+              Intrinsic<[llvm_anyvector_ty],
+              [LLVMMatchType<0>],
+              [IntrNoMem]>;
+
+  // Sort intrinsic names include the ordering and key datatype, but value
+  // datatype must be specified by an additional suffix (i.e., v8i32, v8f32).
+
+  multiclass SortAscDsc_ {
+    def i : Intrinsic<[llvm_anyvector_ty, llvm_anyvector_ty,
+                       LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                      [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>,
+                       LLVMMatchType<1>]>;
+    def f : Intrinsic<[llvm_anyvector_ty, llvm_anyvector_ty,
+                       LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                      [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>,
+                       LLVMMatchType<1>]>;
+  }
+
+  let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
+  defm int_tpu_sort_ascd : SortAscDsc_;
+  defm int_tpu_sort_dscd : SortAscDsc_;
+  def int_tpu_deprecated_segreduce_addf :
+              Intrinsic<[llvm_v8f32_ty, llvm_v8i1_ty],
+                        [llvm_v8i32_ty, llvm_v8f32_ty]>;
+  } // IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn]
+
+  let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
+  def int_tpu_dupcnti :
+             Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                       [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>]>;
+
+  def int_tpu_dupcntf :
+             Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                       [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                        LLVMScalarOrSameVectorWidth<0, llvm_float_ty>]>;
+
+  def int_tpu_uniquei :
+             Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                       [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>]>;
+
+  def int_tpu_uniquef :
+             Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                       [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                        LLVMScalarOrSameVectorWidth<0, llvm_float_ty>]>;
+  } // IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn]
+
+  def int_tpu_addcarry :
+              Intrinsic<[llvm_anyint_ty],
+                        [LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>,
+                         LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>],
+                        [IntrNoMem, Commutative]>;
+
+  def int_tpu_vlt_to :
+              Intrinsic<[llvm_anyvector_ty],
+                        [llvm_anyvector_ty, LLVMMatchType<1>],
+                        [IntrNoMem]>;
+
+  def int_tpu_vle_to :
+              Intrinsic<[llvm_anyvector_ty],
+                        [llvm_anyvector_ty, LLVMMatchType<1>],
+                        [IntrNoMem]>;
+
+  def int_tpu_vclass :
+              Intrinsic<[llvm_anyvector_ty],
+                        [llvm_anyvector_ty, LLVMMatchType<1>],
+                        [IntrNoMem]>;
+
+  def int_tpu_byte_not_zero :
+              Intrinsic<[llvm_anyvector_ty],
+              [llvm_anyvector_ty],
+              [IntrNoMem]>;
+
+  // Halt program execution if the argument is true.
+  def int_tpu_halt_trap : ClangBuiltin<"__builtin_tpu_trap">,
+              Intrinsic<[], [llvm_i1_ty], []>;
+
+  // Special SparseCore trap instruction sequence.
+  def int_tpu_wait_trap : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i1_ty], []>;
+
+  def int_tpu_tileid : ClangBuiltin<"__builtin_tpu_tileid">,
+              Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
+
+  def int_tpu_barrier : ClangBuiltin<"__builtin_tpu_barrier">,
+              Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], []>;
+
+  def int_tpu_cvt_pr_fptosi :
+              Intrinsic<[llvm_anyint_ty],
+                 [llvm_anyfloat_ty, LLVMMatchType<0>],
+                 [IntrNoMem]>;
+  def int_tpu_cvt_fptosi :
+              Intrinsic<[llvm_anyint_ty],
+                 [llvm_anyfloat_ty],
+                 [IntrNoMem]>;
+  def int_tpu_vcvt_fptobf8 :
+              Intrinsic<[llvm_anyfloat_ty],
+              [LLVMScalarOrSameVectorWidth<0, llvm_float_ty>],
+              [IntrNoMem]>;
+  def int_tpu_vcvt_fptoif8 :
+              Intrinsic<[llvm_anyfloat_ty],
+              [LLVMScalarOrSameVectorWidth<0, llvm_float_ty>],
+              [IntrNoMem]>;
+  def int_tpu_vcvt_fptobf16 :
+              Intrinsic<[llvm_anyfloat_ty],
+              [LLVMScalarOrSameVectorWidth<0, llvm_float_ty>],
+              [IntrNoMem]>;
+  def int_tpu_vcvt_fptohf16 :
+              Intrinsic<[llvm_anyfloat_ty],
+              [LLVMScalarOrSameVectorWidth<0, llvm_float_ty>],
+              [IntrNoMem]>;
+  def int_tpu_vcvt_sr_fptobf8 :
+              Intrinsic<[llvm_anyfloat_ty],
+              [llvm_anyint_ty, llvm_anyfloat_ty],
+              [IntrNoMem]>;
+  def int_tpu_vcvt_sr_fptoif8 :
+              Intrinsic<[llvm_anyfloat_ty],
+              [llvm_anyint_ty, llvm_anyfloat_ty],
+              [IntrNoMem]>;
+  def int_tpu_vcvt_sr_fptobf16 :
+              Intrinsic<[llvm_anyfloat_ty],
+              [llvm_anyint_ty, llvm_anyfloat_ty],
+              [IntrNoMem]>;
+  def int_tpu_vcvt_sr_fptohf16 :
+              Intrinsic<[llvm_anyfloat_ty],
+              [llvm_anyint_ty, llvm_anyfloat_ty],
+              [IntrNoMem]>;
+
+  def int_tpu_clamp :
+              Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>,
+                                             LLVMMatchType<0>],
+                        [IntrNoMem]>;
+  def int_tpu_clamp_symmetric :
+              Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                        [IntrNoMem]>;
+
+  def int_tpu_vrot_sublane_down :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  def int_tpu_vrot_sublane :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+  def int_tpu_vperm_sublane :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+
+  // TODO(thomasraoux): sublane_mask returns 2 bits per elements. Currently we
+  // model it as 1024xi1 for simplicity but we need a more accurate
+  // representation.
+  def int_tpu_sublane_mask :
+              Intrinsic<[llvm_v1024i1_ty], [llvm_v1024i32_ty], [IntrNoMem]>;
+  def int_tpu_lane_mask :
+              Intrinsic<[llvm_v1024i1_ty], [llvm_v1024i32_ty], [IntrNoMem]>;
+
+  // Special shift intrinsics, if the user wants to use shift sizes that hardware can
+  // handle.
+  def int_tpu_shll :
+              Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                        [IntrNoMem]>;
+  def int_tpu_shrl :
+              Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                        [IntrNoMem]>;
+  def int_tpu_shra :
+              Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                        [IntrNoMem]>;
+
+  def int_tpu_sadd_ov : ClangBuiltin<"__builtin_tpu_sadd_ov">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+                        [IntrNoMem, Commutative]>;
+  def int_tpu_ssub_ov : ClangBuiltin<"__builtin_tpu_ssub_ov">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+                        [IntrNoMem]>;
+  def int_tpu_smulhi : ClangBuiltin<"__builtin_tpu_smulhi">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+                        [IntrNoMem]>;
+  def int_tpu_sshla_ov : ClangBuiltin<"__builtin_tpu_sshla_ov">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+                        [IntrNoMem]>;
+
+  //===--------------------------------------------------------------------===//
+  // EUP intrinsics
+  //===--------------------------------------------------------------------===//
+  // EUP intrinsics return an ERF register (FIFO register). That needs to be
+  // pop with llvm.eup.pop intrinsic.
+  def int_tpu_rsqrt :
+              Intrinsic<[llvm_i32_ty], [llvm_anyvector_ty], [IntrInaccessibleMemOnly]>;
+  def int_tpu_pow2 :
+              Intrinsic<[llvm_i32_ty], [llvm_anyvector_ty], [IntrInaccessibleMemOnly]>;
+  def int_tpu_log2 :
+              Intrinsic<[llvm_i32_ty], [llvm_anyvector_ty], [IntrInaccessibleMemOnly]>;
+  def int_tpu_tanh :
+              Intrinsic<[llvm_i32_ty], [llvm_anyvector_ty], [IntrInaccessibleMemOnly]>;
+  def int_tpu_rcp :
+              Intrinsic<[llvm_i32_ty], [llvm_anyvector_ty], [IntrInaccessibleMemOnly]>;
+  def int_tpu_eup_push :
+              Intrinsic<[llvm_i32_ty], [llvm_anyvector_ty], [IntrInaccessibleMemOnly]>;
+  def int_tpu_sigshft :
+              Intrinsic<[llvm_i32_ty], [llvm_anyvector_ty], [IntrInaccessibleMemOnly]>;
+  def int_tpu_sin :
+              Intrinsic<[llvm_i32_ty], [llvm_anyvector_ty], [IntrInaccessibleMemOnly]>;
+  def int_tpu_cos :
+              Intrinsic<[llvm_i32_ty], [llvm_anyvector_ty], [IntrInaccessibleMemOnly]>;
+  def int_tpu_erf :
+              Intrinsic<[llvm_i32_ty], [llvm_anyvector_ty], [IntrInaccessibleMemOnly]>;
+
+  def int_tpu_eup_pop :
+              Intrinsic<[llvm_anyvector_ty], [llvm_i32_ty], [IntrInaccessibleMemOnly]>;
+
+  // Macro version of the EUP intrinsics doing push/pop with one instruction.
+  // This gets expanded before codegen but gives more opportunity to the
+  // optimizer.
+  let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
+  def int_tpu_rsqrt_macro :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>]>;
+  def int_tpu_pow2_macro :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>]>;
+  def int_tpu_log2_macro :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>]>;
+  def int_tpu_tanh_macro :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>]>;
+  def int_tpu_rcp_macro :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>]>;
+  def int_tpu_sigshft_macro :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>]>;
+  def int_tpu_sin_macro :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>]>;
+  def int_tpu_cos_macro :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>]>;
+  def int_tpu_erf_macro :
+              Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>]>;
+  }// IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn]
+
+  //===--------------------------------------------------------------------===//
+  // Intrinsics that require custom Clang emission
+  //===--------------------------------------------------------------------===//
+
+  def int_tpu_weird_f32 :
+              Intrinsic<[llvm_i1_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_tpu_weird :
+              Intrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_float_ty>],
+                        [IntrNoMem]>;
+  def int_tpu_vmpcnt_ones :
+              Intrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                        [IntrNoMem]>;
+  def int_tpu_vmctz:
+              Intrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                        [IntrNoMem]>;
+
+  // Varargs need custom emission.
+  def int_tpu_event :
+              Intrinsic<[], [llvm_ptr_ty, llvm_vararg_ty]>;
+
+  // Used for debug.
+  def int_tpu_spill_debug :
+              Intrinsic<[], [llvm_any_ty]>;
+
+  //===--------------------------------------------------------------------===//
+  // SparseCore load/store intrinsics.
+  //===--------------------------------------------------------------------===//
+
+  def int_tpu_sld_cb :
+           Intrinsic<[llvm_i32_ty],
+                     [llvm_x86mmx_ty, llvm_i32_ty],
+                     [IntrReadMem], "", [SDNPMemOperand]>;
+  def int_tpu_sst_cb :
+           Intrinsic<[],
+                     [llvm_i32_ty, llvm_x86mmx_ty, llvm_i32_ty],
+                     [IntrWillReturn, IntrWriteMem], "", [SDNPMemOperand]>;
+  def int_tpu_sld_cb_upd :
+           Intrinsic<[llvm_i32_ty],
+                     [llvm_x86mmx_ty, llvm_i32_ty],
+                     [IntrWriteMem], "", [SDNPMemOperand]>;
+  def int_tpu_sst_cb_upd :
+           Intrinsic<[],
+                     [llvm_i32_ty, llvm_x86mmx_ty, llvm_i32_ty],
+                     [IntrWillReturn, IntrWriteMem], "", [SDNPMemOperand]>;
+  def int_tpu_rdcbreg_smem_base :
+           Intrinsic<[llvm_smemptr_ty],
+                     [llvm_x86mmx_ty],
+                     [IntrReadMem], "", []>;
+  def int_tpu_rdcbreg_tilespmem_base :
+           Intrinsic<[llvm_tilespmemptr_ty],
+                     [llvm_x86mmx_ty],
+                     [IntrReadMem], "", []>;
+  def int_tpu_rdcbreg_size :
+           Intrinsic<[llvm_i32_ty],
+                     [llvm_x86mmx_ty],
+                     [IntrReadMem], "", []>;
+  def int_tpu_rdcbreg_offset :
+           Intrinsic<[llvm_i32_ty],
+                     [llvm_x86mmx_ty],
+                     [IntrReadMem], "", []>;
+  def int_tpu_wrcbreg_smem_base :
+           Intrinsic<[llvm_x86mmx_ty],
+                     [llvm_x86mmx_ty, llvm_smemptr_ty],
+                     [IntrNoMem], "", []>;
+  def int_tpu_wrcbreg_tilespmem_base :
+           Intrinsic<[llvm_x86mmx_ty],
+                     [llvm_x86mmx_ty, llvm_tilespmemptr_ty],
+                     [IntrNoMem], "", []>;
+  def int_tpu_wrcbreg_size :
+           Intrinsic<[llvm_x86mmx_ty],
+                     [llvm_x86mmx_ty, llvm_i32_ty],
+                     [IntrNoMem], "", []>;
+  def int_tpu_wrcbreg_offset :
+           Intrinsic<[llvm_x86mmx_ty],
+                     [llvm_x86mmx_ty, llvm_i32_ty],
+                     [IntrNoMem], "", []>;
+  def int_tpu_cbreg_add_offset :
+           Intrinsic<[llvm_x86mmx_ty],
+                     [llvm_x86mmx_ty, llvm_i32_ty],
+                     [IntrNoMem], "", []>;
+
+  // Strided VLD/VST
+
+  def int_tpu_vld_msk_strided :
+           Intrinsic<[llvm_anyvector_ty],
+                     [llvm_anyvector_ty, LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
+                     [IntrReadMem, IntrArgMemOnly], "", [SDNPMemOperand]>;
+
+  def int_tpu_vst_msk_strided :
+           Intrinsic<[],
+                     [llvm_anyvector_ty, LLVMAnyPointerType<LLVMMatchType<3>>, llvm_i32_ty, llvm_anyvector_ty],
+                     [IntrWriteMem, IntrArgMemOnly], "", [SDNPMemOperand]>;
+
+  def int_tpu_vst_msk_add_strided :
+           Intrinsic<[],
+                     [llvm_anyvector_ty, LLVMAnyPointerType<LLVMMatchType<3>>, llvm_i32_ty, llvm_anyvector_ty],
+                     [IntrArgMemOnly], "", [SDNPMemOperand]>;
+
+  def int_tpu_vld_msk_idx_strided : Intrinsic<[llvm_anyvector_ty],
+                     [llvm_anyvector_ty, LLVMAnyPointerType<LLVMMatchType<0>>,
+                      LLVMScalarOrSameVectorWidth<1, llvm_i32_ty>, llvm_i32_ty],
+                     [IntrReadMem, IntrArgMemOnly], "", [SDNPMemOperand]>;
+
+  def int_tpu_vst_msk_idx_strided : Intrinsic<[],
+                     [llvm_anyvector_ty, LLVMAnyPointerType<LLVMMatchType<4>>,
+                      LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>, llvm_i32_ty, llvm_anyvector_ty],
+                     [IntrWriteMem, IntrArgMemOnly], "", [SDNPMemOperand]>;
+
+  //===--------------------------------------------------------------------===//
+  // TensorCore load/store intrinsics.
+  //===--------------------------------------------------------------------===//
+
+  // shuffled vmem load.
+    def int_tpu_vld_shuffle :
+             Intrinsic<[llvm_anyvector_ty],
+                       [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty, llvm_i32_ty],
+                       [IntrReadMem, IntrArgMemOnly], "", [SDNPMemOperand]>;
+    // strided vmem load.
+    def int_tpu_vld_strided :
+             Intrinsic<[llvm_anyvector_ty],
+                       [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty, llvm_i32_ty],
+                       [IntrReadMem, IntrArgMemOnly], "", [SDNPMemOperand]>;
+
+    // strided vmem store.
+    def int_tpu_vst_strided :
+             Intrinsic<[],
+                       [llvm_anyvector_ty, LLVMAnyPointerType<LLVMMatchType<0>>,
+                        llvm_i32_ty, llvm_i32_ty, llvm_v1024i1_ty],
+                       [IntrArgMemOnly, IntrWillReturn], "", [SDNPMemOperand]>;
+
+    // Indexed vmem load/store case.
+    // Set IAR intrinsics take a source and an IAR index and return an IAR
+    // register.
+    def int_tpu_set_lane_indexed :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+    def int_tpu_set_sublane_indexed :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+    def int_tpu_set_iar_raw :
+              Intrinsic<[llvm_i32_ty],
+              [llvm_v1024i32_ty, llvm_i32_ty],
+              [IntrInaccessibleMemOnly]>;
+    // Same as strided load but takes an extra IAR source and IAR index.
+    def int_tpu_vld_indexed :
+             Intrinsic<[llvm_anyvector_ty],
+                       [LLVMAnyPointerType<LLVMMatchType<0>>,
+                        llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                       [IntrReadMem, IntrInaccessibleMemOrArgMemOnly], "", [SDNPMemOperand]>;
+    // Same as strided store but takes an extra IAR source.
+    def int_tpu_vst_indexed :
+             Intrinsic<[],
+                       [llvm_anyvector_ty, LLVMAnyPointerType<LLVMMatchType<0>>,
+                        llvm_i32_ty, llvm_i32_ty, llvm_v1024i1_ty, llvm_i32_ty, llvm_i32_ty],
+                       [IntrWillReturn, IntrInaccessibleMemOrArgMemOnly], "", [SDNPMemOperand]>;
+
+    // Specialized version of vld/vst indexed accessing a unique pattern. This
+    // allows better alias analysis. The signature matches the indexed version
+    // for simplicity with the exception of iar index which is implicit.
+
+    // Load 4 consecutive lanes and duplicate it in 2 sublanes:
+    // (a, a, a+1, a+1, a+2, a+2, a+3, a+3)
+    def int_tpu_vld_replicate_evenodd_sublanes :
+             Intrinsic<[llvm_anyvector_ty],
+                       [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty,
+                        llvm_i32_ty, llvm_i32_ty],
+                       [IntrReadMem, IntrArgMemOnly], "", [SDNPMemOperand]>;
+    // Store with sublane shuffle following this pattern:
+    // (0, 2, 4, 6, 1, 3, 5, 7)
+    def int_tpu_vst_evenodd_sublanes :
+             Intrinsic<[],
+                       [llvm_anyvector_ty, LLVMAnyPointerType<LLVMMatchType<0>>,
+                        llvm_i32_ty, llvm_i32_ty, llvm_v1024i1_ty, llvm_i32_ty],
+                       [IntrWillReturn, IntrArgMemOnly], "", [SDNPMemOperand]>;
+
+  //===--------------------------------------------------------------------===//
+  // SparseCore specific intrinsics.
+  //===--------------------------------------------------------------------===//
+
+  def int_tpu_sc_sint : ClangBuiltin<"__builtin_tpu_sc_sint">,
+              Intrinsic<[], [llvm_i32_ty], []>;
+
+  def int_tpu_sc_ssettm : ClangBuiltin<"__builtin_tpu_sc_ssettm">,
+              Intrinsic<[], [llvm_i32_ty], []>;
+
+  def int_tpu_sc_strace : ClangBuiltin<"__builtin_tpu_sc_strace">,
+              Intrinsic<[], [llvm_i32_ty], []>;
+
+  //===--------------------------------------------------------------------===//
+  // TensorCore specific intrinsics.
+  //===--------------------------------------------------------------------===//
+
+  def int_tpu_vsettm :
+              Intrinsic<[], [llvm_i32_ty], []>;
+
+  def int_tpu_vtrace :
+              Intrinsic<[], [llvm_i32_ty], []>;
+
+  def int_tpu_tc_vint :
+              Intrinsic<[], [llvm_i32_ty], []>;
+
+  def int_tpu_tc_setrngseed :
+              Intrinsic<[], [llvm_v1024i32_ty], [IntrInaccessibleMemOnly]>;
+
+  def int_tpu_tc_getrngseed :
+              Intrinsic<[llvm_v1024i32_ty], [], [IntrInaccessibleMemOnly]>;
+
+  def int_tpu_tc_vrng :
+              Intrinsic<[llvm_v1024i32_ty], [], [IntrInaccessibleMemOnly]>;
+
+  //===--------------------------------------------------------------------===//
+  // BarnaCore specific intrinsics.
+  //===--------------------------------------------------------------------===//
+
+  // Load with add_loop_index on the address. The second argument is the encoded
+  // add_loop_index type, defined by the ISA.
+  def int_tpu_bc_load_aliaddr : Intrinsic<[llvm_v8f32_ty],
+                                          [llvm_bmemv8f32ptr_ty, llvm_i32_ty],
+                                          [IntrArgMemOnly, IntrReadMem, ImmArg<ArgIndex<1>>], "", [SDNPMemOperand]>;
+  // Load with add_loop_index on the address. The second argument is the encoded
+  // add_loop_index type, defined by the ISA. The third argument is the feature
+  // length multiple.
+  def int_tpu_bc_load_aliaddr_flm : Intrinsic<[llvm_v8f32_ty],
+                                          [llvm_bmemv8f32ptr_ty, llvm_i32_ty, llvm_i32_ty],
+                                          [IntrArgMemOnly, IntrReadMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>],  "", [SDNPMemOperand]>;
+
+  // Store with add_loop_index on the address. The third argument is the encoded
+  // add_loop_index type, defined by the ISA.
+  def int_tpu_bc_store_aliaddr : Intrinsic<[],
+                                           [llvm_v8f32_ty, llvm_bmemv8f32ptr_ty, llvm_i32_ty],
+                                           [IntrArgMemOnly, ImmArg<ArgIndex<2>>],  "", [SDNPMemOperand]>;
+
+  // Store with add_loop_index on the address. The third argument is the encoded
+  // add_loop_index type, defined by the ISA. The fourth argument is the feature
+  // length multiple.
+  def int_tpu_bc_store_aliaddr_flm : Intrinsic<[],
+                                           [llvm_v8f32_ty, llvm_bmemv8f32ptr_ty, llvm_i32_ty, llvm_i32_ty],
+                                           [IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>;
+
+  // Store to (concat_reg) as the address.
+  def int_tpu_bc_store_concat : Intrinsic<[],
+                                          [llvm_v8f32_ty],
+                                          [IntrInaccessibleMemOnly],  "", [SDNPMemOperand]>;
+  def int_tpu_bc_store_concat_aliaddr : Intrinsic<[],
+                                                  [llvm_v8f32_ty, llvm_i32_ty],
+                                                  [IntrInaccessibleMemOnly, ImmArg<ArgIndex<1>>],  "", [SDNPMemOperand]>;
+
+
+  // Shift from (concat_reg) to (cdfifo_reg), again using an aliaddr modifier.
+  def int_tpu_bc_shift : Intrinsic<[], [], [IntrInaccessibleMemOnly]>;
+  def int_tpu_bc_shift_aliaddr : Intrinsic<[], [llvm_i32_ty],
+                                           [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
+
+  // Given an array aggregate of vectors e.g [12 x <8 x float>], insert the second
+  // argument into the array at the index given by (loop_index) and return it.
+  //
+  // This functions as-if the insertvalue IR instruction, but could touch any
+  // array element dynamically.
+  def int_tpu_bc_insertvalue_loopindex : Intrinsic<[llvm_any_ty],
+                                                   [llvm_any_ty, llvm_v8f32_ty],
+                                                   [IntrNoMem]>;
+  // Given an array aggregate of vectors, extract the value given by (loop_index),
+  // and return it.
+  //
+  // This functions as-if the extractvalue IR instruction, but could touch any
+  // array element dynamically.
+  def int_tpu_bc_extractvalue_loopindex : Intrinsic<[llvm_v8f32_ty],
+                                                    [llvm_any_ty],
+                                                    [IntrNoMem]>;
+
+  // Sets up the pipeline_depth for a loop. This must occur in the loop
+  // preheader.
+  def int_tpu_bc_loop_start : Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
+
+  // Returns an i1 to use as a loop test. The only loop allowed is a loop from
+  // 0..feature_count. The returned value is false if the loop should be exited,
+  // and true if the loop should branch to its header.
+  def int_tpu_bc_loop_end :
+     Intrinsic<[llvm_i1_ty], [], [IntrInaccessibleMemOnly], "",
+               [SDNPMemOperand]>;
+
+  // Performs a select between two operands based on a predicate. The predicate
+  // is encoded as an integer according to the ISA. If the predicate is true,
+  // the 1st operand is returned; if false, the 2nd operand.
+  def int_tpu_bc_select_predicate :
+     Intrinsic<[llvm_any_ty], [llvm_i32_ty, LLVMMatchType<0>, LLVMMatchType<0>],
+               [IntrNoMem, ImmArg<ArgIndex<0>>]>;
+
+  // Takes the allocation size in number of [h|s|t|sp]mem, sync flag or dreg
+  // words. Returns a pointer to the [h|s|t|sp]mem, sync flag or dreg memory
+  // allocated.
+  def int_tpu_alloca_hbm :
+     ClangBuiltin<"__builtin_tpu_alloca_hbm">,
+     Intrinsic<[llvm_hbmptr_ty], [llvm_i32_ty], []>;
+  def int_tpu_alloca_smem :
+     ClangBuiltin<"__builtin_tpu_alloca_smem">,
+     Intrinsic<[llvm_smemptr_ty], [llvm_i32_ty], []>;
+  def int_tpu_alloca_tilespmem :
+     ClangBuiltin<"__builtin_tpu_alloca_tilespmem">,
+     Intrinsic<[llvm_tilespmemptr_ty], [llvm_i32_ty], []>;
+  def int_tpu_alloca_spmem :
+     ClangBuiltin<"__builtin_tpu_alloca_spmem">,
+     Intrinsic<[llvm_spmemptr_ty], [llvm_i32_ty], []>;
+  def int_tpu_alloca_dreg :
+     ClangBuiltin<"__builtin_tpu_alloca_dreg">,
+     Intrinsic<[llvm_dregptr_ty], [llvm_i32_ty], []>;
+  def int_tpu_alloca_sflag :
+     ClangBuiltin<"__builtin_tpu_alloca_sflag">,
+     Intrinsic<[llvm_sflagptr_ty], [llvm_i32_ty], []>;
+  // We do not allow stack allocation on any of the _other and _any address spaces.
+
+  // Invalidate all memory allocations before the intrinsic call.
+  // After the call future allocations will reuse the memory. This
+  // intrinsic applies to smem and tilespmem memory.
+  def int_tpu_end_allocation_scope :
+     ClangBuiltin<"__builtin_tpu_end_alloc">,
+     Intrinsic<[], [], []>;
+
+  // Allocates memory in `memory_space` that alias only other overlapping
+  // allocations. The arguments are `size` and allocation `offset` in number of
+  // words. Please note that hbm_any and smem_any are used through specific
+  // cast intrinsics.
+  def int_tpu_allocate_hbm :
+     ClangBuiltin<"__builtin_tpu_allocate_hbm">,
+     Intrinsic<[llvm_hbmptr_ty], [llvm_i32_ty, llvm_i32_ty],
+               []>;
+  def int_tpu_allocate_hbm_any :
+     ClangBuiltin<"__builtin_tpu_allocate_hbm_any">,
+     Intrinsic<[llvm_hbmanyptr_ty], [llvm_i32_ty, llvm_i32_ty],
+               []>;
+  def int_tpu_allocate_smem :
+     ClangBuiltin<"__builtin_tpu_allocate_smem">,
+     Intrinsic<[llvm_smemptr_ty], [llvm_i32_ty, llvm_i32_ty],
+               []>;
+  def int_tpu_allocate_smem_any :
+     ClangBuiltin<"__builtin_tpu_allocate_smem_any">,
+     Intrinsic<[llvm_smemanyptr_ty], [llvm_i32_ty, llvm_i32_ty],
+               []>;
+  def int_tpu_allocate_timem :
+     ClangBuiltin<"__builtin_tpu_allocate_timem">,
+     Intrinsic<[llvm_timemptr_ty], [llvm_i32_ty, llvm_i32_ty],
+               []>;
+  def int_tpu_allocate_tilespmem :
+     ClangBuiltin<"__builtin_tpu_allocate_tilespmem">,
+     Intrinsic<[llvm_tilespmemptr_ty], [llvm_i32_ty, llvm_i32_ty],
+               []>;
+  def int_tpu_allocate_spmem :
+     ClangBuiltin<"__builtin_tpu_allocate_spmem">,
+     Intrinsic<[llvm_spmemptr_ty], [llvm_i32_ty, llvm_i32_ty],
+               []>;
+  def int_tpu_allocate_vmem :
+     Intrinsic<[llvm_vmemv1024i32ptr_ty], [llvm_i32_ty, llvm_i32_ty],
+               []>;
+  def int_tpu_allocate_dreg :
+     ClangBuiltin<"__builtin_tpu_allocate_dreg">,
+     Intrinsic<[llvm_dregptr_ty], [llvm_i32_ty, llvm_i32_ty],
+               []>;
+  def int_tpu_allocate_iova :
+     ClangBuiltin<"__builtin_tpu_allocate_iova">,
+     Intrinsic<[llvm_iovaptr_ty], [llvm_i32_ty, llvm_i32_ty],
+               []>;
+  def int_tpu_allocate_sflag :
+     ClangBuiltin<"__builtin_tpu_allocate_sflag">,
+     Intrinsic<[llvm_sflagptr_ty], [llvm_i32_ty, llvm_i32_ty],
+               []>;
+  def int_tpu_allocate_sflag_other :
+     ClangBuiltin<"__builtin_tpu_allocate_sflag_other">,
+     Intrinsic<[llvm_sflagotherptr_ty], [llvm_i32_ty, llvm_i32_ty],
+               []>;
+  // FIXME(hgreving): obsolete, remove.
+  def int_tpu_allocate_sflag_any :
+     ClangBuiltin<"__builtin_tpu_allocate_sflag_any">,
+     Intrinsic<[llvm_sflaganyptr_ty], [llvm_i32_ty, llvm_i32_ty],
+               []>;
+
+  // Allocates memory in `memory_space` at a dynamic offset. Bump allocation and
+  // bounds check are skipped for these operations if the offset is not a
+  // constant value.  The arguments are `size` and allocation `offset` in number
+  // of words.
+  def int_tpu_allocate_dyn_hbm :
+     ClangBuiltin<"__builtin_tpu_allocate_dyn_hbm">,
+     Intrinsic<[llvm_hbmptr_ty], [llvm_i32_ty, llvm_i32_ty], []>;
+  def int_tpu_allocate_dyn_iova :
+     Intrinsic<[llvm_iovaptr_ty], [llvm_i32_ty, llvm_i32_ty], []>;
+
+  // Special case allocation of cbregs. This is mainly to prevent CSE from
+  // merging wrcbreg instructions into the same cbreg and due to the fact
+  // that many of the `upd` instructions are not modeling the effect on cbregs.
+  def int_tpu_allocate_cbreg :
+     ClangBuiltin<"__builtin_tpu_allocate_cbreg">,
+     Intrinsic<[llvm_x86mmx_ty], [],
+               []>;
+
+  // Special restrict pointer emulation intrinsic.
+  def int_tpu_make_restrict_ptr :
+      Intrinsic<[llvm_anyptr_ty], [LLVMMatchType<0>],
+                [NoAlias<ArgIndex<-1>>, IntrInaccessibleMemOnly]>;
+
+  def int_tpu_ptrtoint : Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty], [IntrNoMem]>;
+  def int_tpu_inttoptr : Intrinsic<[llvm_anyptr_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  // SparseCore addressspace casts.
+  def int_tpu_addrspacecast : Intrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty],
+                              [IntrNoMem]>;
+  // SparseCore addressspace casts, sflag specific.
+  def int_tpu_addrspacecast_tc : Intrinsic<[llvm_sflaganyptr_ty], [llvm_sflagptr_ty],
+                                  [IntrNoMem]>;
+  def int_tpu_addrspacecast_scs : Intrinsic<[llvm_sflaganyptr_ty], [llvm_sflagptr_ty],
+                                  [IntrNoMem]>;
+  def int_tpu_addrspacecast_tac : Intrinsic<[llvm_sflaganyptr_ty], [llvm_sflagptr_ty, llvm_i32_ty],
+                                  [IntrNoMem]>;
+  def int_tpu_addrspacecast_tec : Intrinsic<[llvm_sflaganyptr_ty], [llvm_sflagptr_ty, llvm_i32_ty],
+                                  [IntrNoMem]>;
+  // The following variants are used with ssyncadd.tile, which uses a different
+  // encoding for the target sync flag.
+  def int_tpu_addrspacecast_tile_scs : Intrinsic<[llvm_sflagtileptr_ty], [llvm_sflagptr_ty],
+                                  [IntrNoMem]>;
+  def int_tpu_addrspacecast_tile_tac : Intrinsic<[llvm_sflagtileptr_ty], [llvm_sflagptr_ty, llvm_i32_ty],
+                                  [IntrNoMem]>;
+  def int_tpu_addrspacecast_tile_tec : Intrinsic<[llvm_sflagtileptr_ty], [llvm_sflagptr_ty, llvm_i32_ty],
+                                  [IntrNoMem]>;
+
+  // Translates a core index starting with zero into a SparseCore DMA core_id.
+  def int_tpu_sc_dma_core_id : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  // SparseCore task dispatch intrinsic. It takes the following operands
+  // *) Dreg address space pointer to the task descriptor.
+  // *) Bit-packed 32-bit integer where the upper 16-bits encode the task
+  //    descriptor argument count, and the lower 16-bits encode the tile bitmap.
+  def int_tpu_task_dispatch : ClangBuiltin<"__builtin_tpu_dispatch_task">,
+    Intrinsic<[], [llvm_dregptr_ty, llvm_i32_ty], []>;
+  def int_tpu_task_dispatch_clear_ibuf : ClangBuiltin<"__builtin_tpu_dispatch_task_clear_ibuf">,
+    Intrinsic<[], [llvm_dregptr_ty, llvm_i32_ty], []>;
+
+  // Clears the Tile Ibuf from SCS. As argument, it takes a Dreg address pointer
+  // to an uninitialized task descriptor with at least two fields.
+  def int_tpu_clear_ibuf : Intrinsic<[], [llvm_dregptr_ty], []>;
+
+  // Set/Read registers intrinsics.
+  let IntrProperties = [IntrNoMem, IntrHasSideEffects] in {
+  // Local Cycle Count register lower half [0:31].
+  def int_tpu_rdreg_lcc_lo : ClangBuiltin<"__builtin_tpu_rdreg_lcc_lo">,
+    Intrinsic<[llvm_i32_ty], []>;
+  // Local Cycle Count register upper half [32:63].
+  def int_tpu_rdreg_lcc_hi : ClangBuiltin<"__builtin_tpu_rdreg_lcc_hi">,
+    Intrinsic<[llvm_i32_ty], []>;
+  // Global Time Counter register lower half [0:31].
+  def int_tpu_rdreg_gtc_lo : ClangBuiltin<"__builtin_tpu_rdreg_gtc_lo">,
+    Intrinsic<[llvm_i32_ty], []>;
+  // Global Time Counter register upper half [32:63].
+  def int_tpu_rdreg_gtc_hi : ClangBuiltin<"__builtin_tpu_rdreg_gtc_hi">,
+    Intrinsic<[llvm_i32_ty], []>;
+  // Yield_request
+  def int_tpu_rdreg_yieldreq : ClangBuiltin<"__builtin_tpu_rdreg_yieldreq">,
+    Intrinsic<[llvm_i32_ty], []>;
+  // SparseCore ID
+  def int_tpu_rdreg_scid : ClangBuiltin<"__builtin_tpu_rdreg_scid">,
+    Intrinsic<[llvm_i32_ty], []>;
+  // TensorCore ID
+  def int_tpu_rdreg_tcid : ClangBuiltin<"__builtin_tpu_rdreg_tcid">,
+    Intrinsic<[llvm_i32_ty], []>;
+  // Tag register.
+  def int_tpu_rdreg_tag : ClangBuiltin<"__builtin_tpu_rdreg_tag">,
+    Intrinsic<[llvm_i32_ty], []>;
+  // Tracemark register.
+  def int_tpu_rdreg_tm : ClangBuiltin<"__builtin_tpu_rdreg_tm">,
+    Intrinsic<[llvm_i32_ty], []>;
+  // Tile ID.
+  def int_tpu_rdreg_tid : ClangBuiltin<"__builtin_tpu_rdreg_tid">,
+    Intrinsic<[llvm_i32_ty], []>;
+  // Task Bitmap.
+  def int_tpu_rdreg_tbm : ClangBuiltin<"__builtin_tpu_rdreg_tbm">,
+    Intrinsic<[llvm_i32_ty], []>;
+  // Fence Status.
+  def int_tpu_rdreg_fsr : ClangBuiltin<"__builtin_tpu_rdreg_fsr">,
+    Intrinsic<[llvm_i32_ty], []>;
+  // DIF Depth register.
+  def int_tpu_rdreg_ddr : ClangBuiltin<"__builtin_tpu_rdreg_ddr">,
+    Intrinsic<[llvm_i32_ty], []>;
+  // DMA credit register.
+  def int_tpu_rdreg_dmacrdt : ClangBuiltin<"__builtin_tpu_rdreg_dmacrdt">,
+    Intrinsic<[llvm_i32_ty], []>;
+  // Branch target register.
+  def int_tpu_rdreg_btr : Intrinsic<[llvm_i32_ty], []>;
+  // Response from CSR read lower half [0:31].
+  def int_tpu_rdreg_crr_lo : Intrinsic<[llvm_i32_ty], []>;
+  // Response from CSR read upper half [0:31].
+  def int_tpu_rdreg_crr_hi : Intrinsic<[llvm_i32_ty], []>;
+  // Setter intrinsics
+  def int_tpu_setreg_tag : ClangBuiltin<"__builtin_tpu_setreg_tag">,
+    Intrinsic<[], [llvm_i32_ty]>;
+  def int_tpu_setreg_pdepth : ClangBuiltin<"__builtin_tpu_setreg_pdepth">,
+    Intrinsic<[], [llvm_i32_ty]>;
+  def int_tpu_setreg_ifvalue : ClangBuiltin<"__builtin_tpu_setreg_ifvalue">,
+    Intrinsic<[], [llvm_i32_ty]>;
+  def int_tpu_setreg_dmacrdt : ClangBuiltin<"__builtin_tpu_setreg_dmacrdt">,
+    Intrinsic<[], [llvm_i32_ty]>;
+  def int_tpu_setreg_sflagrange : ClangBuiltin<"__builtin_tpu_setreg_sflagrange">,
+    Intrinsic<[], [llvm_i32_ty]>;
+  } // IntrProperties = [IntrNoMem, IntrHasSideEffects]
+
+  // Local Cycle Count or Global Time Counter. Reading it in a single intrinsic. Guaranteed to be
+  // in a single bundle.
+  def int_tpu_read_local_cycle_count : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [],
+    [IntrNoMem, IntrHasSideEffects]>;
+  def int_tpu_read_global_cycle_count : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [],
+    [IntrNoMem, IntrHasSideEffects]>;
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/MCTargetDesc/TPUBaseInfo.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/MCTargetDesc/TPUBaseInfo.cpp
new file mode 100644
index 0000000..3bfcbc4
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/MCTargetDesc/TPUBaseInfo.cpp

@@ -0,0 +1,318 @@
+//===-- TPUBaseInfo.cpp - Top level definitions for TPU MC ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TPUBaseInfo.h"
+#include "TPUSubtarget.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <unordered_map>
+
+namespace llvm {
+
+namespace {
+// NOTE: we have to use unordered_map instead of DenseMap because DenseMap has
+// special values EmptyKey/TombstoneKey with one of the key values we need to
+// map (i.e. (uint32_t)-1) being EmptyKey.
+using EncodingMap = std::unordered_map<uint32_t, uint32_t>;
+
+const EncodingMap &getSyEncodings(bool IsPxcOrVfc) {
+  // This idiom allows us to define a constant lookup table of encodings.
+  static auto *JfcSyEncodings = new EncodingMap({{1, 46},
+                                                 {(uint32_t)-1, 47},
+                                                 {0, 48},
+                                                 {0x80000000, 49},
+                                                 {FloatToBits(1.0), 50},
+                                                 {FloatToBits(-1.0), 51},
+                                                 {FloatToBits(2.0), 52},
+                                                 {FloatToBits(-2.0), 53},
+                                                 {FloatToBits(0.5), 54},
+                                                 {FloatToBits(-0.5), 55},
+                                                 {0x40490fdb, 56}, // pi
+                                                 {0xc0490fdb, 57}, // -pi
+                                                 {0x402df854, 58}, // e
+                                                 {0xc02df854, 59}, // -e
+                                                 {0xffff0000, 60}});
+
+  static auto *PxcOrVfSyEncodings = new EncodingMap({{0x100, 46},
+                                                     {(uint32_t)-1, 47},
+                                                     {0, 48},
+                                                     {1, 49},
+                                                     {2, 50},
+                                                     {3, 51},
+                                                     {4, 52},
+                                                     {5, 53},
+                                                     {6, 54},
+                                                     {7, 55},
+                                                     {8, 56},
+                                                     {0x10, 57},
+                                                     {0x18, 58},
+                                                     {0x20, 59},
+                                                     {0x30, 60},
+                                                     {0x40, 61},
+                                                     {0x60, 62},
+                                                     {0x80, 63}});
+
+  return IsPxcOrVfc ? *PxcOrVfSyEncodings : *JfcSyEncodings;
+}
+
+const EncodingMap *getVyPFEncodings() {
+  static auto *VyEncodings = new EncodingMap({
+      {1, 1},
+      {(uint32_t)-1, 2},
+      {0, 3},
+      {FloatToBits(1.0), 4},
+      {FloatToBits(-1.0), 5},
+      {FloatToBits(2.0), 6},
+      {FloatToBits(0.5), 7},
+  });
+  return VyEncodings;
+}
+
+const EncodingMap *getVyVFEncodings() {
+  static auto *VyEncodings = new EncodingMap({
+      {1, 1},
+      {(uint32_t)-1, 2},
+      {0, 3},
+      {FloatToBits(1.0), 4},
+      {FloatToBits(-1.0), 5},
+      {FloatToBits(0.5), 6},
+  });
+  return VyEncodings;
+}
+
+const EncodingMap *getVyVFSCEncodings() {
+  static auto *VyEncodings = new EncodingMap({
+      // Encoding 1 is Lane_id, we're not matching.
+      {1, 2},
+      {(uint32_t)-1, 3},
+      {0, 4},
+      {FloatToBits(1.0), 5},
+      {FloatToBits(-1.0), 6},
+      {FloatToBits(0.5), 7},
+  });
+  return VyEncodings;
+}
+
+const EncodingMap *getVyGLSCEncodings() {
+  static auto *VyEncodings = new EncodingMap({
+      // Encoding 1 is Lane_id, we're not matching.
+      {1, 2},
+      {(uint32_t)-1, 3},
+      {0, 4},
+      {FloatToBits(1.0), 5},
+      {FloatToBits(-1.0), 6},
+      {FloatToBits(0.5), 7},
+      {2, 33},
+      {3, 34},
+      {4, 35},
+      {5, 36},
+      {6, 37},
+      {7, 38},
+      {8, 39},
+      // { Compressed_to_Interleaved, 40},
+      // { Interleaved_to_Compressed, 41},
+  });
+  return VyEncodings;
+}
+
+const EncodingMap &getSublaneMaskEncodingsMap(bool IsVfc) {
+  static auto *VfcSublaneMaskEncodings = new EncodingMap(
+      {{0xff, 0}, {1, 11}, {2, 12}, {4, 13}, {8, 14}, {0x10, 15}});
+
+  static auto *ScSublaneMaskEncodings = new EncodingMap({{0xff, 16},
+                                                         {0x7f, 17},
+                                                         {0x3f, 18},
+                                                         {0x1f, 19},
+                                                         {0xf, 20},
+                                                         {0x7, 21},
+                                                         {0x3, 22},
+                                                         {0x1, 23}});
+
+  return IsVfc ? *VfcSublaneMaskEncodings : *ScSublaneMaskEncodings;
+}
+
+const EncodingMap &getMemStrideEncodingsMap(bool IsVfc, bool IsSc) {
+  static auto *VfcStrideEncodings = new EncodingMap(
+      {{1, 0}, {(uint32_t)-1, 11}, {2, 12}, {4, 13}, {8, 14}, {0x10, 15}});
+
+  static auto *ScStrideEncodings = new EncodingMap(
+      {{1, 0}, {0, 1}, {2, 2}, {4, 3}, {8, 4}, {(uint32_t)-1, 5}});
+
+  static auto *PfcStrideEncodings = new EncodingMap({{1, 0}});
+
+  return IsVfc ? *VfcStrideEncodings
+               : (IsSc ? *ScStrideEncodings : *PfcStrideEncodings);
+}
+} // namespace
+
+std::optional<uint32_t> getSublaneMaskEncodings(bool IsVfc, bool IsSc,
+                                                uint32_t Val) {
+  // Pfc/Jfc has no fixed-constant value encodings for sublane mask.
+  if (!IsVfc && !IsSc)
+    return {};
+
+  const EncodingMap &Encodings = getSublaneMaskEncodingsMap(IsVfc);
+  const auto It = Encodings.find(Val);
+  if (It == Encodings.end())
+    return {};
+  return It->second;
+}
+
+std::optional<uint32_t> getMemOffsetEncodings(bool IsVfc, bool IsSc,
+                                              uint32_t Val) {
+  // Pfc/Jfc has no constant value encodings for offset.
+  if (!IsVfc && !IsSc)
+    return {};
+
+  if (Val == 0)
+    return 0;
+
+  if (IsSc && (Val == (uint32_t)-1)) {
+    return 7;
+  }
+
+  return {};
+}
+
+std::optional<uint32_t> getMemStrideEncodings(bool IsVfc, bool IsSc, bool IsPfc,
+                                              uint32_t Val) {
+  if (!IsVfc && !IsSc && !IsPfc)
+    return {};
+
+  const EncodingMap &Encodings = getMemStrideEncodingsMap(IsVfc, IsSc);
+  const auto It = Encodings.find(Val);
+  if (It == Encodings.end())
+    return {};
+  return It->second;
+}
+
+std::optional<uint32_t> getSyEncodings(bool IsPxcOrVfc, uint32_t Val) {
+  const EncodingMap &Encodings = getSyEncodings(IsPxcOrVfc);
+  const auto It = Encodings.find(Val);
+  if (It == Encodings.end())
+    return {};
+  return It->second;
+}
+
+std::optional<uint32_t> getVyEncodings(const TPUSubtarget *ST, uint32_t Val) {
+  const EncodingMap *Encodings;
+  if (ST->isVfcSparseCore()) {
+    Encodings = getVyVFSCEncodings();
+  } else if (ST->isGlcSparseCore()) {
+    Encodings = getVyGLSCEncodings();
+  } else if (ST->isGfcSparseCore()) {
+    // FIXME(hgreving): Add GFC specific encodings.
+    Encodings = getVyGLSCEncodings();
+  } else if (ST->hasVfcTensorCore()) {
+    Encodings = getVyVFEncodings();
+  } else {
+    Encodings = getVyPFEncodings();
+  }
+  const auto It = Encodings->find(Val);
+  if (It == Encodings->end())
+    return {};
+  return It->second;
+}
+
+uint8_t getFirstVyZeroExtEncoding(bool IsVfc) {
+  if (IsVfc)
+    return 7;
+  return 8;
+}
+
+uint8_t getFirstSyZeroExtEncoding(bool IsVfc) {
+  (void)IsVfc;
+  return 32;
+}
+
+uint8_t getFirstVyOneExtEncoding(bool IsVfc) {
+  if (IsVfc)
+    return 13;
+  return 14;
+}
+
+uint8_t getFirstSyOneExtEncoding(bool IsVfc) {
+  (void)IsVfc;
+  return 36;
+}
+
+uint8_t getFirstVyShlEncoding(bool IsVfc) {
+  if (IsVfc)
+    return 19;
+  return 20;
+}
+
+uint8_t getFirstSyShlEncoding(bool IsVfc) {
+  (void)IsVfc;
+  return 40;
+}
+
+uint8_t getFirstVyImm32Encoding(bool IsVfc) {
+  if (IsVfc)
+    return 25;
+  return 26;
+}
+
+uint8_t getFirstSyImm32Encoding(bool IsVfc) {
+  (void)IsVfc;
+  return 44;
+}
+
+uint8_t getFirstMemOffsetZeroExtEncoding(bool IsPfc) { return IsPfc ? 0 : 1; }
+
+uint8_t getFirstMemStrideZeroExtEncoding(bool IsVfc, bool IsPfc) {
+  return IsVfc ? 5 : IsPfc ? 4 : 10;
+}
+
+uint8_t getFirstSublaneMaskZeroExtEncoding(bool IsVfc) { return IsVfc ? 5 : 4; }
+
+namespace {
+struct OpWithSpecialEncodingTy {
+  const unsigned Instr;
+  const unsigned OpIdx;
+  const unsigned EnKind;
+};
+
+using namespace TPU;
+#define GET_OpWithSpecialEncodingTable_IMPL
+#define GET_ImmediateCompatibilityTable_IMPL
+#include "TPUGenSearchableTables.inc"
+} // namespace
+
+std::optional<OpEnc::OpEncodings> getSpecialOpEncoding(const MCInstrDesc &MCID,
+                                                       unsigned OpIdx) {
+  auto OpTypeToSpecialEncoding = [&]() -> std::optional<OpEnc::OpEncodings> {
+    std::optional<TPUOperandTypeRecord> OpRec = getOperandTypeRecord(
+        static_cast<TPUOp::OperandType>(MCID.OpInfo[OpIdx].OperandType));
+    if (!OpRec.has_value())
+      return std::nullopt;
+    return static_cast<OpEnc::OpEncodings>(OpRec->OpEncoding);
+  };
+  if (auto SE = OpTypeToSpecialEncoding())
+    return SE;
+  const OpWithSpecialEncodingTy *Ptr = OpWithSpecialEncoding(MCID.getOpcode());
+  if ((Ptr != nullptr) && (((1U << OpIdx) & Ptr->OpIdx) != 0)) {
+    return static_cast<OpEnc::OpEncodings>(Ptr->EnKind);
+  }
+  return std::nullopt;
+}
+
+std::optional<TPUOperandTypeRecord>
+getOperandTypeRecord(TPUOp::OperandType OTy) {
+  using EnumValueType = std::underlying_type<TPUOp::OperandType>::type;
+  EnumValueType OTyValue = static_cast<EnumValueType>(OTy);
+  const EnumValueType OTyFirstEnum =
+      static_cast<EnumValueType>(TPUOp::OperandType::FIRST_OPERAND_IMM);
+  if (OTyValue < OTyFirstEnum)
+    return std::nullopt;
+  const TPUOperandTypeRecord *Record = OperandTypeInfo(OTyValue - OTyFirstEnum);
+  return Record == nullptr ? std::nullopt
+                           : std::optional<TPUOperandTypeRecord>(*Record);
+}
+
+} // end namespace llvm

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/MCTargetDesc/TPUBaseInfo.h b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/MCTargetDesc/TPUBaseInfo.h
new file mode 100644
index 0000000..8c820ba
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/MCTargetDesc/TPUBaseInfo.h

@@ -0,0 +1,141 @@
+//===-- TPUBaseInfo.h - Top level definitions for TPU MC --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the TPU target useful for the compiler back-end and the MC libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_TPU_MCTARGETDESC_TPUBASEINFO_H
+#define LLVM_LIB_TARGET_TPU_MCTARGETDESC_TPUBASEINFO_H
+
+#include "TPUMCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+namespace OpInfo {
+#define GET_TPUOperandType_DECL
+#include "TPUGenSearchableTables.inc"
+} // namespace OpInfo
+
+namespace TPUPowerGroups {
+#define GET_TPUPowerGroups_DECL
+#include "TPUGenSearchableTables.inc"
+} // namespace TPUPowerGroups
+
+// Structure used to hold information in "Searchable tables" generated through
+// tablegen containing information about a certain operand type.
+// - OperandTypeEnumVal is the enum value for the operand type this record
+// refers to.
+// - ImmMask is the immediate slot mask, which represents which immediate slots
+// this operand uses in an instruction.
+// - OpEncoding represents the type of encoding this operand uses (like VY or SY
+// encodings).
+struct TPUOperandTypeRecord {
+  const unsigned OperandTypeEnumVal;
+  const unsigned ImmMask;
+  const unsigned OpEncoding;
+};
+
+namespace TPUOp {
+enum OperandType : unsigned {
+  FIRST_OPERAND_IMM = MCOI::OPERAND_FIRST_TARGET,
+  OPERAND_IMM_SINGLE = FIRST_OPERAND_IMM + OpInfo::OPERAND_IMM_SINGLE,
+  OPERAND_MEMOFFSET = FIRST_OPERAND_IMM + OpInfo::OPERAND_MEMOFFSET,
+  OPERAND_MEMSTRIDE = FIRST_OPERAND_IMM + OpInfo::OPERAND_MEMSTRIDE,
+  OPERAND_SUBLANEMASK = FIRST_OPERAND_IMM + OpInfo::OPERAND_SUBLANEMASK,
+  OPERAND_SCALAR_IMM32 = FIRST_OPERAND_IMM + OpInfo::OPERAND_SCALAR_IMM32,
+  OPERAND_SYNC_IMM_SINGLE = FIRST_OPERAND_IMM + OpInfo::OPERAND_SYNC_IMM_SINGLE,
+  OPERAND_TC_VLD_SHUFFLE_IMM32 =
+      FIRST_OPERAND_IMM + OpInfo::OPERAND_TC_VLD_SHUFFLE_IMM32,
+  OPERAND_VECTOR_IMM32 = FIRST_OPERAND_IMM + OpInfo::OPERAND_VECTOR_IMM32,
+  OPERAND_IMM_PLAIN = FIRST_OPERAND_IMM + OpInfo::OPERAND_IMM_PLAIN,
+  OPERAND_GPR_VS0 = FIRST_OPERAND_IMM + OpInfo::OPERAND_GPR_VS0,
+  OPERAND_GPR_VS1 = FIRST_OPERAND_IMM + OpInfo::OPERAND_GPR_VS1,
+  OPERAND_GPR_VS2 = FIRST_OPERAND_IMM + OpInfo::OPERAND_GPR_VS2,
+  OPERAND_GPR_VS3 = FIRST_OPERAND_IMM + OpInfo::OPERAND_GPR_VS3,
+  OPERAND_GPR_SM_X = FIRST_OPERAND_IMM + OpInfo::OPERAND_GPR_SM_X,
+  OPERAND_GPR_SM_Y = FIRST_OPERAND_IMM + OpInfo::OPERAND_GPR_SM_Y,
+  LAST_OPERAND_IMM = OPERAND_GPR_SM_Y,
+};
+}
+
+// Returns the encoding for the Sy value, or no value if there is no encoding.
+// Returns the Viperfish representation if IsVfc is true.
+std::optional<uint32_t> getSyEncodings(bool IsPxc, uint32_t Val);
+
+// Returns the encoding for the Vy value, or no value if there is no encoding.
+std::optional<uint32_t> getVyEncodings(const TPUSubtarget *ST, uint32_t Val);
+
+// Returns the encoding for the sublane mask/offset/stride value used in
+// Vld/Vst, or no value if there is no encoding.
+std::optional<uint32_t> getSublaneMaskEncodings(bool IsVfc, bool IsSc,
+                                                uint32_t Val);
+std::optional<uint32_t> getMemOffsetEncodings(bool IsVfc, bool IsSc,
+                                              uint32_t Val);
+std::optional<uint32_t> getMemStrideEncodings(bool IsVfc, bool IsSc, bool IsPfc,
+                                              uint32_t Val);
+
+// Returns the first encoding for the Vy or Sy value, for the zero extend, one
+// extend, and packed shift left versions of the immediate encodings. Returns
+// the TensorCore Viperfish encoding if IsVfc is true. The SparseCore Viperfish
+// encoding is identical to the Pufferfish one.
+uint8_t getFirstVyZeroExtEncoding(bool IsVfc);
+uint8_t getFirstSyZeroExtEncoding(bool IsVfc);
+uint8_t getFirstVyOneExtEncoding(bool IsVfc);
+uint8_t getFirstSyOneExtEncoding(bool IsVfc);
+uint8_t getFirstVyShlEncoding(bool IsVfc);
+uint8_t getFirstSyShlEncoding(bool IsVfc);
+// Returns the first 32-bit immediate encoding value. Returns the Viperfish
+// value if IsVfc is true.
+uint8_t getFirstVyImm32Encoding(bool IsVfc);
+uint8_t getFirstSyImm32Encoding(bool IsVfc);
+
+// Returns the first encoding for Memory-Offset, Memory-Stride and Sublane-Mask
+// values for the zero extend version of the immediate encodings.
+uint8_t getFirstMemOffsetZeroExtEncoding(bool IsPfc);
+uint8_t getFirstMemStrideZeroExtEncoding(bool IsVfc, bool IsPfc);
+uint8_t getFirstSublaneMaskZeroExtEncoding(bool IsVfc);
+
+// Defines special operand encoding kinds, such as 'y' encoding for vector or
+// scalar slots, etc...
+namespace OpEnc {
+#define GET_OpEncodings_DECL
+#include "TPUGenSearchableTables.inc"
+}; // namespace OpEnc
+
+// If the opcode has special encoding for the exact operand.
+std::optional<OpEnc::OpEncodings> getSpecialOpEncoding(const MCInstrDesc &MCID,
+                                                       unsigned OpIdx);
+
+inline bool isTPUImmediate(unsigned OpTy) {
+  if (OpTy == TPUOp::OPERAND_IMM_PLAIN) {
+    // Plain immediates are TPU immediates, but not considered real immediates.
+    // They either represent encodable plain values, or pseudo registers.
+    return false;
+  }
+  if (OpTy == TPUOp::OPERAND_GPR_VS0 || OpTy == TPUOp::OPERAND_GPR_VS1 ||
+      OpTy == TPUOp::OPERAND_GPR_VS2 || OpTy == TPUOp::OPERAND_GPR_VS3 ||
+      OpTy == TPUOp::OPERAND_GPR_SM_X || OpTy == TPUOp::OPERAND_GPR_SM_Y) {
+    // Special register operands are not immediates.
+    return false;
+  }
+  return OpTy >= TPUOp::FIRST_OPERAND_IMM && OpTy <= TPUOp::LAST_OPERAND_IMM;
+}
+
+// Returns the record containing immediate mask and encodings for a specified
+// operand type.
+std::optional<TPUOperandTypeRecord>
+getOperandTypeRecord(TPUOp::OperandType OTy);
+
+} // namespace llvm
+#endif // LLVM_LIB_TARGET_TPU_MCTARGETDESC_TPUBASEINFO_H

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPU.h b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPU.h
new file mode 100644
index 0000000..8a77a56
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPU.h

@@ -0,0 +1,523 @@
+//===-------- TPU.h - Top-level interface for TPU -------*- C++ -*---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// TPU backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_TPU_H
+#define LLVM_LIB_TARGET_TPU_H
+
+#include "MCTargetDesc/TPUMCTargetDesc.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+
+#include <functional>
+#include <string>
+
+namespace llvm {
+class FunctionPass;
+class TPUMachinePipelinerInfo;
+class TPUTargetMachine;
+class MachineFunctionPass;
+class TargetMachine;
+class formatted_raw_ostream;
+class ScheduleHazardRecognizer;
+class MachineSchedContext;
+class ScheduleDAGSwing;
+
+enum BarnaCorePredicateKind : uint64_t {
+  // Note that Always is deliberately zero even though its encoding is 15.
+  // This allows us to use zero as the default predicate.
+  BCPK_Always,
+  BCPK_FirstIdOccurrence,
+  BCPK_FirstIdInFeature,
+  BCPK_NewFeatureId,
+  BCPK_NewTokenId,
+  BCPK_NewSample,
+  BCPK_OnlyIdInFeatureAndSample,
+  BCPK_FirstIdInBatch,
+  BCPK_NewTile,
+  BCPK_FeatureIdMatch,
+  BCPK_RepeatedTokenFeature,
+  BCPK_FirstIteration,
+  BCPK_LastIteration,
+  BCPK_NewSampleOrTile,
+  BCPK_RepeatedTileSample,
+  BCPK_NewFeatureOrToken
+};
+
+// Represents the register and immediate part of a TPU instruction predicate.
+class TPUPredicate {
+  static_assert(BCPK_NewFeatureOrToken < 1U << 4,
+                "Not enough bits to represent BCPK!");
+  struct Flags {
+    // Should the sense of this predicate be inverted?
+    uint64_t Invert : 1;
+    // Predicate kind for BarnaCore.
+    BarnaCorePredicateKind BcKind : 4;
+    // Pipeline stage for BarnaCore. Default is zero.
+    uint64_t BcPipelineStage : 2;
+  };
+  Register PPR = TPU::Palways;
+  union {
+    Flags F;
+    uint64_t I = 0;
+  } Imm;
+
+  void initPredicate(const MachineInstr &MI) {
+    // findFirstPredOperandIdx() can't deal with variadic reglists.
+    unsigned Idx = MI.getOpcode() == TPU::EVENT ? MI.getNumOperands() - 2
+                                                : MI.findFirstPredOperandIdx();
+    // If instruction is not predicatable keep the default predicate.
+    if(Idx != -1) {
+      PPR = MI.getOperand(Idx).getReg();
+      Imm.I = MI.getOperand(Idx + 1).getImm();
+    }
+  }
+
+public:
+  TPUPredicate() {}
+
+  explicit TPUPredicate(const MachineInstr *MI) {
+    initPredicate(*MI);
+  }
+  explicit TPUPredicate(const MachineInstr &MI) {
+    initPredicate(MI);
+  }
+  explicit TPUPredicate(const MCInst *MI, unsigned Idx) {
+    PPR = MI->getOperand(Idx).getReg();
+    Imm.I = MI->getOperand(Idx + 1).getImm();
+  }
+
+  static TPUPredicate fromRawBcEncoding(uint64_t Raw) {
+    return TPUPredicate()
+        .setInvert((Raw & 0x10) != 0)
+        // We convert from raw to BcKind by adding one to all values and mapping
+        // Always (15) to zero. We can do this by adding one and masking the
+        // bottom 4 bits.
+        .setBarnaCoreKind(static_cast<BarnaCorePredicateKind>((Raw + 1) & 0xF));
+  }
+
+  unsigned toRawBcEncoding() {
+    unsigned Enc = 15;
+    if (Imm.F.BcKind != BCPK_Always)
+      Enc = static_cast<unsigned>(Imm.F.BcKind) - 1;
+    return Enc ^ (Imm.F.Invert << 4);
+  }
+
+  bool operator==(const TPUPredicate &P) const {
+    return PPR == P.PPR && Imm.I == P.Imm.I;
+  }
+  bool operator!=(const TPUPredicate &P) const {
+    return PPR != P.PPR || Imm.I != P.Imm.I;
+  }
+
+  bool isAlways() const {
+    return PPR == TPU::Palways && Imm.F.Invert == 0 &&
+           Imm.F.BcKind == BCPK_Always;
+  }
+
+  TPUPredicate toggleInvert() const {
+    TPUPredicate P(*this);
+    P.Imm.F.Invert ^= 1;
+    return P;
+  }
+  bool getInvert() const { return Imm.F.Invert; }
+  TPUPredicate setInvert(bool Invert) const {
+    TPUPredicate P(*this);
+    P.Imm.F.Invert = Invert;
+    return P;
+  }
+
+  TPUPredicate setBarnaCoreKind(BarnaCorePredicateKind Kind) const {
+    TPUPredicate P(*this);
+    P.Imm.F.BcKind = Kind;
+    return P;
+  }
+  BarnaCorePredicateKind getBarnaCoreKind() const { return Imm.F.BcKind; }
+
+  TPUPredicate setBarnaCorePipelineStage(unsigned PS) const {
+    TPUPredicate P(*this);
+    assert(PS < 4 && "Too many pipeline stages to represent!");
+    P.Imm.F.BcPipelineStage = PS;
+    return P;
+  }
+  unsigned getBarnaCorePipelineStage() const { return Imm.F.BcPipelineStage; }
+
+  Register getReg() const { return PPR; }
+  TPUPredicate setReg(Register PPR) const {
+    TPUPredicate P(*this);
+    P.PPR = PPR;
+    return P;
+  }
+
+  uint64_t getImm() const { return Imm.I; }
+
+  void applyTo(MachineInstr *MI) const {
+    unsigned Idx = MI->findFirstPredOperandIdx();
+    MI->getOperand(Idx).setReg(PPR);
+    MI->getOperand(Idx + 1).setImm(Imm.I);
+  }
+  void addTo(MachineInstrBuilder *MIB) const {
+    MIB->addReg(PPR);
+    MIB->addImm(Imm.I);
+  }
+};
+
+// Add the default predicate {Palways, 0} to an instruction being built.
+inline const MachineInstrBuilder &
+AddDefaultPred(const MachineInstrBuilder &MIB) {
+  TPUPredicate P;
+  return MIB.addReg(P.getReg()).addImm(P.getImm());
+}
+
+// Target address spaces for TPU. Note that smem is the default address
+// space (for stack, and because it can handle all types of data at 4-byte
+// alignment).
+//
+// Note that the target address spaces start at 200 arbitrarily. Low indices are
+// used by Clang for OpenCL/CUDA address spaces, so to avoid conflicts we start
+// high.
+enum TPUAddressSpace : unsigned {
+  TPUAS_Smem = 0,    ///< smem, scalar memory.
+  TPUAS_Hbm = 203,   ///< HBM 32B, global slow memory.
+  TPUAS_Sflag = 204, ///< sflag, shared atomic memory.
+  // SparseCore specific memory
+  TPUAS_TileSpmem = 201,  ///< tilespmem, tile local memory.
+  TPUAS_Spmem = 202,      ///< spmem, tile shared memory.
+  TPUAS_Dreg = 208,       ///< dreg, descriptor memory.
+  TPUAS_SflagOther = 210, ///< sflag, "other" tec/tac in the same tile.
+  TPUAS_SflagAny = 211,   ///< sflag, "any" tec/tac on any chip, translated to
+                          ///< sflag address space.
+  TPUAS_SflagTile = 217,  ///< sflag, any tile on the same core.
+  TPUAS_SmemAny = 212,    ///< smem, scalar memory on any chip.
+  TPUAS_HbmAny = 213,     ///< HBM 32B, global slow memory on any chip.
+  TPUAS_Simem = 215,      ///< simem, SCS instruction memory.
+  TPUAS_Timem = 214,      ///< timem, task instruction memory.
+  TPUAS_Iova = 216,       ///< IO virtual address host memory.
+  // TensorCore specific memory
+  TPUAS_Vmem = 205, ///< vmem, tensorcore vector memory.
+  TPUAS_Cmem = 206, ///< cmem, tensorcore common memory.
+  TPUAS_Bmem = 207, ///< bmem, BarnaCore memory.
+  // FIXME(hgreving): we may need a hbm_other address space that potentially
+  // aliases with the default hbm address space.
+};
+
+// createTPUISelDag - This pass converts a legalized DAG into a
+// TPU-specific DAG, ready for instruction scheduling.
+FunctionPass *createTPUISelDag(TPUTargetMachine &TM);
+
+// createTPUMemOpIntrinsicsPass - This pass breaks down llvm memop intrinsics
+// into raw loads and stores.
+Pass *createTPUMemOpIntrinsicsPass();
+
+// createTPUGEPLoweringPass - This pass lowers GetElementPtrs away before DAG
+// construction to allow custom address manipulation logic.
+Pass *createTPUGEPLoweringPass();
+
+// createTPUBundlePackerPass - This pass performs final scheduling and packs
+// into bundles.
+Pass *createTPUBundlePackerPass();
+
+// createTPUPostBundleLowerPseudosPass - This pass performs post-bundle-packing
+// lowering of pseudo instructions like BRreserve and BR into BRrel.
+Pass *createTPUPostBundleLowerPseudosPass();
+
+// createTPUPrintMachineFunctionPass - This pass can be used to inject function
+// dumps in opt mode.
+Pass *createTPUPrintMachineFunctionPass(const std::string Banner);
+
+// createTPULateIBufMissMitigationPass - This pass implements experimental ibuf
+// cache miss mitigation strategies.
+Pass *createTPULateIBufMissMitigationPass();
+
+// createTPURemoveIdentityCopiesPass - This pass removes trivial identity
+// copies.
+Pass *createTPURemoveIdentityCopiesPass();
+
+// createTPUInvalidateFifoFillAnalysisPass - This pass is used to inject
+// invalidation of fifo fill level analysis after upstream passes.
+Pass *createTPUInvalidateFifoFillAnalysisPass();
+
+// createTPUAddrCalcDelayPass - This pass optimizes copies that incur
+// address calculation delay.
+Pass *createTPUAddrCalcDelayPass();
+
+// createTPUBreakVResHoldPass - This pass tries to break output dependencies
+// that incur VRes hold latency.
+Pass *createTPUBreakVResHoldPass();
+
+// createTPUOverPredicatePass - This pass adds extra predication on unpredicated
+// instructions to give the scheduler more freedom.
+Pass *createTPUOverPredicatePass();
+
+// createTPUUnderPredicatePass - This pass removes superfluous predication on
+// instructions.
+Pass *createTPUUnderPredicatePass();
+
+// createTPUVLIWPreparePass - This pass performs last-minute adjustments
+// in preparation mainly for software pipelining.
+Pass *createTPUVLIWPreparePass();
+
+// createTPUCodeGenPreparePass - This pass performs last-minute adjustments in
+// preparation for DAG formation.
+Pass *createTPUCodeGenPreparePass(TPUTargetMachine *TM);
+
+// createTPUXLUOptimizationsPass - This pass performs optimizations on XLU
+// operations.
+Pass *createTPUXLUOptimizationsPass();
+
+// createTPUFifoPseudoAllocPass - This pass performs register allocation of
+// pseudo physcial FIFO registers.
+Pass *createTPUFifoPseudoAllocPass();
+
+// createTPURegisterPreparePass - This pass performs (1) unused fifo register
+// optimization and will do fifo expansion in the future, and (2) RA constraint
+// enforcement on the operands of certain modify mask operations.
+Pass *createTPURegisterPreparePass();
+
+// createTPUEventDebugPass - This pass is used to automatically inject event
+// instructions in certain cases, used for debugging.
+Pass *createTPUEventDebugPass();
+
+// createTPUSpillDebugPass - This pass is used to automatically inject spill
+// and fills instructions into the code.
+Pass *createTPUSpillDebugPass();
+
+// createTPUMachineVerifierPass - TPU specific machine verifier to be used in
+// debug mode.
+Pass *createTPUMachineVerifierPass();
+
+// createTPUNopCoalescingPass - This pass takes sleds of NOPs and converts them
+// to SDELAYs/VDELAYs.
+Pass *createTPUNopCoalescingPass();
+
+// createTPUPadFunctionsPass - This pass pads SparseCore SCS functions with NOPs
+// if necessary to preserve 64-byte alignment.
+Pass *createTPUPadFunctionsPass();
+
+// createTPULoopAnalysisPass - This pass is used to dump loop information
+// for analysis purposes only.
+Pass *createTPULoopAnalysisPass();
+
+// createTPURematerializePass - This pass tries to rematerialize operations
+// to reduce register pressure.
+Pass *createTPURematerializePass();
+
+// createTPUCrossCallSpillPackerPass - This pass packs spills that pass across
+// call sites to the beginning of the range.
+Pass *createTPUCrossCallSpillPackerPass();
+
+// Raw hazard handling
+Pass *createRawHazardPass();
+
+// Perform peephole optimizations after ISel.
+Pass *createTPUEarlyPostISelMiscPass();
+
+// Perform copy rotate transformation.
+Pass *createTPUCopyRotatePass();
+
+// Perform complex addressing mode emulation.
+Pass *createTPUEmulateComplexAddressingPass();
+
+// TPU verifier pass. Make sure only supported functionality are being used.
+Pass *createTPUVerifierPass(TPUTargetMachine *TM, bool IsLateCodegen);
+
+// Propagates separate scopes and noalias if the tpu loop_parallel intrinsic is
+// found in the code.
+Pass *createTPULoopParallelPass();
+
+// Convert tpu alloca intrinsic into inttooptr and set aliasing metadata to be
+// be able to encode aliasing information coming from allocation.
+Pass *createTPUMemAllocPass(TPUTargetMachine *TM);
+
+// Lower the pseudo Rotate instructions to scalar rotate of sub-register. This
+// needs to be ran after de-ssa and before regalloc.
+Pass *createBarnaCoreRotateLoweringPass();
+
+// Orders Fifo instructions.
+Pass *createTPUFifoSchedulingPass();
+
+// Create a DAG scheduler, using a variant of swing scheduling.
+ScheduleDAGSwing *createScheduleDAGSwingResource(MachineSchedContext *C);
+
+// Ensures program end halt location (needed for the overlayer).
+Pass *createTPUEnsureProgramEndHaltPass();
+
+// TPU specific alias analysis pass.
+ImmutablePass *createTPUAAWrapperPass(bool IsSparseCore);
+
+// Wrapper pass to be able to add external alias analysis.
+ImmutablePass* createTPUExternalAAWrapperPass();
+
+// Wrapper pass for software pipeliner super pass info.
+ImmutablePass *createTPUMachinePipelinerInfoWrapperPass();
+
+// Analysis pass for software pipeliner super pass.
+Pass *createTPUMachinePipelinerAnalysisPass(TPUMachinePipelinerInfo *ExtMPI);
+
+// TPU prepare for opt, e.g. propagate always_inline, masked intrinsics.
+Pass *createTPUOptimizePreparePass(TPUTargetMachine *TM);
+
+// TPU software pipeliner.
+Pass *createSwingModuloSchedulerPass();
+
+// TPU dovetailing pass.
+Pass *createTPUPipelineDovetailingPass();
+
+// TPU early branch-folding pass.
+Pass *createTPUEarlyBranchFoldingPass(bool KeepPrologEpilog);
+
+// TPU machine pipeliner super pass.
+Pass *createTPUMachinePipelinerSuperPass(TPUTargetMachine *TM);
+
+// createTPUPreSpillerPass - This pass attempts to reduce register pressure by
+// pre-spilling values before software pipelining.
+Pass *createTPUPreSpillerPass(TPUMachinePipelinerInfo *ExtMPI,
+                              bool UseHeuristicMode);
+
+// TPU machine software pipeliner pass.
+Pass *createTPUMachinePipelinerPass(TPUMachinePipelinerInfo *ExtMPI,
+                                    const Twine &DbgStr = "");
+
+// TPU speculative superimposed machine software pipeliner pass.
+Pass *createTPUMachineSSIPipelinerPass();
+
+// TPU pass to call back with profiled execution graph.
+Pass *createTPUExecutionProfilePass(
+    AsmPrinter *Printer, MCInstPrinter *InstPrinter,
+    std::function<int(std::string &subtarget, int64_t startpc, int64_t endpc)>
+        ExecutionProfileCallback,
+    std::function<std::string(std::string &)> GraphRPCCallback,
+    std::function<void(std::string &, int64_t)> RegisterEndOfFuncCallback,
+    std::function<int(std::string &Subtarget)> AdjustPCSectionCallback);
+
+// TPU optimize and convert scalar spills to dregs.
+Pass *createTPUOptimizeSpillToDregPass();
+
+// TPU debug pass to insert trace instructions.
+Pass *createTPUHardwareTraceDebugPass();
+
+// TPU advanced early if predication.
+Pass *createTPUEarlyIfPredicatorPass();
+
+/// MachineScheduler - This pass schedules machine instructions.
+extern char &TPUMachineSchedulerFastID;
+
+// ID for a pass to inject invalidation of fifo fill levels.
+extern char &TPUInvalidateFifoFillAnalysisID;
+
+// ID for late ibuf cache mitigation pass.
+extern char &TPULateIBufMissMitigationID;
+
+// ID for scalar spill to dreg optimization.
+extern char &TPUOptimizeSpillToDregID;
+
+//===----------------------------------------------------------------------===//
+// Target pass initializers
+//===----------------------------------------------------------------------===//
+void initializeBundleTrackerTestPass(PassRegistry &Registry);
+void initializeBundlePackerPass(PassRegistry &Registry);
+void initializeLiveRangeReductionPass(PassRegistry &Registry);
+void initializePostBundleLowerPseudosPass(PassRegistry &Registry);
+void initializeTPUPrintMachineFunctionPass(PassRegistry &Registry);
+void initializeTPULateIBufMissMitigationPass(PassRegistry &Registry);
+void initializeTPUInvalidateFifoFillAnalysisPass(PassRegistry &Registry);
+void initializeTPURemoveIdentityCopiesPass(PassRegistry &Registry);
+void initializeTPUBreakVResHoldPass(PassRegistry &Registry);
+void initializeTPUAddrCalcDelayPass(PassRegistry &Registry);
+void initializeOverPredicatePass(PassRegistry &Registry);
+void initializeUnderPredicatePass(PassRegistry &Registry);
+void initializeTPULoopParallelPass(PassRegistry &Registry);
+void initializeTPUVLIWPreparePass(PassRegistry &Registry);
+void initializeTPUCodeGenPreparePass(PassRegistry &Registry);
+void initializeTPUXLUOptimizationsPass(PassRegistry &Registry);
+void initializeFifoPseudoAllocPass(PassRegistry &Registry);
+void initializeTPURegisterPreparePass(PassRegistry &Registry);
+void initializeTPUEventDebugPass(PassRegistry &Registry);
+void initializeTPUSpillDebugPass(PassRegistry &Registry);
+void initializeTPUMachineVerifierPass(PassRegistry &Registry);
+void initializeTPUNopCoalescingPass(PassRegistry &Registry);
+void initializeTPUPadFunctionsPass(PassRegistry &Registry);
+void initializeTPULoopAnalysisPass(PassRegistry &Registry);
+void initializeTPURematerializePass(PassRegistry &Registry);
+void initializeTPUCrossCallSpillPackerPass(PassRegistry &Registry);
+void initializeTPURawHazardPass(PassRegistry &Registry);
+void initializeTPUMemAllocPass(PassRegistry &Registry);
+void initializeTPUMemOpIntrinsicsPass(PassRegistry &Registry);
+void initializeTPUGEPLoweringPass(PassRegistry &Registry);
+void initializeTPUOptimizePreparePass(PassRegistry &Registry);
+void initializeTPUMachinePipelinerSuperPassPass(PassRegistry &Registry);
+void initializeTPUMachinePipelinerInfoWrapperPassPass(PassRegistry &Registry);
+void initializeTPUMachinePipelinerAnalysisPass(PassRegistry &Registry);
+void initializeTPUMachinePipelinerPass(PassRegistry &Registry);
+void initializeTPUMachineSSIPipelinerPass(PassRegistry &Registry);
+void initializeTPUPipelineDovetailingPass(PassRegistry &Registry);
+void initializeTPUEarlyBranchFoldingPass(PassRegistry &Registry);
+void initializeTPUEarlyPostISelMiscPass(PassRegistry &Registry);
+void initializeTPUCopyRotatePass(PassRegistry &Registry);
+void initializeTPUEmulateComplexAddressingPass(PassRegistry &Registry);
+void initializeTPUVerifierPass(PassRegistry &Registry);
+void initializeBarnaCoreRotateLoweringPass(PassRegistry &Registry);
+void initializeTPUFifoSchedulingPass(PassRegistry &Registry);
+void initializeTPUEnsureProgramEndHaltPass(PassRegistry &Registry);
+void initializeTPUAAWrapperPassPass(PassRegistry &Registry);
+void initializeTPUExternalAAWrapperPass(PassRegistry &Registry);
+void initializeBarnaCoreRotateLoweringPass(PassRegistry &Registry);
+void initializeMachineSchedulerFastPass(PassRegistry &Registry);
+void initializeTPUPreSpillerPass(PassRegistry &Registry);
+void initializeTPUDAGToDAGISelPass(PassRegistry &Registry);
+void initializeTPUExecutionProfilePass(PassRegistry &Registry);
+void initializeTPUOptimizeSpillToDregPass(PassRegistry &Registry);
+void initializeTPUHardwareTraceDebugPass(PassRegistry &Registry);
+void initializeTPUEarlyIfPredicatorPass(PassRegistry &Registry);
+
+Target &getTheTPUTarget();
+
+// New pass manager trampolines.
+struct TPUOptimizePreparePass : PassInfoMixin<TPUOptimizePreparePass> {
+  TPUOptimizePreparePass(TPUTargetMachine &TM) : TM(TM) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+  TPUTargetMachine &TM;
+};
+
+struct TPUMemAllocPass : PassInfoMixin<TPUMemAllocPass> {
+  TPUMemAllocPass(TPUTargetMachine &TM) : TM(TM) {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+  TPUTargetMachine &TM;
+};
+
+struct TPULoopParallelPass : PassInfoMixin<TPULoopParallelPass> {
+  TPULoopParallelPass() {}
+  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
+
+struct TPUVerifierPass : PassInfoMixin<TPUVerifierPass> {
+  TPUVerifierPass(TPUTargetMachine &TM, bool IsLateCodegen)
+      : TM(TM), IsLateCodegen(IsLateCodegen) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AF);
+
+private:
+  TPUTargetMachine &TM;
+  bool IsLateCodegen = false;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_TPU_H

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPU.td b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPU.td
new file mode 100644
index 0000000..9b3e246
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPU.td

@@ -0,0 +1,259 @@
+//===- TPU.td - Describe the TPU Target ------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+def TPUInstrInfo : InstrInfo {
+  let useDeprecatedPositionallyEncodedOperands = true;
+}
+
+def FeatureHasV1024 : SubtargetFeature<"v1024",
+  "HasV1024",
+  "true",
+  "VPU process vector of size 8x128"
+>;
+
+def FeatureHasMXU : SubtargetFeature<"mxu",
+  "HasMXU",
+  "true",
+  "Has matrix multiply hardware"
+>;
+
+def FeatureHasV8 : SubtargetFeature<"v8",
+  "HasV8",
+  "true",
+  "VPU process vector of size 8"
+>;
+
+def FeatureHasV16 : SubtargetFeature<"v16",
+  "HasV16",
+  "true",
+  "VPU process vector of size 16"
+>;
+
+def FeatureHasLPGL : SubtargetFeature<"lp-gl",
+  "HasLPGL",
+  "true",
+  "Target VPU supports Ghostlite+ packed low precision"
+>;
+
+def FeatureHasLPVF : SubtargetFeature<"lp-vf",
+  "HasLPVF",
+  "true",
+  "Target VPU supports Viperfish packed low precision"
+>;
+
+def FeatureHasTranscendental : SubtargetFeature<"tcdtl",
+  "HasTranscendental",
+  "true",
+  "Target ERF supports transcendental opcodes"
+>;
+
+def FeatureHasBcChannelControllerIsa : SubtargetFeature<"barnacore-cc-isa",
+  "HasBarnacoreChannelControllerIsa",
+  "true",
+  "True if the subtarget is the PXC BarnaCore, which has a very different ISA."
+>;
+
+def FeatureHasVfcSparsecoreIsa : SubtargetFeature<"sparsecore-vfc-isa",
+  "HasVfcSparsecoreIsa",
+  "true",
+  "True if the subtarget is the VFC Sparsecore."
+>;
+
+def FeatureHasGlcSparsecoreIsa : SubtargetFeature<"sparsecore-glc-isa",
+  "HasGlcSparsecoreIsa",
+  "true",
+  "True if the subtarget is the GLC Sparsecore."
+>;
+
+def FeatureHasGfcSparsecoreIsa : SubtargetFeature<"sparsecore-gfc-isa",
+  "HasGfcSparsecoreIsa",
+  "true",
+  "True if the subtarget is the GFC Sparsecore."
+>;
+
+def FeatureHasVectorSflags : SubtargetFeature<"vector-sflags",
+  "HasVectorSflags",
+  "true",
+  "VPU uses vector sflag instructions"
+>;
+
+def FeatureHasScalarSflags : SubtargetFeature<"scalar-sflags",
+  "HasScalarSflags",
+  "true",
+  "VPU uses scalar sflag instructions"
+>;
+
+def FeatureHasFatalRawHazard : SubtargetFeature<"FatalRawHazard",
+  "HasFatalRawHazard",
+  "true",
+  "HW fails on VMem load/store hazard"
+>;
+
+def FeatureHasMxuGsft : SubtargetFeature<"Gsft",
+  "HasGsft",
+  "true",
+  "HW has a separate Transpose MXU latch (gsft)"
+>;
+
+def FeatureHasJfcTensorCore : SubtargetFeature<"jfc-tensorcore",
+  "HasJfcTensorCore",
+  "true",
+  "HW is a JFC TensorCore"
+>;
+
+def FeatureHasDfcTensorCore : SubtargetFeature<"dfc-tensorcore",
+  "HasDfcTensorCore",
+  "true",
+  "HW is a DFC TensorCore"
+>;
+
+def FeatureHasPfcTensorCore : SubtargetFeature<"pfc-tensorcore",
+  "HasPfcTensorCore",
+  "true",
+  "HW is a PFC TensorCore"
+>;
+
+def FeatureHasVfcTensorCore : SubtargetFeature<"vfc-tensorcore",
+  "HasVfcTensorCore",
+  "true",
+  "HW is a VFC TensorCore"
+>;
+
+def FeatureHasPxcVPU : SubtargetFeature<"pxc-vpu",
+  "HasPxcVPU",
+  "true",
+  "Has Pufferfish VPU instructions (such as vclamps)."
+>;
+
+def FeatureHasSMul24 : SubtargetFeature<"smul24",
+  "HasSMul24",
+  "true",
+  "Has a 24-bit scalar multiplier available."
+>;
+
+def FeatureHasSMul32 : SubtargetFeature<"smul32",
+  "HasSMul32",
+  "true",
+  "Has a full 32-bit scalar multiplier."
+>;
+
+def FeatureHasEarlyVxposeAllocation : SubtargetFeature<"early-vxpose-alloc",
+  "HasEarlyVxposeAllocation",
+  "true",
+  "A Vxpose instruction allocating TRF will do so before any pop in the bundle."
+>;
+
+def FeatureNeedsCompilerThrottling : SubtargetFeature<"needs-compiler-throttling",
+  "HasNeedsCompilerThrottling",
+  "true",
+  "This flag indicates that the compiler should try to handle power consumption inserting NOPs."
+>;
+
+def SparseCoreV8HwMode : HwMode<"+v8">;
+def SparseCoreV16HwMode : HwMode<"+v16">;
+def JfcDfcHwMode : HwMode<"-v8,-pfc-tensorcore">;
+def PxcHwMode : HwMode<"-v8,+pfc-tensorcore">;
+def BarnaCorePxcHwMode : HwMode<"+barnacore-cc-isa">;
+
+include "TPUSchedule.td"
+include "TPURegisterInfo.td"
+include "TPUCallingConv.td"
+include "TPUInstrInfo.td"
+
+// PXC BarnaCore channel controller.
+def : ProcessorModel<"barnacore-cc-pf", BarnaCorePFSchedModel,
+                     [FeatureHasBcChannelControllerIsa, FeatureHasV8,
+                      FeatureHasPxcVPU, FeatureHasSMul32]>;
+
+// SparseCore VFC subtargets
+def : ProcessorModel<"sparsecore-tec-vf", SparseCoreTecVFSchedModel,
+                     [FeatureHasV8, FeatureHasScalarSflags, FeatureHasLPVF, FeatureHasLPVF,
+                      FeatureHasPxcVPU, FeatureHasSMul32, FeatureHasVfcSparsecoreIsa]>;
+def : ProcessorModel<"sparsecore-scs-vf", SparseCoreScsVFSchedModel,
+                     [FeatureHasScalarSflags, FeatureHasSMul32,
+		      FeatureHasVfcSparsecoreIsa]>;
+def : ProcessorModel<"sparsecore-tac-vf", SparseCoreTacVFSchedModel,
+                     [FeatureHasScalarSflags, FeatureHasSMul32,
+		      FeatureHasVfcSparsecoreIsa]>;
+
+// SparseCore GLC subtargets
+def : ProcessorModel<"sparsecore-tec-gl", SparseCoreTecGLSchedModel,
+                     [FeatureHasV8, FeatureHasScalarSflags, FeatureHasLPGL, FeatureHasLPGL,
+                      FeatureHasTranscendental, FeatureHasPxcVPU, FeatureHasSMul32,
+                      FeatureHasGlcSparsecoreIsa]>;
+def : ProcessorModel<"sparsecore-scs-gl", SparseCoreScsGLSchedModel,
+                     [FeatureHasScalarSflags, FeatureHasSMul32,
+		      FeatureHasGlcSparsecoreIsa]>;
+def : ProcessorModel<"sparsecore-tac-gl", SparseCoreTacGLSchedModel,
+                     [FeatureHasScalarSflags, FeatureHasSMul32,
+		      FeatureHasGlcSparsecoreIsa]>;
+
+// SparseCore GFC subtargets
+def : ProcessorModel<"sparsecore-tec-gf", SparseCoreTecGFSchedModel,
+                     [FeatureHasV16, FeatureHasScalarSflags, FeatureHasLPGL, FeatureHasLPGL,
+                      FeatureHasTranscendental, FeatureHasSMul32, FeatureHasGfcSparsecoreIsa]>;
+def : ProcessorModel<"sparsecore-scs-gf", SparseCoreScsGFSchedModel,
+                     [FeatureHasScalarSflags, FeatureHasSMul32,
+		      FeatureHasGfcSparsecoreIsa]>;
+def : ProcessorModel<"sparsecore-tac-gf", SparseCoreTacGFSchedModel,
+                     [FeatureHasScalarSflags, FeatureHasSMul32,
+		      FeatureHasGfcSparsecoreIsa]>;
+
+// Jellyfish tensor core subtarget
+def : ProcessorModel<"tensorcore-jf", TensorCoreJFSchedModel,
+[FeatureHasV1024, FeatureHasMXU, FeatureHasVectorSflags,
+ FeatureHasJfcTensorCore, FeatureHasFatalRawHazard, FeatureHasSMul24,
+ FeatureHasEarlyVxposeAllocation]>;
+// Dragonfish tensor core subtarget
+def : ProcessorModel<"tensorcore-df", TensorCoreJFSchedModel,
+[FeatureHasV1024, FeatureHasMXU, FeatureHasVectorSflags,
+ FeatureHasDfcTensorCore, FeatureHasFatalRawHazard, FeatureHasSMul32,
+ FeatureHasEarlyVxposeAllocation, FeatureNeedsCompilerThrottling]>;
+// Pufferfish tensor core subtarget
+def : ProcessorModel<"tensorcore-pf", TensorCorePFSchedModel,
+[FeatureHasV1024, FeatureHasMXU, FeatureHasVectorSflags,
+ FeatureHasPfcTensorCore, FeatureHasPxcVPU, FeatureHasMxuGsft,
+ FeatureHasSMul32]>;
+// Viperfish tensor core subtarget
+def : ProcessorModel<"tensorcore-vf", TensorCoreVFSchedModel,
+[FeatureHasV1024, FeatureHasMXU, FeatureHasVectorSflags,
+ FeatureHasVfcTensorCore, FeatureHasPxcVPU,
+ FeatureHasSMul32]>;
+
+def TPUAsmParser : AsmParser {
+  bit HasMnemonicFirst = 0;
+}
+
+def TPUAsmParserVariant : AsmParserVariant {
+  let TokenizingCharacters = "[]*!;:()";
+  let SeparatorCharacters = " \t,";
+}
+
+def TPUInstPrinter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+// An AsmWriter variant that omits slot and immediate annotations (so that
+// the generated assembly is less messy and can be consumed by other tools,
+// but is not perfectly round-trippable).
+def TPUGenericInstPrinter : AsmWriter {
+  string AsmWriterClassName  = "GenericInstPrinter";
+  bit isMCAsmWriter = 1;
+  int Variant = 1;
+}
+
+def TPU : Target {
+  let InstructionSet = TPUInstrInfo;
+  let AssemblyParsers = [TPUAsmParser];
+  let AssemblyParserVariants = [TPUAsmParserVariant];
+  let AssemblyWriters = [TPUInstPrinter, TPUGenericInstPrinter];
+  int AllowRegisterRenaming = 1;
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUAsmPrinter.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUAsmPrinter.cpp
new file mode 100644
index 0000000..8293c6e
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUAsmPrinter.cpp

@@ -0,0 +1,505 @@
+//===-- TPUAsmPrinter.cpp - TPU LLVM assembly writer --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the TPU assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/TPUInstPrinter.h"
+#include "TPU.h"
+#include "TPUFifoAnalysis.h"
+#include "TPUInstrInfo.h"
+#include "TPUMCInstLower.h"
+#include "TPUMachineFunctionInfo.h"
+#include "TPURegisterInfo.h"
+#include "TPUTargetMachine.h"
+#include "TargetInfo/TPUTargetInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <iterator>
+#include <memory>
+
+#define DEBUG_TYPE "asm-printer"
+using namespace llvm;
+
+cl::opt<bool>
+    TPUReportUsedSpillSlots("tpu-report-used-spillslots", cl::init(false),
+                            cl::desc("Report used spill slots in a separate "
+                                     "section for the runtime to use"));
+
+cl::opt<bool>
+    EncodeMCInsts("tpu-encode-mcinst-bundles", cl::init(false),
+                  cl::desc("If true, encode all bundle packing information "
+                           "into MCInsts during lowering. The default is false "
+                           "if we are emitting assembly, true otherwise."));
+
+cl::opt<bool> EmitGlobalsSection(
+    "tpu-emit-globals-section", cl::init(true),
+    cl::desc(
+        "Enables emitting globals in a data section for each memory space."));
+
+namespace {
+
+cl::opt<bool>
+    TPUPrintBundleNumber("tpu-print-bundle-number", cl::init(true),
+                         cl::desc("If enabled print the bundle number as a "
+                                  "comment in the assembly stream."));
+
+cl::opt<bool>
+    EnableFifoAnnotationForTest("force-fifo-annotation", cl::init(false),
+                                cl::desc("Force fifo annotation when printing"
+                                         "asm on Sparsecore for test."));
+
+struct CompilerStatistics {
+  std::string FunctionName;
+  unsigned NumberOfBundles = 0;
+  unsigned NumberOfStaticVDelayWaits = 0;
+  unsigned NumberOfVectorSpills = 0;
+  unsigned NumberOfScalarSpills = 0;
+  unsigned NumberOfVectorReloads = 0;
+  unsigned NumberOfScalarReloads = 0;
+  unsigned VectorUsedFrameIndices = 0;
+  unsigned ScalarUsedFrameIndices = 0;
+  unsigned StaticInsertedThrottleCycles = 0;
+  std::optional<unsigned> MaxCrossCallScalarRegs;
+  std::optional<unsigned> MaxCrossCallVectorRegs;
+};
+
+class TPUAsmPrinter : public AsmPrinter {
+public:
+  explicit TPUAsmPrinter(TargetMachine &TM,
+                         std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)) {}
+
+  StringRef getPassName() const override { return "TPU Assembly Printer"; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O);
+  void emitInstruction(const MachineInstr *MI) override;
+  void emitGlobalVariable(const GlobalVariable *GV) override;
+
+private:
+  void AnalyzeFifos(MachineFunction &MF);
+  std::string AnnotateFifoPush(const MachineInstr &MI);
+  std::string AnnotateFifoPop(const MachineInstr &MI);
+  void EmitFunctionMetadata();
+  CompilerStatistics ComputeStatistics() const;
+  void PrintCompilerStatistics(const CompilerStatistics &Stats) const;
+
+  const TPUSubtarget *ST;
+  std::unique_ptr<FifoAnalysis> FA;
+  DenseMap<FifoAnalysis::Def *, unsigned> DefNum;
+  uint64_t EmittedBundles;
+};
+} // end of anonymous namespace
+
+bool TPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  EmittedBundles = 0;
+  ST = &MF.getSubtarget<TPUSubtarget>();
+  // Sparsecore may generate predicated code for software pipelined loops for
+  // which fifo analysis is broken. We have evaluated several approaches to make
+  // it work, e.g. by marking pipelined blocks and ignoring, or by conveying
+  // stage info in the instruction, but each approach is either not working or
+  // fragile. We therefore disable the otherwise convenient analysis in asm if
+  // software pipelining is enabled.
+  if (!ST->isSparseCore() || EnableFifoAnnotationForTest)
+    AnalyzeFifos(MF);
+  bool Changed = AsmPrinter::runOnMachineFunction(MF);
+  if (TPUReportUsedSpillSlots)
+    EmitFunctionMetadata();
+  if (OutStreamer->hasRawTextSupport() && isVerbose()) {
+    PrintCompilerStatistics(ComputeStatistics());
+  }
+  return Changed;
+}
+
+void TPUAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                 raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << TPUInstPrinter::getRegisterName(MO.getReg());
+    break;
+
+  case MachineOperand::MO_Immediate:
+    O << "$" << MO.getImm();
+    break;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    O << *getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_BlockAddress: {
+    MCSymbol *BA = GetBlockAddressSymbol(MO.getBlockAddress());
+    O << BA->getName();
+    break;
+  }
+
+  case MachineOperand::MO_ExternalSymbol:
+    O << *GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << '_'
+      << MO.getIndex();
+    break;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+      << MO.getIndex();
+    return;
+
+  default:
+    llvm_unreachable("<unknown operand type>");
+  }
+}
+
+void TPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
+  std::vector<std::string> Annot;
+  if (isVerbose()) {
+    if (MI->getOpcode() == TPU::BUNDLE &&
+        (TPUInstrInfo::isFifoPush(*MI) || TPUInstrInfo::isFifoPop(*MI))) {
+      auto E = MI->getParent()->instr_end();
+      for (auto I = std::next(MI->getIterator()); I != E && I->isInsideBundle();
+           ++I) {
+        if (!ST->isSparseCore() || EnableFifoAnnotationForTest) {
+          if (TPUInstrInfo::isFifoPush(*I))
+            Annot.push_back(AnnotateFifoPush(*I));
+          else if (TPUInstrInfo::isFifoPop(*I))
+            Annot.push_back(AnnotateFifoPop(*I));
+        }
+      }
+    }
+    if (TPUPrintBundleNumber)
+      Annot.push_back((Twine("num: ") + Twine(EmittedBundles++)).str());
+  }
+  if (MI->getOpcode() == TPU::BUNDLE) {
+    auto E = MI->getParent()->instr_end();
+    for (auto I = std::next(MI->getIterator()); I != E && I->isInsideBundle();
+         ++I) {
+      // We're emitting the location string as a comment so we can print it.
+      std::string s;
+      llvm::raw_string_ostream os(s);
+      I->getDebugLoc().print(os);
+      if (!s.empty()) {
+        Annot.push_back(s);
+      }
+    }
+  }
+
+  bool Encode = EncodeMCInsts;
+  if (EncodeMCInsts.getNumOccurrences() == 0)
+    Encode = OutStreamer->hasRawTextSupport() ? false : true;
+
+  TPUMCInstLower MCInstLowering(OutContext, *this, *ST, Encode);
+  const MCSubtargetInfo& STI = getSubtargetInfo();
+  MCInst *TmpInst = OutContext.createMCInst();
+  MCSymbol *PendingTmpSymbol = nullptr;
+  MCInstLowering.Lower(MI, *TmpInst, PendingTmpSymbol);
+  if (PendingTmpSymbol)
+    OutStreamer->emitLabel(PendingTmpSymbol);
+  if (!Annot.empty())
+    OutStreamer->AddComment(join(Annot, ", "), /*EOL=*/true);
+  OutStreamer->emitInstruction(*TmpInst, STI);
+}
+
+void TPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
+  if (!EmitGlobalsSection) {
+    // Attention: not emitting globals leads to the streamer being unable to
+    // resolve them. We're using this as a check on SparseCore. Certain
+    // instructions, like e.g. event or sctrap are inlining the global string
+    // expression, and will still work as expected.
+    return;
+  }
+  StringRef Postfix;
+  // Set the section based on the address space.
+  switch (GV->getAddressSpace()) {
+  default:
+    llvm_unreachable("Unknown address space for global variable!");
+  case TPUAS_Smem:
+    Postfix = "smem";
+    break;
+  case TPUAS_TileSpmem:
+    Postfix = "tilespmem";
+    break;
+  case TPUAS_Spmem:
+    Postfix = "spmem";
+    break;
+  case TPUAS_Hbm:
+    Postfix = "hbm";
+    break;
+  case TPUAS_Sflag:
+    Postfix = "sflag";
+    break;
+  case TPUAS_SflagOther:
+    // Note: ISS doesn't yet understand the `sflagother` section. But that's
+    // okay - in SC we won't produce globals in real programs anyways.
+    Postfix = "sflagother";
+    break;
+  case TPUAS_SflagAny:
+    // Same comment as above, no support in ISS.
+    Postfix = "sflagany";
+    break;
+  case TPUAS_SflagTile:
+    // Same comment as above, no support in ISS.
+    Postfix = "sflagtile";
+    break;
+  case TPUAS_HbmAny:
+    // Same comment as above, no support in ISS.
+    Postfix = "hbm";
+    break;
+  case TPUAS_SmemAny:
+    // Same comment as above, no support in ISS.
+    Postfix = "smem";
+    break;
+  case TPUAS_Vmem:
+    Postfix = "vmem";
+    break;
+  case TPUAS_Dreg:
+    Postfix = "dreg";
+    break;
+  case TPUAS_Iova:
+    Postfix = "iova";
+    break;
+  case TPUAS_Simem:
+    Postfix = "simem";
+    break;
+  }
+  const_cast<GlobalVariable *>(GV)->setSection(std::string(".data.") +
+                                               Postfix.str());
+  AsmPrinter::emitGlobalVariable(GV);
+}
+
+void TPUAsmPrinter::AnalyzeFifos(MachineFunction &MF) {
+  ST = &MF.getSubtarget<TPUSubtarget>();
+  FA = std::make_unique<FifoAnalysis>(MF, ST->getFifoInfos());
+  DefNum.clear();
+  for (auto *D : FA->defs())
+    DefNum.insert({D, DefNum.size()});
+}
+
+namespace {
+// Return a string with a section of the alphabet, starting from index I and
+// extending to index E-1. For example, makeABC(3, 8) -> "cdefg";
+std::string makeABC(unsigned I, unsigned E) {
+  std::string S(E - I, ' ');
+  std::iota(S.begin(), S.end(), 'a' + I);
+  return S;
+}
+} // namespace
+
+std::string TPUAsmPrinter::AnnotateFifoPush(const MachineInstr &MI) {
+  FifoAnalysis::Def *D = FA->getDef(MI);
+  if (!D)
+    return "";
+  std::string S;
+  raw_string_ostream OS(S);
+  unsigned N = DefNum[D];
+  OS << "push %" << N;
+  if (D->getNumPushedItems() > 1)
+    OS << makeABC(0, D->getNumPushedItems());
+  return OS.str();
+}
+
+std::string TPUAsmPrinter::AnnotateFifoPop(const MachineInstr &MI) {
+  FifoAnalysis::Use *U = FA->getUse(MI);
+  if (!U)
+    return "";
+
+  std::string S;
+  raw_string_ostream OS(S);
+  OS << "pop ";
+
+  SmallVector<std::string, 4> Strs;
+  for (auto *D : U->defs()) {
+    std::string &S = Strs.emplace_back();
+    raw_string_ostream OS(S);
+
+    unsigned N = DefNum[D];
+    OS << "%" << N;
+    if (D->getNumPushedItems() == 1)
+      continue;
+    int StartIdx = D->getNumPushedItems() - U->getPrev().getNumItemsLeftToPop();
+    OS << makeABC(StartIdx, StartIdx + U->getNumPoppedItems());
+  }
+  if (Strs.size() == 1)
+    OS << Strs[0];
+  else
+    OS << "phi(" << join(Strs, " or ") << ")";
+  return OS.str();
+}
+
+CompilerStatistics TPUAsmPrinter::ComputeStatistics() const {
+  CompilerStatistics Stats;
+  const TPUInstrInfo *TII = MF->getSubtarget<TPUSubtarget>().getInstrInfo();
+  const TPURegisterInfo *TRI =
+      MF->getSubtarget<TPUSubtarget>().getRegisterInfo();
+  const TPUMachineFunctionInfo *MFI = MF->getInfo<TPUMachineFunctionInfo>();
+  Stats.FunctionName = MF->getName();
+  auto CollectVDelayWaitCycles = [](const MachineInstr &MI) -> int {
+    if (MI.getOpcode() == TPU::VDELAY || MI.getOpcode() == TPU::VDELAY_LONG) {
+      return MI.getOperand(0).getImm();
+    }
+    return 0;
+  };
+  for (auto &MBB : *MF) {
+    for (auto &I : MBB) {
+      assert(I.isBundle() && "All instructions should be bundled");
+      // Collecting bundles in program.
+      ++Stats.NumberOfBundles;
+      SmallVector<const MachineMemOperand *, 4> Accesses;
+      // Collecting stack loads.
+      // Sadly, because the MachineMemOperands do not have correct address space
+      // info for the spill/reload the only way is check the register class
+      // of the register being spilled/reloaded to understand what it is (or
+      // check the opcode)
+      for (auto It = std::next(I.getIterator());; ++It) {
+        SmallVector<const MachineMemOperand *, 1> Accesses;
+        if (TII->hasLoadFromStackSlot(*It, Accesses)) {
+          assert(Accesses.size() == 1 && "Expected 1 MMO for stack accesses");
+          Register Reg = It->getOperand(0).getReg();
+          if (TRI->isVectorMemSpilledRegister(*MF, Reg)) {
+            ++Stats.NumberOfVectorReloads;
+          } else if (TRI->isScalarMemSpilledRegister(*MF, Reg)) {
+            ++Stats.NumberOfScalarReloads;
+          }
+        }
+        Stats.NumberOfStaticVDelayWaits += CollectVDelayWaitCycles(*It);
+        if (!It->isBundledWithSucc())
+          break;
+      }
+      // Collecting stack stores.
+      // Sadly, because the MachineMemOperands do not have correct address space
+      // info for the spill/reload the only way is check the register class
+      // of the register being spilled/reloaded to understand what it is (or
+      // check the opcode
+      for (auto It = std::next(I.getIterator());; ++It) {
+        SmallVector<const MachineMemOperand *, 1> Accesses;
+        if (TII->hasStoreToStackSlot(*It, Accesses)) {
+          assert(Accesses.size() == 1 && "Expected 1 MMO for stack accesses");
+          unsigned Reg = It->getOperand(0).getReg();
+          if (TRI->isVectorMemSpilledRegister(*MF, Reg)) {
+            ++Stats.NumberOfVectorSpills;
+          } else if (TRI->isScalarMemSpilledRegister(*MF, Reg)) {
+            ++Stats.NumberOfScalarSpills;
+          }
+        }
+        if (!It->isBundledWithSucc())
+          break;
+      }
+    }
+  }
+  Stats.VectorUsedFrameIndices = MFI->getNumVectorSpillSlots();
+  Stats.ScalarUsedFrameIndices = MFI->getNumSmemSpillSlots();
+  Stats.StaticInsertedThrottleCycles = MFI->getStaticInsertedThrottleCycles();
+  Stats.MaxCrossCallScalarRegs = MFI->getMaxCrossCallScalarRegisters();
+  Stats.MaxCrossCallVectorRegs = MFI->getMaxCrossCallVectorRegisters();
+  return Stats;
+}
+
+void TPUAsmPrinter::EmitFunctionMetadata() {
+  const TPUMachineFunctionInfo *MFI = MF->getInfo<TPUMachineFunctionInfo>();
+  MCSectionELF *FuncMetadata = OutStreamer->getContext().getELFNamedSection(
+      "function_metadata", MF->getName(), ELF::SHT_PROGBITS, 0);
+  OutStreamer->switchSection(FuncMetadata);
+  OutStreamer->emitLabel(createTempSymbol("vector_spill_slots_num"));
+  OutStreamer->emitIntValue(MFI->getNumVectorSpillSlots(), 4);
+  OutStreamer->emitLabel(createTempSymbol("scalar_spill_slots_num"));
+  OutStreamer->emitIntValue(MFI->getNumSmemSpillSlots(), 4);
+  OutStreamer->emitLabel(createTempSymbol("static_inserted_throttle_cycles"));
+  OutStreamer->emitIntValue(MFI->getStaticInsertedThrottleCycles(), 4);
+  if (MFI->getMaxCrossCallScalarRegisters().has_value()) {
+    OutStreamer->emitLabel(createTempSymbol("max_cross_call_scalar_regs"));
+    OutStreamer->emitIntValue(MFI->getMaxCrossCallScalarRegisters().value(), 4);
+  }
+  if (MFI->getMaxCrossCallVectorRegisters().has_value()) {
+    OutStreamer->emitLabel(createTempSymbol("max_cross_call_vector_regs"));
+    OutStreamer->emitIntValue(MFI->getMaxCrossCallVectorRegisters().value(), 4);
+  }
+  OutStreamer->popSection();
+}
+
+void TPUAsmPrinter::PrintCompilerStatistics(
+    const CompilerStatistics &Stats) const {
+  assert(OutStreamer->hasRawTextSupport() && "Can't print comments");
+
+  OutStreamer->emitRawComment(" --- Compiler Statistics for @" +
+                                  Twine(Stats.FunctionName) + " ---",
+                              false);
+  OutStreamer->addBlankLine();
+  OutStreamer->emitRawComment(
+      " NumberOfBundles              = " + Twine(Stats.NumberOfBundles), false);
+  OutStreamer->emitRawComment(" NumberOfStaticVDelayWaits    = " +
+                                  Twine(Stats.NumberOfStaticVDelayWaits),
+                              false);
+  OutStreamer->emitRawComment(" NumberOfVectorSpills         = " +
+                                  Twine(Stats.NumberOfVectorSpills),
+                              false);
+  OutStreamer->emitRawComment(" NumberOfScalarSpills         = " +
+                                  Twine(Stats.NumberOfScalarSpills),
+                              false);
+  OutStreamer->emitRawComment(" NumberOfVectorReloads        = " +
+                                  Twine(Stats.NumberOfVectorReloads),
+                              false);
+  OutStreamer->emitRawComment(" NumberOfScalarReloads        = " +
+                                  Twine(Stats.NumberOfScalarReloads),
+                              false);
+  OutStreamer->emitRawComment(" VectorUsedFrameIndices       = " +
+                                  Twine(Stats.VectorUsedFrameIndices),
+                              false);
+  OutStreamer->emitRawComment(" ScalarUsedFrameIndices       = " +
+                                  Twine(Stats.ScalarUsedFrameIndices),
+                              false);
+  OutStreamer->emitRawComment(" StaticInsertedThrottleCycles = " +
+                                  Twine(Stats.StaticInsertedThrottleCycles),
+                              false);
+  if (Stats.MaxCrossCallScalarRegs.has_value()) {
+    OutStreamer->emitRawComment(" MaxCrossCallScalarRegs    = " +
+                                    Twine(Stats.MaxCrossCallScalarRegs.value()),
+                                false);
+  }
+  if (Stats.MaxCrossCallVectorRegs.has_value()) {
+    OutStreamer->emitRawComment(" MaxCrossCallVectorRegs    = " +
+                                    Twine(Stats.MaxCrossCallVectorRegs.value()),
+                                false);
+  }
+  OutStreamer->addBlankLine();
+  OutStreamer->emitRawComment(" --- End of Compiler Statistics for @" +
+                                  Twine(Stats.FunctionName) + " ---",
+                              false);
+  OutStreamer->addBlankLine();
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeTPUAsmPrinter() {
+  RegisterAsmPrinter<TPUAsmPrinter> X(getTheTPUTarget());
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUCallingConv.td b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUCallingConv.td
new file mode 100644
index 0000000..26bfca1
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUCallingConv.td

@@ -0,0 +1,132 @@
+//===- TPUCallingConv.td - Calling Conventions ------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the TPU architecture.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// The TPU fastcc calling convention is being lowered to an argument-less
+// function call using a special CALL_FAST instruction for Sparsecore SCS
+// trampolines.
+//
+def CC_TPU_SC_Fast : CallingConv<[
+  CCCustom<"CC_TPU_AnyReg_Error">
+]>;
+
+// TPU Pseudo C Calling convention.
+def CC_TPU_Pseudo : CallingConv<[
+  // Values marked "consecutive" come from aggregates that have been split apart
+  // on BarnaCore.
+  CCIfConsecutiveRegs<CCCustom<"CC_TPU_CustomBCAggregate">>,
+
+  // Promote i8/i16 args to i32
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Pass all arguments in the 31 first sregs.
+  CCIfType<[i32, f32],
+           CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12,
+                          S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23,
+                          S24, S25, S26, S27, S28, S29, S30]>>,
+  // Pass cbregs (tests only)
+  CCIfType<[x86mmx],
+           CCAssignToReg<[CB0, CB1, CB2, CB3, CB4, CB5, CB6, CB7, CB8, CB9, CB10, CB11,
+                          CB12, CB13, CB14, CB15]>>,
+  CCIfType<[i1],
+           CCAssignToReg<[P0, P1, P2, P3]>>,
+  // Ensure we have more vregs available for input on BC. This does not affect
+  // TC or SC's codegen, it just allows more registers to be passed as input.
+  CCIfType<[v1024i32, v1024f32, v8i32, v8f32, v16i32, v16f32, v16bf16, v16f16, v32bf16,
+            v32f16, v16i16, v32i16, v32i8, v64i8, v64i4, v128i4, v128i2, v256i2, v256i1,
+            v512i1],
+           CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7,
+                          V8, V9, V10, V11, V12, V13, V14, V15]>>,
+  CCIfType<[v1024i1, v8i1, v16i1, v32i1, v64i1],
+           CCAssignToReg<[M0, M1, M2, M3]>>,
+  // Otherwise they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
+
+// TPU 32-bit C Calling convention.
+def CC_TPU32 : CallingConv<[
+  // Promote i8/i16 args to i32
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Pass all arguments in the 31 first sregs.
+  CCIfType<[i32, f32],
+           CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12,
+                          S13, S14, S15, S16, S17, S18, S19]>>,
+  // Ensure we have more vregs available for input on BC. This does not affect
+  // TC or SC's codegen, it just allows more registers to be passed as input.
+  CCIfType<[v8i32, v8f32, v16i32, v16f32, v16bf16, v16f16, v32bf16,
+            v32f16, v16i16, v32i16, v32i8, v64i8, v64i4, v128i4, v128i2, v256i2, v256i1, v512i1],
+           CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15,
+                          V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, V29, V30, V31,
+                          V32, V33, V34, V35, V36, V37, V38, V39, V40, V41, V42, V43, V44, V45, V46, V47]>>,
+  CCIfType<[v8i1, v16i1, v32i1, v64i1],
+           CCAssignToReg<[M0, M1, M2, M3, M4, M5, M6, M7]>>,
+
+  // Otherwise they are assigned to the stack. No alignment here,
+  // we figure this out based on scalar/vector during lowering.
+  CCAssignToStack<0, 1>
+]>;
+
+// TPU Pseudo C return-value convention.
+def RetCC_TPU_Pseudo : CallingConv<[
+  // Values marked "consecutive" come from aggregates that have been split apart
+  // on BarnaCore.
+  CCIfConsecutiveRegs<CCCustom<"CC_TPU_CustomBCAggregate">>,
+
+  CCIfType<[i32, f32], CCAssignToReg<[S0, S1]>>,
+  CCIfType<[x86mmx], CCAssignToReg<[CB0]>>,
+  CCIfType<[i1], CCAssignToReg<[P0, P1]>>,
+  CCIfType<[v1024i32, v1024f32, v8i32, v8f32, v16i32, v16f32, v16bf16, v16f16, v32bf16,
+            v32f16, v16i16, v32i16, v32i8, v64i8, v64i4, v128i4, v128i2, v256i2, v256i1, v512i1],
+           CCAssignToReg<[V0, V1]>>,
+  CCIfType<[v1024i1, v8i1, v16i1, v32i1], CCAssignToReg<[M0, M1]>>
+]>;
+
+// TPU 32-bit C return-value convention.
+def RetCC_TPU32 : CallingConv<[
+  // FIXME(b/237788792):  ABI?
+  // Promote i8/i16 args to i32
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[i32, f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                      S9, S10, S11, S12, S13, S14, S15]>>,
+  CCIfType<[v8i32, v8f32, v16i32, v16f32, v16bf16, v16f16, v32bf16,
+            v32f16, v16i16, v32i16, v32i8, v64i8, v64i4, v128i4, v128i2, v256i2, v256i1, v512i1],
+           CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11,
+                          V12, V13, V14, V15]>>,
+  CCIfType<[v8i1, v16i1, v32i1, v64i1], CCAssignToReg<[M0, M1, M2, M3]>>
+]>;
+
+let Entry = 1 in {
+// TPU calling convention.
+def CC_TPU : CallingConv<[
+  CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_TPU_SC_Fast>>,
+  CCIf<"State.getMachineFunction().getSubtarget<TPUSubtarget>().isTPUABIEnabled()", CCDelegateTo<CC_TPU32>>,
+  CCDelegateTo<CC_TPU_Pseudo>
+]>;
+} // Entry = 1
+
+let Entry = 1 in {
+def RetCC_TPU : CallingConv<[
+  CCDelegateTo<RetCC_TPU_Pseudo>,
+  CCIf<"State.getMachineFunction().getSubtarget<TPUSubtarget>().isTPUABIEnabled()", CCDelegateTo<RetCC_TPU32>>
+]>;
+} // Entry = 1
+
+// Pseudo use callee saved convention. IAR registers cannot be spilled. It is users'
+// responsibility to make sure only one version is alive at a time.
+def CSR_NoRegs : CalleeSavedRegs<(add IAR0, IAR1)>;
+
+// TPU 32-bit C callee saved registers. The order matters.
+def CSR_TPU32 : CalleeSavedRegs<(add LR, FPV, FPS)>;

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUCodeGenPrepare.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUCodeGenPrepare.cpp
new file mode 100644
index 0000000..03a4457
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUCodeGenPrepare.cpp

@@ -0,0 +1,1441 @@
+//===-- TPUCodeGenPrepare.cpp - Prepare for codegen ----------------- C++ -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TPUCodeGenPrepare performs TPU specific adjustments to the IR module
+// immediately prior to DAG formation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TPU.h"
+#include "TPUIRUtils.h"
+#include "TPUTargetMachine.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsTPU.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/KnownBits.h"
+
+#define DEBUG_TYPE "tpu-codegen-prepare"
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+cl::opt<bool>
+    EnableEventDebug("tpu-enable-event-debug", cl::init(false),
+                     cl::desc("Enable event injection debugging on TPU."));
+
+cl::opt<unsigned> EventDebugMax(
+    "tpu-event-debug-max", cl::init(128),
+    cl::desc("Max number of injected events, increase if necessary."));
+
+cl::opt<bool> ForceSupportVldVstIdxAdd(
+    "tpu-enable-vld-vst-idx-add", cl::init(false),
+    cl::desc("Enable vld.vst.idx.add instruction as in b/193554156."));
+
+static cl::opt<bool> ConvertHalfToBfloat(
+    "tpu-convert-half-to-bfloat", cl::Hidden, cl::init(false),
+    cl::desc("Hacks and mutates half to bfloat, use with extreme care."));
+
+static cl::opt<bool> SortFunctionSymbols(
+    "tpu-sort-global-function-symbols", cl::init(false),
+    cl::desc("When enabled, sorts the global function symbol table in the "
+             "order of scs, tac, tec, on SparseCore."));
+
+extern cl::opt<bool> EnableContinuations;
+
+namespace {
+class TPUCodeGenPrepare : public ModulePass {
+public:
+  static char ID;
+  TPUCodeGenPrepare() : ModulePass(ID), TM(nullptr) {}
+  TPUCodeGenPrepare(TPUTargetMachine *TM) : ModulePass(ID), TM(TM) {}
+
+  bool runOnModule(Module &M) override;
+  bool runOnFunction(Function &F);
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+
+  StringRef getPassName() const override { return "TPU codegen prepare"; }
+
+private:
+  // Expand special SparseCore VLD/VST NP intrinsic.
+  void expandScVldVstNpIntrinsic(IntrinsicInst *Intr, Intrinsic::ID ID_VLD,
+                                 Intrinsic::ID ID_VST, const DataLayout &DL);
+  // Optimize and match software shift semantics to one hardware shift
+  // instruction. Returns true if successfully matched, false otherwise.
+  bool matchShiftPattern(Value *V);
+  // Match special SparseCore VLD/VST NP intrinsic.
+  bool matchScVldVstNpIntrinsic(Value *V, const DataLayout &DL);
+  // Match int.masked.load.
+  bool matchMaskedLoadIntrinsic(Value *V);
+  // Match int.masked.store.
+  bool matchMaskedStoreIntrinsic(Value *V);
+  // If a vector binary op only computes a relevant value in element 0 transform
+  // it to an equivalent scalar instruction.
+  bool scalarizeBinOp(Instruction &I);
+  bool scalarizeCmp(Instruction &I);
+  bool scalarizeSelect(Instruction &I);
+  // Match consecutive DMADone code sequence that can be merged.
+  bool matchDMADoneOpt(Instruction *I, BasicBlock::iterator &It);
+  // Reassociate chained GEP to match the addressing mode expecting by DAG
+  // selection.
+  bool reassociateGEP(Instruction &I);
+  // For ld/store instruction try to sink GEP in the same block to allow DAG
+  // selection to pattern match it.
+  bool sinkGEP(Instruction &I);
+  // Constructs target address in syncset.remote and syncadd.remote intrinsics
+  // by packing core_id, chip_id, and possible done_bit into a single register,
+  // as as expected by the hardware instruction. We arbitrarily choose the first
+  // sflag parameter hold the packed sflag IDs, and set the remaining parameters
+  // to zero.
+  bool matchSyncRemoteIntrinsic(Instruction *I);
+  // Expands destination sflags in SparseCore syncset.both and syncadd.both into
+  // a shift+or to pack the destinations into a single register, as expected by
+  // the hardware instruction. We arbitrarily choose the first sflag parameter
+  // hold the packed sflag IDs, and set the second sflag parameter to zero.
+  bool matchSyncBothIntrinsic(Instruction *I);
+  // Stream intrinsics with non-constant stream_control override immediates are
+  // expanded into a shift/or with the sflag operand and the stream_control
+  // override set to zero.
+  bool matchStreamIntrinsic(Instruction *I);
+  // Sub-match indirect/strided stream intrinsics. Assumes that the caller is
+  // matchStreamIntrinsic and already filtered for stream intrinsics.
+  bool matchIndirectOrStrideStreamIntrinsic(Instruction *I);
+  // Expands the source_sync_flag and relax_sync_override operands with a
+  // shift+or to pack both operands into a single value, as expected by the
+  // DMA general hardware instruction. Set both operands to have the same value
+  // at the end. Also handles shifting Spmem pointers to 32B word size, which is
+  // a special case for DMA.
+  bool matchGeneralDMA(Instruction *I);
+  // Handles shifting Spmem pointers to 32B word size, which is a special case
+  // for DMA.
+  bool matchSimpleDMA(Instruction *I);
+  // Handles 64-bit emulation of IO virtual host address pointers and offset.
+  // Also handles shifting Spmem pointers to 32B word size, which is a special
+  // case for DMA.
+  bool matchSimpleHostDMA(Instruction *I);
+  // Shifts a DMA pointer operand to normalize to HBM word size.
+  bool shiftDMASpmemPointer(IntrinsicInst *Intr, unsigned Idx);
+  // Emulates IO virtual host address pointer, offset arithmetic.
+  bool emulateDMAIovaPointer(IntrinsicInst *Intr, unsigned PtrIdx,
+                             unsigned OffIdx);
+  // Expands addrspace cast intrinsics into LLVM addrspace instructions.
+  // TODO(b/192387730): Finalize sflag space conversions and corresponding
+  // pointer arithmetic.
+  bool matchAddrspaceCast(Instruction *I);
+  // Lowers the tpu.clear.ibuf intrinsic to an sfence followed by a
+  // task.clear_ibuf.
+  bool matchClearIbuf(Instruction *I);
+  // Match TPU clamp.
+  bool matchClamp(Instruction *I, const DataLayout &DL);
+  // Check for tpu.traps that always halt and simplify.
+  bool simplifyTrivialTraps(Function &F);
+  // Prepare global strings to be used for compiler injected event debug.
+  void prepareEventDebug(Function &F);
+  // Hacks the module and tries to convert all half vectors into bfloat vectors,
+  // intended to be used for clang experiments. This is a gross hack, use with
+  // extreme care, it can easily lead to invalid or wrong IR.
+  bool convertHalfToBfloat(Module &M, TPUTargetMachine *TM);
+  // Helper function, create a tpu.inttoptr intrinsic call.
+  Value *createIntToPtr(Module *M, IRBuilder<> &B, Value *V, Type *PtrTy);
+
+  TPUTargetMachine *TM = nullptr;
+  const TPUSubtarget *ST = nullptr;
+};
+char TPUCodeGenPrepare::ID = 0;
+
+bool IsSignBitKnownSame(KnownBits KB0, KnownBits KB1) {
+  bool IsNegative = KB0.isNegative() && KB1.isNegative();
+  bool IsNonNegative = KB0.isNonNegative() && KB1.isNonNegative();
+  return IsNegative || IsNonNegative;
+}
+} // namespace
+
+INITIALIZE_PASS(TPUCodeGenPrepare, DEBUG_TYPE, "TPU codegen prepare", false,
+                false)
+
+Pass *llvm::createTPUCodeGenPreparePass(TPUTargetMachine *TM) {
+  return new TPUCodeGenPrepare(TM);
+}
+
+Value *TPUCodeGenPrepare::createIntToPtr(Module *M, IRBuilder<> &B, Value *V,
+                                         Type *PtrTy) {
+  Function *IntToPtrFn =
+      Intrinsic::getDeclaration(M, llvm::Intrinsic::tpu_inttoptr, PtrTy);
+  return B.CreateCall(IntToPtrFn, V);
+}
+
+void TPUCodeGenPrepare::expandScVldVstNpIntrinsic(IntrinsicInst *Intr,
+                                                  Intrinsic::ID ID_VLD,
+                                                  Intrinsic::ID ID_VST,
+                                                  const DataLayout &DL) {
+  IRBuilder<> Builder(Intr);
+  llvm::Type *LoadTy = Intr->getType();
+  int ElementSizeInBytes =
+      DL.getTypeAllocSize(cast<VectorType>(LoadTy)->getElementType());
+  llvm::Type *MaskTy =
+      VectorType::get(Type::getInt1Ty(Intr->getContext()),
+                      ST->vectorSizeInElements(ElementSizeInBytes),
+                      /*Scalable=*/false);
+  llvm::Type *BasePtrTy = Intr->getOperand(1)->getType();
+  Value *AllOnes = ConstantVector::getSplat(
+      ElementCount::getFixed(ST->vectorSizeInElements(ElementSizeInBytes)),
+      Constant::getAllOnesValue(Type::getInt1Ty(Intr->getContext())));
+  Instruction *VldIdx = Builder.CreateIntrinsic(
+      ID_VLD, {LoadTy, MaskTy, BasePtrTy},
+      {AllOnes, Intr->getOperand(1), Intr->getOperand(2)});
+  Intr->replaceAllUsesWith(VldIdx);
+  Instruction *VstIdxAdd =
+      Builder.CreateIntrinsic(ID_VST, {MaskTy, BasePtrTy, LoadTy},
+                              {Intr->getOperand(0), Intr->getOperand(1),
+                               Intr->getOperand(2), Intr->getOperand(3)});
+  VldIdx->copyMetadata(*Intr);
+  VstIdxAdd->copyMetadata(*Intr);
+  Intr->eraseFromParent();
+}
+
+bool TPUCodeGenPrepare::matchShiftPattern(Value *V) {
+  // If the value is an integer constant or a splat vector constant, then this
+  // lambda returns its value, otherwise it returns -1. We're only using this to
+  // read values != -1.
+  auto getIntValue = [](Value *V) -> int {
+    Constant *CV = dyn_cast<Constant>(V);
+    if (CV) {
+      if (!isa<ConstantInt>(CV))
+        CV = CV->getSplatValue();
+      CV = dyn_cast<ConstantInt>(CV);
+    }
+    return CV ? CV->getUniqueInteger().getZExtValue() : -1;
+  };
+  // We're providing this pattern matching as an optimization to turn
+  // the superfluous semantics into one shift instruction.
+  if (SelectInst *SELI = dyn_cast<SelectInst>(V)) {
+    Instruction *SHI = dyn_cast<Instruction>(SELI->getOperand(1));
+    if (!SHI)
+      return false;
+    if (SHI->getOpcode() != Instruction::Shl &&
+        SHI->getOpcode() != Instruction::LShr)
+      return false;
+    if (getIntValue(SELI->getOperand(2)) != 0)
+      return false;
+    ICmpInst *ICI = dyn_cast<ICmpInst>(SELI->getOperand(0));
+    if (!ICI)
+      return false;
+    if (ICI->getPredicate() == CmpInst::ICMP_ULT) {
+      if (getIntValue(ICI->getOperand(1)) != 32)
+        return false;
+      SELI->replaceAllUsesWith(SHI);
+      // Matches (rhs & 31) == rhs ? lhs << rhs : 0
+      // %1 = icmp ult i32 %y, 32
+      // %2 = shl i32 %lhs, %rhs
+      // %3 = select i1 %1, i32 %2, i32 0
+      return true;
+    }
+  } else if (Instruction *SHR = dyn_cast<Instruction>(V)) {
+    if (SHR->getOpcode() != Instruction::LShr)
+      return false;
+    SelectInst *SELI = dyn_cast<SelectInst>(SHR->getOperand(1));
+    if (!SELI)
+      return false;
+    if (getIntValue(SELI->getOperand(2)) != 31)
+      return false;
+    ICmpInst *ICI = dyn_cast<ICmpInst>(SELI->getOperand(0));
+    if (!ICI)
+      return false;
+    if (ICI->getPredicate() != CmpInst::ICMP_ULT)
+      return false;
+    unsigned CmpVal = getIntValue(ICI->getOperand(1));
+    if (CmpVal != 31)
+      return false;
+    // Matches lhs >> ((rhs & 31) == rhs ? rhs : 31)
+    // %1 = icmp ult i32 %y, 31
+    // %2 = select i1 %1, i32 %rhs, i32 31
+    // %3 = lshr i32 %lhs, %2
+    IRBuilder<> B(SHR);
+    Value *ReplaceV = B.CreateAShr(SHR->getOperand(0), SELI->getOperand(1));
+    cast<Instruction>(ReplaceV)->copyMetadata(*SHR);
+    SHR->replaceAllUsesWith(ReplaceV);
+    return true;
+  }
+  return false;
+}
+
+bool TPUCodeGenPrepare::matchScVldVstNpIntrinsic(Value *V,
+                                                 const DataLayout &DL) {
+  if (ForceSupportVldVstIdxAdd || (ST && ST->supportsVldVstIdxAdd()))
+    return false;
+  if (IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(V)) {
+    switch (Intr->getIntrinsicID()) {
+    default:
+      // Nothing to do.
+      break;
+    case Intrinsic::tpu_vst_msk_idx_ret_add_np:
+      expandScVldVstNpIntrinsic(Intr, Intrinsic::tpu_vld_msk_idx_np,
+                                Intrinsic::tpu_vst_msk_idx_add_np, DL);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool TPUCodeGenPrepare::matchMaskedLoadIntrinsic(Value *V) {
+  if (ST && !ST->isSparseCore())
+    return false;
+  IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(V);
+  if (!Intr || Intr->getIntrinsicID() != Intrinsic::masked_load)
+    return false;
+  llvm::Type *MaskTy = VectorType::get(
+      Type::getInt1Ty(Intr->getContext()),
+      cast<VectorType>(Intr->getType())->getElementCount().getKnownMinValue(),
+      /*Scalable=*/false);
+  IRBuilder<> B(Intr);
+  // We're discarding the alignment operand as well as the passthru.
+  Value *L = B.CreateIntrinsic(
+      Intrinsic::tpu_vld_msk,
+      {Intr->getType(), MaskTy, Intr->getType()->getPointerTo(TPUAS_TileSpmem)},
+      {Intr->getOperand(2), Intr->getOperand(0)});
+  Intr->replaceAllUsesWith(L);
+  Intr->eraseFromParent();
+  return true;
+}
+
+bool TPUCodeGenPrepare::matchMaskedStoreIntrinsic(Value *V) {
+  if (ST && !ST->isSparseCore())
+    return false;
+  IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(V);
+  if (!Intr || Intr->getIntrinsicID() != Intrinsic::masked_store)
+    return false;
+  llvm::Type *ValTy = Intr->getOperand(0)->getType();
+  llvm::Type *MaskTy = VectorType::get(
+      Type::getInt1Ty(Intr->getContext()),
+      cast<VectorType>(ValTy)->getElementCount().getKnownMinValue(),
+      /*Scalable=*/false);
+  IRBuilder<> B(Intr);
+  B.CreateIntrinsic(
+      Intrinsic::tpu_vst_msk,
+      {MaskTy, Intr->getOperand(0)->getType()->getPointerTo(TPUAS_TileSpmem),
+       ValTy},
+      {Intr->getOperand(3), Intr->getOperand(1), Intr->getOperand(0)});
+  Intr->eraseFromParent();
+  return true;
+}
+
+// %a = insertelement <1024 x i32> undef, i32 %s1, i32 0
+// %b = insertelement <1024 x i32> undef, i32 %s2, i32 0
+// %c = shl <1024 x i32> %a, %b
+// ->
+// %c1 = shl i32 %a, %b
+// %c = insertelement <1024 x i32> undef, i32 %c1, i32 0
+bool TPUCodeGenPrepare::scalarizeBinOp(Instruction &I) {
+  Value *ScalarSrc[2] = {nullptr, nullptr};
+  BinaryOperator *Op = dyn_cast<BinaryOperator>(&I);
+  if (Op == nullptr)
+    return false;
+  for (unsigned Idx = 0; Idx < 2; Idx++) {
+    Value *V;
+    if (match(Op->getOperand(Idx),
+              m_InsertElt(m_Undef(), m_Value(V), m_Zero()))) {
+      ScalarSrc[Idx] = V;
+    } else if (ConstantDataVector *C =
+                   dyn_cast<ConstantDataVector>(Op->getOperand(Idx))) {
+      if (!isa<ConstantDataVector>(Op->getOperand(1 - Idx)))
+        ScalarSrc[Idx] = C->getElementAsConstant(0);
+    }
+  }
+  if (ScalarSrc[0] == nullptr || ScalarSrc[1] == nullptr)
+    return false;
+  IRBuilder<> B(&I);
+  Value *ScalarI = B.CreateBinOp(Op->getOpcode(), ScalarSrc[0], ScalarSrc[1]);
+  Value *VectorI =
+      B.CreateInsertElement(UndefValue::get(I.getType()), ScalarI, uint64_t(0));
+  I.replaceAllUsesWith(VectorI);
+  return true;
+}
+
+//  %splatx = insertelement <1024 x i32> undef, i32 %x, i32 0
+//  %vm = icmp eq <1024 x i32> %splatx, zeroinitializer
+// ->
+// %p = icmp i32 %x, zeroinitializer
+// %vm = insertelement <1024 x i32> undef, i32 %p, i32 0
+bool TPUCodeGenPrepare::scalarizeCmp(Instruction &I) {
+  Value *ScalarSrc[2] = {nullptr, nullptr};
+  CmpInst *Cmp = dyn_cast<CmpInst>(&I);
+  if (Cmp == nullptr)
+    return false;
+  for (unsigned Idx = 0; Idx < 2; Idx++) {
+    Value *V;
+    if (match(Cmp->getOperand(Idx),
+              m_InsertElt(m_Undef(), m_Value(V), m_Zero()))) {
+      ScalarSrc[Idx] = V;
+    } else if (ConstantDataVector *C =
+                   dyn_cast<ConstantDataVector>(Cmp->getOperand(Idx))) {
+      ScalarSrc[Idx] = C->getElementAsConstant(0);
+    } else if (ConstantAggregateZero *C =
+                   dyn_cast<ConstantAggregateZero>(Cmp->getOperand(Idx))) {
+      ScalarSrc[Idx] = C->getElementValue(unsigned(0));
+    }
+  }
+  if (ScalarSrc[0] == nullptr || ScalarSrc[1] == nullptr)
+    return false;
+  IRBuilder<> B(&I);
+  Value *ScalarI = nullptr;
+  if (isa<FCmpInst>(Cmp))
+    ScalarI = B.CreateFCmp(Cmp->getPredicate(), ScalarSrc[0], ScalarSrc[1]);
+  else
+    ScalarI = B.CreateICmp(Cmp->getPredicate(), ScalarSrc[0], ScalarSrc[1]);
+  Value *VectorI =
+      B.CreateInsertElement(UndefValue::get(I.getType()), ScalarI, uint64_t(0));
+  I.replaceAllUsesWith(VectorI);
+  I.eraseFromParent();
+  return true;
+}
+
+//  %vm = insertelement <1024 x i1> undef, i1 %x, i32 0
+//  %vmsplat = shufflevector <1024 x i1> %vm, <1024 x i1> undef, <1024 x i32> 0
+//  %res = select <1024 x i1> %vmsplat, <1024 x float> 0, <1024 x float> %y
+// ->
+//  %res = select i1 %x, <1024 x float> 0, <1024 x float> %y
+bool TPUCodeGenPrepare::scalarizeSelect(Instruction &I) {
+  SelectInst *Sel = dyn_cast<SelectInst>(&I);
+  if (Sel == nullptr)
+    return false;
+  Value *ScalarCond;
+  if (!match(Sel->getCondition(),
+             m_Shuffle(m_InsertElt(m_Undef(), m_Value(ScalarCond), m_Zero()),
+                       m_Undef(), m_ZeroMask()))) {
+    return false;
+  }
+  IRBuilder<> B(&I);
+  Value *NewSel =
+      B.CreateSelect(ScalarCond, Sel->getTrueValue(), Sel->getFalseValue());
+  I.replaceAllUsesWith(NewSel);
+  I.eraseFromParent();
+  return true;
+}
+
+// Merge consecutive wait.ge followed by syncadd
+// call void @llvm.tpu.waitge(i32 addrspace(204)* %flag, i32 16)
+// call void @llvm.tpu.syncadd(i32 addrspace(204)* %flag, i32 -16)
+// call void @llvm.tpu.waitge(i32 addrspace(204)* %flag, i32 16)
+// call void @llvm.tpu.syncadd(i32 addrspace(204)* %flag, i32 -16)
+// call void @llvm.tpu.waitge(i32 addrspace(204)* %flag, i32 200)
+// call void @llvm.tpu.syncadd(i32 addrspace(204)* %flag, i32 -200)
+// ->
+// call void @llvm.tpu.waitge(i32 addrspace(204)* %flag, i32 232)
+// call void @llvm.tpu.syncadd(i32 addrspace(204)* %flag, i32 -232)
+//
+// TODO(sdasgup): Apply the same opt for the yieldable variants of vwait?
+bool TPUCodeGenPrepare::matchDMADoneOpt(Instruction *Inst,
+                                        BasicBlock::iterator &It) {
+  auto MatchDMADone = [&](IntrinsicInst *&WaitIntr, IntrinsicInst *&Sync) {
+    WaitIntr = dyn_cast<IntrinsicInst>(Inst);
+    if (WaitIntr == nullptr ||
+        WaitIntr->getIntrinsicID() != Intrinsic::tpu_waitge)
+      return false;
+    Sync = dyn_cast<IntrinsicInst>(Inst->getNextNode());
+    if (Sync == nullptr || Sync->getIntrinsicID() != Intrinsic::tpu_syncadd ||
+        WaitIntr->getOperand(0) != Sync->getOperand(0))
+      return false;
+    const APInt *C1 = nullptr, *C2 = nullptr;
+    if (!match(WaitIntr->getOperand(1), m_APInt(C1)) ||
+        !match(Sync->getOperand(1), m_APInt(C2)) || -(*C1) != *C2)
+      return false;
+    return true;
+  };
+  IntrinsicInst *FirstWaitIntr = nullptr;
+  IntrinsicInst *FirstSync = nullptr;
+  if (!MatchDMADone(FirstWaitIntr, FirstSync))
+    return false;
+  Inst = Inst->getNextNode()->getNextNode();
+  IRBuilder<> Builder(FirstWaitIntr);
+  while (1) {
+    IntrinsicInst *NextWaitIntr = nullptr;
+    IntrinsicInst *NextSync = nullptr;
+    if (!MatchDMADone(NextWaitIntr, NextSync))
+      break;
+    if (FirstWaitIntr->getOperand(0) != NextWaitIntr->getOperand(0) ||
+        FirstWaitIntr->getOperand(0) != NextSync->getOperand(0))
+      break;
+    Inst = Inst->getNextNode()->getNextNode();
+    FirstWaitIntr->setOperand(1,
+                              Builder.CreateAdd(FirstWaitIntr->getOperand(1),
+                                                NextWaitIntr->getOperand(1)));
+    FirstSync->setOperand(1, Builder.CreateAdd(FirstSync->getOperand(1),
+                                               NextSync->getOperand(1)));
+    NextWaitIntr->eraseFromParent();
+    NextSync->eraseFromParent();
+  }
+  // Move the iterator past instructions we already processed.
+  It = Inst->getIterator();
+  return true;
+}
+
+// Transform :
+//  %gep0 = getelementptr i8, i8 addrspace(205)* getelementptr (i8, i8
+//  addrspace(205)* null, i32 8768), i32 %base
+// %gep = getelementptr i8, i8 addrspace(205)* %gep0, i32 512
+// ->
+//  %gep = getelementptr i8, i8 addrspace(205)* %base, i32 9280
+//
+// In general this gets folded during DAG optimization but it may not happen
+// if the instructions are in different blocks.
+bool TPUCodeGenPrepare::reassociateGEP(Instruction &I) {
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
+  // Only process GEP of i8, this should be run after GEP lowering so we expect
+  // most GEP to be using i8*.
+  if (GEP == nullptr || !GEP->getResultElementType()->isIntegerTy(8))
+    return false;
+  ConstantInt *Offset = dyn_cast<ConstantInt>(I.getOperand(1));
+  if (Offset == nullptr)
+    return false;
+  Value *Ptr = GEP->getPointerOperand();
+  // skip bitcasts
+  while (isa<BitCastInst>(Ptr))
+    Ptr = cast<BitCastInst>(Ptr)->getOperand(0);
+  GetElementPtrInst *ParentGEP = dyn_cast<GetElementPtrInst>(Ptr);
+  if (ParentGEP == nullptr ||
+      !ParentGEP->getResultElementType()->isIntegerTy(8))
+    return false;
+  Constant *LiteralOffset = dyn_cast<Constant>(ParentGEP->getOperand(1));
+  Constant *LiteralPtr = dyn_cast<Constant>(ParentGEP->getPointerOperand());
+  IRBuilder<> Builder(&I);
+  Value *NewGEP = nullptr;
+  if (LiteralPtr != nullptr && LiteralOffset == nullptr) {
+    Value *NewBase =
+        createIntToPtr(I.getModule(), Builder, ParentGEP->getOperand(1),
+                       GEP->getPointerOperandType());
+    Value *AddedOffset = Builder.CreateAdd(
+        Builder.CreatePtrToInt(LiteralPtr, Offset->getType()), Offset);
+    assert(cast<PointerType>(NewBase->getType())
+               ->isOpaqueOrPointeeTypeMatches(
+                   Type::getInt8Ty(NewBase->getContext())));
+    NewGEP = Builder.CreateGEP(Builder.getInt8Ty(), NewBase, AddedOffset);
+  } else if (LiteralPtr == nullptr && LiteralOffset != nullptr) {
+    Value *AddedOffset = Builder.CreateAdd(LiteralOffset, Offset);
+    NewGEP = Builder.CreateGEP(Builder.getInt8Ty(),
+                               ParentGEP->getPointerOperand(), AddedOffset);
+  } else {
+    return false;
+  }
+  I.replaceAllUsesWith(NewGEP);
+  I.eraseFromParent();
+  return true;
+}
+
+// Move GEP ins the same BB as ld/store when we know the DAG selection will
+// match them into one instruction.
+bool TPUCodeGenPrepare::sinkGEP(Instruction &I) {
+  std::optional<unsigned> OpIdx;
+  if (auto *L = dyn_cast<LoadInst>(&I))
+    OpIdx = L->getPointerOperandIndex();
+  else if (auto *S = dyn_cast<StoreInst>(&I))
+    OpIdx = S->getPointerOperandIndex();
+  else if (auto *Intr = dyn_cast<IntrinsicInst>(&I)) {
+    switch (Intr->getIntrinsicID()) {
+    case Intrinsic::tpu_vld_shuffle:
+    case Intrinsic::tpu_vld_strided:
+    case Intrinsic::tpu_vld_indexed:
+    case Intrinsic::tpu_vld_replicate_evenodd_sublanes:
+      OpIdx = 0;
+      break;
+    case Intrinsic::tpu_vst_strided:
+    case Intrinsic::tpu_vst_indexed:
+    case Intrinsic::tpu_vst_evenodd_sublanes:
+      OpIdx = 1;
+      break;
+    default:
+      return false;
+    }
+  }
+  if (!OpIdx.has_value())
+    return false;
+  Value *Pointer = I.getOperand(OpIdx.value());
+  // skip bitcasts
+  while (isa<BitCastInst>(Pointer))
+    Pointer = cast<BitCastInst>(Pointer)->getOperand(0);
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Pointer);
+  // Only consider cases where this is a GEP from a different block.
+  if (GEP == nullptr || GEP->getParent() == I.getParent() ||
+      GEP->getNumOperands() > 2)
+    return false;
+  if (!isa<Constant>(GEP->getOperand(0)) && !isa<Constant>(GEP->getOperand(1)))
+    return false;
+  auto *NewGEP = GEP->clone();
+  NewGEP->insertBefore(&I);
+  IRBuilder<> B(&I);
+  Value *NewPointer =
+      B.CreateBitCast(NewGEP, I.getOperand(OpIdx.value())->getType());
+  I.setOperand(OpIdx.value(), NewPointer);
+  return true;
+}
+
+bool TPUCodeGenPrepare::matchIndirectOrStrideStreamIntrinsic(Instruction *I) {
+  // Assumes we're only called by matchStreamIntrinsic.
+  IntrinsicInst *Intr = cast<IntrinsicInst>(I);
+  Intrinsic::ID Id = Intr->getIntrinsicID();
+  StringRef IName = Intrinsic::isOverloaded(Id) ? Intrinsic::getBaseName(Id)
+                                                : Intrinsic::getName(Id);
+  // Again here, unfortunately we're falling back to string operations, in order
+  // to avoid to handle hundreds of syntactically different stream intrinsic
+  // opcodes. We're filtering the hbm4b off_tile_type indirect and strided
+  // types, excluding the indirect.vreg types.
+  if (IName.take_front(29) == "llvm.tpu.stream.indirect.vreg" ||
+      (IName.take_front(24) != "llvm.tpu.stream.indirect" &&
+       IName.take_front(23) != "llvm.tpu.stream.strided"))
+    return false;
+  // llvm.tpu.stream.strided.gather. is position 31.
+  if (IName.find("hbm4b", 31) == StringRef::npos)
+    return false;
+  IRBuilder<> B(I);
+  constexpr unsigned IndirectListSizeOrLengthPerStrideOp = 6;
+  constexpr unsigned Hbm4bOffOp = 7;
+  Value *Hbm4bOff = B.CreateShl(Intr->getOperand(Hbm4bOffOp), 18);
+  Value *CombinedVS1 = B.CreateOr(
+      Hbm4bOff, Intr->getOperand(IndirectListSizeOrLengthPerStrideOp));
+  Intr->setOperand(IndirectListSizeOrLengthPerStrideOp, CombinedVS1);
+  Intr->setOperand(Hbm4bOffOp, CombinedVS1);
+  return true;
+}
+
+bool TPUCodeGenPrepare::matchSyncRemoteIntrinsic(Instruction *I) {
+  IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(I);
+  if (!Intr)
+    return false;
+
+  bool SetsDoneBit = false;
+  switch (Intr->getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::tpu_syncadd_remote:
+  case Intrinsic::tpu_syncset_remote:
+    break;
+  case Intrinsic::tpu_syncadd_remote_done:
+  case Intrinsic::tpu_syncadd_remote_doneinv:
+  case Intrinsic::tpu_syncset_remote_done:
+  case Intrinsic::tpu_syncset_remote_doneinv:
+    SetsDoneBit = true;
+    break;
+  }
+
+  unsigned TargetAddressIdx = 0;
+  unsigned ChipIdIdx = 2;
+  unsigned CoreIdIdx = 3;
+  unsigned DoneBitIdx = 4;
+
+  IRBuilder<> B(I);
+  Value *SflagAny = Intr->getOperand(TargetAddressIdx);
+  Value *ChipId = Intr->getOperand(ChipIdIdx);
+  Value *CoreId = Intr->getOperand(CoreIdIdx);
+  Type *I32Type = Type::getInt32Ty(I->getContext());
+  Type *SflagPtrTy = SflagAny->getType();
+  assert(SflagPtrTy->isPointerTy());
+
+  // The target address operand has a number of operands packed into it. For
+  // details on the mapping of these bits, reference the table in
+  // go/vxc-isa#atomic-remote-set-add.
+  Value *CombinedTarget = B.CreatePtrToInt(SflagAny, I32Type);
+  CombinedTarget = B.CreateOr(CombinedTarget, B.CreateShl(CoreId, 14));
+  CombinedTarget = B.CreateOr(CombinedTarget, B.CreateShl(ChipId, 17));
+  Intr->setOperand(CoreIdIdx, B.getInt32(0));
+  Intr->setOperand(ChipIdIdx, B.getInt32(0));
+  if (SetsDoneBit) {
+    Value *DoneBit = Intr->getOperand(DoneBitIdx);
+    CombinedTarget = B.CreateOr(CombinedTarget, B.CreateShl(DoneBit, 31));
+    Intr->setOperand(DoneBitIdx, B.getInt32(0));
+  }
+  CombinedTarget =
+      createIntToPtr(I->getModule(), B, CombinedTarget, SflagPtrTy);
+  Intr->setOperand(TargetAddressIdx, CombinedTarget);
+  return true;
+}
+
+bool TPUCodeGenPrepare::matchSyncBothIntrinsic(Instruction *I) {
+  IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(I);
+  if (!Intr)
+    return false;
+
+  Intrinsic::ID Id = Intr->getIntrinsicID();
+  if (Id != Intrinsic::tpu_syncadd_both && Id != Intrinsic::tpu_syncset_both &&
+      Id != Intrinsic::tpu_syncset_both_done) {
+    return false;
+  }
+
+  IRBuilder<> B(I);
+  Value *Sflag = Intr->getOperand(0);
+  Value *SflagOther = Intr->getOperand(1);
+  Type *SflagPtrTy = Sflag->getType();
+  Type *SflagOtherPtrTy = SflagOther->getType();
+  Type *I32Type = Type::getInt32Ty(I->getContext());
+
+  assert(SflagPtrTy->isPointerTy());
+  assert(SflagOtherPtrTy->isPointerTy());
+
+  Sflag = B.CreatePtrToInt(Sflag, I32Type);
+  SflagOther = B.CreatePtrToInt(SflagOther, I32Type);
+  Value *SflagBoth = B.CreateOr(Sflag, B.CreateShl(SflagOther, 16));
+  // Technically it's now a "pointer" to both sflag and sflag_other, but we have
+  // to choose just one address space
+  SflagBoth = createIntToPtr(I->getModule(), B, SflagBoth, SflagPtrTy);
+  Intr->setOperand(0, SflagBoth);
+  SflagOther =
+      createIntToPtr(I->getModule(), B, B.getInt32(0), SflagOtherPtrTy);
+  Intr->setOperand(1, SflagOther);
+  return true;
+}
+
+bool TPUCodeGenPrepare::matchStreamIntrinsic(Instruction *I) {
+  if (IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(I)) {
+    Intrinsic::ID Id = Intr->getIntrinsicID();
+    StringRef IName = Intrinsic::isOverloaded(Id) ? Intrinsic::getBaseName(Id)
+                                                  : Intrinsic::getName(Id);
+    // As an exception in order to avoid matching hundreds of intrinsic ids,
+    // we're matching the stream intrinsic string here.
+    if (IName.take_front(15) != "llvm.tpu.stream")
+      return false;
+
+    // std::optionally match indirect/strided stream intrinsic.
+    matchIndirectOrStrideStreamIntrinsic(I);
+
+    // In a nutshell, how stream control overrides work in our system:
+    //
+    // 1)  MLO needs to set it (dynamic or immediate) as the intrinsic operand,
+    //     starting at bit 0, as in go/vxc-sc-isa#stream_control left column.
+    // 2a) Iff the override is not a constant, LLVM OR's it with the sflag value
+    //     (that's actually an OR instruction in the code).
+    // 2b) Iff the override is a constant, LLVM does not OR and leaves the
+    //     operand.
+    // 3)  Hardware interprets those overrides as XOR.
+    // 4)  In case of 2b) because of 3), the ISA emitter XORs the menmonic based
+    //     stream controls and the override constant bits.
+
+    constexpr unsigned SflagStreamControlOp = 0;
+    constexpr unsigned StreamControlOvrdOp = 1;
+    if (isa<ConstantInt>(Intr->getOperand(StreamControlOvrdOp)))
+      return false;
+    IRBuilder<> B(I);
+    Value *SflagStreamControl = Intr->getOperand(SflagStreamControlOp);
+    Value *StreamControlOvrd =
+        B.CreateShl(Intr->getOperand(StreamControlOvrdOp), 5);
+    Type *SflagPtrTy = SflagStreamControl->getType();
+    assert(SflagPtrTy->isPointerTy());
+    SflagStreamControl =
+        B.CreatePtrToInt(SflagStreamControl, Type::getInt32Ty(I->getContext()));
+    SflagStreamControl = B.CreateOr(SflagStreamControl, StreamControlOvrd);
+    SflagStreamControl =
+        createIntToPtr(I->getModule(), B, SflagStreamControl, SflagPtrTy);
+    Intr->setOperand(SflagStreamControlOp, SflagStreamControl);
+    Intr->setOperand(StreamControlOvrdOp, B.getInt32(0));
+    return true;
+  }
+  return false;
+}
+
+bool TPUCodeGenPrepare::shiftDMASpmemPointer(IntrinsicInst *Intr,
+                                             unsigned Idx) {
+  Value *AddrOp = Intr->getOperand(Idx);
+  Type *PtrTy = Intr->getOperand(Idx)->getType();
+  int AS = PtrTy->getPointerAddressSpace();
+  if (AS != TPUAS_Spmem)
+    return false;
+
+  unsigned HbmShiftSize = ST->getHbmWordSizeLog2();
+  unsigned ShiftSize = TPU::getShiftSize(AS, *ST);
+  assert(HbmShiftSize > ShiftSize);
+  unsigned LShr = HbmShiftSize - ShiftSize;
+  if (ConstantInt *C = dyn_cast<ConstantInt>(AddrOp)) {
+    if (C->getZExtValue() & (1 << LShr) - 1) {
+      report_fatal_error(
+          "Index not suitable for word size, would get dropped.\n");
+    }
+  }
+
+  IRBuilder<> B(Intr);
+  Type *I32Type = Type::getInt32Ty(Intr->getContext());
+  AddrOp = B.CreatePtrToInt(AddrOp, I32Type);
+  Value *ShiftedAddr = B.CreateLShr(AddrOp, B.getInt32(LShr));
+  AddrOp = createIntToPtr(Intr->getModule(), B, ShiftedAddr, PtrTy);
+  Intr->setOperand(Idx, AddrOp);
+  return true;
+}
+
+bool TPUCodeGenPrepare::matchSimpleDMA(Instruction *I) {
+  IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(I);
+  if (!Intr)
+    return false;
+
+  Intrinsic::ID Id = Intr->getIntrinsicID();
+  if (Id != Intrinsic::tpu_dma_hbm_to_smem_sc_simple &&
+      Id != Intrinsic::tpu_dma_hbm_to_simem_sc_simple &&
+      Id != Intrinsic::tpu_dma_hbm_to_timem_sc_simple &&
+      Id != Intrinsic::tpu_dma_hbm_to_spmem_sc_simple &&
+      Id != Intrinsic::tpu_dma_hbm_to_hbm_sc_simple &&
+      Id != Intrinsic::tpu_dma_smem_to_hbm_sc_simple &&
+      Id != Intrinsic::tpu_dma_timem_to_hbm_sc_simple &&
+      Id != Intrinsic::tpu_dma_spmem_to_hbm_sc_simple &&
+      Id != Intrinsic::tpu_dma_spmem_to_spmem_sc_simple) {
+    return false;
+  }
+
+  constexpr unsigned SrcAddrIdx = 1;
+  constexpr unsigned DstAddrIdx = 2;
+  shiftDMASpmemPointer(Intr, SrcAddrIdx);
+  shiftDMASpmemPointer(Intr, DstAddrIdx);
+
+  return true;
+}
+
+Value *pToS32(IRBuilder<> &B, Value *Val) {
+  assert(Val->getType() == B.getInt1Ty());
+  return B.CreateSelect(Val, B.getInt32(1), B.getInt32(0));
+}
+
+std::pair<Value *, Value *> addWithCarry(IRBuilder<> &B, Value *ValA,
+                                         Value *ValB) {
+  assert(ValA->getType() == B.getInt32Ty());
+  assert(ValB->getType() == B.getInt32Ty());
+  Value *Sum = B.CreateAdd(ValA, ValB);
+  Value *Carry = pToS32(B, B.CreateIntrinsic(Intrinsic::tpu_addcarry,
+                                             B.getInt1Ty(), {ValA, ValB}));
+  return {Sum, Carry};
+}
+
+bool TPUCodeGenPrepare::emulateDMAIovaPointer(IntrinsicInst *Intr,
+                                              unsigned PtrIdx,
+                                              unsigned OffIdx) {
+  Type *PtrTy = Intr->getOperand(PtrIdx)->getType();
+  int AS = PtrTy->getPointerAddressSpace();
+  if (AS != TPUAS_Iova)
+    return false;
+
+  IRBuilder<> B(Intr);
+  Type *I32Type = Type::getInt32Ty(Intr->getContext());
+  Value *OffsetOp = Intr->getOperand(OffIdx);
+  Value *AddrOp = Intr->getOperand(PtrIdx);
+  AddrOp = B.CreatePtrToInt(AddrOp, I32Type);
+
+  // Shift pointers to account for the emulated word size, yielding a 64-bit
+  // value (i.e. two 32-bit values)
+  int ShiftSize = TPU::getShiftSize(AS, *ST);
+  Value *AddrHi = B.CreateAShr(AddrOp, B.getInt32(32 - ShiftSize));
+  Value *AddrLo = B.CreateShl(AddrOp, B.getInt32(ShiftSize));
+
+  Value *IovaLo = AddrLo;
+  Value *IovaHi = AddrHi;
+
+  // Add offset, if needed
+  if (!isa<ConstantInt>(OffsetOp) ||
+      cast<ConstantInt>(OffsetOp)->getZExtValue() != 0) {
+    auto [Sum, Carry] = addWithCarry(B, AddrLo, OffsetOp);
+    IovaLo = Sum;
+    IovaHi = B.CreateAdd(IovaHi, Carry);
+  }
+
+  Intr->setOperand(PtrIdx, createIntToPtr(Intr->getModule(), B, IovaLo, PtrTy));
+  // IOVA high stays as a value.
+  Intr->setOperand(OffIdx, IovaHi);
+  return true;
+}
+
+bool TPUCodeGenPrepare::matchSimpleHostDMA(Instruction *I) {
+  IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(I);
+  if (!Intr)
+    return false;
+
+  Intrinsic::ID Id = Intr->getIntrinsicID();
+  if (Id != Intrinsic::tpu_dma_iova_to_hbm_sc_simple &&
+      Id != Intrinsic::tpu_dma_hbm_to_iova_sc_simple) {
+    return false;
+  }
+
+  constexpr unsigned SrcAddrIdx = 1;
+  constexpr unsigned DstAddrIdx = 2;
+  shiftDMASpmemPointer(Intr, SrcAddrIdx);
+  shiftDMASpmemPointer(Intr, DstAddrIdx);
+
+  constexpr unsigned OffsetIdx = 3;
+  emulateDMAIovaPointer(Intr, SrcAddrIdx, OffsetIdx);
+  emulateDMAIovaPointer(Intr, DstAddrIdx, OffsetIdx);
+
+  return true;
+}
+
+bool TPUCodeGenPrepare::matchGeneralDMA(Instruction *I) {
+  IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(I);
+  if (!Intr)
+    return false;
+
+  Intrinsic::ID Id = Intr->getIntrinsicID();
+  if (Id != Intrinsic::tpu_dma_hbm_to_hbm_sc_general &&
+      Id != Intrinsic::tpu_dma_smem_to_smem_sc_general &&
+      Id != Intrinsic::tpu_dma_hbm_to_smem_sc_general &&
+      Id != Intrinsic::tpu_dma_hbm_to_timem_sc_general &&
+      Id != Intrinsic::tpu_dma_hbm_to_spmem_sc_general &&
+      Id != Intrinsic::tpu_dma_smem_to_hbm_sc_general &&
+      Id != Intrinsic::tpu_dma_timem_to_hbm_sc_general &&
+      Id != Intrinsic::tpu_dma_spmem_to_hbm_sc_general &&
+      Id != Intrinsic::tpu_dma_spmem_to_spmem_sc_general) {
+    return false;
+  }
+
+  constexpr unsigned SrcSflagIdx = 5;
+  constexpr unsigned Override2Idx = 10;
+  constexpr unsigned SrcCoreIdIdx = 6;
+  Value *SrcSflag = Intr->getOperand(SrcSflagIdx);
+  Type *SrcSflagPtrTy = SrcSflag->getType();
+  Value *Override2 = Intr->getOperand(Override2Idx);
+  Value *SrcCoreId = Intr->getOperand(SrcCoreIdIdx);
+
+  // Shift the relax_sync_override operand by 16 and OR with source sync flag to
+  // form combined operands.
+  IRBuilder<> B(I);
+  Type *I32Type = Type::getInt32Ty(I->getContext());
+  SrcSflag = B.CreatePtrToInt(SrcSflag, I32Type);
+  // We're OR'ing the source sync flag core id at [15:13]. We do not check or
+  // mask the bits, see go/vxc-isa#stride_pseudo_code.
+  constexpr unsigned CoreIdShift = 13;
+  SrcSflag = B.CreateOr(B.CreateShl(SrcCoreId, CoreIdShift), SrcSflag);
+  // We're OR'ing the second override operand that represents bits 16 and up.
+  constexpr unsigned SyncModeShift = 16;
+  Value *CombinedVS0 =
+      B.CreateOr(B.CreateShl(Override2, SyncModeShift), SrcSflag);
+
+  // Set both operands to the same value as determined by the hardware DMA
+  // instruction.
+  Intr->setOperand(SrcSflagIdx, createIntToPtr(I->getModule(), B, CombinedVS0,
+                                               SrcSflagPtrTy));
+  Intr->setOperand(Override2Idx, CombinedVS0);
+
+  // Now we're OR'ing the destination sync flag core id.
+  constexpr unsigned DstSflagIdx = 0;
+  constexpr unsigned DstCoreIdIdx = 1;
+  Value *DstSflag = Intr->getOperand(DstSflagIdx);
+  Type *DstSflagPtrTy = DstSflag->getType();
+  Value *DstCoreId = Intr->getOperand(DstCoreIdIdx);
+  DstSflag = B.CreatePtrToInt(DstSflag, I32Type);
+  DstSflag = B.CreateOr(B.CreateShl(DstCoreId, 13), DstSflag);
+  Intr->setOperand(DstSflagIdx,
+                   createIntToPtr(I->getModule(), B, DstSflag, DstSflagPtrTy));
+
+  // Setting this to zero for lowering.
+  Intr->setOperand(SrcCoreIdIdx, B.getInt32(0));
+  Intr->setOperand(DstCoreIdIdx, B.getInt32(0));
+
+  constexpr unsigned SrcAddrIdx = 2;
+  constexpr unsigned DstAddrIdx = 3;
+  shiftDMASpmemPointer(Intr, SrcAddrIdx);
+  shiftDMASpmemPointer(Intr, DstAddrIdx);
+
+  return true;
+}
+
+bool TPUCodeGenPrepare::matchAddrspaceCast(Instruction *I) {
+  IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(I);
+  if (!Intr)
+    return false;
+  if (Intr->getIntrinsicID() == Intrinsic::tpu_addrspacecast) {
+    unsigned DstAS = Intr->getType()->getPointerAddressSpace();
+    unsigned SrcAS = Intr->getOperand(0)->getType()->getPointerAddressSpace();
+    if ((SrcAS == TPUAS_Smem && DstAS == TPUAS_SmemAny) ||
+        (SrcAS == TPUAS_Hbm && DstAS == TPUAS_HbmAny)) {
+      IRBuilder<> B(I);
+      Value *PlainCast =
+          B.CreateAddrSpaceCast(Intr->getOperand(0), Intr->getType());
+      Intr->replaceAllUsesWith(PlainCast);
+      return true;
+    }
+  }
+  // The following translation addresses could be part of TPUSubtarget, but
+  // since there is no good way to generalize this, we hardcoded them here.
+  // See go/vxc-sc-isa#sync-flag-address-space.
+  if (Intr->getIntrinsicID() == Intrinsic::tpu_addrspacecast_tc) {
+    assert(Intr->getOperand(0)->getType()->getPointerAddressSpace() ==
+           TPUAS_Sflag);
+    IRBuilder<> B(I);
+    Value *PlainCast =
+        B.CreateAddrSpaceCast(Intr->getOperand(0), Intr->getType());
+    // No address offset for tc translation.
+    Intr->replaceAllUsesWith(PlainCast);
+    return true;
+  }
+
+  bool IsSCS = false;
+  bool IsTAC = false;
+
+  // If true, we will use ssync*.tile encoding:
+  // go/vxc-sc-isa#atomic-tile-set-add
+  // Otherwise we use the message address space encoding:
+  // go/vxc-mem#memories-in-the-message-address-space-ma
+  bool UseTileEncoding = false;
+
+  switch (Intr->getIntrinsicID()) {
+  case Intrinsic::tpu_addrspacecast_scs:
+    IsSCS = true;
+    break;
+  case Intrinsic::tpu_addrspacecast_tec:
+    break;
+  case Intrinsic::tpu_addrspacecast_tac:
+    IsTAC = true;
+    break;
+  case Intrinsic::tpu_addrspacecast_tile_scs:
+    UseTileEncoding = true;
+    IsSCS = true;
+    break;
+  case Intrinsic::tpu_addrspacecast_tile_tec:
+    UseTileEncoding = true;
+    break;
+  case Intrinsic::tpu_addrspacecast_tile_tac:
+    UseTileEncoding = true;
+    IsTAC = true;
+    break;
+  default:
+    return false;
+  }
+
+  int StartOffset = 0x1c00;
+  int TileMultiplier = 0x40;
+  if (IsSCS || UseTileEncoding) {
+    StartOffset = 0x0;
+    TileMultiplier = 0x10000;
+  }
+  if (IsTAC) {
+    StartOffset += 0x20;
+  }
+
+  assert(Intr->getOperand(0)->getType()->getPointerAddressSpace() ==
+         TPUAS_Sflag);
+  IRBuilder<> B(I);
+
+  Value *TileId;
+  if (!IsSCS) {
+    TileId = Intr->getOperand(1);
+  } else if (UseTileEncoding) {
+    TileId = B.getInt32(16);
+  } else {
+    TileId = B.getInt32(0);
+  }
+
+  Value *PlainCast =
+      B.CreateAddrSpaceCast(Intr->getOperand(0), Intr->getType());
+  Value *OffsetV;
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(TileId)) {
+    int Offset = StartOffset + CI->getZExtValue() * TileMultiplier;
+    OffsetV = B.getInt32(Offset);
+  } else {
+    OffsetV = B.CreateMul(TileId, B.getInt32(TileMultiplier));
+    OffsetV = B.CreateAdd(OffsetV, B.getInt32(StartOffset));
+  }
+  Function *PtrToIntFn = Intrinsic::getDeclaration(
+      Intr->getModule(), llvm::Intrinsic::tpu_ptrtoint, PlainCast->getType());
+  Value *TransPtr = B.CreateCall(PtrToIntFn, PlainCast);
+  TransPtr = B.CreateAdd(TransPtr, OffsetV);
+  TransPtr =
+      createIntToPtr(Intr->getModule(), B, TransPtr, PlainCast->getType());
+  Intr->replaceAllUsesWith(TransPtr);
+
+  return true;
+}
+
+bool TPUCodeGenPrepare::matchClearIbuf(Instruction *I) {
+  IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(I);
+  if (!Intr || Intr->getIntrinsicID() != Intrinsic::tpu_clear_ibuf)
+    return false;
+  if (ST && !ST->isSparseCoreScs())
+    llvm_unreachable("tpu.clear_ibuf is only supported on SCS.");
+
+  IRBuilder<> Builder(Intr);
+
+  // Insert an sfence. The sfence waits for all tiles to complete execution of
+  // outstanding tasks. This is required so that the Tile Ibuf reaches a point
+  // of quiescence (see b/207514927#comment17). (Technically the sfence only
+  // needs to wait for task completion on the 16 tiles, so an sfence.sel $0xffff
+  // should also suffice.)
+  Instruction *SFenceInst =
+      Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
+
+  // Set the PCs for TAC and TEC to be out of bounds (-1). This will prevent
+  // new instructions from being fetched into Tile Ibuf from Tile Imem.
+  Value *PC = Builder.getInt32(-1);
+  Value *DregPtr = Intr->getOperand(0);
+  Value *AccessPCPtr =
+      Builder.CreateGEP(Builder.getInt8Ty(), DregPtr, Builder.getInt32(0));
+  Instruction *StoreAccessPCInst = Builder.CreateStore(PC, AccessPCPtr);
+  Value *ExecutePCPtr =
+      Builder.CreateGEP(Builder.getInt8Ty(), DregPtr, Builder.getInt32(1));
+  Instruction *StoreExecutePCInst = Builder.CreateStore(PC, ExecutePCPtr);
+
+  // Dispatch the dummy task with task.clear_ibuf to actually clear the Tile
+  // Ibuf. The lower 16 bits of `ArgCountAndTileBitmap` sets all of the tiles to
+  // be active, while the upper bits specifies the number of words in the
+  // descriptor for the dummy task (two, including the TAC and TEC PCs).
+  Value *ArgCountAndTileBitmap = Builder.getInt32(0x2FFFF);
+  Function *DispatchTaskFn = Intrinsic::getDeclaration(
+      Intr->getModule(), llvm::Intrinsic::tpu_task_dispatch_clear_ibuf);
+  Instruction *DispatchTaskInst =
+      Builder.CreateCall(DispatchTaskFn, {DregPtr, ArgCountAndTileBitmap});
+
+  SFenceInst->copyMetadata(*Intr);
+  StoreAccessPCInst->copyMetadata(*Intr);
+  StoreExecutePCInst->copyMetadata(*Intr);
+  DispatchTaskInst->copyMetadata(*Intr);
+  I->eraseFromParent();
+  return true;
+}
+
+bool TPUCodeGenPrepare::matchClamp(Instruction *I, const DataLayout &DL) {
+  IRBuilder<> B(I);
+  auto Replace = [&](CallInst *Intr) {
+    I->replaceAllUsesWith(Intr);
+    I->eraseFromParent();
+    return true;
+  };
+  Value *CMin, *CX, *CMax;
+  if (!match(I, m_Intrinsic<Intrinsic::tpu_clamp>(m_Value(CMin), m_Value(CX),
+                                                  m_Value(CMax))))
+    return false;
+  if (match(I, m_Intrinsic<Intrinsic::tpu_clamp>(m_FNeg(m_Specific(CMax)),
+                                                 m_Value(CX), m_Value()))) {
+    return Replace(
+        B.CreateBinaryIntrinsic(Intrinsic::tpu_clamp_symmetric, CX, CMax));
+  }
+  Constant *CMinFC, *CMaxFC;
+  if (match(I, m_Intrinsic<Intrinsic::tpu_clamp>(
+                   m_Constant(CMinFC), m_Value(CX), m_Constant(CMaxFC)))) {
+    if (!CMinFC->isZeroValue()) {
+      ConstantDataVector *CMinFCV = cast<ConstantDataVector>(CMinFC);
+      ConstantDataVector *CMaxFCV = cast<ConstantDataVector>(CMaxFC);
+      assert(CMinFCV->getType()->getNumElements() ==
+             CMaxFCV->getType()->getNumElements());
+#ifndef NDEBUG
+      int ElementSizeInBytes =
+          DL.getTypeAllocSize(CMinFCV->getType()->getElementType());
+#endif
+      assert(CMinFCV->getType()->getNumElements() ==
+             ST->vectorSizeInElements(ElementSizeInBytes));
+      bool Inverse = true;
+      for (int i = 0; i < CMinFCV->getType()->getNumElements(); i++) {
+        APFloat CMinA =
+            cast<ConstantFP>(CMinFCV->getAggregateElement(i))->getValue();
+        APFloat CMaxA =
+            cast<ConstantFP>(CMaxFCV->getAggregateElement(i))->getValue();
+        if (CMinA != -CMaxA) {
+          Inverse = false;
+          break;
+        } else if (CMinA > CMaxA) {
+          llvm_unreachable("tpu.clamp requires max > min.");
+        }
+      }
+      if (Inverse)
+        return Replace(
+            B.CreateBinaryIntrinsic(Intrinsic::tpu_clamp_symmetric, CX, CMax));
+    }
+  }
+  // Fall back to min/max.
+  return Replace(B.CreateBinaryIntrinsic(
+      Intrinsic::maximum, B.CreateBinaryIntrinsic(Intrinsic::minimum, CX, CMax),
+      CMin));
+}
+
+void TPUCodeGenPrepare::prepareEventDebug(Function &F) {
+  if (!EnableEventDebug)
+    return;
+  Module *M = F.getParent();
+  int Sz = M->getGlobalList().size();
+  for (int i = 0; i < EventDebugMax; i++) {
+    std::string Str =
+        ".str_" + std::string(F.getName()) + "_" + std::to_string(i);
+    std::string Msg = std::string(F.getName()) + "_" + std::to_string(i);
+    ArrayType *ArrayTy =
+        ArrayType::get(IntegerType::get(M->getContext(), 8), Msg.size() + 1);
+    new GlobalVariable(*M, ArrayTy, true, GlobalValue::InternalLinkage,
+                       ConstantDataArray::getString(M->getContext(), Msg, true),
+                       Str);
+  }
+  for (int i = 0; i < EventDebugMax; i++) {
+    Function *IntrEvent =
+        Intrinsic::getDeclaration(F.getParent(), llvm::Intrinsic::tpu_event);
+    IRBuilder<> B(&*F.begin()->begin());
+    GlobalVariable *GV = &*std::next(M->globals().begin(), Sz + i);
+    B.CreateCall(IntrEvent, B.CreateGEP(GV->getValueType(), GV,
+                                        {B.getInt32(0), B.getInt32(0)}));
+  }
+}
+
+bool TPUCodeGenPrepare::simplifyTrivialTraps(Function &F) {
+  if (EnableContinuations)
+    // Continuations messes with return/HALT, skip.
+    return false;
+  if (ST && !ST->isSparseCore())
+    return false;
+  bool Changed = false;
+  SmallVector<IntrinsicInst *> Traps;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(&I)) {
+        if ((Intr->getIntrinsicID() == Intrinsic::tpu_halt_trap &&
+             match(Intr->getOperand(0), m_One()))) {
+          Traps.push_back(Intr);
+          Changed = true;
+        }
+      }
+    }
+  }
+  for (IntrinsicInst *Trap : Traps) {
+    BasicBlock *BeforeBB = Trap->getParent()->splitBasicBlock(
+        Trap->getIterator(), "", /*Before=*/true);
+    Instruction *AbsBr = BeforeBB->getTerminator();
+    IRBuilder<> B(AbsBr);
+    Type *RetTy = Trap->getFunction()->getReturnType();
+    if (RetTy->isVoidTy())
+      B.CreateRetVoid();
+    else
+      B.CreateRet(UndefValue::get(RetTy));
+    AbsBr->eraseFromParent();
+    Trap->eraseFromParent();
+  }
+  return Changed;
+}
+
+class FuncComparator {
+public:
+  FuncComparator(TPUTargetMachine *TM) : TM(TM){};
+  bool operator()(const Function &Fa, const Function &Fb) {
+    auto &STa = TM->getSubtarget<TPUSubtarget>(Fa);
+    auto &STb = TM->getSubtarget<TPUSubtarget>(Fb);
+    if (STa.isSparseCoreScs())
+      return true;
+    if (STb.isSparseCoreScs())
+      return false;
+    if (STa.isSparseCoreTac())
+      return true;
+    if (STb.isSparseCoreTac())
+      return false;
+    return false;
+  }
+
+private:
+  TPUTargetMachine *TM;
+};
+
+bool TPUCodeGenPrepare::runOnModule(Module &M) {
+  bool Changed = false;
+  // See comment above, use with extreme care.
+  convertHalfToBfloat(M, TM);
+  for (Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+    if (TM)
+      ST = &TM->getSubtarget<TPUSubtarget>(F);
+    Changed |= runOnFunction(F);
+  }
+  if (SortFunctionSymbols) {
+    // On SparseCore, we sort the symbol list of functions
+    // in the order of scs, tac, tec.
+    assert(!ST || ST->isSparseCore());
+    FuncComparator comp(TM);
+    M.getFunctionList().sort(comp);
+  }
+  return Changed;
+}
+
+bool TPUCodeGenPrepare::runOnFunction(Function &F) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  prepareEventDebug(F);
+  for (auto &BB : F) {
+    for (auto I = BB.begin(); I != BB.end();) {
+      Instruction *Inst = &(*I++);
+      if (matchDMADoneOpt(Inst, I))
+        continue;
+      if (matchShiftPattern(Inst))
+        continue;
+      if (matchScVldVstNpIntrinsic(Inst, DL))
+        continue;
+      if (matchMaskedLoadIntrinsic(Inst))
+        continue;
+      if (matchMaskedStoreIntrinsic(Inst))
+        continue;
+      if (scalarizeBinOp(*Inst))
+        continue;
+      if (scalarizeCmp(*Inst))
+        continue;
+      if (scalarizeSelect(*Inst))
+        continue;
+      if (reassociateGEP(*Inst))
+        continue;
+      if (sinkGEP(*Inst))
+        continue;
+      if (matchSyncRemoteIntrinsic(Inst))
+        continue;
+      if (matchSyncBothIntrinsic(Inst))
+        continue;
+      if (matchStreamIntrinsic(Inst))
+        continue;
+      if (matchGeneralDMA(Inst))
+        continue;
+      if (matchSimpleDMA(Inst))
+        continue;
+      if (matchSimpleHostDMA(Inst))
+        continue;
+      if (matchAddrspaceCast(Inst))
+        continue;
+      if (matchClearIbuf(Inst))
+        continue;
+      if (matchClamp(Inst, DL))
+        continue;
+    }
+  }
+
+  simplifyTrivialTraps(F);
+
+  // Perform an optimization to modify icmp ult -> icmp slt where possible.
+  // At the moment we fail with an unholy error if we can't do this, because
+  // we don't support unsigned comparisons in the hardware and we don't yet
+  // emit the bitwise pattern to synthesize them.
+
+  for (auto &BB : F) {
+    for (auto I = BB.begin(); I != BB.end(); ++I) {
+      ICmpInst *ICI = dyn_cast<ICmpInst>(&*I);
+      if (!ICI || !ICI->isUnsigned() || !ICI->isRelational())
+        continue;
+      IRBuilder<> B(ICI);
+      if (ST) {
+        if (ICI->getType()->isVectorTy() && ST->hasUnsignedVectorCompare())
+          continue;
+        else if (!ICI->getType()->isVectorTy() &&
+                 ST->hasUnsignedScalarCompare())
+          continue;
+      }
+
+      // We have an icmp with a non-equality unsigned comparison. Check if we
+      // can infer information from the sign bit.
+      KnownBits KB0 = llvm::computeKnownBits(ICI->getOperand(0), DL);
+      KnownBits KB1 = llvm::computeKnownBits(ICI->getOperand(1), DL);
+
+      if (IsSignBitKnownSame(KB0, KB1)) {
+        Value *V = B.CreateICmp(ICI->getSignedPredicate(), ICI->getOperand(0),
+                                ICI->getOperand(1), ICI->getName());
+        cast<Instruction>(V)->copyMetadata(*ICI);
+        ICI->replaceAllUsesWith(V);
+        continue;
+      }
+
+      // Try and match (icmp ult (add X, 1), Y) -> icmp slt (X, Y-1).
+      // The (add X, 1) may be confusing KnownBits.
+      Value *X;
+      if (ICI->getPredicate() == CmpInst::ICMP_ULT &&
+          match(ICI->getOperand(0), m_Add(m_Value(X), m_One()))) {
+        KB0 = llvm::computeKnownBits(X, DL);
+        if (IsSignBitKnownSame(KB0, KB1)) {
+          Value *V = B.CreateICmp(
+              CmpInst::ICMP_SLT, X,
+              B.CreateSub(ICI->getOperand(1), B.getInt32(1)), ICI->getName());
+          cast<Instruction>(V)->copyMetadata(*ICI);
+          ICI->replaceAllUsesWith(V);
+          continue;
+        }
+      }
+
+      // If the lower bit of Y is known to be 0 change:
+      // (icmp ult X, Y) -> icmp slt (shr(X, 1), shr(Y, 1)).
+      if (ICI->getPredicate() == CmpInst::ICMP_ULT &&
+          KB1.countMinTrailingZeros() > 0) {
+        Value *One = ConstantInt::get(ICI->getOperand(0)->getType(), 1);
+        Value *V = B.CreateICmp(
+            CmpInst::ICMP_SLT, B.CreateLShr(ICI->getOperand(0), One),
+            B.CreateLShr(ICI->getOperand(1), One), ICI->getName());
+        cast<Instruction>(V)->copyMetadata(*ICI);
+        ICI->replaceAllUsesWith(V);
+        continue;
+      }
+
+      // We don't know that the sign bits are equal, so we can't convert this
+      // from unsigned to signed.
+    }
+  }
+  return true;
+}
+
+bool TPUCodeGenPrepare::convertHalfToBfloat(Module &M, TPUTargetMachine *TM) {
+  // See comment above, use with extreme care.
+  if (!TM || !ConvertHalfToBfloat)
+    return false;
+  for (Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+    ST = &TM->getSubtarget<TPUSubtarget>(F);
+    if (ST->isSparseCoreTec())
+      break;
+  }
+  if (!ST->isSparseCoreTec())
+    return false;
+  bool Changed = false;
+  int VhfLength;
+  if (ST->hasV8())
+    VhfLength = 16;
+  else if (ST->hasV16())
+    VhfLength = 32;
+  else
+    llvm_unreachable("Unexpected VPU.");
+  Type *VhfTy =
+      VectorType::get(Type::getHalfTy(M.getContext()), VhfLength, false);
+  Type *VhfTileSpmemPtrTy =
+      VectorType::get(Type::getHalfTy(M.getContext()), VhfLength, false)
+          ->getPointerTo(TPUAS_TileSpmem);
+  // This is a hack for code from C-lang that can contain vector code in Smem.
+  Type *VhfSmemPtrTy =
+      VectorType::get(Type::getHalfTy(M.getContext()), VhfLength, false)
+          ->getPointerTo(TPUAS_Smem);
+  Type *VbfTy =
+      VectorType::get(Type::getBFloatTy(M.getContext()), VhfLength, false);
+  Type *VbfTileSpmemPtrTy =
+      VectorType::get(Type::getBFloatTy(M.getContext()), VhfLength, false)
+          ->getPointerTo(TPUAS_TileSpmem);
+  Type *VbfSmemPtrTy =
+      VectorType::get(Type::getBFloatTy(M.getContext()), VhfLength, false)
+          ->getPointerTo(TPUAS_Smem);
+  for (GlobalVariable &GV : M.globals()) {
+    if (GV.getType() == VhfTy) {
+      Changed = true;
+      GV.mutateType(VbfTy);
+    }
+  }
+  auto TryToChangeType = [&](Value *V) {
+    if (V->getType() == VhfTy) {
+      Changed = true;
+      V->mutateType(VbfTy);
+    } else if (V->getType() == VhfTileSpmemPtrTy) {
+      Changed = true;
+      V->mutateType(VbfTileSpmemPtrTy);
+    } else if (V->getType() == VhfSmemPtrTy) {
+      Changed = true;
+      V->mutateType(VbfSmemPtrTy);
+    }
+    if (isa<AllocaInst>(V) && V->getType() == VhfSmemPtrTy) {
+      Changed = true;
+      cast<AllocaInst>(V)->setAllocatedType(VbfSmemPtrTy);
+    }
+  };
+  for (Function &F : M) {
+    for (BasicBlock &B : F) {
+      for (Instruction &I : B) {
+        TryToChangeType(&I);
+      }
+    }
+  }
+  return Changed;
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUFifoAnalysis.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUFifoAnalysis.cpp
new file mode 100644
index 0000000..a9014c2
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUFifoAnalysis.cpp

@@ -0,0 +1,413 @@
+//===-- TPUFifoAnalysis.cpp - FIFO analysis utilities  -----------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Performs analysis on FIFO registers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TPUFifoAnalysis.h"
+#include "TPU.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "MCTargetDesc/TPUMCTargetDesc.h"
+#include "TPUInstrInfo.h"
+#include "TPUMachineFunctionInfo.h"
+#include "TPURegisterInfo.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <iterator>
+#include <vector>
+
+#define DEBUG_TYPE "tpu-fifo-analysis"
+using namespace llvm;
+
+#define DEBUG_STATE(Str, MBB, State)                                           \
+  LLVM_DEBUG(dbgs() << "MBB: "; MBB.printAsOperand(dbgs());                    \
+             dbgs() << "; " << Str << ": " << State << "\n";)
+
+static cl::opt<bool>
+    ErrorsAsWarnings("tpu-fifo-analysis-errors-as-warnings", cl::init(false),
+                     cl::desc("In the TPU Fifo Analysis pass if we would "
+                              "generate an error make it a warning instead"));
+
+FifoAnalysis::FifoAnalysis(MachineFunction &MF, ArrayRef<const FifoInfo *> FIs,
+                           SmallSet<MachineBasicBlock *, 8> &MBBs,
+                           MachineRegisterInfo *MRI)
+    : FIs(FIs), MRI(MRI) {
+  RunFifoAnalysis(MF, MBBs);
+}
+
+FifoAnalysis::FifoAnalysis(MachineFunction &MF, ArrayRef<const FifoInfo *> FIs,
+                           MachineRegisterInfo *MRI)
+    : FIs(FIs), MRI(MRI) {
+  SmallSet<MachineBasicBlock *, 8> Empty;
+  RunFifoAnalysis(MF, Empty);
+}
+
+void FifoAnalysis::RunFifoAnalysis(
+    MachineFunction &MF, SmallSet<MachineBasicBlock *, 8> &AllowSetMBBs) {
+  // Analyze every FifoInfo separately. We could mash all these together and do
+  // only one traverse of the CFG, but the extra DenseMaps make the code really
+  // hard to read and understand.
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  for (const FifoInfo *FI : FIs) {
+    BlockExitStateMap ExitStates;
+    BlockEntryStateMap EntryStates;
+    std::vector<DeferredEdge> Deferred;
+    for (MachineBasicBlock *MBB : RPOT) {
+      if (!AnalyzeBlock(*MBB, AllowSetMBBs, ExitStates, EntryStates, Deferred,
+                        FI)) {
+        Valid = false;
+        break;
+      }
+    }
+
+    if (!Valid)
+      return;
+    for (auto &SuccAndPred : Deferred) {
+      assert(EntryStates.count(SuccAndPred.first) == 1);
+      if (ExitStates.count(SuccAndPred.second) == 0)
+        // This can happen with unreachable blocks. Ignore this edge.
+        continue;
+      auto &SuccPhis = EntryStates[SuccAndPred.first];
+      auto &PredState = ExitStates[SuccAndPred.second];
+      if (!MergeStates(SuccPhis, PredState)) {
+        Valid = false;
+        break;
+      }
+    }
+  }
+
+  // Iterate over all Phis P finding the transitive set of all Defs that feed P.
+  // This is a fixpoint algorithm but we don't expect many iterations.
+  DenseMap<Phi *, SmallPtrSet<Def *, 4>> PhiToDefs;
+  bool Changed = true;
+  while (Changed) {
+    Changed = false;
+    for (Phi *P : phis()) {
+      SmallPtrSet<Def *, 4> Defs;
+      for (DefUse *DU : P->getPrevs()) {
+        // Unwrap uses to get to either a Phi or a Def.
+        while (Use *U = dyn_cast<Use>(DU))
+          DU = &U->getPrev();
+        if (Phi *P = dyn_cast<Phi>(DU))
+          // For a Phi, look up the previous transitive Defs we calculated.
+          Defs.insert(PhiToDefs[P].begin(), PhiToDefs[P].end());
+        else
+          // For a Def just insert it.
+          Defs.insert(cast<Def>(DU));
+      }
+      // SmallPtrSet doesn't have operator==. But we know this analysis is
+      // additive - if a set is different now to a previous iteration, it must
+      // now be a superset. So we can use size() as a proxy for equality.
+      assert(Defs.size() >= PhiToDefs[P].size());
+      if (Defs.size() != PhiToDefs[P].size()) {
+        PhiToDefs[P] = std::move(Defs);
+        Changed = true;
+      }
+    }
+  }
+
+  // Finally assign all Uses their potential Def list.
+  for (Use *U : uses()) {
+    SmallVector<MachineInstr *, 4> Instrs;
+    // Unwrap use chains to get to a Def or Phi.
+    DefUse *DU = &U->getPrev();
+    while (Use *U = dyn_cast<Use>(DU))
+      DU = &U->getPrev();
+    if (Def *D = dyn_cast<Def>(DU)) {
+      U->Defs = {D};
+    } else {
+      auto &S = PhiToDefs[cast<Phi>(DU)];
+      U->Defs.insert(U->Defs.end(), S.begin(), S.end());
+      sort(U->Defs);
+    }
+  }
+}
+
+FifoAnalysis::~FifoAnalysis() {
+  // Defs, Uses and Phis are non-POD but are backed by a BumpPtrAllocator,
+  // so call their destructors manually.
+  for (Phi *P : phis())
+    P->~Phi();
+  for (Use *U : uses())
+    U->~Use();
+  for (Def *D : defs())
+    D->~Def();
+}
+
+bool FifoAnalysis::IsFifoInstr(MachineInstr &MI, const FifoInfo *FI) {
+  return any_of(MI.operands(), [&](const MachineOperand &MO) {
+    if (!MO.isReg())
+      return false;
+    unsigned Reg = MO.getReg();
+    if (Register::isVirtualRegister(Reg)) {
+      assert(MRI && "Need RegInfo to handle virtual registers!");
+      return MRI->getRegClass(Reg) == FI->getRegisterClass();
+    } else {
+      return FI->getRegisterClass()->contains(MO.getReg());
+    }
+  });
+}
+
+bool FifoAnalysis::MergeStates(std::deque<Phi *> &SuccPhis,
+                               const FifoState &PredState) {
+  if (SuccPhis.size() != PredState.size())
+    return false;
+
+  auto SI = PredState.begin();
+  for (auto I = SuccPhis.begin(), E = SuccPhis.end(); I != E; ++I, ++SI) {
+    Phi *P = *I;
+    DefUse *D = *SI;
+    if (find(P->getPrevs(), D) != P->getPrevs().end())
+      continue;
+    if (P->getFifoInfo() != D->getFifoInfo())
+      return false;
+    if (P->getNumItemsLeftToPop() != D->getNumItemsLeftToPop())
+      return false;
+    P->Prevs.push_back(D);
+  }
+  return true;
+}
+
+bool FifoAnalysis::DiagnoseError(const MachineBasicBlock &MBB,
+                                 const std::deque<Phi *> &A,
+                                 const FifoState &B) {
+  errs() << "ERROR during FIFO analysis of BB ";
+  MBB.printAsOperand(errs());
+  errs() << "; merging: \n  " << A << "\ninto\n  " << B << "\n";
+  if (!ErrorsAsWarnings)
+    llvm::report_fatal_error("Broken module found, aborting!",
+                             /*gen_crash_diag=*/false);
+  return false;
+}
+
+bool FifoAnalysis::DiagnoseError(const MachineBasicBlock &MBB,
+                                 const FifoState &A, StringRef Reason) {
+  errs() << "ERROR during FIFO analysis of BB ";
+  MBB.printAsOperand(errs());
+  errs() << ": " << Reason << "; state:  " << A << "\n";
+  if (!ErrorsAsWarnings)
+    llvm::report_fatal_error("Broken module found, aborting!",
+                             /*gen_crash_diag=*/false);
+  return false;
+}
+
+bool FifoAnalysis::AnalyzeBlock(MachineBasicBlock &MBB,
+                                SmallSet<MachineBasicBlock *, 8> &AllowSetMBBs,
+                                BlockExitStateMap &ExitStates,
+                                BlockEntryStateMap &EntryStates,
+                                std::vector<DeferredEdge> &Deferred,
+                                const FifoInfo *FI) {
+  FifoState State;
+  bool PipelinedBlock =
+      MBB.getParent()->getInfo<TPUMachineFunctionInfo>()->isBasicBlockPipelined(
+          &MBB);
+  // First, analyze entry arcs to assemble the entry state.
+  // TODO(hgreving): Currently AllowSetMBBs is an allowed set of blocks that
+  // should be considered during fifo analysis. It is only populated by the fifo
+  // overflow mutation, and currently may only contain the block of a single
+  // basic block loop. We can extend this in the future for loops with more
+  // basic blocks.
+  if (!AllowSetMBBs.empty() && !AllowSetMBBs.count(&MBB))
+    return true;
+  SmallVector<MachineBasicBlock *, 4> Preds;
+  for (auto &PI : MBB.predecessors()) {
+    if (AllowSetMBBs.empty() || AllowSetMBBs.count(PI))
+      Preds.push_back(PI);
+  }
+  if (Preds.size() == 1) {
+    State = ExitStates[*MBB.pred_begin()];
+  } else if (Preds.size() > 1) {
+    // Multiple predecessors; create a Phi for every incoming state. All
+    // predecessors' exit states must be identical.
+    std::optional<std::deque<Phi *>> EntryState;
+    for (auto &PI : Preds) {
+      auto ESI = ExitStates.find(&*PI);
+      if (ESI == ExitStates.end()) {
+        // We don't have an exit state for this block yet. Mark it deferred
+        // and we'll come back to it later.
+        Deferred.push_back({&MBB, &*PI});
+        continue;
+      }
+      if (EntryState.has_value()) {
+        if (!MergeStates(*EntryState, ESI->second))
+          return DiagnoseError(MBB, *EntryState, ESI->second);
+        continue;
+      }
+      EntryState = std::deque<Phi *>();
+      for (DefUse *DU : ESI->second) {
+        // Create a new PHI with just DU as input. The other inputs will be
+        // added via MergeStates (below, and via Deferred for backedges).
+        EntryState->push_back(new (Allocator) Phi({DU}));
+        State.push_back(EntryState->back());
+        Phis.push_back(EntryState->back());
+      }
+    }
+    assert(EntryState.has_value() && "No dominating edges found?!");
+    EntryStates[&MBB] = std::move(*EntryState);
+  }
+  DEBUG_STATE("state on entry", MBB, State);
+  // Save the entry state of the block.
+  FifoInputState[std::make_pair(&MBB, FI)] = State.size();
+  std::vector<MachineInstr *> ForwardPop;
+  auto PopElement = [&](MachineInstr &MI) {
+    Use *U = new (Allocator) Use(MI, *State.front(), FI);
+    Uses.push_back(U);
+    InstrToDefUse[&MI] = U;
+    if (U->getNumItemsLeftToPop() < 0)
+      return DiagnoseError(MBB, State, "Popping more items than pushed");
+    if (U->getNumItemsLeftToPop() == 0)
+      State.pop_front();
+    else
+      State.front() = U;
+    DEBUG_STATE("pop", MBB, State);
+    return true;
+  };
+  for (MachineInstr &MI : MBB.instrs()) {
+    if (!IsFifoInstr(MI, FI) || MI.isBundle())
+      continue;
+    if (TPUInstrInfo::isFifoPush(MI)) {
+      Def *D = new (Allocator) Def(MI, FI);
+      State.push_back(D);
+      Defs.push_back(D);
+      InstrToDefUse[&MI] = D;
+      DEBUG_STATE("push", MBB, State);
+    } else if (TPUInstrInfo::isFifoPop(MI)) {
+      if (State.empty()) {
+        if (!PipelinedBlock)
+          return DiagnoseError(MBB, State, "Popping with an empty state.");
+        ForwardPop.push_back(&MI);
+        continue;
+      }
+      if (!PopElement(MI))
+        return false;
+    }
+  }
+  // Record definition escaping the block.
+  for (DefUse *DefUse : State)
+    if (isa<Def>(DefUse))
+      LiveOut.insert(&cast<Def>(DefUse)->getInstr());
+  // For pipelined blocks we may have predicated pops happening before the
+  // corresponding push. We save those pops and match them with remaining
+  // pushes at the end of the basic block.
+  for (MachineInstr *MI : ForwardPop) {
+    if (State.empty())
+      return DiagnoseError(MBB, State, "Popping with an empty state.");
+    if (!PopElement(*MI))
+      return false;
+  }
+  DEBUG_STATE("state on exit", MBB, State);
+
+  // TODO(jmolloy): I'm not sure this deals with HALT non-fallthroughs properly.
+  if (MBB.succ_empty() && !State.empty())
+    DiagnoseError(MBB, State, "Return block ends with non-empty FIFO");
+  ExitStates[&MBB] = State;
+  return true;
+}
+
+void FifoAnalysis::Def::print(raw_ostream &OS) const {
+  auto *TII = Instr.getMF()->getSubtarget().getInstrInfo();
+  OS << TII->getName(getOpcode());
+  if (N != 1)
+    OS << "[" << N << "]";
+}
+
+void FifoAnalysis::Phi::print(raw_ostream &OS) const {
+  bool First = true;
+  OS << "(";
+  for (DefUse *DU : Prevs) {
+    if (!First)
+      OS << ", ";
+    First = false;
+    DU->print(OS);
+  }
+  OS << ")";
+}
+
+void FifoAnalysis::Use::print(raw_ostream &OS) const {
+  OS << "Use<" << N << ">";
+  Prev.print(OS);
+}
+
+void FifoAnalysis::DefUse::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case DUK_Use:
+    return cast<Use>(this)->print(OS);
+  case DUK_Def:
+    return cast<Def>(this)->print(OS);
+  case DUK_Phi:
+    return cast<Phi>(this)->print(OS);
+  }
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const FifoAnalysis::DefUse &DU) {
+  DU.print(OS);
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+                              const std::deque<FifoAnalysis::DefUse *> &DUs) {
+  OS << "{";
+  bool First = true;
+  for (auto *DU : DUs) {
+    if (!First)
+      OS << ", ";
+    First = false;
+    DU->print(OS);
+  }
+  OS << "}";
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+                              const std::deque<FifoAnalysis::Phi *> &DUs) {
+  OS << "{";
+  bool First = true;
+  for (auto *DU : DUs) {
+    if (!First)
+      OS << ", ";
+    First = false;
+    DU->print(OS);
+  }
+  OS << "}";
+  return OS;
+}
+
+unsigned FifoAnalysis::DefUse::getPushNumItems(const MachineInstr &Instr,
+                                               const FifoInfo *FI) {
+  // Variable number of items pushed.
+  if (TPUInstrInfo::isTransposeEnd(Instr.getDesc())) {
+    if (TPUInstrInfo::isSegmented(Instr.getDesc())) {
+      if (TPUInstrInfo::isPacked(Instr.getDesc()))
+        return 8;
+      return 16;
+    }
+    if (TPUInstrInfo::isPacked(Instr.getDesc()))
+      return Instr.getOperand(3).getImm() / 16;
+    return Instr.getOperand(2).getImm() / 8;
+  }
+  if (TPUInstrInfo::isPackedMatMul(Instr.getDesc()))
+    return 2;
+  return FI->getPushNumItems(Instr.getOpcode());
+}
+unsigned FifoAnalysis::DefUse::getPopNumItems(const MachineInstr &Instr,
+                                              const FifoInfo *FI) {
+  // No variable number of items for pop instructions.
+  return FI->getPopNumItems(Instr.getOpcode());
+}
+
+unsigned FifoAnalysis::getFifoInputState(MachineBasicBlock *MBB,
+                                         const FifoInfo *FI) const {
+  return FifoInputState.find(std::make_pair(MBB, FI))->second;
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUFifoAnalysis.h b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUFifoAnalysis.h
new file mode 100644
index 0000000..d5a38ff
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUFifoAnalysis.h

@@ -0,0 +1,232 @@
+//===-- TPUFifoAnalysis.h - FIFO analysis utilities  ------------*- C++ -* -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Performs analysis on FIFO registers.
+//
+// TODO(hgreving): This analysis is partly obsolete and partly inaccurate and
+// will be simplified.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_GOOGLETPU_TPUFIFOANALYSIS_H
+#define LLVM_LIB_TARGET_GOOGLETPU_TPUFIFOANALYSIS_H
+
+#include "TPUSubtarget.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/raw_ostream.h"
+#include <deque>
+
+namespace llvm {
+
+// Analyze FIFO usage across a function. This analysis runs in either SSA or
+// post-RA form and will analyze pushes and pops to a register or register class
+// across block boundaries.
+class FifoAnalysis {
+public:
+  enum DefUseKind { DUK_Def, DUK_Use, DUK_Phi };
+  // We consider an interaction with a FIFO to consist of one "push" followed by
+  // one or more "pops". This describes a def-use chain where each use modifies
+  // the state of the FIFO. A physical FIFO interleaves multiple such def-use
+  // chains.
+  //
+  // DefUse is the base class of Def, Use and Phi.
+  class DefUse {
+  public:
+    virtual ~DefUse() {}
+    // At this point in the chain, how many items are left unpopped?
+    int getNumItemsLeftToPop() const { return NumLeft; }
+
+    // Return the opcode of the first push instruction.
+    unsigned getOpcode() const { return Opcode; }
+
+    // Return the FifoInfo for this chain.
+    const FifoInfo *getFifoInfo() const { return FI; }
+
+    DefUseKind getKind() const { return Kind; }
+    void print(raw_ostream &OS) const;
+
+  protected:
+    static unsigned getPushNumItems(const MachineInstr &, const FifoInfo *FI);
+    static unsigned getPopNumItems(const MachineInstr &, const FifoInfo *FI);
+
+  private:
+    friend class FifoAnalysis;
+    DefUse(DefUseKind Kind, int NumLeft, unsigned Opcode,
+           const FifoInfo *FI)
+        : Kind(Kind), NumLeft(NumLeft), Opcode(Opcode), FI(FI) {}
+    DefUseKind Kind;
+    int NumLeft;
+    unsigned Opcode;
+    const FifoInfo *FI;
+  };
+
+  // A Def is a push instruction. There is one Def at the start of a def-use
+  // chain.
+  class Def : public DefUse {
+  public:
+    // The push instruction.
+    MachineInstr &getInstr() const { return Instr; }
+    // The number of items pushed into the fifo by this Def.
+    unsigned getNumPushedItems() const { return N; }
+
+    void print(raw_ostream &OS) const;
+    static bool classof(const DefUse *DU) { return DU->getKind() == DUK_Def; }
+  private:
+    friend class FifoAnalysis;
+    Def(MachineInstr &Instr, const FifoInfo *FI)
+        : DefUse(DUK_Def, getPushNumItems(Instr, FI), Instr.getOpcode(), FI),
+          Instr(Instr), N(getPushNumItems(Instr, FI)) {}
+    MachineInstr &Instr;
+    unsigned N;
+  };
+
+  // A Use is a pop instruction. It extracts a value from the FIFO and modifies
+  // the FIFO state.
+  class Use : public DefUse {
+  public:
+    // Return the immediately prior node in the def-use chain.
+    DefUse &getPrev() { return Prev; }
+
+    // The number of items popped from the fifo by this Use.
+    unsigned getNumPoppedItems() const { return N; }
+
+    // All Defs that may be the source of items popped by this Use.
+    ArrayRef<Def *> defs() const { return Defs; }
+
+    // The pop instruction.
+    MachineInstr &getInstr() const { return Instr; }
+
+    void print(raw_ostream &OS) const;
+    static bool classof(const DefUse *DU) { return DU->getKind() == DUK_Use; }
+
+  private:
+    friend class FifoAnalysis;
+    Use(MachineInstr &Instr, DefUse &Prev, const FifoInfo *FI)
+        : DefUse(DUK_Use,
+                 Prev.getNumItemsLeftToPop() - getPopNumItems(Instr, FI),
+                 Prev.getOpcode(), FI),
+          Instr(Instr), Prev(Prev), N(getPopNumItems(Instr, FI)) {}
+    MachineInstr &Instr;
+    DefUse &Prev;
+    unsigned N;
+
+    // Populated at the end of the analysis phase.
+    SmallVector<Def *, 4> Defs;
+  };
+
+  // A Phi combines multiple def-use chains.
+  class Phi : public DefUse {
+  public:
+    ArrayRef<DefUse *> getPrevs() { return Prevs; }
+
+    void print(raw_ostream &OS) const;
+    static bool classof(const DefUse *DU) { return DU->getKind() == DUK_Phi; }
+
+  private:
+    friend class FifoAnalysis;
+    Phi(SmallVector<DefUse *, 4> Prevs)
+        : DefUse(DUK_Phi, Prevs[0]->getNumItemsLeftToPop(),
+                 Prevs[0]->getOpcode(), Prevs[0]->getFifoInfo()),
+          Prevs(std::move(Prevs)) {}
+    SmallVector<DefUse *, 4> Prevs;
+  };
+
+  // Create a FifoAnalysis over MF for the Fifos given by FIs. If MF uses
+  // virtual registers, MRI must be given.
+  FifoAnalysis(MachineFunction &MF, ArrayRef<const FifoInfo *> FIs,
+               MachineRegisterInfo *MRI = nullptr);
+  // If MBBs contains any blocks, then all other blocks are excluded from and
+  // ignored in the analysis. If MBBs is empty, then all blocks are considered.
+  FifoAnalysis(MachineFunction &MF, ArrayRef<const FifoInfo *> FIs,
+               SmallSet<MachineBasicBlock *, 8> &MBBs,
+               MachineRegisterInfo *MRI = nullptr);
+  ~FifoAnalysis();
+
+  FifoAnalysis(FifoAnalysis &&) = default;
+
+  // The move assignment operator is deleted because BumpPtrAllocator does not
+  // allow move assignment.
+  FifoAnalysis &operator=(FifoAnalysis &&) = delete;
+
+  // Return a list of all known Defs, Phis and Uses.
+  ArrayRef<Def *> defs() { return Defs; }
+  ArrayRef<Phi *> phis() { return Phis; }
+  ArrayRef<Use *> uses() { return Uses; }
+
+  Def *getDef(const MachineInstr &MI) {
+    return dyn_cast_or_null<Def>(InstrToDefUse[&MI]);
+  }
+  Use *getUse(const MachineInstr &MI) {
+    return dyn_cast_or_null<Use>(InstrToDefUse[&MI]);
+  }
+
+  bool isValid() const { return Valid; }
+
+  // Return true if the machine instruction defines a Fifo register liveout from
+  // its basic block.
+  bool isLiveOut(const MachineInstr *I) const { return LiveOut.count(I) > 0; }
+
+  // Return the number of elements in the given Fifo at the entrance of the
+  // basic block.
+  unsigned getFifoInputState(MachineBasicBlock *MBB, const FifoInfo *) const;
+
+  // Returns whether the instructions is a fifo instruction using fifo type FI
+  bool IsFifoInstr(MachineInstr &MI, const FifoInfo *FI);
+
+private:
+  using FifoState = std::deque<DefUse *>;
+  using BlockEntryStateMap = DenseMap<MachineBasicBlock *, std::deque<Phi *>>;
+  using BlockExitStateMap = DenseMap<MachineBasicBlock *, FifoState>;
+  using DeferredEdge = std::pair<MachineBasicBlock *, MachineBasicBlock *>;
+
+  // Runs Fifo analysis.
+  void RunFifoAnalysis(MachineFunction &MF,
+                       SmallSet<MachineBasicBlock *, 8> &MBBs);
+  bool AnalyzeBlock(MachineBasicBlock &MBB,
+                    SmallSet<MachineBasicBlock *, 8> &MBBs,
+                    BlockExitStateMap &ExitStates,
+                    BlockEntryStateMap &EntryStates,
+                    std::vector<DeferredEdge> &Deferred, const FifoInfo *FI);
+  bool MergeStates(std::deque<Phi *> &SuccPhis, const FifoState &PredState);
+  bool DiagnoseError(const MachineBasicBlock &MBB, const std::deque<Phi *> &A,
+                     const FifoState &B);
+  bool DiagnoseError(const MachineBasicBlock &MBB, const FifoState &A,
+                     StringRef Reason);
+
+  ArrayRef<const FifoInfo *> FIs;
+  MachineRegisterInfo *MRI;
+
+  BumpPtrAllocator Allocator;
+  std::vector<Def *> Defs;
+  std::vector<Phi *> Phis;
+  std::vector<Use *> Uses;
+  DenseMap<const MachineInstr *, DefUse *> InstrToDefUse;
+  // Instruction live out of the block where they are defined.
+  DenseSet<const MachineInstr *> LiveOut;
+  // Map of the state of the Fifo per basic block.
+  DenseMap<std::pair<MachineBasicBlock *, const FifoInfo *>, unsigned>
+      FifoInputState;
+
+  bool Valid = true;
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const FifoAnalysis::DefUse &DU);
+raw_ostream &operator<<(raw_ostream &OS,
+                        const std::deque<FifoAnalysis::DefUse *> &DUs);
+raw_ostream &operator<<(raw_ostream &OS,
+                        const std::deque<FifoAnalysis::Phi *> &DUs);
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_GOOGLETPU_TPUFIFOANALYSIS_H

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUFifoScheduler.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUFifoScheduler.cpp
new file mode 100644
index 0000000..0ea7546
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUFifoScheduler.cpp

@@ -0,0 +1,226 @@
+//===-- TPUFifoScheduler.cpp.cpp - TPU critical path scheduler - C++ ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass orders FIFO instructions to reduce the critical path. As a pair of
+// push and pop needs to be re-ordered together we need special handling. This
+// pass tries to not modify the order of other instructions as keeping the
+// original order is important to the following scheduling passes.
+// The logic followed is:
+// * Remove edges caused by FIFO pseudo memory operand.
+// * Recompute the depth without the edges removed.
+// * Add new edges to force the order of FIFO instructions based on the new
+//   depth.
+// * Run the scheduler to find a correct instruction order (an instruction needs
+//   to be after all its predecessors)
+//
+// Note that most of the logic is in the DAG mutation so this mutation could
+// directly be applied to exisiting schedulers however this mutation break the
+// assumption that original order is correct and may not work if the scheduler
+// uses this assumption.
+//
+//===----------------------------------------------------------------------===//
+#include "TPU.h"
+#include "TPUFifoAnalysis.h"
+#include "TPUSubtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+
+#include <memory>
+using namespace llvm;
+
+#define DEBUG_TYPE "tpu-fifo-scheduler"
+
+namespace {
+
+// Remove edges between push/push and pop/pop instructions to calculate a depth
+// value independent of the FIFO order. Then sort by depth and re-add the edges
+// in a different order. This creates "backedges" but since we know that if an
+// instruction A as a greater depth than B then B cannot depend on A. So adding
+// those edge doesn't create cycles and the result is still a DAG.
+class ReorderFifoConstraint : public ScheduleDAGMutation {
+  // Helper to figure out if we can re-order the Push instruction associated to
+  // a given Pop.
+  bool PushCanBeReordered(FifoAnalysis::Use *U) {
+    // We only re-order instructions for Pop with a single associated Push
+    // in the same basic block.
+    if (U->defs().size() != 1)
+      return false;
+    // We only support cases where we push one element per instruction. And we
+    // don't support transpose as it comes with a set of transpose
+    // instructions that needs to stay together.
+    auto *PushDef = *U->defs().begin();
+    if (PushDef->getNumPushedItems() != 1 ||
+        TPUInstrInfo::isTransposeEnd(PushDef->getInstr().getDesc()))
+      return false;
+    return true;
+  }
+
+public:
+  void apply(ScheduleDAGInstrs *DAG) override {
+    DenseMap<const FifoInfo *, std::vector<FifoAnalysis::Use *>> PopInst;
+    MachineFunction &MF = DAG->MF;
+    FifoAnalysis FA(MF, MF.getSubtarget<TPUSubtarget>().getFifoInfos(),
+                    &MF.getRegInfo());
+    for (auto *U : FA.uses()) {
+      if (!PushCanBeReordered(U))
+        continue;
+      SUnit *SU = DAG->getSUnit(&U->getInstr());
+      SUnit *SUPush = DAG->getSUnit(&(*U->defs().begin())->getInstr());
+      if (!SU || !SUPush)
+        continue;
+      SmallVector<SDep, 16> FifoDep;
+      PopInst[U->getFifoInfo()].push_back(U);
+      for (auto &Dep : SU->Preds) {
+        if (!Dep.isNormalMemory())
+          continue;
+        auto *PrevPop = FA.getUse(*Dep.getSUnit()->getInstr());
+        if (PrevPop == nullptr || !PushCanBeReordered(PrevPop))
+          continue;
+        SUnit *PrevPush =
+            DAG->getSUnit(&(*PrevPop->defs().begin())->getInstr());
+        if (PrevPush == nullptr)
+          continue;
+        // We found a pair of predecessors push/pop within the DAG. We remove
+        // the edge and will add a new one that may be in a different direction.
+        FifoDep.push_back(Dep);
+        auto I = find_if(SUPush->Preds, [&](const SDep &Dep) {
+          return (Dep.getSUnit() == PrevPush);
+        });
+        assert(I != SUPush->Preds.end());
+        SUPush->removePred(*I);
+        SUPush->setDepthDirty();
+      }
+      for (auto &Dep : FifoDep)
+        SU->removePred(Dep);
+      SU->setDepthDirty();
+    }
+    // Add back new edges in an order dictated by the new depth calculated.
+    for (auto FifoPops : PopInst) {
+      auto &Pops = FifoPops.second;
+      sort(Pops, [&](const FifoAnalysis::Use *A, const FifoAnalysis::Use *B) {
+        SUnit *SUA = DAG->getSUnit(&A->getInstr());
+        SUnit *SUB = DAG->getSUnit(&B->getInstr());
+        return std::make_tuple(SUA->getDepth(), SUB->getHeight(), SUA->NodeNum) >
+               std::make_tuple(SUB->getDepth(), SUA->getHeight(), SUB->NodeNum);
+      });
+      for (unsigned I = 0; I < Pops.size() - 1; I++) {
+        SUnit *SU = DAG->getSUnit(&Pops[I]->getInstr());
+        SUnit *SUNext = DAG->getSUnit(&Pops[I + 1]->getInstr());
+        SDep Dep(SUNext, SDep::MustAliasMem);
+        // The latency doesn't matter, we only want to enforce an order between
+        // FIFO instructions.
+        Dep.setLatency(1);
+        SU->addPred(Dep, true);
+        // Add the matching edge between the POPs associated with those two
+        // PUSHs.
+        SUnit *PushSU = DAG->getSUnit(&(*Pops[I]->defs().begin())->getInstr());
+        SUnit *PushSUNext =
+            DAG->getSUnit(&(*Pops[I + 1]->defs().begin())->getInstr());
+        SDep DepPush(PushSUNext, SDep::MustAliasMem);
+        DepPush.setLatency(1);
+        PushSU->addPred(DepPush, true);
+      }
+    }
+  }
+};
+
+// Trivial scheduler trying to keep the original order. Since we added edges
+// between Fifo instructions in a different order than original order this will
+// automatically move Fifo instruction while doing minimum of changes to the
+// orginal order.
+class FifoScheduler : public MachineSchedStrategy {
+  struct SUnitOrder {
+    bool operator()(SUnit *A, SUnit *B) const {
+      return A->NodeNum < B->NodeNum;
+    }
+  };
+  // Priority queue where instructions are ordered based on NumNode.
+  PriorityQueue<SUnit *, std::vector<SUnit *>, SUnitOrder> BottomQ;
+
+public:
+  FifoScheduler() = default;
+
+  void initialize(ScheduleDAGMI *) override {}
+  void initPolicy(MachineBasicBlock::iterator Begin,
+                  MachineBasicBlock::iterator End,
+                  unsigned NumRegionInstrs) override {}
+
+  SUnit *pickNode(bool &IsTopNode) override {
+    if (BottomQ.empty())
+      return nullptr;
+    SUnit *SU = BottomQ.top();
+    BottomQ.pop();
+    assert(!SU->isScheduled);
+    IsTopNode = false;
+    return SU;
+  }
+  bool shouldTrackPressure() const override { return false; }
+  void schedNode(SUnit *SU, bool IsTopNode) override {}
+  void releaseTopNode(SUnit *SU) override {}
+  void releaseBottomNode(SUnit *SU) override { BottomQ.push(SU); }
+};
+
+// Wrapper pass to run the scheduler.
+class TPUFifoScheduling : public MachineFunctionPass {
+public:
+  static char ID;
+  TPUFifoScheduling() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF);
+  StringRef getPassName() const override { return "TPU fifo scheduling pass"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetPassConfig>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+char TPUFifoScheduling::ID = 0;
+
+bool TPUFifoScheduling::runOnMachineFunction(MachineFunction &MF) {
+  auto *PassConfig = &getAnalysis<TargetPassConfig>();
+
+  MachineSchedContext Ctx;
+  Ctx.MF = &MF;
+  Ctx.PassConfig = PassConfig;
+  Ctx.RegClassInfo->runOnMachineFunction(MF);
+
+  auto LRSPtr = std::make_unique<FifoScheduler>();
+  ScheduleDAGMILive Scheduler(&Ctx, std::move(LRSPtr));
+  Scheduler.addMutation(std::make_unique<ReorderFifoConstraint>());
+  for (auto &MBB : MF) {
+    Scheduler.startBlock(&MBB);
+    auto RegionEnd = MBB.end();
+    // Don't schedule terminators.
+    while (MBB.begin() != RegionEnd && (&*std::prev(RegionEnd))->isTerminator())
+      RegionEnd--;
+    // Skip empty regions.
+    if (MBB.begin() == RegionEnd)
+      continue;
+    Scheduler.enterRegion(&MBB, MBB.begin(), RegionEnd, MBB.size());
+    Scheduler.schedule();
+  }
+  return true;
+}
+
+} // namespace
+
+INITIALIZE_PASS(TPUFifoScheduling, DEBUG_TYPE, "TPU Fifo scheduling", false,
+                false)
+
+Pass *llvm::createTPUFifoSchedulingPass() { return new TPUFifoScheduling(); }

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUGEPLowering.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUGEPLowering.cpp
new file mode 100644
index 0000000..0100d4f
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUGEPLowering.cpp

@@ -0,0 +1,291 @@
+//===-- TPUGEPLowering.cpp - GEP lowering ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GetElementPtr instructions perform address manipulation. On TPU
+// none of the memories are byte-addressed. The default lowering of GEPs
+// by SelectionDAGBuilder is non-overridable, so instead we lower away GEPs
+// just prior to DAG construction. We can then modify the addressing logic
+// based on the address space.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TPU.h"
+#include "TPUIRUtils.h"
+#include "TPUSubtarget.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsTPU.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#define DEBUG_TYPE "tpu-gep-lowering"
+using namespace llvm;
+
+namespace {
+class TPUGEPLowering : public FunctionPass {
+public:
+  static char ID;
+  TPUGEPLowering() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.setPreservesCFG();
+  }
+
+  StringRef getPassName() const override { return "TPU GEP lowering"; }
+
+private:
+  void lowerGEP(GetElementPtrInst &I);
+  void lowerPtrCast(IntrinsicInst &I);
+  // Morphs scalar loads from vector memory back to vector.
+  // FIXME(b/221423458): Is this safe?
+  bool revectorizeLoads(Function &F);
+
+  // Recursively check if a constant expression contains a GEP.
+  bool ContainsGEP(ConstantExpr *CE);
+
+  // Recursively break a constant expression into one or several instructions.
+  // Insert the instructions at the right location so that it dominate the given
+  // user instruction and replace the use for the given operand index.
+  void BreakCstExpr(ConstantExpr *CE, Instruction &User, unsigned OperandIdx);
+
+  // The currently processed function.
+  Function *Fun = nullptr;
+  const TPUSubtarget *ST = nullptr;
+};
+char TPUGEPLowering::ID = 0;
+} // namespace
+
+Pass *llvm::createTPUGEPLoweringPass() { return new TPUGEPLowering(); }
+
+bool TPUGEPLowering::ContainsGEP(ConstantExpr *CE) {
+  if (CE->getOpcode() == Instruction::GetElementPtr) {
+    return true;
+  }
+  for (unsigned N = 0; N < CE->getNumOperands(); ++N) {
+    if (auto *CEOp = dyn_cast<ConstantExpr>(CE->getOperand(N))) {
+      if (ContainsGEP(CEOp))
+        return true;
+    }
+  }
+  return false;
+}
+
+void TPUGEPLowering::BreakCstExpr(ConstantExpr *CE, Instruction &User,
+                                  unsigned OperandIdx) {
+  auto *Inst = CE->getAsInstruction();
+  if (PHINode *Phi = dyn_cast<PHINode>(&User)) {
+    Inst->insertBefore(Phi->getIncomingBlock(OperandIdx)->getTerminator());
+    User.setOperand(OperandIdx, Inst);
+  } else {
+    Inst->insertBefore(&User);
+    User.replaceUsesOfWith(CE, Inst);
+  }
+  for (unsigned N = 0; N < Inst->getNumOperands(); ++N) {
+    if (auto *CEOp = dyn_cast<ConstantExpr>(Inst->getOperand(N)))
+      BreakCstExpr(CEOp, *Inst, N);
+  }
+}
+
+bool TPUGEPLowering::revectorizeLoads(Function &F) {
+  if (!ST->hasVPU())
+    return false;
+  bool Changed = false;
+  SmallVector<GetElementPtrInst *> GEPsToRemove;
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E;) {
+    Instruction *Inst = &(*I++);
+    LoadInst *Load = dyn_cast<LoadInst>(Inst);
+    if (!Load)
+      continue;
+    if (Load->getType()->isVectorTy())
+      continue;
+    unsigned AS = Load->getPointerOperandType()->getPointerAddressSpace();
+    if (AS != TPUAS_TileSpmem && AS != TPUAS_Vmem)
+      continue;
+    Changed = true;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    VectorType *VTy = VectorType::get(
+        Load->getType(),
+        ST->vectorSizeInElements(DL.getTypeAllocSize(Load->getType())), false);
+    IRBuilder<> B(Load->getParent(), Load->getIterator());
+    LoadInst *NewLoad = nullptr;
+    if (GetElementPtrInst *GEP =
+            dyn_cast<GetElementPtrInst>(Load->getPointerOperand())) {
+      Type *LoadTy = Load->getType();
+      assert(cast<PointerType>(GEP->getPointerOperandType())
+                 ->isOpaqueOrPointeeTypeMatches(LoadTy));
+      bool IsVector = LoadTy->isVectorTy();
+      // We do not support aggregates at this point.
+      assert(IsVector || LoadTy->isIntegerTy() || LoadTy->isFloatingPointTy());
+      assert(!LoadTy->isAggregateType());
+      assert(AS != TPUAS_TileSpmem || Load->getAlign() >= llvm::Align(32));
+      assert(AS != TPUAS_Vmem || Load->getAlign() >= llvm::Align(4096));
+      assert(GEP->hasIndices());
+      SmallVector<Value *> Indices;
+      Value *PointerOperand;
+      if (!IsVector) {
+        Indices.push_back(B.getInt32(0));
+        PointerOperand = B.CreateBitCast(GEP->getPointerOperand(),
+                                         PointerType::get(VTy, AS));
+      } else {
+        PointerOperand = GEP->getPointerOperand();
+      }
+      Indices.insert(Indices.end(), GEP->indices().begin(),
+                     GEP->indices().end());
+      Value *NewGEP = B.CreateGEP(VTy, PointerOperand, Indices);
+      if (isa<GetElementPtrInst>(NewGEP))
+        cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEP->isInBounds());
+      // We delete later, since it may be used elsewhere in the function.
+      GEPsToRemove.push_back(GEP);
+      // Casting the scalar pointer back to vector.
+      NewGEP = B.CreateBitCast(NewGEP, PointerType::get(VTy, AS));
+      NewLoad = B.CreateLoad(VTy, NewGEP);
+    } else {
+      NewLoad = B.CreateLoad(VTy, B.CreateBitCast(Load->getPointerOperand(),
+                                                  PointerType::get(VTy, AS)));
+    }
+    Value *ExtrElInst = B.CreateExtractElement(NewLoad, B.getInt32(0));
+    Load->replaceAllUsesWith(ExtrElInst);
+    Load->eraseFromParent();
+  }
+  for (auto *GEP : GEPsToRemove)
+    GEP->eraseFromParent();
+  return Changed;
+}
+
+bool TPUGEPLowering::runOnFunction(Function &F) {
+  Fun = &F;
+  auto &TPC = getAnalysis<TargetPassConfig>();
+  auto &TM = TPC.getTM<TargetMachine>();
+  ST = &TM.getSubtarget<TPUSubtarget>(*Fun);
+
+  // Attempt to revectorize scalarized memory accesses to vector memory.
+  revectorizeLoads(F);
+
+  // First remove all uses of ConstantGEPs - promote them to instructions.
+  for (auto &I : instructions(F)) {
+    for (unsigned N = 0; N < I.getNumOperands(); ++N) {
+      if (auto *CE = dyn_cast<ConstantExpr>(I.getOperand(N))) {
+        if (ContainsGEP(CE))
+          BreakCstExpr(CE, I, N);
+      }
+    }
+  }
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E;) {
+    Instruction* Inst = &(*I++);
+    if(IntrinsicInst* Intr = dyn_cast<IntrinsicInst>(Inst)) {
+      if (Intr->getIntrinsicID() == Intrinsic::tpu_inttoptr ||
+          Intr->getIntrinsicID() == Intrinsic::tpu_ptrtoint)
+        lowerPtrCast(*Intr);
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst))
+      lowerGEP(*GEP);
+  }
+  return true;
+}
+
+void TPUGEPLowering::lowerGEP(GetElementPtrInst &I) {
+  LLVM_DEBUG(dbgs() << "lowerGEP: " << I << "\n");
+  //
+  // Note the structure of this code is taken from
+  // SelectionDAGBuilder::visitGetElementPtr.
+  // It decomposes the passed GEP instruction into an equivalent chain of
+  // single-indexed GEPs, while converting the input operands to i8 pointers
+  // and by scaling the indexes as needed by the TPU addressing.
+  // In so doing we still delegate lowering of the GEP instructions to
+  // SelectionDAGBuilder::visitGetElementPtr, but we adjust the pointer
+  // arithmetic as needed.
+  //
+  DataLayout DL(I.getModule());
+  Value *Op0 = I.getOperand(0);
+  unsigned AS = Op0->getType()->getScalarType()->getPointerAddressSpace();
+  assert(!I.getType()->isVectorTy() && "Vector GEP not implemented!");
+
+  IRBuilder<> B(I.getParent(), I.getIterator());
+  Type *I8Ty = PointerType::get(B.getInt8Ty(), AS);
+  Value *Ptr = B.CreateBitCast(Op0, I8Ty);
+  for (auto GTI = gep_type_begin(&I), E = gep_type_end(&I); GTI != E; ++GTI) {
+    Value *Idx = GTI.getOperand();
+    if (StructType *StTy = GTI.getStructTypeOrNull()) {
+      unsigned Field = cast<Constant>(Idx)->getUniqueInteger().getZExtValue();
+      if (Field) {
+        // Get the offset of Field in the struct.
+        uint64_t Offset = DL.getStructLayout(StTy)->getElementOffset(Field);
+        APInt APOffset = TPU::adjustForWordSize(APInt(32, Offset), AS, *ST);
+        Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt(APOffset));
+      }
+    } else {
+      if (isa<ConstantInt>(Idx) && cast<ConstantInt>(Idx)->isZero())
+        continue;
+      unsigned IdxSize = DL.getIndexSizeInBits(AS);
+      APInt ElementSize(IdxSize, DL.getTypeAllocSize(GTI.getIndexedType()));
+      unsigned Shift = TPU::getShiftSize(AS, *ST);
+
+      // If this is a scalar constant or a splat vector of constants,
+      // handle it quickly.
+      const auto *CI = dyn_cast<ConstantInt>(Idx);
+      if (!CI && isa<ConstantDataVector>(Idx) &&
+          cast<ConstantDataVector>(Idx)->getSplatValue())
+        CI = cast<ConstantInt>(cast<ConstantDataVector>(Idx)->getSplatValue());
+
+      if (CI && CI->isZero()) {
+        continue;
+      }
+      Idx = B.CreateSExtOrTrunc(Idx, B.getInt32Ty());
+      // We can only handle the case where the size is dividible by granularity
+      // or the granularity is dividible by the element size. Otherwise we loose
+      // some bits in the transformation.
+      Value *Offset = nullptr;
+      if (ElementSize.lshr(Shift).shl(Shift) == ElementSize) {
+        // The shift amount itself does not need arithmetic shift. Negative
+        // indices are covered by the multiply.
+        Offset = B.CreateMul(Idx, B.getInt(ElementSize.lshr(Shift)));
+      } else if ((1 << Shift) % ElementSize.getZExtValue() == 0) {
+        unsigned AShr = Shift - ElementSize.logBase2();
+        if (ConstantInt *C = dyn_cast<ConstantInt>(Idx)) {
+          if (C->getZExtValue() & (1 << AShr) - 1) {
+            report_fatal_error(
+                "Index not suitable for word size, would get dropped.\n");
+          }
+        }
+        // Arithmetic shift.
+        Offset = B.CreateAShr(Idx, AShr);
+      } else {
+        report_fatal_error(
+            "Address not trivially aligned on address space granularity.\n");
+      }
+
+      Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, Offset);
+    }
+  }
+
+  I.replaceAllUsesWith(B.CreateBitCast(Ptr, I.getType()));
+  I.eraseFromParent();
+}
+
+void TPUGEPLowering::lowerPtrCast(IntrinsicInst &I) {
+  IRBuilder<> B(&I);
+  Value* NewCast = nullptr;
+  if(I.getIntrinsicID() == Intrinsic::tpu_inttoptr)
+    NewCast = B.CreateIntToPtr(I.getOperand(0), I.getType());
+  if(I.getIntrinsicID() == Intrinsic::tpu_ptrtoint)
+    NewCast = B.CreatePtrToInt(I.getOperand(0), I.getType());
+  assert(NewCast != nullptr);
+  I.replaceAllUsesWith(NewCast);
+  I.eraseFromParent();
+}
+
+INITIALIZE_PASS(TPUGEPLowering, DEBUG_TYPE, "TPU GEP lowering", false, false)

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUIRUtils.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUIRUtils.cpp
new file mode 100644
index 0000000..cb5480a
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUIRUtils.cpp

@@ -0,0 +1,328 @@
+//===-- TPUIRUtils.cpp - IR Lowering util functions/classes -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation file for TPUIRUtils.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TPUIRUtils.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsTPU.h"
+
+using namespace llvm;
+
+unsigned TPU::getShiftSize(unsigned AS, const TPUSubtarget &ST) {
+  unsigned Shift;
+  switch (static_cast<TPUAddressSpace>(AS)) {
+  case TPUAS_Vmem:
+  case TPUAS_Cmem:
+    // Vector word size is 512 bytes.
+    Shift = 9;
+    break;
+  case TPUAS_Hbm:
+  case TPUAS_HbmAny: {
+    Shift = ST.getHbmWordSizeLog2();
+    break;
+  }
+  case TPUAS_Simem:
+  case TPUAS_Timem:
+    // Simem and Timem word size is 32 bytes.
+    Shift = 5;
+    break;
+  case TPUAS_Smem:
+  case TPUAS_SmemAny:
+  case TPUAS_Sflag:
+  case TPUAS_SflagAny:
+  case TPUAS_SflagOther:
+  case TPUAS_SflagTile:
+  case TPUAS_TileSpmem:
+  case TPUAS_Spmem:
+  case TPUAS_Dreg:
+    // Scalar word size is 4 bytes.
+    Shift = 2;
+    break;
+  case TPUAS_Bmem:
+    llvm_unreachable("Unsupported");
+  case TPUAS_Iova:
+    // IOVA word size is 4k bytes per design. The real word size is actually 1
+    // byte.
+    Shift = 12;
+    break;
+  }
+  return Shift;
+}
+
+APInt TPU::adjustForWordSize(const APInt &V, unsigned AS,
+                             const TPUSubtarget &ST) {
+  unsigned Shift = getShiftSize(AS, ST);
+  if (V.ashr(Shift).shl(Shift) != V)
+    report_fatal_error("Pointer Offset not aligned on word size");
+  return V.ashr(Shift);
+}
+
+bool TPU::isTransposeEnd(const Instruction &I) {
+  const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+  if (!II)
+    return false;
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::tpu_tc_transpose_end:
+  case Intrinsic::tpu_tc_transpose_start_end:
+  case Intrinsic::tpu_tc_transpose_end_segmented:
+  case Intrinsic::tpu_tc_transpose_end_packed:
+  case Intrinsic::tpu_tc_transpose_end_segmented_packed:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool TPU::isXLUPop(const Instruction &I) {
+  const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+  if (!II)
+    return false;
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::tpu_tc_vtrfpop:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool TPU::isTransposePushNotPacked(const Instruction &I) {
+  const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+  if (!II)
+    return false;
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::tpu_tc_transpose:
+  case Intrinsic::tpu_tc_transpose_start:
+  case Intrinsic::tpu_tc_transpose_start_end:
+  case Intrinsic::tpu_tc_transpose_end:
+  case Intrinsic::tpu_tc_transpose_segmented:
+  case Intrinsic::tpu_tc_transpose_start_segmented:
+  case Intrinsic::tpu_tc_transpose_start_end_segmented:
+  case Intrinsic::tpu_tc_transpose_end_segmented:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool TPU::isTransposePushPacked(const Instruction &I) {
+  const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+  if (!II)
+    return false;
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::tpu_tc_transpose_packed:
+  case Intrinsic::tpu_tc_transpose_start_packed:
+  case Intrinsic::tpu_tc_transpose_start_end_packed:
+  case Intrinsic::tpu_tc_transpose_end_packed:
+  case Intrinsic::tpu_tc_transpose_segmented_packed:
+  case Intrinsic::tpu_tc_transpose_start_segmented_packed:
+  case Intrinsic::tpu_tc_transpose_start_end_segmented_packed:
+  case Intrinsic::tpu_tc_transpose_end_segmented_packed:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool TPU::isTransposePush(const Instruction &I) {
+  return isTransposePushNotPacked(I) || isTransposePushPacked(I);
+}
+
+bool TPU::isRotatePushNotPacked(const Instruction &I) {
+  const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+  if (!II)
+    return false;
+  return II->getIntrinsicID() == Intrinsic::tpu_vrotate;
+}
+
+bool TPU::isRotatePushPacked(const Instruction &I) {
+  const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+  if (!II)
+    return false;
+  return II->getIntrinsicID() == Intrinsic::tpu_vrotate_packed;
+}
+
+bool TPU::isRotatePush(const Instruction &I) {
+  return isRotatePushNotPacked(I) || isRotatePushPacked(I);
+}
+
+bool TPU::reducesPrecisionToBF16(const Instruction &I, int OpIdx) {
+  if (I.getOpcode() != Instruction::Call)
+    return false;
+  switch (cast<CallInst>(I).getIntrinsicID()) {
+  case Intrinsic::tpu_vmatmul_f32:
+  case Intrinsic::tpu_vmatpush_f32:
+    if (OpIdx == 0)
+      return true;
+    return false;
+  case Intrinsic::tpu_pack:
+  case Intrinsic::tpu_packc:
+    if (OpIdx == 0 || OpIdx == 2)
+      return true;
+    return false;
+  default:
+    return false;
+  }
+}
+
+SmallVector<int, 2> TPU::preservesBF16OperandPrecision(const Instruction &I) {
+  switch (I.getOpcode()) {
+  case Instruction::Call: {
+    switch (cast<CallInst>(I).getIntrinsicID()) {
+    case Intrinsic::tpu_xlane_max:
+    case Intrinsic::tpu_xlane_min:
+    case Intrinsic::tpu_xlane_segmented_max:
+    case Intrinsic::tpu_xlane_segmented_min:
+    case Intrinsic::tpu_vrotate:
+    case Intrinsic::tpu_vrot_sublane_down:
+      return {0};
+    default:
+      return {};
+    }
+  }
+  case Instruction::BitCast:
+    return {0};
+  case Instruction::Select:
+    return {1, 2};
+  default:
+    return {};
+  }
+}
+
+std::optional<unsigned> TPU::getMemOpPtrIndex(const Instruction &I) {
+  std::optional<unsigned> OpIdx;
+  if (auto *L = dyn_cast<LoadInst>(&I))
+    OpIdx = L->getPointerOperandIndex();
+  else if (auto *S = dyn_cast<StoreInst>(&I))
+    OpIdx = S->getPointerOperandIndex();
+  else if (auto *Intr = dyn_cast<IntrinsicInst>(&I)) {
+    switch (Intr->getIntrinsicID()) {
+    case Intrinsic::tpu_vld_strided:
+    case Intrinsic::tpu_vld_shuffle:
+    case Intrinsic::tpu_vld_indexed:
+    case Intrinsic::tpu_vld_replicate_evenodd_sublanes:
+      OpIdx = 0;
+      break;
+    case Intrinsic::tpu_vst_strided:
+    case Intrinsic::tpu_vst_indexed:
+    case Intrinsic::tpu_vst_evenodd_sublanes:
+      OpIdx = 1;
+      break;
+    default:
+      break;
+    }
+  }
+  return OpIdx;
+}
+
+LocationSize TPU::getMemOpAccessSize(const Instruction &I) {
+  switch (I.getOpcode()) {
+  case Instruction::Load:
+    return LocationSize::precise(
+        I.getModule()->getDataLayout().getTypeStoreSize(
+            cast<LoadInst>(I).getType()));
+  case Instruction::Store:
+    return LocationSize::precise(
+        I.getModule()->getDataLayout().getTypeStoreSize(
+            cast<StoreInst>(I).getValueOperand()->getType()));
+  case Instruction::Call: {
+    switch (cast<CallInst>(I).getIntrinsicID()) {
+    case Intrinsic::tpu_vld_shuffle:
+    case Intrinsic::tpu_vld_replicate_evenodd_sublanes:
+      return LocationSize::upperBound(
+          I.getModule()->getDataLayout().getTypeStoreSize(I.getType()));
+    case Intrinsic::tpu_vst_evenodd_sublanes:
+      return LocationSize::upperBound(
+          I.getModule()->getDataLayout().getTypeStoreSize(
+              I.getOperand(0)->getType()));
+    default:
+      break;
+    }
+  }
+  }
+  return LocationSize::beforeOrAfterPointer();
+}
+
+unsigned TPU::getMemOpAddrSpace(const Instruction &I) {
+  auto OpIdx = getMemOpPtrIndex(I);
+  assert(OpIdx.has_value());
+  Value *Ptr = I.getOperand(OpIdx.value());
+  assert(Ptr->getType()->isPointerTy() && "Must be pointer");
+  return cast<PointerType>(Ptr->getType())->getAddressSpace();
+}
+
+const Instruction *TPU::getPreviousTransposePush(const Instruction &I,
+                                                 bool Packed) {
+  assert(isTransposePush(I) && "Needs to be called on transpose push");
+  Value *Op = I.getOperand(Packed ? 5 : 4);
+  if (Op == UndefValue::get(Op->getType()))
+    return nullptr;
+  assert(isa<Instruction>(Op) && "Expected previous push to be an instruction");
+  return cast<Instruction>(Op);
+}
+
+const Value *TPU::getRotateAmount(const Instruction &I) {
+  assert(isRotatePush(I) && "Needs to be called on a rotate lane");
+  if (isRotatePushNotPacked(I))
+    return I.getOperand(1);
+  return I.getOperand(2);
+}
+
+const Value *TPU::getRotateBusIdx(const Instruction &I) {
+  assert(isRotatePush(I) && "Needs to be called on a rotate lane");
+  if (isRotatePushNotPacked(I))
+    return I.getOperand(2);
+  return I.getOperand(3);
+}
+
+bool TPU::isTLPFunction(const Function &F) {
+  return F.hasFnAttribute("is-tlp-function");
+}
+
+namespace {
+// Extract a single metadata int value.
+int GetMetadataValue(const Module *M, const Twine &Name, unsigned Idx) {
+  NamedMDNode *NMDN = M->getNamedMetadata(Name);
+  if (NMDN == nullptr) {
+    report_fatal_error("Missing expected metadata: " + Name);
+  }
+  MDNode *MDN = cast<MDNode>(NMDN->getOperand(Idx));
+  ConstantAsMetadata *CAM = cast<ConstantAsMetadata>(MDN->getOperand(0));
+  ConstantInt *CI = cast<ConstantInt>(CAM->getValue());
+  return CI->getSExtValue();
+}
+} // namespace
+
+// Extract spill range as a pair [start, limit).
+std::pair<int, int> TPU::GetSpillRange(const Module *M, StringRef start_name,
+                                       StringRef limit_name) {
+  return {GetMetadataValue(M, start_name, 0),
+          GetMetadataValue(M, limit_name, 0)};
+}
+
+// Extract spill range as a pair [start, limit).
+std::pair<int, int> TPU::GetSpillRange(const Module *M, const Function *F,
+                                       const Twine &FuncsName,
+                                       const Twine &StartName,
+                                       const Twine &LimitName) {
+  unsigned Idx = 0;
+  NamedMDNode *NMDNFuncs = M->getNamedMetadata(FuncsName);
+  if (NMDNFuncs == nullptr)
+    report_fatal_error("Missing expected metadata: " + FuncsName);
+  for (Idx = 0; Idx < NMDNFuncs->getNumOperands(); Idx++) {
+    MDNode *MDN = cast<MDNode>(NMDNFuncs->getOperand(Idx));
+    if (cast<ValueAsMetadata>(MDN->getOperand(0))->getValue() == F)
+      break;
+  }
+  if (Idx == NMDNFuncs->getNumOperands())
+    report_fatal_error("Can't find function in metadata");
+  return {GetMetadataValue(M, StartName, Idx),
+          GetMetadataValue(M, LimitName, Idx)};
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUIRUtils.h b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUIRUtils.h
new file mode 100644
index 0000000..fd89b78
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUIRUtils.h

@@ -0,0 +1,127 @@
+//===-- TPUIRUtils.h - IR Lowering util functions/classes -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a collection of functions/classes to be used by IR based passes in
+// the TPU backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_GOOGLETPU_TPUUTILS_H_
+#define LLVM_LIB_TARGET_GOOGLETPU_TPUUTILS_H_
+
+#include "TPU.h"
+#include "TPUSubtarget.h"
+#include "llvm/Analysis/MemoryLocation.h"
+
+namespace llvm {
+namespace TPU {
+
+// Return how many bits needs to be shifted to convert a byte address to a word
+// address.
+unsigned getShiftSize(unsigned AS, const TPUSubtarget &ST);
+
+// Given a value in bytes, adjust for the word size of the type T.
+// TMem Vectors are 32-byte addressed, scalars are 4-byte, VMem Vectors are
+// 512-byte.
+APInt adjustForWordSize(const APInt &V, unsigned AS, const TPUSubtarget &ST);
+
+// Returns true if this is the last push of a transpose sequence.
+bool isTransposeEnd(const Instruction &I);
+
+// Returns true if this is an XLU pop.
+bool isXLUPop(const Instruction &I);
+
+// Return true if this is a regular XLU transpose push.
+bool isTransposePushNotPacked(const Instruction &I);
+
+// Return true if this is a packed XLU transpose push.
+bool isTransposePushPacked(const Instruction &I);
+
+// Return true if this is a XLU transpose push.
+bool isTransposePush(const Instruction &I);
+
+// Return true if this is a regular XLU rotate push.
+bool isRotatePushNotPacked(const Instruction &I);
+
+// Return true if this is a packed XLU rotate push.
+bool isRotatePushPacked(const Instruction &I);
+
+// Return true if this is a XLU rotate push.
+bool isRotatePush(const Instruction &I);
+
+// Returns true if the instruction I reduced the precision of operand OpIdx
+// implicitly.
+bool reducesPrecisionToBF16(const Instruction &I, int OpIdx);
+
+// Returns the indices of the operands of instruction I that transfer the
+// precision to the output. (for example the selectees of a select)
+SmallVector<int, 2> preservesBF16OperandPrecision(const Instruction &I);
+
+// Returns the index of the pointer operand of instruction I.
+std::optional<unsigned> getMemOpPtrIndex(const Instruction &I);
+
+// Returns the access size for memory operation I.
+LocationSize getMemOpAccessSize(const Instruction &I);
+
+// Returns the address space of memory operation I.
+unsigned getMemOpAddrSpace(const Instruction &I);
+
+// Returns the previous transpose push in the sequence. Needs to be called
+// on a Transpose push. If this is the first push then returns nullptr.
+const Instruction *getPreviousTransposePush(const Instruction &I, bool Packed);
+
+// Returns the Amount operand of a rotate lane intrinsic.
+const Value *getRotateAmount(const Instruction &I);
+
+// Returns the BusIdx operand of a rotate lane intrinsic.
+const Value *getRotateBusIdx(const Instruction &I);
+
+// Returns if this is passed Function argument is the TLP function.
+bool isTLPFunction(const Function &F);
+
+// Returns the spill range based on metadata per module.
+std::pair<int, int> GetSpillRange(const Module *M, StringRef start_name,
+                                  StringRef limit_name);
+
+// Returns the spill range based on metadata per function.
+std::pair<int, int> GetSpillRange(const Module *M, const Function *F,
+                                  const Twine &FuncsName,
+                                  const Twine &StartName,
+                                  const Twine &LimitName);
+
+// Helper utility that calls a member function of an object upon destruction.
+template <class T = void> struct Cleanup {
+  using MemFuncPtr = void (T::*)();
+  Cleanup() = delete;
+  Cleanup(T *o, MemFuncPtr fo) : O(o), FO(fo) {}
+  Cleanup(T **oo, MemFuncPtr fo) : OO(oo), FO(fo) {}
+  ~Cleanup() {
+    if (O && FO)
+      (O->*FO)();
+    if (OO && *OO && FO)
+      ((*OO)->*FO)();
+  }
+  T *get() { return O; }
+  T *O = nullptr;
+  T **OO = nullptr;
+  MemFuncPtr FO = nullptr;
+};
+// Helper utility specialization that calls a function.
+template <> struct Cleanup<void> {
+  Cleanup(std::function<void()> f) : F(f) {}
+  ~Cleanup() {
+    if (F)
+      F();
+  }
+  std::function<void()> F = nullptr;
+};
+
+} // namespace TPU
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_GOOGLETPU_TPUUTILS_H_

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUISelDAGToDAG.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUISelDAGToDAG.cpp
new file mode 100644
index 0000000..8498229
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUISelDAGToDAG.cpp

@@ -0,0 +1,844 @@
+//===------ TPUISelDAGToDAG.cpp - A dag to dag inst selector for TPU ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the TPU target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/TPUMCTargetDesc.h"
+#include "TPU.h"
+#include "TPUInstrInfo.h"
+#include "TPURegisterInfo.h"
+#include "TPUSubtarget.h"
+#include "TPUTargetMachine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsTPU.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+#define DEBUG_TYPE "tpu-isel"
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TPUDAGToDAGISel - TPU specific code to select TPU machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+namespace {
+
+uint64_t invertRawBarnaCorePredicate(uint64_t X) {
+  return TPUPredicate::fromRawBcEncoding(X)
+      .toggleInvert()
+      .toRawBcEncoding();
+}
+
+class TPUDAGToDAGISel : public SelectionDAGISel {
+public:
+  static char ID;
+
+  explicit TPUDAGToDAGISel(TPUTargetMachine *tm = nullptr)
+      : SelectionDAGISel(ID, *tm) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    ST = &MF.getSubtarget<TPUSubtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
+
+  // Pass Name
+  StringRef getPassName() const override {
+    return "TPU DAG->DAG Pattern Instruction Selection";
+  }
+
+private:
+  // Determine whether an ISD::OR's operands are suitable to turn the operation
+  // into an addition, which often has more compact encodings.
+  bool SelectAddLikeOr(SDNode *Parent, SDValue N, SDValue &Out) {
+    assert(Parent->getOpcode() == ISD::OR && "unexpected parent");
+    Out = N;
+    return CurDAG->haveNoCommonBitsSet(N, Parent->getOperand(1));
+  }
+
+  const TPUSubtarget *ST;
+// Include the pieces autogenerated from the target description.
+#include "TPUGenDAGISel.inc"
+
+  // Instruction Selection not handled by the auto-generated tablgen
+  void Select(SDNode *N) override;
+
+  // Support functions for the opcodes of Instruction Selection
+  // not handled by the auto-generated tblgen
+  void selectDIVREM(SDNode *N);
+  void selectFrameIndex(SDNode *N);
+  void selectBRCOND(SDNode *N);
+  void selectEVENT(SDNode *N);
+  void selectCrossLaneIntrinsic(SDNode *N, const TargetRegisterClass *RC);
+  void selectErfIntrinsic(SDNode *N);
+  bool selectXOR(SDNode *N);
+  void selectReadCycleCounter(SDNode *N);
+  void selectTrapIntrinsic(SDNode *N);
+  void selectMaskAdapter(SDNode *N);
+};
+
+char TPUDAGToDAGISel::ID = 0;
+
+} // namespace
+
+// Select instructions not customized! Used for
+// expanded, promoted and normal instructions
+void TPUDAGToDAGISel::Select(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    LLVM_DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    return;
+  }
+
+  // Instruction Selection not handled by the auto-generated tablegen selection
+  // should be handled here.
+  switch (Opcode) {
+  case ISD::FrameIndex:
+    return selectFrameIndex(Node);
+  case ISD::BR:
+    break;
+  case ISD::BRCOND:
+    if (ST->hasBarnaCoreChannelControllerIsa())
+      // brcond is selected in tblgen for BarnaCore.
+      break;
+    return selectBRCOND(Node);
+  case ISD::INTRINSIC_WO_CHAIN:
+    switch (Node->getConstantOperandVal(0)) {
+    default:
+      break;
+    case Intrinsic::tpu_rsqrt_macro:
+    case Intrinsic::tpu_pow2_macro:
+    case Intrinsic::tpu_log2_macro:
+    case Intrinsic::tpu_tanh_macro:
+    case Intrinsic::tpu_rcp_macro:
+    case Intrinsic::tpu_sigshft_macro:
+    case Intrinsic::tpu_sin_macro:
+    case Intrinsic::tpu_cos_macro:
+    case Intrinsic::tpu_erf_macro:
+      return selectErfIntrinsic(Node);
+    case Intrinsic::tpu_add_scan1xNi:
+    case Intrinsic::tpu_add_scan1xNf:
+    case Intrinsic::tpu_add_seg_scan1xNi:
+    case Intrinsic::tpu_add_seg_scan1xNf:
+    case Intrinsic::tpu_min_seg_scan1xNi:
+    case Intrinsic::tpu_min_seg_scan1xNf:
+    case Intrinsic::tpu_min_seg_index_scan1xNi:
+    case Intrinsic::tpu_min_seg_index_scan1xNf:
+    case Intrinsic::tpu_max_seg_scan1xNi:
+    case Intrinsic::tpu_max_seg_scan1xNf:
+    case Intrinsic::tpu_max_seg_index_scan1xNi:
+    case Intrinsic::tpu_max_seg_index_scan1xNf:
+    case Intrinsic::tpu_min_scan1xNi:
+    case Intrinsic::tpu_min_scan1xNf:
+    case Intrinsic::tpu_min_index_scan1xNi:
+    case Intrinsic::tpu_min_index_scan1xNf:
+    case Intrinsic::tpu_max_scan1xNi:
+    case Intrinsic::tpu_max_scan1xNf:
+    case Intrinsic::tpu_max_index_scan1xNi:
+    case Intrinsic::tpu_max_index_scan1xNf:
+    case Intrinsic::tpu_add_half_scan2xNbf16:
+    case Intrinsic::tpu_add_full_scan2xNbf16:
+    case Intrinsic::tpu_min_scan2xNbf16:
+    case Intrinsic::tpu_max_scan2xNbf16:
+    case Intrinsic::tpu_min_index_scan2xNbf16:
+    case Intrinsic::tpu_max_index_scan2xNbf16:
+    case Intrinsic::tpu_add_half_seg_scan2xNbf16:
+    case Intrinsic::tpu_add_full_seg_scan2xNbf16:
+    case Intrinsic::tpu_min_seg_scan2xNbf16:
+    case Intrinsic::tpu_max_seg_scan2xNbf16:
+    case Intrinsic::tpu_min_seg_index_scan2xNbf16:
+    case Intrinsic::tpu_max_seg_index_scan2xNbf16:
+    case Intrinsic::tpu_deprecated_segreduce_addf:
+      return selectCrossLaneIntrinsic(Node, &TPU::XRFPR0RegClass);
+    case Intrinsic::tpu_sort_ascdi:
+    case Intrinsic::tpu_sort_ascdf:
+    case Intrinsic::tpu_sort_dscdi:
+    case Intrinsic::tpu_sort_dscdf:
+    case Intrinsic::tpu_dupcnti:
+    case Intrinsic::tpu_dupcntf:
+    case Intrinsic::tpu_uniquei:
+    case Intrinsic::tpu_uniquef:
+      return selectCrossLaneIntrinsic(Node, &TPU::XRFPR1RegClass);
+    case Intrinsic::tpu_16i1_to_8i1:
+    case Intrinsic::tpu_8i1_to_16i1:
+    case Intrinsic::tpu_16i1_to_32i1:
+    case Intrinsic::tpu_32i1_to_8i1:
+    case Intrinsic::tpu_8i1_to_32i1:
+      // These special intrinsics lower the type adapter intrinsics we derive
+      // from LLVM that we added as a workaround to be able to mix low precision
+      // masks v16i1, v32i1 and v8i1.
+      return selectMaskAdapter(Node);
+    }
+    break;
+  case ISD::INTRINSIC_VOID:
+    switch (Node->getConstantOperandVal(1)) {
+    default:
+      break;
+    case Intrinsic::tpu_event:
+      return selectEVENT(Node);
+    case Intrinsic::tpu_wait_trap:
+      // We need to expand the trap intrinsic late mainly because we need to
+      // predicate the code. Otherwise, we could have done it during combine.
+      return selectTrapIntrinsic(Node);
+    }
+    break;
+  case ISD::INTRINSIC_W_CHAIN:
+    switch (Node->getConstantOperandVal(1)) {
+    default:
+      break;
+    case Intrinsic::tpu_read_global_cycle_count:
+    case Intrinsic::tpu_read_local_cycle_count:
+      return selectReadCycleCounter(Node);
+    }
+    break;
+  case ISD::XOR:
+    if (selectXOR(Node))
+      return;
+    break;
+  case ISD::SDIVREM:
+  case ISD::SDIV:
+  case ISD::SREM: {
+    // Note: If we start setting -tpu-emulate-signed-divrem, we may want to just
+    // remove the flag. This is mostly to scare people away from unknowingly
+    // triggering emulation (resulting in extra instructions, since hardware
+    // lacks signed division instructions) when rewriting to use an unsigned
+    // divide would work just as well.
+    llvm_unreachable("Signed integer division not supported (emulation can "
+                     "be enabled by -tpu-emulate-signed-divrem)");
+    break;
+  }
+  case ISD::UDIV:
+  case ISD::UREM: {
+    llvm_unreachable("Should have been expanded into UDIVREM.");
+    break;
+  }
+  case ISD::UDIVREM:
+    if (!ST->hasVfcTensorCore() && !ST->isSparseCore())
+      llvm_unreachable("Scalar divide not supported on this target.");
+    return selectDIVREM(Node);
+  default:
+    break;
+  }
+
+  // Select the default instruction
+  SelectCode(Node);
+}
+
+void TPUDAGToDAGISel::selectDIVREM(SDNode *Node) {
+  assert(Node->getOpcode() == ISD::UDIVREM);
+  SDLoc DL(Node);
+  assert(Node->getVTList().NumVTs == 2);
+  const TPUTargetLowering *TLI = ST->getTargetLowering();
+
+  SmallVector<SDValue, 4> Operands;
+  // Divisor
+  Operands.push_back(Node->getOperand(0));
+
+  auto Dividend = Node->getOperand(1);
+  bool IsImmediate = isa<ConstantSDNode>(Dividend);
+  if (IsImmediate)
+    Operands.push_back(CurDAG->getTargetConstant(
+        cast<ConstantSDNode>(Dividend)->getAPIntValue(), SDLoc(Node),
+        MVT::i32));
+  else
+    Operands.push_back(Dividend);
+
+  auto PredReg = CurDAG->getRegister(TPU::Palways, MVT::i1);
+  auto PredInvert = CurDAG->getTargetConstant(APInt(1, 0), DL, MVT::i1);
+  Operands.push_back(PredReg);
+  Operands.push_back(PredInvert);
+
+  assert(!SDValue(Node, 0).use_empty() || !SDValue(Node, 1).use_empty());
+
+  SDNode *PopDiv = nullptr;
+  SDNode *PopRem = nullptr;
+  unsigned Opcode;
+
+  bool HasDiv = !SDValue(Node, 0).use_empty();
+  bool HasRem = !SDValue(Node, 1).use_empty();
+
+  if (!HasDiv) {
+    Opcode = IsImmediate ? TPU::SREMri : TPU::SREMrr;
+  } else if (!HasRem) {
+    Opcode = IsImmediate ? TPU::SDIVri : TPU::SDIVrr;
+  } else {
+    Opcode = IsImmediate ? TPU::SDIVREMri : TPU::SDIVREMrr;
+  }
+  MachineSDNode *MN =
+      CurDAG->getMachineNode(Opcode, DL, {MVT::i32, MVT::Other}, Operands);
+  assert(MN->isMachineOpcode());
+  TLI->addTPUMemOperand(*CurDAG, MN, /*IsPush=*/true, &TPU::DRFPRRegClass);
+
+  if (HasDiv) {
+    Operands.clear();
+    Operands.push_back(SDValue(MN, 0));
+    Operands.push_back(PredReg);
+    Operands.push_back(PredInvert);
+    Operands.push_back(SDValue(MN, 1)); // Chain
+    PopDiv = CurDAG->getMachineNode(TPU::SPOP_DRF, DL, {MVT::i32, MVT::Other},
+                                    Operands);
+    TLI->addTPUMemOperand(*CurDAG, PopDiv, /*IsPush=*/false,
+                          &TPU::DRFPRRegClass);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, 0), SDValue(PopDiv, 0));
+  }
+
+  if (HasRem) {
+    Operands.clear();
+    Operands.push_back(SDValue(MN, 0));
+    Operands.push_back(PredReg);
+    Operands.push_back(PredInvert);
+    if (HasDiv)
+      Operands.push_back(SDValue(PopDiv, 1)); // Chain
+    PopRem = CurDAG->getMachineNode(
+        TPU::SPOP_DRF, DL, {MVT::i32, MVT::Other, MVT::Glue}, Operands);
+    TLI->addTPUMemOperand(*CurDAG, PopRem, /*IsPush=*/false,
+                          &TPU::DRFPRRegClass);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(PopRem, 0));
+  }
+}
+
+void TPUDAGToDAGISel::selectFrameIndex(SDNode *N) {
+  if (!ST->isTPUABIEnabled())
+    llvm_unreachable("Not implemented");
+  SDLoc DL(N);
+  int FI = cast<FrameIndexSDNode>(N)->getIndex();
+  MachineFunction &MF = CurDAG->getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const AllocaInst *Alloca = MFI.getObjectAllocation(FI);
+  SDValue TFI = CurDAG->getTargetFrameIndex(
+      FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+  if (Alloca->getAddressSpace() != TPUAS_Smem &&
+      Alloca->getAddressSpace() != TPUAS_TileSpmem)
+    llvm_unreachable("Unsupported memory space for alloca.");
+  assert(Alloca->getAddressSpace() != TPUAS_TileSpmem ||
+         MF.getSubtarget<TPUSubtarget>().hasVPU());
+  if (!Alloca->isStaticAlloca())
+    llvm_unreachable("Dynamically sized alloca is not supported.");
+  auto PredReg = CurDAG->getRegister(TPU::Palways, MVT::i1);
+  auto PredNoInvert = CurDAG->getTargetConstant(APInt(1, 0), DL, MVT::i1);
+  CurDAG->SelectNodeTo(N, TPU::IMM, MVT::i32, {TFI, PredReg, PredNoInvert});
+}
+
+void TPUDAGToDAGISel::selectBRCOND(SDNode *Node) {
+  const SDValue &Chain = Node->getOperand(0);
+  const SDValue &Predicate = Node->getOperand(1);
+  const SDValue &Target = Node->getOperand(2);
+  SDLoc DL(Node);
+  auto PredicateInvert = CurDAG->getTargetConstant(APInt(1, 0), DL, MVT::i1);
+
+  CurDAG->SelectNodeTo(Node, TPU::BRcond, MVT::Other,
+                       {Target, Predicate, PredicateInvert, Chain});
+}
+
+void TPUDAGToDAGISel::selectEVENT(SDNode *Node) {
+  SDLoc DL(Node);
+  const SDValue &Chain = Node->getOperand(0);
+  unsigned FirstArgIdx = 2;
+  std::vector<SDValue> Args;
+
+  // Select (Wrapper tglobaladdr) for the first argument.
+  assert(Node->getOperand(FirstArgIdx)->getOpcode() == TPUISD::WRAPPER);
+  Args.push_back(Node->getOperand(FirstArgIdx)->getOperand(0));
+
+  for (unsigned I = FirstArgIdx + 1; I < Node->getNumOperands(); ++I)
+    Args.push_back(Node->getOperand(I));
+
+  // A variadic EVENT needs at least one argument. If we have no arguments use
+  // the special nullary form.
+  unsigned Opcode =
+      Args.size() == 1 ? TPU::EVENT_NULLARY_LOWER : TPU::EVENT_LOWER;
+
+  // Add the chain at the end.
+  Args.push_back(Chain);
+  CurDAG->SelectNodeTo(Node, Opcode, MVT::Other, Args);
+}
+
+bool TPUDAGToDAGISel::selectXOR(SDNode *Node) {
+  if (Node->getValueType(0) != MVT::i1)
+    return false;
+  if (isa<ConstantSDNode>(Node->getOperand(1)))
+    // XOR with constant uses the tablegen selector.
+    return false;
+  // (xor PPR:x, PPR:y) -> SEL x, (POR !y, !y), y.
+  auto PredReg = CurDAG->getRegister(TPU::Palways, MVT::i1);
+  auto PredNoInvert =
+      CurDAG->getTargetConstant(APInt(1, 0), SDLoc(Node), MVT::i1);
+  auto PredInvert =
+      CurDAG->getTargetConstant(APInt(32, 1), SDLoc(Node), MVT::i32);
+  SDNode *Not = CurDAG->getMachineNode(TPU::POR, SDLoc(Node), MVT::i1,
+                                       {Node->getOperand(1), PredInvert,
+                                        Node->getOperand(1), PredInvert,
+                                        PredReg, PredNoInvert});
+
+  CurDAG->SelectNodeTo(
+      Node, TPU::PSEUDO_PSELrr, MVT::i1,
+      {Node->getOperand(0), SDValue(Not, 0), Node->getOperand(1)});
+  return true;
+}
+
+void TPUDAGToDAGISel::selectTrapIntrinsic(SDNode *N) {
+  assert(N->getNumValues() == 1 && "Expected no outputs but a chain");
+  SDValue Chain = N->getOperand(0);
+
+  const SDValue &TmpPredReg = N->getOperand(4);
+  int InvertVal = 0;
+  // Small peephole checking for an inverted predicate in the original code.
+  if (TmpPredReg->getOpcode() == ISD::XOR) {
+    ConstantSDNode *CI = dyn_cast<ConstantSDNode>(TmpPredReg->getOperand(1));
+    if (CI && CI->getZExtValue() == 1) {
+      InvertVal = 1;
+    }
+  }
+  const SDValue &PredReg =
+      InvertVal == 0 ? N->getOperand(4) : N->getOperand(4)->getOperand(0);
+  auto PredInvert =
+      CurDAG->getTargetConstant(APInt(32, InvertVal), SDLoc(N), MVT::i32);
+  SDNode *SyncFlagZero = CurDAG->getMachineNode(
+      TPU::IMM, SDLoc(N), MVT::i32,
+      {CurDAG->getTargetConstant(APInt(32, 0), SDLoc(N), MVT::i32), PredReg,
+       PredInvert});
+  MachineSDNode *SyncSet = CurDAG->getMachineNode(
+      TPU::scSYNCSETri, SDLoc(N), MVT::Other,
+      {SDValue(SyncFlagZero, 0),
+       CurDAG->getTargetConstant(APInt(32, 0), SDLoc(N), MVT::i32), PredReg,
+       PredInvert, Chain});
+
+  MachinePointerInfo MPI(TPUAS_Sflag, 0);
+  auto *MemRef = CurDAG->getMachineFunction().getMachineMemOperand(
+      MPI, MachineMemOperand::MOStore, /*s=*/4,
+      /*base_alignment=*/llvm::Align(4));
+  CurDAG->setNodeMemRefs(SyncSet, {MemRef});
+
+  bool IsImmediate = isa<ConstantSDNode>(N->getOperand(3));
+  SmallVector<SDValue, 4> Operands;
+  assert(N->getOperand(2)->getOpcode() == TPUISD::WRAPPER);
+  Operands.push_back(N->getOperand(2)->getOperand(0));
+  Operands.push_back(SDValue(SyncFlagZero, 0));
+  if (IsImmediate) {
+    Operands.push_back(CurDAG->getTargetConstant(
+        cast<ConstantSDNode>(N->getOperand(3))->getAPIntValue(), SDLoc(N),
+        MVT::i32));
+  } else {
+    Operands.push_back(N->getOperand(3));
+  }
+  Operands.push_back(PredReg);
+  Operands.push_back(PredInvert);
+  // Chain
+  Operands.push_back(SDValue(SyncSet, 0));
+
+  SDNode *ScTrap = CurDAG->getMachineNode(IsImmediate ? TPU::scPSEUDO_TRAPi
+                                                      : TPU::scPSEUDO_TRAPr,
+                                          SDLoc(N), MVT::Other, Operands);
+
+  // Replace the head chain.
+  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(ScTrap, 0));
+}
+
+void TPUDAGToDAGISel::selectErfIntrinsic(SDNode *Node) {
+  SDLoc DL(Node);
+  unsigned Intrinsic = Node->getConstantOperandVal(0);
+  unsigned Opcode;
+  switch (Intrinsic) {
+  default:
+    llvm_unreachable("Unknown erf intrinsic!");
+  case Intrinsic::tpu_rsqrt_macro:
+    Opcode =
+        Node->getValueType(0) != MVT::v16bf16 ? TPU::VRSQRT : TPU::VRSQRTBF16;
+    break;
+  case Intrinsic::tpu_pow2_macro:
+    Opcode =
+        Node->getValueType(0) != MVT::v16bf16 ? TPU::VPOW2 : TPU::VPOW2BF16;
+    break;
+  case Intrinsic::tpu_log2_macro:
+    Opcode =
+        Node->getValueType(0) != MVT::v16bf16 ? TPU::VLOG2 : TPU::VLOG2BF16;
+    break;
+  case Intrinsic::tpu_tanh_macro:
+    Opcode =
+        Node->getValueType(0) != MVT::v16bf16 ? TPU::VTANH : TPU::VTANHBF16;
+    break;
+  case Intrinsic::tpu_rcp_macro:
+    Opcode = Node->getValueType(0) != MVT::v16bf16 ? TPU::VRCP : TPU::VRCPBF16;
+    break;
+  case Intrinsic::tpu_sigshft_macro:
+    Opcode = Node->getValueType(0) != MVT::v16bf16 ? TPU::VSIGSHFT
+                                                   : TPU::VSIGSHFTBF16;
+    break;
+  case Intrinsic::tpu_sin_macro:
+    if (!ST->hasTranscendental())
+      llvm_unreachable("Instruction unsupported.");
+    Opcode = Node->getValueType(0) != MVT::v16bf16 ? TPU::VSIN : TPU::VSINBF16;
+    break;
+  case Intrinsic::tpu_cos_macro:
+    if (!ST->hasTranscendental())
+      llvm_unreachable("Instruction unsupported.");
+    Opcode = Node->getValueType(0) != MVT::v16bf16 ? TPU::VCOS : TPU::VCOSBF16;
+    break;
+  case Intrinsic::tpu_erf_macro:
+    if (!ST->hasTranscendental())
+      llvm_unreachable("Instruction unsupported.");
+    Opcode = Node->getValueType(0) != MVT::v16bf16 ? TPU::VERF : TPU::VERFBF16;
+    break;
+  }
+
+  // Add everything except the intrinsic ID as operands.
+  SmallVector<SDValue, 4> Operands;
+  for (unsigned I = 1; I < Node->getNumOperands(); ++I)
+    Operands.push_back(Node->getOperand(I));
+
+  auto PredReg = CurDAG->getRegister(TPU::Palways, MVT::i1);
+  auto PredInvert = CurDAG->getTargetConstant(APInt(1, 0), DL, MVT::i1);
+  Operands.push_back(PredReg);
+  Operands.push_back(PredInvert);
+
+  MachineSDNode *MN = CurDAG->getMachineNode(Opcode, DL, MVT::i32, Operands);
+
+  Operands.clear();
+  Operands.push_back(SDValue(MN, 0));
+  Operands.push_back(PredReg);
+  Operands.push_back(PredInvert);
+  SDNode *NewNode = CurDAG->getMachineNode(TPU::VRES_EUP, DL,
+                                           Node->getValueType(0), Operands);
+
+  CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, 0), SDValue(NewNode, 0));
+}
+
+void TPUDAGToDAGISel::selectMaskAdapter(SDNode *Node) {
+  CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, 0), Node->getOperand(1));
+}
+
+void TPUDAGToDAGISel::selectCrossLaneIntrinsic(SDNode *Node,
+                                               const TargetRegisterClass *RC) {
+  SDLoc DL(Node);
+  unsigned Intrinsic = Node->getConstantOperandVal(0);
+
+  auto ValidateType = [](EVT Ty, EVT CheckTy) {
+    if (Ty != CheckTy)
+      llvm_unreachable("Invalid cross lane result type.");
+  };
+  const TPUTargetLowering *TLI = ST->getTargetLowering();
+
+  unsigned Opcode;
+  SmallVector<unsigned, 3> ComputedValues;
+  assert(RC == &TPU::XRFPR0RegClass || RC == &TPU::XRFPR1RegClass);
+  switch (Intrinsic) {
+  default:
+    llvm_unreachable("Unknown cross-lane intrinsic!");
+  case Intrinsic::tpu_sort_ascdi:
+    Opcode = TPU::scVSORTASCD;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 1, 2};
+    break;
+  case Intrinsic::tpu_sort_ascdf:
+    Opcode = TPU::scVSORTASCDF;
+    ValidateType(Node->getValueType(0), TLI->getVNF32Ty());
+    ComputedValues = {0, 1, 2};
+    break;
+  case Intrinsic::tpu_sort_dscdi:
+    Opcode = TPU::scVSORTDSCD;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 1, 2};
+    break;
+  case Intrinsic::tpu_sort_dscdf:
+    Opcode = TPU::scVSORTDSCDF;
+    ValidateType(Node->getValueType(0), TLI->getVNF32Ty());
+    ComputedValues = {0, 1, 2};
+    break;
+  case Intrinsic::tpu_add_seg_scan1xNi:
+    Opcode = TPU::scVADDSEGSCAN1XNI;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_add_seg_scan1xNf:
+    Opcode = TPU::scVADDSEGSCAN1XNF;
+    ValidateType(Node->getValueType(0), TLI->getVNF32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_min_seg_scan1xNi:
+    Opcode = TPU::scVMINSEGSCAN1XNI;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_min_seg_scan1xNf:
+    Opcode = TPU::scVMINSEGSCAN1XNF;
+    ValidateType(Node->getValueType(0), TLI->getVNF32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_min_seg_index_scan1xNi:
+    Opcode = TPU::scVMINSEGIDXSCAN1XNI;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_min_seg_index_scan1xNf:
+    Opcode = TPU::scVMINSEGIDXSCAN1XNF;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_max_seg_scan1xNi:
+    Opcode = TPU::scVMAXSEGSCAN1XNI;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_max_seg_scan1xNf:
+    Opcode = TPU::scVMAXSEGSCAN1XNF;
+    ValidateType(Node->getValueType(0), TLI->getVNF32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_max_seg_index_scan1xNi:
+    Opcode = TPU::scVMAXSEGIDXSCAN1XNI;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_max_seg_index_scan1xNf:
+    Opcode = TPU::scVMAXSEGIDXSCAN1XNF;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_add_scan1xNi:
+    Opcode = TPU::scVADDSCAN1XNI;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_add_scan1xNf:
+    Opcode = TPU::scVADDSCAN1XNF;
+    ValidateType(Node->getValueType(0), TLI->getVNF32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_add_half_scan2xNbf16:
+    assert(!ST->isVfcSparseCore());
+    Opcode = TPU::scVADDSCAN2XNHALFBF16;
+    ValidateType(Node->getValueType(0), TLI->getVNBF16Ty());
+    ValidateType(Node->getValueType(1), TLI->getVMNI1Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_add_full_scan2xNbf16:
+    assert(!ST->isVfcSparseCore());
+    Opcode = TPU::scVADDSCAN2XNFULLBF16;
+    ValidateType(Node->getValueType(0), TLI->getVNF32Ty());
+    ValidateType(Node->getOperand(2)->getValueType(0), TLI->getVNBF16Ty());
+    ComputedValues = {0, 1, 2};
+    break;
+  case Intrinsic::tpu_min_scan2xNbf16:
+    assert(!ST->isVfcSparseCore());
+    Opcode = TPU::scVMINSCAN2XNBF16;
+    ValidateType(Node->getValueType(0), TLI->getVNBF16Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_max_scan2xNbf16:
+    assert(!ST->isVfcSparseCore());
+    Opcode = TPU::scVMAXSCAN2XNBF16;
+    ValidateType(Node->getValueType(0), TLI->getVNBF16Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_min_index_scan2xNbf16:
+    assert(!ST->isVfcSparseCore());
+    Opcode = TPU::scVMINIDXSCAN2XNBF16;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_max_index_scan2xNbf16:
+    assert(!ST->isVfcSparseCore());
+    Opcode = TPU::scVMAXIDXSCAN2XNBF16;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_add_half_seg_scan2xNbf16:
+    assert(!ST->isVfcSparseCore());
+    Opcode = TPU::scVADDSEGSCAN2XNHALFBF16;
+    ValidateType(Node->getValueType(0), TLI->getVNBF16Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_add_full_seg_scan2xNbf16:
+    assert(!ST->isVfcSparseCore());
+    ValidateType(Node->getValueType(0), TLI->getVNF32Ty());
+    ValidateType(Node->getOperand(2)->getValueType(0), TLI->getVNBF16Ty());
+    Opcode = TPU::scVADDSEGSCAN2XNFULLBF16;
+    ComputedValues = {0, 1, 2};
+    break;
+  case Intrinsic::tpu_min_seg_scan2xNbf16:
+    assert(!ST->isVfcSparseCore());
+    Opcode = TPU::scVMINSEGSCAN2XNBF16;
+    ValidateType(Node->getValueType(0), TLI->getVNBF16Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_max_seg_scan2xNbf16:
+    assert(!ST->isVfcSparseCore());
+    Opcode = TPU::scVMAXSEGSCAN2XNBF16;
+    ValidateType(Node->getValueType(0), TLI->getVNBF16Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_min_seg_index_scan2xNbf16:
+    assert(!ST->isVfcSparseCore());
+    Opcode = TPU::scVMINSEGIDXSCAN2XNBF16;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_max_seg_index_scan2xNbf16:
+    assert(!ST->isVfcSparseCore());
+    Opcode = TPU::scVMAXSEGIDXSCAN2XNBF16;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_dupcnti:
+    Opcode = TPU::scVDUPCNT;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 1, 2};
+    break;
+  case Intrinsic::tpu_dupcntf:
+    Opcode = TPU::scVDUPCNTF;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 1, 2};
+    break;
+  case Intrinsic::tpu_uniquei:
+    Opcode = TPU::scVUNIQUE;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 1, 2};
+    break;
+  case Intrinsic::tpu_uniquef:
+    Opcode = TPU::scVUNIQUEF;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 1, 2};
+    break;
+  case Intrinsic::tpu_deprecated_segreduce_addf:
+    Opcode = TPU::scVSEGREDUCEADDF;
+    ValidateType(Node->getValueType(0), TLI->getVNF32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_min_scan1xNi:
+    Opcode = TPU::scVMINSCAN1XNI;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_min_scan1xNf:
+    Opcode = TPU::scVMINSCAN1XNF;
+    ValidateType(Node->getValueType(0), TLI->getVNF32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_min_index_scan1xNi:
+    Opcode = TPU::scVMINIDXSCAN1XNI;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_min_index_scan1xNf:
+    Opcode = TPU::scVMINIDXSCAN1XNF;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_max_scan1xNi:
+    Opcode = TPU::scVMAXSCAN1XNI;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_max_scan1xNf:
+    Opcode = TPU::scVMAXSCAN1XNF;
+    ValidateType(Node->getValueType(0), TLI->getVNF32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_max_index_scan1xNi:
+    Opcode = TPU::scVMAXIDXSCAN1XNI;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  case Intrinsic::tpu_max_index_scan1xNf:
+    Opcode = TPU::scVMAXIDXSCAN1XNF;
+    ValidateType(Node->getValueType(0), TLI->getVNI32Ty());
+    ComputedValues = {0, 2};
+    break;
+  }
+
+  // Add everything except the intrinsic ID as operands.
+  SmallVector<SDValue, 4> Operands;
+  for (unsigned I = 1; I < Node->getNumOperands(); ++I)
+    Operands.push_back(Node->getOperand(I));
+
+  auto PredReg = CurDAG->getRegister(TPU::Palways, MVT::i1);
+  auto PredInvert = CurDAG->getTargetConstant(APInt(1, 0), DL, MVT::i1);
+  Operands.push_back(PredReg);
+  Operands.push_back(PredInvert);
+
+  MachineSDNode *MN =
+      CurDAG->getMachineNode(Opcode, DL, {MVT::i32, MVT::Other}, Operands);
+  TLI->addTPUMemOperand(*CurDAG, MN, /*IsPush=*/true, RC);
+
+  unsigned NumResults = Node->getVTList().NumVTs;
+  // The number of outputs must match the expected count.
+  assert(ComputedValues.size() == NumResults);
+
+  // Remap the result types to their positions in the replacement node.
+  EVT NewNodeResultType[3] = {TLI->getVNI32Ty(), TLI->getVNI32Ty(),
+                              TLI->getVMNI1Ty()};
+  for (unsigned I = 0; I < NumResults; ++I) {
+    unsigned J = ComputedValues[I];
+    NewNodeResultType[J] = Node->getValueType(I);
+  }
+
+  Operands.clear();
+  Operands.push_back(SDValue(MN, 0));
+  Operands.push_back(PredReg);
+  Operands.push_back(PredInvert);
+  SDNode *NewNode = CurDAG->getMachineNode(
+      RC == &TPU::XRFPR0RegClass ? TPU::scVPOP3_XRF0 : TPU::scVPOP3_XRF1, DL,
+      CurDAG->getVTList(NewNodeResultType), Operands);
+  TLI->addTPUMemOperand(*CurDAG, NewNode, /*IsPush=*/false, RC);
+
+  // Remap the uses to reference appropriate outputs of the replacement node.
+  for (unsigned I = 0; I < NumResults; ++I)
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, I),
+                                      SDValue(NewNode, ComputedValues[I]));
+}
+
+void TPUDAGToDAGISel::selectReadCycleCounter(SDNode *N) {
+  assert(N->getNumValues() == 3 && "Expected two outputs and a chain");
+
+  // TODO(taugustine): We could avoid always expanding to lo+hi if we identified
+  // unused outputs and connecting to implicit_def.
+  unsigned Opcode;
+  switch (N->getConstantOperandVal(1)) {
+  default:
+    llvm_unreachable("Unexpected read cycle counter intrinsic");
+  case Intrinsic::tpu_read_global_cycle_count:
+    Opcode = TPU::GTC_READ;
+    break;
+  case Intrinsic::tpu_read_local_cycle_count:
+    Opcode = TPU::LCC_READ;
+    break;
+  }
+
+  CurDAG->SelectNodeTo(N, Opcode,
+                       CurDAG->getVTList(MVT::i32, MVT::i32, MVT::Other),
+                       {N->getOperand(0)});
+}
+
+INITIALIZE_PASS(TPUDAGToDAGISel, DEBUG_TYPE,
+                "TPU DAG->DAG Pattern Instruction Selection", false, false)
+
+// createTPUISelDag - This pass converts a legalized DAG into a
+// TPU-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createTPUISelDag(TPUTargetMachine &TM) {
+  return new TPUDAGToDAGISel(&TM);
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUISelLowering.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUISelLowering.cpp
new file mode 100644
index 0000000..4976fbc
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUISelLowering.cpp

@@ -0,0 +1,2942 @@
+//===--------- TPUISelLowering.cpp - TPU DAG Lowering Implementation ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TPUTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TPUISelLowering.h"
+#include "MCTargetDesc/TPUBaseInfo.h"
+#include "MCTargetDesc/TPUMCTargetDesc.h"
+#include "TPU.h"
+#include "TPUCallingConv.h"
+#include "TPUIRUtils.h"
+#include "TPUMachineFunctionInfo.h"
+#include "TPURegisterInfo.h"
+#include "TPUSubtarget.h"
+#include "TPUTargetMachine.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetCallingConv.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IntrinsicsTPU.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <utility>
+
+#define DEBUG_TYPE "tpu-lower"
+
+using namespace llvm;
+
+static cl::opt<bool> PropagateTpuEmbeddedMasks(
+    "tpu-enable-embedded-masks", cl::Hidden, cl::init(true),
+    cl::desc("Enables propagating embedded hardware masks "
+             "into special mask registers."));
+
+static cl::opt<bool>
+    GenerateTpuVCMasks("tpu-enable-vcmasks", cl::Hidden, cl::init(true),
+                       cl::desc("Enables generation of vcmask instructions to "
+                                "create mask immediates whenever possible."));
+
+static cl::opt<bool>
+    EmulateSignedDivRem("tpu-emulate-signed-divrem", cl::Hidden,
+                        cl::init(false),
+                        cl::desc("Enables emulation of signed div/rem via the "
+                                 "unsigned div/rem instructions"));
+
+extern cl::opt<bool> TPUVerifierStrictIntoPtr;
+
+bool TPUTargetLowering::functionArgumentNeedsConsecutiveRegisters(
+    Type *Ty, CallingConv::ID CallConv, bool isVarArg,
+    const DataLayout &DL) const {
+  // All aggregates on BarnaCore are allocated consecutive registers.
+  return IsBC && (Ty->isArrayTy() || Ty->isStructTy());
+}
+
+TPUTargetLowering::TPUTargetLowering(const TargetMachine &TM,
+                                     const TPUSubtarget &STI)
+    : TargetLowering(TM) {
+  ST = &STI;
+  TII = ST->getInstrInfo();
+
+  IsBC = ST->isPxcBarnaCore();
+  IsSC = ST->isSparseCore();
+  IsVFTC = ST->hasVfcTensorCore();
+  HasLPVF = ST->hasLPVF();
+  HasLPGL = ST->hasLPGL();
+  HasVMinMax = ST->hasVMinMax();
+
+  if (ST->hasV1024()) {
+    HasVPU = true;
+    VNI32 = MVT::v1024i32;
+    VNF32 = MVT::v1024f32;
+    // TODO(thomasraoux): Mask can be 2bits per elements on PFC:
+    // https://g3doc.corp.google.com/platforms/deepsea/logic/pfc/g3doc/isa/tensorcore.md#create-sublane-mask-instruction
+    VMNI1 = MVT::v1024i1;
+  } else if (ST->hasV16()) {
+    HasVPU = true;
+    VNI32 = MVT::v16i32;
+    VNF32 = MVT::v16f32;
+    VMNI1 = MVT::v16i1;
+  } else if (ST->hasV8()) {
+    HasVPU = true;
+    VNI32 = MVT::v8i32;
+    VNF32 = MVT::v8f32;
+    VMNI1 = MVT::v8i1;
+  } else {
+    // No vector support.
+    VNI32 = MVT::i32;
+    VNF32 = MVT::f32;
+    VMNI1 = MVT::i1;
+  }
+
+  VNBF16 = MVT::INVALID_SIMPLE_VALUE_TYPE;
+  VNF16 = MVT::INVALID_SIMPLE_VALUE_TYPE;
+  VNI16 = MVT::INVALID_SIMPLE_VALUE_TYPE;
+  VNI4 = MVT::INVALID_SIMPLE_VALUE_TYPE;
+  VNI2 = MVT::INVALID_SIMPLE_VALUE_TYPE;
+  VNI8 = MVT::INVALID_SIMPLE_VALUE_TYPE;
+  VMNBF16I1 = MVT::INVALID_SIMPLE_VALUE_TYPE;
+  VNI8I1 = MVT::INVALID_SIMPLE_VALUE_TYPE;
+  VMN16I1 = MVT::INVALID_SIMPLE_VALUE_TYPE;
+  VMN32I1 = MVT::INVALID_SIMPLE_VALUE_TYPE;
+  VMN64I1 = MVT::INVALID_SIMPLE_VALUE_TYPE;
+  if (HasLPVF || HasLPGL) {
+    if (ST->hasV8()) {
+      VNBF16 = MVT::v16bf16;
+      VNF16 = MVT::v16f16;
+      VNI16 = MVT::v16i16;
+      VNI8 = MVT::v32i8;
+      VNI4 = MVT::v64i4;
+      VNI2 = MVT::v128i2;
+      VMNBF16I1 = MVT::v16i1;
+      VNI8I1 = MVT::v32i1;
+      VNI1 = MVT::v256i1;
+      VMN16I1 = MVT::v16i1;
+      VMN32I1 = MVT::v32i1;
+    } else if (ST->hasV16()) {
+      VNBF16 = MVT::v32bf16;
+      VNF16 = MVT::v32f16;
+      VNI16 = MVT::v32i16;
+      VNI8 = MVT::v64i8;
+      VNI1 = MVT::v512i1;
+      VNI4 = MVT::v128i4;
+      VNI2 = MVT::v256i2;
+      VMNBF16I1 = MVT::v32i1;
+      VNI8I1 = MVT::v64i1;
+      VMN32I1 = MVT::v32i1;
+      VMN64I1 = MVT::v32i1;
+    } else {
+      llvm_unreachable("Unexpected VPU size.");
+    }
+  }
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &TPU::GPRRegClass);
+  addRegisterClass(MVT::f32, &TPU::GPRRegClass);
+  addRegisterClass(MVT::bf16, &TPU::GPRRegClass);
+  addRegisterClass(MVT::i1, &TPU::PPRRegClass);
+  // MVT::i8 is not legal in GPR.
+
+  if (IsSC) {
+    // SparseCore is hijacking the mmx data type for cbreg.
+    addRegisterClass(MVT::x86mmx, &TPU::CBRRegClass);
+  }
+
+  if (HasVPU) {
+    if (IsBC) {
+      // BarnaCore has Vregs and Vaggregs that both have the same type, so
+      // use VPR_AGGRegClass which is the superclass of both. Restricting a
+      // regclass to a strict subset is trivial.
+      addRegisterClass(VNI32, &TPU::VPR_AGGRegClass);
+      addRegisterClass(VNF32, &TPU::VPR_AGGRegClass);
+    } else {
+      addRegisterClass(VNI32, &TPU::VPRRegClass);
+      addRegisterClass(VNF32, &TPU::VPRRegClass);
+    }
+    addRegisterClass(VMNI1, &TPU::MPRRegClass);
+  }
+  if (HasLPVF || HasLPGL) {
+    addRegisterClass(VNBF16, &TPU::VPRRegClass);
+    addRegisterClass(VNF16, &TPU::VPRRegClass);
+    addRegisterClass(VNI16, &TPU::VPRRegClass);
+    addRegisterClass(VNI8, &TPU::VPRRegClass);
+    addRegisterClass(VNI4, &TPU::VPRRegClass);
+    addRegisterClass(VNI2, &TPU::VPRRegClass);
+    addRegisterClass(VNI1, &TPU::VPRRegClass);
+    if (ST->hasV8())
+      addRegisterClass(VMN16I1, &TPU::MPRRegClass);
+    addRegisterClass(VMN32I1, &TPU::MPRRegClass);
+    if (ST->hasV16())
+      addRegisterClass(VMN64I1, &TPU::MPRRegClass);
+  }
+
+  // Compute derived properties from the register classes
+  TRI = ST->getRegisterInfo();
+  computeRegisterProperties(TRI);
+
+  setStackPointerRegisterToSaveRestore(TPU::SPS);
+
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MVT::i8, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i8, Expand);
+
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Legal);
+  setOperationAction(ISD::SETCC, MVT::i32, Legal);
+  setOperationAction(ISD::SETCC, MVT::i1, Promote);
+  setOperationAction(ISD::SELECT, MVT::i32, Legal);
+  setOperationAction(ISD::SELECT, MVT::f32, Legal);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT_CC, VNF32, Expand);
+  setOperationAction(ISD::SELECT_CC, VNI32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
+
+  setOperationAction(ISD::VASTART, MVT::Other, Expand);
+  setOperationAction(ISD::VAARG, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+
+  if (IsSC) {
+    setOperationAction(ISD::SDIVREM, MVT::i32, Legal);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Legal);
+    setOperationAction(ISD::FDIV, MVT::f32, Custom);
+  } else {
+    setOperationAction(ISD::SDIVREM, VNI32, Expand);
+    setOperationAction(ISD::UDIVREM, VNI32, Expand);
+  }
+
+  // We rely on the combiner to expand into DIVREM.
+  auto SDivRemAction = EmulateSignedDivRem ? Custom : Expand;
+  setOperationAction(ISD::SDIV, MVT::i32, SDivRemAction);
+  setOperationAction(ISD::SREM, MVT::i32, SDivRemAction);
+  setOperationAction(ISD::UDIV, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+
+  // We do not currently VDiv i32.
+  setOperationAction(ISD::SDIV, VNI32, Expand);
+  setOperationAction(ISD::UDIV, VNI32, Expand);
+  setOperationAction(ISD::SREM, VNI32, Expand);
+  setOperationAction(ISD::UREM, VNI32, Expand);
+
+  for (const auto &VT : {MVT::i32, VNI32}) {
+    setOperationAction(ISD::MUL, VT, Legal);
+    setOperationAction(ISD::MULHU, VT, Expand);
+    setOperationAction(ISD::MULHS, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+
+    setOperationAction(ISD::ROTR, VT, Expand);
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::SHL_PARTS, VT, Expand);
+    setOperationAction(ISD::SRL_PARTS, VT, Expand);
+    setOperationAction(ISD::SRA_PARTS, VT, Expand);
+
+    setOperationAction(ISD::BSWAP, VT, Expand);
+    setOperationAction(ISD::CTPOP, VT, Legal);
+    setOperationAction(ISD::CTLZ, VT, Legal);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+  }
+
+  // If VMul i32 is not natively supported, we need to emulate it.
+  if (!IsSC && !IsVFTC)
+    setOperationAction(ISD::MUL, VNI32, Custom);
+  // For Jellyfish do a custom lowering of i32 MUL
+  if (!ST->hasSMul32())
+    setOperationAction(ISD::MUL, MVT::i32, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  if (IsVFTC || IsSC) {
+    setOperationAction(ISD::UMAX, MVT::i32, Legal);
+    setOperationAction(ISD::UMIN, MVT::i32, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::f32, Legal);
+  } // else we will fail lowering.
+  setOperationAction(ISD::FNEG, MVT::f32, Legal);
+  setOperationAction(ISD::FNEG, VNF32, Legal);
+
+  // Extended load operations for i1 types must be promoted
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+  }
+  setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
+  setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
+  setOperationAction(ISD::FMAXIMUM, VNF32, Legal);
+  setOperationAction(ISD::FMINIMUM, VNF32, Legal);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
+  setOperationAction(ISD::FCOPYSIGN, VNF32, Legal);
+  if (HasLPGL) {
+    setOperationAction(ISD::FMAXIMUM, VNBF16, Legal);
+    setOperationAction(ISD::FMINIMUM, VNBF16, Legal);
+  }
+
+  // Unordered comparisons not supported.
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETO, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, VNF32, Expand);
+  setCondCodeAction(ISD::SETUGT, VNF32, Expand);
+  setCondCodeAction(ISD::SETUGE, VNF32, Expand);
+  setCondCodeAction(ISD::SETULT, VNF32, Expand);
+  setCondCodeAction(ISD::SETULE, VNF32, Expand);
+  setCondCodeAction(ISD::SETONE, VNF32, Expand);
+  setCondCodeAction(ISD::SETUO, VNF32, Expand);
+  setCondCodeAction(ISD::SETO, VNF32, Expand);
+  if (HasLPGL) {
+    setCondCodeAction(ISD::SETUEQ, VNBF16, Expand);
+    setCondCodeAction(ISD::SETUGT, VNBF16, Expand);
+    setCondCodeAction(ISD::SETUGE, VNBF16, Expand);
+    setCondCodeAction(ISD::SETULT, VNBF16, Expand);
+    setCondCodeAction(ISD::SETULE, VNBF16, Expand);
+    setCondCodeAction(ISD::SETONE, VNBF16, Expand);
+    setCondCodeAction(ISD::SETUO, VNBF16, Expand);
+    setCondCodeAction(ISD::SETO, VNBF16, Expand);
+  }
+  if (HasVMinMax) {
+    setOperationAction(ISD::UMAX, VNI32, Legal);
+    setOperationAction(ISD::UMIN, VNI32, Legal);
+    if (HasLPGL) {
+      setOperationAction(ISD::UMAX, VNI16, Legal);
+      setOperationAction(ISD::UMIN, VNI16, Legal);
+    }
+  }
+
+  // Unsigned scalar comparisons supported for VF and SC subtargets.
+  LegalizeAction UnsignedCmpLegalizeAction = Custom;
+  if (ST->hasUnsignedScalarCompare()) {
+    UnsignedCmpLegalizeAction = Legal;
+  }
+  setCondCodeAction(ISD::SETUGT, MVT::i32, UnsignedCmpLegalizeAction);
+  setCondCodeAction(ISD::SETUGE, MVT::i32, UnsignedCmpLegalizeAction);
+  setCondCodeAction(ISD::SETULT, MVT::i32, UnsignedCmpLegalizeAction);
+  setCondCodeAction(ISD::SETULE, MVT::i32, UnsignedCmpLegalizeAction);
+
+  // Unsigned scalar comparisons supported for SC subtargets.
+  UnsignedCmpLegalizeAction = Custom;
+  if (ST->hasUnsignedVectorCompare()) {
+    UnsignedCmpLegalizeAction = Legal;
+  }
+  setCondCodeAction(ISD::SETUGT, VNI32, UnsignedCmpLegalizeAction);
+  setCondCodeAction(ISD::SETUGE, VNI32, UnsignedCmpLegalizeAction);
+  setCondCodeAction(ISD::SETULT, VNI32, UnsignedCmpLegalizeAction);
+  setCondCodeAction(ISD::SETULE, VNI32, UnsignedCmpLegalizeAction);
+
+  setTargetDAGCombine(ISD::BUILD_VECTOR);
+  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
+  setTargetDAGCombine(ISD::VSELECT);
+  setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+
+  // We could match this during isel in tablegen, but we want a bit more
+  // control.
+  setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
+
+  // Function alignments (log2)
+  setMinFunctionAlignment(Align(2));
+  setPrefFunctionAlignment(Align(2));
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+
+  setJumpIsExpensive(false);
+
+  // TODO(jmolloy): This is a hangover from Lanai. Evaluate if jumptables are
+  // needed or useful.
+  setMinimumJumpTableEntries(100);
+
+  // We'd run into trouble with pointer word sizes if we let native selection
+  // DAG lower these.
+  MaxStoresPerMemset = 0; // For @llvm.memset -> sequence of stores
+  MaxStoresPerMemsetOptSize = 0;
+  MaxStoresPerMemcpy = 0; // For @llvm.memcpy -> sequence of stores
+  MaxStoresPerMemcpyOptSize = 0;
+  MaxStoresPerMemmove = 0; // For @llvm.memmove -> sequence of stores
+  MaxStoresPerMemmoveOptSize = 0;
+
+  // Booleans always contain 0 or 1.
+  setBooleanContents(ZeroOrOneBooleanContent);
+}
+
+SDValue TPUTargetLowering::LowerGlobalAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+
+  SDValue TargetAddr = DAG.getTargetGlobalAddress(
+      GV, DL, getPointerTy(DAG.getDataLayout()), Offset);
+  return DAG.getNode(TPUISD::WRAPPER, DL, MVT::i32, TargetAddr);
+}
+
+SDValue TPUTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  CondCodeSDNode *Cond = cast<CondCodeSDNode>(Op.getOperand(2).getNode());
+  assert(!Op.getOperand(0).getValueType().isFloatingPoint() &&
+         ISD::isUnsignedIntSetCC(Cond->get()) &&
+         "Comparisons involving floating-point and signed-int types should not "
+         "be custom lowered as they are either expanded or legal.");
+
+  ISD::CondCode SignedCond;
+  switch (Cond->get()) {
+  default:
+    llvm_unreachable("Unknown signed condcode?");
+  case ISD::CondCode::SETULT:
+    SignedCond = ISD::CondCode::SETLT;
+    break;
+  case ISD::CondCode::SETULE:
+    SignedCond = ISD::CondCode::SETLE;
+    break;
+  case ISD::CondCode::SETUGT:
+    SignedCond = ISD::CondCode::SETGT;
+    break;
+  case ISD::CondCode::SETUGE:
+    SignedCond = ISD::CondCode::SETGE;
+    break;
+  }
+  SDLoc DL(Op);
+
+  // Generate unsigned setcc as:
+  //  %x = setcc signed %a, %b
+  //  %y = xor %a, %b      // one if bitwise different.
+  //  %z = setcc slt %y, 0 // sign bit different?
+  //       xor %x, %z
+  EVT VT = Op.getValueType();
+  EVT InputVT = Op.getOperand(0).getValueType();
+  SDValue X =
+      DAG.getSetCC(DL, VT, Op.getOperand(0), Op.getOperand(1), SignedCond);
+  SDValue Y = DAG.getNode(ISD::XOR, DL, InputVT, Op.getOperand(0),
+                              Op.getOperand(1));
+  SDValue Z = DAG.getSetCC(DL, VT, Y, DAG.getConstant(0, DL, InputVT),
+                               ISD::CondCode::SETLT);
+  return DAG.getNode(ISD::XOR, DL, VT, X, Z);
+}
+
+SDValue TPUTargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  assert(ST->isSparseCore());
+  SDValue X = Op.getOperand(0);
+  SDValue Y = Op.getOperand(1);
+  SDValue Splat = DAG.getNode(TPUISD::SPLAT, DL, VNF32, Y);
+  auto PredReg = DAG.getRegister(TPU::Palways, MVT::i1);
+  auto PredInvert = DAG.getTargetConstant(APInt(1, 0), DL, MVT::i1);
+  MachineSDNode *VRcpPush =
+      DAG.getMachineNode(TPU::VRCP, DL, MVT::i32, {Splat, PredReg, PredInvert});
+  addTPUMemOperand(DAG, VRcpPush, /*IsPush=*/true, &TPU::ERFPRRegClass);
+  MachineSDNode *VRcpPop = DAG.getMachineNode(
+      TPU::VRES_EUP, DL, VNF32, {SDValue(VRcpPush, 0), PredReg, PredInvert});
+  addTPUMemOperand(DAG, VRcpPop, /*IsPush=*/false, &TPU::ERFPRRegClass);
+  SDValue Srcp =
+      SDValue(DAG.getMachineNode(TPU::scVREADr, SDLoc(Op), MVT::f32,
+                                 {SDValue(VRcpPop, 0),
+                                  DAG.getTargetConstant(0, DL, MVT::i32),
+                                  PredReg, PredInvert}),
+              0);
+  SDValue FDivRes = DAG.getNode(ISD::FMUL, DL, MVT::f32, X, Srcp);
+  return FDivRes;
+}
+
+SDValue TPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue X = Op.getOperand(0);
+  SDValue Y = Op.getOperand(1);
+  assert(Op.getValueType() == MVT::i32 && "Expected i32 when Lowering SDIV32");
+
+  // To emulate signed division, we:
+  //   1. Take the absolute value of the operands
+  //   2. Perform an unsigned divide of the operands
+  //   3. Possibly negate the result of (2.).
+  unsigned UnsignedOpCode;
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Unknown signed divrem opcode");
+  case ISD::SDIV:
+    UnsignedOpCode = ISD::UDIV;
+    break;
+  case ISD::SREM:
+    UnsignedOpCode = ISD::UREM;
+    break;
+  }
+
+  EVT VT = Op.getValueType();
+
+  // 1. Compute abs(x), abs(y): abs(x) = x ^ (x >> 31) - (x >> 31)
+  //
+  // Note: we do this slightly differently than LLO, which uses
+  // compares+selects, but we end up with the same number of instructions.
+  // http://google3/platforms/xla/service/jellyfish/llo_region_builder.cc?l=950&rcl=378412916
+  SDValue XMask = DAG.getNode(ISD::SRA, DL, VT, X, DAG.getConstant(31, DL, VT));
+  SDValue YMask = DAG.getNode(ISD::SRA, DL, VT, Y, DAG.getConstant(31, DL, VT));
+
+  SDValue XInv = DAG.getNode(ISD::XOR, DL, VT, X, XMask);
+  SDValue YInv = DAG.getNode(ISD::XOR, DL, VT, Y, YMask);
+
+  SDValue XAbs = DAG.getNode(ISD::SUB, DL, VT, XInv, XMask);
+  SDValue YAbs = DAG.getNode(ISD::SUB, DL, VT, YInv, YMask);
+
+  // 2. Compute unsigned div/rem.
+  SDValue AbsResult = DAG.getNode(UnsignedOpCode, DL, VT, XAbs, YAbs);
+
+  // 3. Possibly negate the result of the unsigned div/rem.
+  SDValue SignMask;
+  if (Op.getOpcode() == ISD::SDIV) {
+    SignMask = DAG.getNode(ISD::XOR, DL, VT, XMask, YMask);
+  } else {
+    // For rem, the sign is determined by the dividend (X), defined the same way
+    // as the remainder operator % in C:
+    //   (a % b) == a - (a / b) * b
+    SignMask = XMask;
+  }
+  // SignMask is either all zeros or all ones (in which case the result should
+  // be negative). When it is all ones, we can use this mask to negate the two's
+  // complement result similar to finding abs(x):
+  //   result = abs_result ^ mask - mask
+  SDValue AbsResultInv = DAG.getNode(ISD::XOR, DL, VT, AbsResult, SignMask);
+  SDValue SignedResult = DAG.getNode(ISD::SUB, DL, VT, AbsResultInv, SignMask);
+
+  return SignedResult;
+}
+
+SDValue TPUTargetLowering::LowerMUL32(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue X = Op.getOperand(0);
+  SDValue Y = Op.getOperand(1);
+  assert(Op.getValueType() == MVT::i32 && "Expected i32 when Lowering MUL32");
+  // Expand a MUL i32 operation using the UMUL24 node for Jellyfish.
+  // The decomposition looks like:
+  // c = mul i32 a, b
+  // -->
+  // ll = umul24 i32 a, b
+  // al = srl i32 a, 24
+  // bl = srl i32 b, 24
+  // lh = umul24 i32 al, b
+  // hl = umul24 i32 a, bl
+  // sum = add i32 lh, hl
+  // shiftsum = shl i32 sum, 24
+  // c = add i32 shiftsum, ll
+  EVT VT = Op.getValueType();
+  KnownBits KBX = DAG.computeKnownBits(X);
+  KnownBits KBY = DAG.computeKnownBits(Y);
+  bool X_is_24bit = (KBX.Zero & 0xFF000000U) == 0xFF000000U;
+  bool Y_is_24bit = (KBY.Zero & 0xFF000000U) == 0xFF000000U;
+  // Using the fact that the smul.u24 instruction automatically zeroes out the
+  // upper bits of the operands. This saves the need to do it ourselves.
+  SDValue Low_Low = DAG.getNode(TPUISD::UMUL24, DL, VT, X, Y);
+  SDValue High_Low, Low_High;
+  if (!X_is_24bit) {
+    SDValue HighX =
+        DAG.getNode(ISD::SRL, DL, VT, X, DAG.getConstant(24, DL, VT));
+    High_Low = DAG.getNode(TPUISD::UMUL24, DL, VT, HighX, Y);
+  }
+  if (!Y_is_24bit) {
+    SDValue HighY =
+        DAG.getNode(ISD::SRL, DL, VT, Y, DAG.getConstant(24, DL, VT));
+    Low_High = DAG.getNode(TPUISD::UMUL24, DL, VT, X, HighY);
+  }
+  SDValue MixedSum;
+  if (High_Low && Low_High) {
+    MixedSum = DAG.getNode(ISD::ADD, DL, VT, High_Low, Low_High);
+  } else if (High_Low) {
+    MixedSum = High_Low;
+  } else if (Low_High) {
+    MixedSum = Low_High;
+  } else {
+    return Low_Low;
+  }
+  SDValue ShiftedSum =
+      DAG.getNode(ISD::SHL, DL, VT, MixedSum, DAG.getConstant(24, DL, VT));
+  return DAG.getNode(ISD::ADD, DL, VT, Low_Low, ShiftedSum);
+}
+
+// Handle the lowering of the simple cases where one operand is a constant.
+// This uses non-adjacent form (NAF).
+SDValue TPUTargetLowering::SimpleEmulVMUL32(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue X = Op.getOperand(0);
+  SDValue Y = Op.getOperand(1);
+  if (Y.getOpcode() != TPUISD::SPLAT)
+    return SDValue();
+  ConstantSDNode *C = isConstOrConstSplat(Y.getOperand(0));
+  if (C == nullptr)
+    return SDValue();
+  int M = C->getZExtValue();
+  int HighestOne = -1;
+  int NonZeroEntries = 0;
+  std::array<int, 32> SignedDigit;
+  SignedDigit.fill(0);
+
+  // Following algortihm taken from:
+  // https://en.wikipedia.org/wiki/Non-adjacent_form
+  int64_t e = std::abs(M);
+  const int s = M < 0 ? -1 : 1;
+  int i = 0;
+  while (e > 0) {
+    int zi = 0;
+    if (e % 2 == 1) {
+      zi = 2 - (e % 4);
+      if (zi != 0) {
+        ++NonZeroEntries;
+      }
+    }
+    SignedDigit[i] = s * zi;
+    if (SignedDigit[i] == 1) {
+      HighestOne = i;
+    }
+    e = (e - zi) / 2;
+    ++i;
+  }
+
+  // Initialize the running sum. Set the running sum to the maximal
+  // shifted positive value (i.e., largest i such that zi == 1 and MulAmt
+  // has V<<i as a term NAF).
+  SDValue Res;
+  if (HighestOne == -1) {
+    Res =
+        DAG.getNode(TPUISD::SPLAT, DL, VNI32, DAG.getConstant(0, DL, MVT::i32));
+  } else {
+    Res = DAG.getNode(TPUISD::SPLAT, DL, VNI32,
+                      DAG.getConstant(HighestOne, DL, MVT::i32));
+    Res = DAG.getNode(ISD::SHL, DL, VNI32, X, Res);
+    SignedDigit[HighestOne] = 0;
+  }
+
+  // Assemble multiplication from shift, add, sub using NAF form and
+  // running sum.
+  for (size_t i = 0; i < SignedDigit.size(); ++i) {
+    if (SignedDigit[i] == 0) {
+      continue;
+    }
+
+    SDValue op = X;
+    // Shifted multiplicand (v<<i).
+    if (i > 0) {
+      SDValue I = DAG.getNode(TPUISD::SPLAT, DL, VNI32,
+                              DAG.getConstant(i, DL, MVT::i32));
+      op = DAG.getNode(ISD::SHL, DL, VNI32, X, I);
+    }
+    if (SignedDigit[i] == 1) {
+      Res = DAG.getNode(ISD::ADD, DL, VNI32, Res, op);
+    } else if (SignedDigit[i] == -1) {
+      Res = DAG.getNode(ISD::SUB, DL, VNI32, Res, op);
+    }
+  }
+  return Res;
+}
+
+// Logic to lower down VMUL32 copied from LLO region builder.
+SDValue TPUTargetLowering::LowerVMUL32(SDValue Op, SelectionDAG &DAG) const {
+  if (SDValue V = SimpleEmulVMUL32(Op, DAG))
+    return V;
+  SDLoc DL(Op);
+  SDValue lhs = Op.getOperand(0);
+  SDValue rhs = Op.getOperand(1);
+  // Multiword multiplication. Splits the inputs up into 3 11-bit words using
+  // fmul, uses VmulU11 to form their products.
+  // Computes int32(x * y). We use this as a 11 bit x 11 bit -> 22 bit integer
+  // multiplication primitive without losing precision.
+  // Generates code:
+  //   uint32 u0 = u & 0x7FF;
+  //   uint32 u1 = (u >> 11) & 0x7FF;
+  //   uint32 u2 = u >> 22;
+  //   uint32 v0 = v & 0x7FF;
+  //   uint32 v1 = (v >> 11) & 0x7FF;
+  //   uint32 v2 = v >> 22;
+  //   return u0 * v0 + ((u1 * v0 + u0 * v1) << 11) +
+  //          ((u0 * v2 + u1 * v1 + u2 * v0) << 22);
+  auto VSplatImm32 = [&](int I) {
+    return DAG.getNode(TPUISD::SPLAT, DL, VNI32,
+                       DAG.getConstant(I, DL, MVT::i32));
+  };
+  auto VandU32 = [&](SDValue X, SDValue Y) {
+    return DAG.getNode(ISD::AND, DL, VNI32, X, Y);
+  };
+  auto VaddS32 = [&](SDValue X, SDValue Y) {
+    return DAG.getNode(ISD::ADD, DL, VNI32, X, Y);
+  };
+  auto VshrlU32 = [&](SDValue X, SDValue Y) {
+    return DAG.getNode(ISD::SRL, DL, VNI32, X, Y);
+  };
+  auto VshllU32 = [&](SDValue X, SDValue Y) {
+    return DAG.getNode(ISD::SHL, DL, VNI32, X, Y);
+  };
+  auto VcvtS32ToF32 = [&](SDValue X) {
+    return DAG.getNode(ISD::SINT_TO_FP, DL, VNF32, X);
+  };
+
+  // Computes int32(x * y). We use this as a 11 bit x 11 bit -> 22 bit integer
+  // multiplication primitive without losing precision.
+  auto VmulU11 = [&](SDValue Lhs, SDValue Rhs) {
+    auto Product =
+        DAG.getNode(ISD::FMUL, DL, VNF32, VcvtS32ToF32(Lhs), VcvtS32ToF32(Rhs));
+    return DAG.getNode(ISD::FP_TO_SINT, DL, VNI32, Product);
+  };
+
+  auto mask = VSplatImm32(0x7FF);
+  auto k11 = VSplatImm32(11);
+  auto k22 = VSplatImm32(22);
+
+  auto u0 = VandU32(lhs, mask);
+  auto u1 = VandU32(VshrlU32(lhs, k11), mask);
+  auto u2 = VshrlU32(lhs, k22);
+
+  auto v0 = VandU32(rhs, mask);
+  auto v1 = VandU32(VshrlU32(rhs, k11), mask);
+  auto v2 = VshrlU32(rhs, k22);
+
+  auto w0 = VmulU11(u0, v0);
+
+  auto w1 = VmulU11(u1, v0);
+  w1 = VaddS32(w1, VmulU11(u0, v1));
+  w1 = VshllU32(w1, k11);
+
+  auto w2 = VmulU11(u0, v2);
+  w2 = VaddS32(w2, VmulU11(u1, v1));
+  w2 = VaddS32(w2, VmulU11(u2, v0));
+  w2 = VshllU32(w2, k22);
+
+  return VaddS32(VaddS32(w0, w1), w2);
+}
+
+SDValue TPUTargetLowering::LowerADDRSPACECAST(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
+  unsigned SrcAS = ASC->getSrcAddressSpace();
+  unsigned DestAS = ASC->getDestAddressSpace();
+  if ((SrcAS == TPUAS_Smem && DestAS == TPUAS_SmemAny) ||
+      (SrcAS == TPUAS_Hbm && DestAS == TPUAS_HbmAny) ||
+      (SrcAS == TPUAS_Sflag && DestAS == TPUAS_SflagAny) ||
+      (SrcAS == TPUAS_Sflag && DestAS == TPUAS_SflagTile)) {
+    return DAG.getNode(ISD::BITCAST, dl, MVT::i32, ASC->getOperand(0));
+  }
+  if (!TPUVerifierStrictIntoPtr)
+    return DAG.getNode(ISD::BITCAST, dl, MVT::i32, ASC->getOperand(0));
+  report_fatal_error("Unsupported addrspace cast " + Twine(SrcAS) + "->" +
+                     Twine(DestAS) + ".\n");
+}
+
+SDValue TPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unimplemented operand");
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG);
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::SDIV:
+  case ISD::SREM:
+    return LowerSDIV32(Op, DAG);
+  case ISD::FDIV:
+    if (!ST->hasVPU())
+      llvm_unreachable("fdiv on scalar core is not supported.");
+    return LowerFDIV32(Op, DAG);
+  case ISD::MUL: {
+    if (Op.getValueType() == MVT::i32)
+      return LowerMUL32(Op, DAG);
+    return LowerVMUL32(Op, DAG);
+  }
+  case ISD::ADDRSPACECAST:
+    return LowerADDRSPACECAST(Op, DAG);
+  case TPUISD::SPLAT:
+    // We're doing some specific type checking, because this is a special case
+    // for MVT::v32i8 when the DAG legalizer tries to promote MVT::i8.
+    if (isTypeLegal(Op->getOperand(0).getValueType()))
+      llvm_unreachable(
+          "This should only happen if the splat element isn't legal.");
+    EVT VT = Op->getOperand(0).getValueType();
+    if (!VT.isSimple() || !VT.isInteger() || VT != MVT::i8)
+      llvm_unreachable("This should only happen on scalar type MVT::i8, "
+                       "which is being promoted.");
+    // We're promoting the MVT::i8 Splat element and match it later.
+    return DAG.getNode(
+        TPUISD::SPLAT, SDLoc(Op), Op->getSimpleValueType(0),
+        DAG.getTargetConstant(Op->getConstantOperandAPInt(0).zext(32),
+                              SDLoc(Op), MVT::i32));
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+static bool isMaskVT(MVT VT, const TPUSubtarget &ST) {
+  return VT.getScalarType() == MVT::i1 &&
+         /* This check is for real low precision i1 types */
+         VT.getSizeInBits() != 8 * ST.vectorSizeInBytes();
+}
+
+// Custom version of CCInfo.AnalyzeFormalArguments, supporting scalar and vector
+// stacks. Hacks the memory offsets split into two stacks into the ArgLocs.
+static void analyzeFormalArguments(const TPUTargetLowering &TLI,
+                                   const TPUSubtarget *ST,
+                                   const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   CCState &CCInfo,
+                                   SmallVector<CCValAssign, 16> &ArgLocs) {
+  int NumBytesScalar = 0;
+  int NumBytesVector = 0;
+  unsigned NumArgs = Ins.size();
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    MVT ArgVT = Ins[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
+    int PrevNumBytes = CCInfo.getNextStackOffset();
+    if (CC_TPU(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo))
+      report_fatal_error("unable to allocate function argument #" + Twine(i));
+    if (!ST->isTPUABIEnabled() || CCInfo.getCallingConv() == CallingConv::Fast)
+      continue;
+    CCValAssign &CCV = ArgLocs[i];
+    if (int StackOffsetDelta = CCInfo.getNextStackOffset() - PrevNumBytes) {
+      if (ArgVT.isVector()) {
+        assert(ST->hasVPU());
+        // This is a trick using the API in order to adjust the LocMemOffset,
+        // because we have two separate stacks for scalar and vector.
+        if (isMaskVT(ArgVT, *ST)) {
+          int AlignedStackOffsetDelta =
+              alignTo(StackOffsetDelta, ST->vectorSizeInBytes());
+          StackOffsetDelta = AlignedStackOffsetDelta;
+        }
+        assert(StackOffsetDelta == ST->vectorSizeInBytes());
+        CCV.convertToMem(NumBytesVector);
+        NumBytesVector += StackOffsetDelta;
+      } else {
+        // Same comment as above.
+        CCV.convertToMem(NumBytesScalar);
+        NumBytesScalar += StackOffsetDelta;
+      }
+    }
+  }
+}
+
+// Transform physical registers into virtual registers and
+// generate load operations for arguments places on the stack.
+SDValue TPUTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  switch (CallConv) {
+  case CallingConv::Fast:
+  case CallingConv::C:
+    break;
+  default:
+    report_fatal_error("Unsupported calling convention");
+  }
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  if (ST->isTPUABIEnabled())
+    RegInfo.addLiveIn(TPU::LR);
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  analyzeFormalArguments(*this, ST, Ins, CCInfo, ArgLocs);
+
+  DenseMap<unsigned, SmallVector<Register, 4>> OrigArgToRegLoc;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    assert(!MF.getFunction().hasStructRetAttr());
+    assert(!IsVarArg);
+    assert(VA.getLocInfo() == CCValAssign::Full);
+    EVT VT = VA.getLocVT();
+    Register VirtReg;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default:
+      llvm_unreachable("Unhandled type in call lowering!");
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+    case MVT::f32:
+      VirtReg = RegInfo.createVirtualRegister(&TPU::GPRRegClass);
+      break;
+    case MVT::i1:
+      assert(!ST->isTPUABIEnabled());
+      VirtReg = RegInfo.createVirtualRegister(&TPU::PPRRegClass);
+      break;
+    case MVT::x86mmx:
+      assert(ST->hasVPU());
+      VirtReg = RegInfo.createVirtualRegister(&TPU::CBRRegClass);
+      break;
+    case MVT::v8i32:
+    case MVT::v8f32:
+    case MVT::v16bf16:
+    case MVT::v16f16:
+    case MVT::v16i16:
+    case MVT::v32i8:
+    case MVT::v64i4:
+    case MVT::v128i2:
+    case MVT::v256i1:
+    case MVT::v16i32:
+    case MVT::v16f32:
+    case MVT::v32bf16:
+    case MVT::v32f16:
+    case MVT::v32i16:
+    case MVT::v64i8:
+    case MVT::v128i4:
+    case MVT::v256i2:
+    case MVT::v512i1:
+    case MVT::v1024i32:
+    case MVT::v1024f32:
+      assert(ST->hasVPU());
+      if (IsBC && TPU::VAGGRegClass.contains(VA.getLocReg())) {
+        assert(!ST->isTPUABIEnabled());
+        VirtReg = RegInfo.createVirtualRegister(&TPU::VAGGRegClass);
+      } else {
+        VirtReg = RegInfo.createVirtualRegister(&TPU::VPRRegClass);
+      }
+      break;
+    case MVT::v64i1:
+      assert(ST->hasVPU());
+      if (ST->hasV8())
+        llvm_unreachable("Unexpected mask type.");
+      VirtReg = RegInfo.createVirtualRegister(&TPU::MPRRegClass);
+      break;
+    case MVT::v16i1:
+      assert(ST->hasVPU());
+      VirtReg = RegInfo.createVirtualRegister(&TPU::MPRRegClass);
+      break;
+    case MVT::v32i1:
+      assert(ST->hasVPU());
+      if (ST->hasV8() && !HasLPGL)
+        llvm_unreachable("Needs +lp.");
+      VirtReg = RegInfo.createVirtualRegister(&TPU::MPRRegClass);
+      break;
+    case MVT::v8i1:
+    case MVT::v1024i1:
+      assert(ST->hasVPU());
+      VirtReg = RegInfo.createVirtualRegister(&TPU::MPRRegClass);
+      break;
+    }
+    if (VA.isRegLoc()) {
+      OrigArgToRegLoc[Ins[i].getOrigArgIndex()].push_back(VA.getLocReg());
+      RegInfo.addLiveIn(VA.getLocReg(), VirtReg);
+      InVals.push_back(DAG.getCopyFromReg(Chain, DL, VirtReg, VT));
+    } else { // VA.isRegLoc()
+      assert(VA.isMemLoc());
+      assert(!VA.needsCustom());
+      MachineFunction &MF = DAG.getMachineFunction();
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      // In order to make it easier for the callee, the stack pointer in the
+      // caller is incremented such that it points to a free slot in the callee
+      // for the return address. Adjusting the argument offsets here.
+      if (!VA.getValVT().isVector())
+        LocMemOffset += ST->scalarSizeInBytes();
+      unsigned AdjustedLocMemOffset =
+          TPU::adjustForWordSize(
+              APInt(32, LocMemOffset),
+              VA.getValVT().isVector() ? TPUAS_TileSpmem : TPUAS_Smem, *ST)
+              .getZExtValue();
+      MachineFrameInfo &MFI = MF.getFrameInfo();
+      EVT ArgVT = Ins[i].ArgVT;
+      int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), AdjustedLocMemOffset,
+                                     /*IsImmutable=*/false);
+      auto PtrVT = getPointerTy(DAG.getDataLayout());
+      unsigned Opcode;
+      SDValue StackPtr;
+      if (isMaskVT(VA.getValVT(), *ST)) {
+        assert(ST->hasVPU());
+        Opcode = TPU::RESTORE_MPRs;
+        StackPtr = DAG.getRegister(TPU::FPV, MVT::i32);
+      } else if (VA.getValVT().isVector()) {
+        assert(ST->hasVPU());
+        Opcode = TPU::RESTORE_VPRs;
+        StackPtr = DAG.getRegister(TPU::FPV, MVT::i32);
+      } else {
+        Opcode = TPU::RESTORE_GPRs;
+        StackPtr = DAG.getRegister(TPU::FPS, MVT::i32);
+      }
+      SmallVector<SDValue, 8> Ops;
+      SDValue TFI = DAG.getTargetFrameIndex(FI, PtrVT);
+      auto PredReg = DAG.getRegister(TPU::Palways, MVT::i1);
+      auto PredInvert = DAG.getTargetConstant(APInt(1, 0), DL, MVT::i1);
+      Ops.push_back(StackPtr);
+      Ops.push_back(TFI);
+      Ops.push_back(PredReg);
+      Ops.push_back(PredInvert);
+      MVT ValVT = VA.getValVT();
+      MachineSDNode *MN = DAG.getMachineNode(
+          Opcode, DL, isMaskVT(ValVT, *ST) ? VMNI1 : ValVT, Ops);
+      auto *MemRef = DAG.getMachineFunction().getMachineMemOperand(
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+          MachineMemOperand::MOLoad, /*s=*/4,
+          /*base_alignment=*/llvm::Align(4));
+      DAG.setNodeMemRefs(MN, {MemRef});
+      SDValue Arg = SDValue(MN, 0);
+      InVals.push_back(Arg);
+    }
+  }
+
+  if (IsBC) {
+    // On BarnaCore, we obtain aggregates as function inputs and refer to them
+    // by their base register throughout the function. We need to block the
+    // register allocator from clobbering them. Aggregates are identified by
+    // multiple registers having the same input argument index.
+    TPUMachineFunctionInfo &MFInfo = *MF.getInfo<TPUMachineFunctionInfo>();
+    for (auto &Range : OrigArgToRegLoc) {
+      if (Range.second.size() == 1)
+        continue;
+      // Note that we rely on the range already being sorted from above.
+      MFInfo.addBarnaCoreAggregateRange(Range.second.front() - TPU::VAGG0,
+                                        Range.second.back() - TPU::VAGG0 + 1);
+    }
+  }
+  return Chain;
+}
+
+SDValue
+TPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                               bool IsVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
+                               const SDLoc &DL, SelectionDAG &DAG) const {
+  // CCValAssign - represent the assignment of the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_TPU);
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    // FIXME(b/237788792): Finalize return ABI.
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    assert(!VA.needsCustom());
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVals[i], Chain);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain; // Update chain
+
+  // We're checking the call graph here and setting whether or not a function is
+  // an entry function. At least on our system, this is good enough.
+  TPUMachineFunctionInfo &MFInfo = *MF.getInfo<TPUMachineFunctionInfo>();
+  // Ugly cast, CallGraph should really take a const Module. FIXME(hgreving):
+  // maybe try to change upstream. The cast here is safe because nobody will
+  // change the Module.
+  CallGraph CG(*const_cast<Module *>(MF.getMMI().getModule()));
+  const CallGraphNode *CGN = CG[&MF.getFunction()];
+  // There's a always at least one null node referencing the function.
+  if (CGN->getNumReferences() == 1)
+    MFInfo.setIsTopLevel(true);
+  else
+    MFInfo.setIsTopLevel(false);
+
+  if (!ST->isTPUABIEnabled() || MFInfo.isTopLevel())
+    return DAG.getNode(TPUISD::HALT, DL, MVT::Other,
+                       ArrayRef<SDValue>(&RetOps[0], RetOps.size()));
+  return DAG.getNode(TPUISD::RET, DL, MVT::Other,
+                     ArrayRef<SDValue>(&RetOps[0], RetOps.size()));
+}
+
+//===----------------------------------------------------------------------===//
+//                      Custom Lowerings
+//===----------------------------------------------------------------------===//
+
+SDValue TPUTargetLowering::PerformSCALAR_TO_VECTORCombine(
+    SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
+  const SDValue &Val = N->getOperand(0);
+  MVT VecVT = N->getSimpleValueType(0);
+
+  return DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, Val);
+}
+
+SDValue TPUTargetLowering::PerformINSERT_VECTOR_ELTCombine(
+    SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
+  const SDValue &Vec = N->getOperand(0);
+  const SDValue &Val = N->getOperand(1);
+
+  auto PredReg = DCI.DAG.getRegister(TPU::Palways, MVT::i1);
+  auto PredInvert = DCI.DAG.getTargetConstant(APInt(1, 0), SDLoc(N), MVT::i1);
+  MVT VecVT = N->getSimpleValueType(0);
+
+  SDValue SplatVal = DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, Val);
+
+  SmallVector<SDValue, 8> Ops;
+  SDValue Vseq = SDValue(DCI.DAG.getMachineNode(TPU::VLANESEQ, SDLoc(N), VNI32,
+                                                PredReg, PredInvert),
+                         0);
+  Ops.push_back(Vseq);
+
+  SDValue Mask;
+  if (const ConstantSDNode *Idx =
+          dyn_cast<ConstantSDNode>(N->getOperand(2).getNode())) {
+    Ops.push_back(DCI.DAG.getTargetConstant(*Idx->getConstantIntValue(),
+                                            SDLoc(N), MVT::i32));
+    Ops.push_back(PredReg);
+    Ops.push_back(PredInvert);
+    Mask =
+        SDValue(DCI.DAG.getMachineNode(TPU::VMLANEi, SDLoc(N), VMNI1, Ops), 0);
+  } else {
+    Ops.push_back(SDValue(cast<SDNode>(N->getOperand(2).getNode()), 0));
+    Ops.push_back(PredReg);
+    Ops.push_back(PredInvert);
+    Mask =
+        SDValue(DCI.DAG.getMachineNode(TPU::VMLANEr, SDLoc(N), VMNI1, Ops), 0);
+  }
+  return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), VecVT, Mask, SplatVal, Vec);
+}
+
+bool TPUTargetLowering::isNonNaNFPConstSplat(SDValue N) const {
+  if (N->getOpcode() == TPUISD::SPLAT) {
+    if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+      return !CN->isNaN();
+  }
+  return false;
+}
+
+EVT TPUTargetLowering::getOptimalMemOpType(
+    const MemOp &Op, const AttributeList &FuncAttributes) const {
+  // We're returning something that makes sense, though it is useless since we
+  // neither know the memory space, nor can we let selection DAG to the LLVM
+  // MemOp lowering. See header file for explanation.
+  return VNI32;
+}
+
+SDValue TPUTargetLowering::PerformSETCCCombine(
+    SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
+  // We help the DAG combiner by recognizing ordered setcc of splats that can't
+  // be NaN. LLVM can do that if BUILD_VECTOR, but we combine early into SPLAT,
+  // hence this code.
+  if (!isNonNaNFPConstSplat(N->getOperand(0)) ||
+      !isNonNaNFPConstSplat(N->getOperand(1)))
+    return SDValue();
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  // TODO(hgreving): what about SETO?
+  ISD::CondCode NoNaNCC = getFCmpCodeWithoutNaN(CC);
+  if (NoNaNCC != CC)
+    return DCI.DAG.getSetCC(SDLoc(N), N->getSimpleValueType(0),
+                            N->getOperand(0), N->getOperand(1), NoNaNCC);
+  return SDValue();
+}
+
+SDValue TPUTargetLowering::getSupportedVCMask(SelectionDAG &DAG, int VectorMask,
+                                              SDLoc Loc) const {
+  if (!ST->hasVCMasks() || !GenerateTpuVCMasks)
+    return SDValue();
+  int MaskSizeInBits = EVT(VMNI1).getSizeInBits();
+  int FullMask = (1 << MaskSizeInBits) - 1;
+  // Technically `< MaskSizeInBits` would be enough because a full mask should
+  // be covered by embedded masks.
+  for (int i = 1; i <= MaskSizeInBits; i++) {
+    int CompareMask = (1 << i) - 1;
+    for (int j = 0; j < MaskSizeInBits; j++) {
+      int RotCompareMask =
+          (CompareMask << j | CompareMask >> (MaskSizeInBits - j)) & FullMask;
+      if (VectorMask == RotCompareMask) {
+        int S = j * 4;
+        int E = ((i + j - 1) % MaskSizeInBits) * 4 + 3;
+        assert(S < EVT(VMNI1).getSizeInBits() * 4);
+        assert(E < EVT(VMNI1).getSizeInBits() * 4);
+        auto PredReg = DAG.getRegister(TPU::Palways, MVT::i1);
+        auto PredInvert = DAG.getTargetConstant(APInt(1, 0), Loc, MVT::i1);
+        return SDValue(
+            DAG.getMachineNode(
+                TPU::VCMASKi, Loc, VMNI1,
+                DAG.getTargetConstant(APInt(32, E << 8 | S), Loc, MVT::i32),
+                PredReg, PredInvert),
+            0);
+      }
+    }
+  }
+  return SDValue();
+}
+
+SDValue TPUTargetLowering::getSupportedVCMask(SelectionDAG &DAG,
+                                              SDNode *N) const {
+  if (!ST->hasVCMasks() || !GenerateTpuVCMasks)
+    return SDValue();
+  int MaskSizeInBits = EVT(VMNI1).getSizeInBits();
+  if (N->getNumOperands() != MaskSizeInBits)
+    return SDValue();
+  int BuildVectorMask = 0;
+  for (int i = 0; i < MaskSizeInBits; i++) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i));
+    if (C == nullptr)
+      return SDValue();
+    BuildVectorMask |= C->getZExtValue() << i;
+  }
+  return getSupportedVCMask(DAG, BuildVectorMask, SDLoc(N));
+}
+
+SDValue TPUTargetLowering::PerformBUILD_VECTORCombine(
+    SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
+  // Combine a BUILD_VECTOR(42, 42, 42, 42, ...) -> SPLAT(42)
+  MVT VecVT = N->getSimpleValueType(0);
+  if (VecVT != VNI32 && VecVT != VNF32 && VecVT != VMNI1) {
+    if (!HasLPGL)
+      return SDValue();
+    if (VecVT != VNBF16 && VecVT != VNI8)
+      return SDValue();
+  }
+  MVT ScalarVT;
+  if (VecVT == VNI32)
+    ScalarVT = MVT::i32;
+  else if (VecVT == VNF32)
+    ScalarVT = MVT::f32;
+  else if (VecVT == VNBF16)
+    ScalarVT = MVT::bf16;
+  else if (VecVT == VNI8)
+    ScalarVT = MVT::i8;
+  else if (VecVT == VMNI1)
+    // Low precision build_vector masks are currently not supported.
+    ScalarVT = MVT::i1;
+  else
+    llvm_unreachable("Bad vector ty!");
+
+  // Checking for supported embedded hardware masks. I would have preferred to
+  // do this in tablegen, and this would be possible with sth like this:
+  //
+  // def tpuvm17 : PatLeaf<(build_vector), [{
+  //   return isMask7f(N);
+  // }]>;
+  //
+  // let Predicates = [HasV8,NotBC] in {
+  // def : Pat<(vNi1 (Splat -1)), (COPY !cast<TPUReg>("M16"))>;
+  // def : Pat<(vNi1 (tpuvm17)), (COPY !cast<TPUReg>("M17"))>;
+  //
+  // However, since we already combine BUILD_VECTOR here, we would have to check
+  // for the embedded masks here anyway and potentially bail combine.
+  // Additionally, it is harder to turn on/off the feature in tablegen. Lastly,
+  // we may run into cases with instructions not supporting the special mask, in
+  // which case we probably want to legalize them, and this will be easier if we
+  // combine the hardware mask here. All of the above is the reason why the code
+  // is here, and not in tablegen.
+  //
+  if (ScalarVT == MVT::i1) {
+    Register EmbeddedMask = getSupportedEmbeddedMask(N);
+    if (EmbeddedMask != TPU::NoRegister)
+      return DCI.DAG.getCopyFromReg(DCI.DAG.getEntryNode(), SDLoc(N),
+                                    EmbeddedMask, VMNI1);
+    SDValue VMCreate = getSupportedVCMask(DCI.DAG, N);
+    if (VMCreate.getNode())
+      return VMCreate;
+  }
+
+  unsigned VecSize = MVT(VecVT).getVectorNumElements();
+  bool IsSplat = true;
+  bool IsVlaneSeq = true;
+  assert(N->getNumOperands() == VecSize);
+  SDValue Val0 = N->getOperand(0);
+  int IC = -1;
+  if (Val0.getSimpleValueType() != ScalarVT)
+    return SDValue();
+  for (unsigned I = 0; I < VecSize; ++I) {
+    if (N->getOperand(I) != Val0 && !N->getOperand(I).isUndef())
+      IsSplat = false;
+    ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(N->getOperand(I));
+    if (!ValC) {
+      IsVlaneSeq = false;
+      continue;
+    }
+    if (ValC->getZExtValue() != IC++ + 1)
+      IsVlaneSeq = false;
+    if (!IsVlaneSeq && !IsSplat)
+      break;
+  }
+
+  if (IsSplat)
+    return DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, Val0);
+
+  auto PredReg = DCI.DAG.getRegister(TPU::Palways, MVT::i1);
+  auto PredInvert = DCI.DAG.getTargetConstant(APInt(1, 0), SDLoc(N), MVT::i1);
+
+  if (IsVlaneSeq)
+    return SDValue(DCI.DAG.getMachineNode(TPU::VLANESEQ, SDLoc(N), VNI32,
+                                          PredReg, PredInvert),
+                   0);
+
+  // BUILD_VECTOR(a, b, c, d, ...) -> VSEL(Splat(a), ...)
+  // This is really ugly but is the only way :(
+
+  // Pick an initial splat value.
+  SDValue InitialSplatted = N->getOperand(VecSize - 1);
+  SDValue V = DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, InitialSplatted);
+  for (unsigned I = 0; I < VecSize; ++I) {
+    if (N->getOperand(I)->isUndef() || N->getOperand(I) == InitialSplatted)
+      continue;
+    SDValue SplatVal =
+        DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, N->getOperand(I));
+
+    SDValue VMCreate = getSupportedVCMask(DCI.DAG, 1 << I, SDLoc(N));
+    SDValue Mask;
+    if (VMCreate.getNode()) {
+      Mask = VMCreate;
+    } else {
+      SmallVector<SDValue, 8> Ops;
+      SDValue Vseq = SDValue(DCI.DAG.getMachineNode(TPU::VLANESEQ, SDLoc(N),
+                                                    VNI32, PredReg, PredInvert),
+                             0);
+      Ops.push_back(Vseq);
+      Ops.push_back(DCI.DAG.getTargetConstant(I, SDLoc(N), MVT::i32));
+      Ops.push_back(PredReg);
+      Ops.push_back(PredInvert);
+      Mask = SDValue(DCI.DAG.getMachineNode(TPU::VMLANEi, SDLoc(N), VMNI1, Ops),
+                     0);
+    }
+
+    // And use that mask to select-in this value.
+    V = DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), VecVT, Mask, SplatVal, V);
+  }
+  return V;
+}
+
+SDValue TPUTargetLowering::PerformVECTOR_SHUFFLECombine(
+    ShuffleVectorSDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
+  // Combine a VECTOR_SHUFFLE(1, 2, 3, 4, 5, 6, 7, 0) -> VROTDOWN()
+  // or        VECTOR_SHUFFLE(VECTOR_INSERT(x,y, n), n, n, ...) -> VSPLAT(y)
+  // or        VECTOR_SHUFFLE(x, x, x, x, x, x, x, x) -> VSPLAT(VROTDOWN())
+
+  MVT VecVT = N->getSimpleValueType(0);
+  if (VecVT != VNI32 && VecVT != VNF32 && VecVT != VMNI1)
+    return SDValue();
+  assert(N->getNumOperands() == 2);
+  SDValue Val = N->getOperand(0);
+
+  unsigned VecSize = MVT(VecVT).getVectorNumElements();
+  bool IsSequence = true;
+  bool IsSame = true;
+  unsigned Offset = N->getMaskElt(0);
+  for (unsigned I = 0; I < VecSize; ++I) {
+    if (N->getMaskElt(I) != (I + Offset) % VecSize)
+      IsSequence = false;
+    if (N->getMaskElt(I) != Offset)
+      IsSame = false;
+  }
+
+  bool NeedsTrunc = false;
+  if (VecVT == VMNI1) {
+    Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VNI32, Val);
+    VecVT = VNI32;
+    NeedsTrunc = true;
+  }
+
+  // Helper function to trucate the result if we performed extension
+  // of the operation from i1
+  auto TruncateReturnIfNeed = [&](SDValue V) {
+    if (NeedsTrunc)
+      return DCI.DAG.getNode(ISD::TRUNCATE, SDLoc(N), VMNI1, V);
+    return V;
+  };
+
+  if (IsSequence && ST->isSparseCore())
+    return TruncateReturnIfNeed(
+        DCI.DAG.getNode(TPUISD::VROTDOWN, SDLoc(N), VecVT, Val,
+                        DCI.DAG.getConstant(Offset, SDLoc(N), MVT::i32)));
+
+  if (!IsSame && ST->isSparseCore()) {
+    // SparseCore has a vector permute that permutes the elements into all lanes
+    // of a vector based on a vector mask.
+    SmallVector<SDValue, 8> MaskElements;
+    for (int El : N->getMask())
+      MaskElements.push_back(DCI.DAG.getConstant(El, SDLoc(N), MVT::i32));
+    SDValue VMask =
+        DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VNI32, MaskElements);
+    return TruncateReturnIfNeed(
+        DCI.DAG.getNode(TPUISD::VPERMUTE, SDLoc(N), VecVT, Val, VMask));
+  }
+
+  if (!IsSame)
+    return SDValue();
+
+  // On tensorcore we cannot use rotdown to move any element into lane 0.
+  if (!ST->isSparseCore() && Offset != 0)
+    return SDValue();
+
+  MVT ScalarVT = VecVT == VNI32 ? MVT::i32 : MVT::f32;
+  // If the value replicated comes from an insert, splat directly the original
+  // value
+  if (N->getOperand(0).getOpcode() == ISD::INSERT_VECTOR_ELT) {
+    SDNode *ExtractElt = cast<SDNode>(N->getOperand(0));
+    const ConstantSDNode *Idx =
+        cast<ConstantSDNode>(ExtractElt->getOperand(2).getNode());
+    if (Idx->getConstantIntValue()->getZExtValue() == Offset) {
+      SDValue ExtractedVal = ExtractElt->getOperand(1);
+      MVT ExtractedSplatVT = NeedsTrunc ? VMNI1 : VecVT;
+      return DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), ExtractedSplatVT,
+                             ExtractedVal);
+    }
+  }
+  if (ST->hasBroadcast()) {
+    // SparseCore has a vector broadcast that broadcasts the element at Offset
+    // into all lanes of a vector without traversing the scalar side.
+    return TruncateReturnIfNeed(
+        DCI.DAG.getNode(TPUISD::VBROADCAST, SDLoc(N), VecVT, Val,
+                        DCI.DAG.getConstant(Offset, SDLoc(N), MVT::i32)));
+  }
+  // Extract the splatted value from the vector and re-splat it.
+  // Rotate the vector if the offset is not zero.
+  if (Offset != 0) {
+    Val = DCI.DAG.getNode(TPUISD::VROTDOWN, SDLoc(N), VecVT, Val,
+                          DCI.DAG.getConstant(Offset, SDLoc(N), MVT::i32));
+  }
+  Val = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ScalarVT, Val,
+                        DCI.DAG.getConstant(0, SDLoc(N), MVT::i32));
+  Val = DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VecVT, Val);
+  return TruncateReturnIfNeed(Val);
+}
+
+SDValue TPUTargetLowering::PerformVSELECTCombine(
+    SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
+  if (N->getValueType(0) != VMNI1)
+    return SDValue();
+  SDValue Cond = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue Op2 = N->getOperand(2);
+  if (Op1.getOpcode() == TPUISD::SPLAT && Op2.getOpcode() == TPUISD::SPLAT &&
+      isa<ConstantSDNode>(Op1->getOperand(0)) &&
+      isa<ConstantSDNode>(Op2->getOperand(0))) {
+    bool TrueVal = cast<ConstantSDNode>(Op1->getOperand(0))->getLimitedValue();
+    bool FalseVal = cast<ConstantSDNode>(Op2->getOperand(0))->getLimitedValue();
+
+    if (TrueVal == FalseVal)
+      // select(C, X, X) -> X
+      return Op1;
+    if (TrueVal == true && FalseVal == false)
+      // select(C, 1, 0) -> C
+      return Cond;
+    assert(TrueVal == false && FalseVal == true);
+    // select(C, 0, 1) -> !C === C xor -1
+    return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VMNI1, Cond, Op2);
+  }
+
+  // select(C, X, Y) -> (C & X) | (~C & Y)
+  SDValue CAndX = DCI.DAG.getNode(ISD::AND, SDLoc(N), VMNI1, Cond, Op1);
+  SDValue NotC = DCI.DAG.getNode(
+      ISD::XOR, SDLoc(N), VMNI1, Cond,
+      DCI.DAG.getNode(TPUISD::SPLAT, SDLoc(N), VMNI1,
+                      DCI.DAG.getConstant(-1, SDLoc(N), MVT::i1)));
+  SDValue NotCAndY = DCI.DAG.getNode(ISD::AND, SDLoc(N), VMNI1, NotC, Op2);
+  return DCI.DAG.getNode(ISD::OR, SDLoc(N), VMNI1, CAndX, NotCAndY);
+}
+
+SDValue TPUTargetLowering::PerformBcInsertValueCombine(
+    SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
+  // Combine llvm.tpu.bc.insertvalue.loopindex -> BC_INSERTVALUE.
+  // The intrinsic takes an array and returns an array. This is lowered to
+  //   %a = merge_values a0,a1,a2,...,an-1
+  //   %b1,b2,...,bn-1 = @llvm.tpu.bc.insertvalue.loopindex %a, %c
+  //
+  // We don't care about the values of any physical registers. We've already
+  // reserved a block of registers for this aggregate, all we need to do is
+  // keep the zeroth register to plumb through as the base value.
+  //
+  // Here we replace the intrinsic with an BC_INSERTVALUE of the base register
+  // and a MERGE_VALUES result, with the base register in value 0 and the rest
+  // UNDEF. The optimizer will then clean things up.
+
+  SDLoc DL(N);
+  SDValue BaseReg = N->getOperand(1);
+  SDValue InsertedValue = N->getOperand(2);
+  EVT VT = BaseReg.getValueType();
+  SDValue NewN =
+      DCI.DAG.getNode(TPUISD::BC_INSERTVALUE, DL, VT, BaseReg, InsertedValue);
+  SmallVector<SDValue, 4> Vs(N->getNumValues(), DCI.DAG.getUNDEF(VT));
+  Vs[0] = NewN;
+  return DCI.DAG.getMergeValues(Vs, DL);
+}
+
+SDValue TPUTargetLowering::PerformBcExtractValueCombine(
+    SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
+  // Combine llvm.tpu.bc.extractvalue.loopindex -> BC_EXTRACTVALUE.
+  // The intrinsic takes an array and returns a vector. This is lowered to
+  //   %a = merge_values a0,a1,a2,...,an-1
+  //   %b:v8f32 = @llvm.tpu.bc.extractvalue.loopindex %a
+  //
+  // We don't care about the values of any physical registers. We've already
+  // reserved a block of registers for this aggregate, all we need to do is
+  // keep the zeroth register to plumb through as the base value.
+  //
+  // We're already accessing MERGE_VALUES:0, so just rewrite in place.
+
+  SDLoc DL(N);
+  SDValue BaseReg = N->getOperand(1);
+  EVT VT = BaseReg.getValueType();
+  return DCI.DAG.getNode(TPUISD::BC_EXTRACTVALUE, DL, VT, BaseReg);
+}
+
+SDValue TPUTargetLowering::PerformPtrToIntCombine(SDNode *N) const {
+  return N->getOperand(1);
+}
+
+const char *TPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default:
+    return "<TPU unknown opcode>";
+  case TPUISD::HALT:
+    return "TPUISD::HALT";
+  case TPUISD::VROTDOWN:
+    return "TPUISD::VROTDOWN";
+  case TPUISD::VBROADCAST:
+    return "TPUISD::VBROADCAST";
+  case TPUISD::VPERMUTE:
+    return "TPUISD::VPERMUTE";
+  case TPUISD::SPLAT:
+    return "TPUISD::SPLAT";
+  case TPUISD::WRAPPER:
+    return "TPUISD::WRAPPER";
+  case TPUISD::BC_INSERTVALUE:
+    return "TPUISD::BC_INSERTVALUE";
+  case TPUISD::BC_EXTRACTVALUE:
+    return "TPUISD::BC_EXTRACTVALUE";
+  case TPUISD::UMUL24:
+    return "TPUISD::UMUL24";
+  case TPUISD::CALL:
+    return "TPUISD::CALL";
+  }
+}
+
+SDValue TPUTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  switch (N->getOpcode()) {
+  case ISD::BUILD_VECTOR:
+    return PerformBUILD_VECTORCombine(N, DCI);
+  case ISD::VECTOR_SHUFFLE:
+    return PerformVECTOR_SHUFFLECombine(cast<ShuffleVectorSDNode>(N), DCI);
+  case ISD::INSERT_VECTOR_ELT:
+    return PerformINSERT_VECTOR_ELTCombine(N, DCI);
+  case ISD::SCALAR_TO_VECTOR:
+    return PerformSCALAR_TO_VECTORCombine(N, DCI);
+  case ISD::VSELECT:
+    return PerformVSELECTCombine(N, DCI);
+  case ISD::INTRINSIC_WO_CHAIN:
+    switch (N->getConstantOperandVal(0)) {
+    default:
+      return SDValue();
+    case Intrinsic::tpu_bc_insertvalue_loopindex:
+      return PerformBcInsertValueCombine(N, DCI);
+    case Intrinsic::tpu_bc_extractvalue_loopindex:
+      return PerformBcExtractValueCombine(N, DCI);
+    case Intrinsic::tpu_inttoptr:
+    case Intrinsic::tpu_ptrtoint:
+      return PerformPtrToIntCombine(N);
+    }
+  case ISD::SETCC:
+    return PerformSETCCCombine(N, DCI);
+  default:
+    break;
+  }
+
+  return SDValue();
+}
+
+std::optional<bool>
+TPUTargetLowering::IsFifoAccess(MachineInstr &MI,
+                                const TargetRegisterClass *RegClass) const {
+  const MCInstrDesc &MCID = TII->get(MI.getOpcode());
+  for (auto I = MCID.opInfo_begin(), IE = MCID.opInfo_end(); I != IE; I++) {
+    if (I->RegClass == RegClass->getID()) {
+      // For push instruction the destination register needs to match the
+      // given reg class. For Pop instruction of the operands needs to match
+      // the given reg class.
+      if (I == MCID.opInfo_begin())
+        return false;
+      else if (I != MCID.opInfo_begin())
+        return true;
+    }
+  }
+  return std::nullopt;
+}
+
+bool TPUTargetLowering::UsesSpecialReg(
+    MachineInstr &MI, const TargetRegisterClass *RegClass) const {
+  const MCInstrDesc &MCID = TII->get(MI.getOpcode());
+  for (auto I = MCID.opInfo_begin(), IE = MCID.opInfo_end(); I != IE; I++) {
+    if (I->RegClass == RegClass->getID()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Register TPUTargetLowering::getSupportedEmbeddedMask(SDNode *N) const {
+  if (!ST->hasEmbeddedMasks() || !PropagateTpuEmbeddedMasks)
+    return TPU::NoRegister;
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+  // See e.g. go/vfc-sc-isa#vector-modify-mask-instructions.
+  DenseMap<int, Register> SupportedEmbeddedMasks{
+      {0xff, TPU::M16}, {0x7f, TPU::M17}, {0x3f, TPU::M18}, {0x1f, TPU::M19},
+      {0xf, TPU::M20},  {0x7, TPU::M21},  {0x3, TPU::M22},  {0x1, TPU::M23},
+  };
+  int MaskSizeInBits = EVT(VMNI1).getSizeInBits();
+  if (N->getNumOperands() != MaskSizeInBits)
+    return TPU::NoRegister;
+  auto MatchesBitMask = [MaskSizeInBits, N](int BitMask) {
+    for (int i = 0; i < MaskSizeInBits; i++) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i));
+      if (C == nullptr)
+        return false;
+      if (C->getZExtValue() != ((BitMask >> i) & 0x1))
+        return false;
+    }
+    return true;
+  };
+  for (auto &KV : SupportedEmbeddedMasks) {
+    if (MatchesBitMask(KV.first))
+      return KV.second;
+  }
+  return TPU::NoRegister;
+}
+
+void TPUTargetLowering::SetDependency(MachineInstr &MI, MachineBasicBlock *MBB,
+                                      const TargetRegisterClass *RegClass,
+                                      bool IsPush) const {
+  const TPUTargetMachine &TM =
+      static_cast<const TPUTargetMachine &>(MBB->getParent()->getTarget());
+  MachinePointerInfo MPI(TM.getFifoPSV(IsPush, RegClass));
+  auto *MemRef = MBB->getParent()->getMachineMemOperand(
+      MPI, MachineMemOperand::MOLoad | MachineMemOperand::MOStore, /*s=*/4,
+      /*base_alignment=*/llvm::Align(4));
+  MI.addMemOperand(*MBB->getParent(), MemRef);
+}
+
+// DWG needs dependencies with all matmul.
+// The first matmul after a DWG need dependencies with all matpush.
+// DWG can be re-ordered across matpush instructions.
+// This function adds the memory operators to enforce this order.
+MachineBasicBlock *TPUTargetLowering::SetDWGDep(MachineInstr &MI,
+                                                MachineBasicBlock *MBB) const {
+  MachineRegisterInfo &RegInfo = MBB->getParent()->getRegInfo();
+  SmallDenseSet<MachineInstr *, 32> DWGUses;
+  Register Dst = MI.getOperand(0).getReg();
+  for (MachineInstr &MIUser : RegInfo.use_instructions(Dst)) {
+    assert(MIUser.getParent() == MBB &&
+           "matmul use WDG from a different block, this case is currently not "
+           "supported");
+    DWGUses.insert(&MIUser);
+  }
+  if (DWGUses.empty())
+    return MBB;
+  auto E = MBB->end();
+  MachineInstr *FirstMatMul = nullptr;
+  for (auto I = MI.getIterator(); I != E; I++) {
+    if (DWGUses.count(&(*I)) > 0) {
+      FirstMatMul = &(*I);
+      break;
+    }
+  }
+  assert(FirstMatMul != nullptr && "didn't find any matmul");
+  // The first MatMul needs to have an explicit dependency with gsfn as it
+  // triggers the copy from gsfn/gsft to gmr. This means the following push
+  // cannot be re-ordered across the first matmul.
+  const TargetRegisterClass *GSFNRegClass =
+      RegInfo.getRegClass(MI.getOperand(1).getReg());
+  SetDependency(*FirstMatMul, MBB, GSFNRegClass);
+  // DWG cannot be re-ordered across any matmul instruction so add a dependency
+  // to push MRF to represent that.
+  const TargetRegisterClass *MRFRegClass =
+      RegInfo.getRegClass(FirstMatMul->getOperand(0).getReg());
+  SetDependency(MI, MBB, MRFRegClass, /*isPush=*/true);
+  return MBB;
+}
+
+MachineBasicBlock *
+TPUTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                               MachineBasicBlock *MBB) const {
+  // Generic handling of instructions that to sets dependencies
+  if (static_cast<const TPUInstrInfo *>(TII)->isDWGInst(MI)) {
+    return SetDWGDep(MI, MBB);
+  }
+  bool IsSpecialRegAccess = false;
+  for (auto Fifo : FifoClasses) {
+    if (auto IsPop = IsFifoAccess(MI, Fifo)) {
+      SetDependency(MI, MBB, Fifo, !*IsPop);
+      IsSpecialRegAccess = true;
+    }
+  }
+  for (auto ImplicitReg : SpecialStagingReg) {
+    if (UsesSpecialReg(MI, ImplicitReg)) {
+      SetDependency(MI, MBB, ImplicitReg);
+      IsSpecialRegAccess = true;
+    }
+  }
+  // Instruction with special register accesses only need to be modified to have
+  // an extra pseudo source.
+  if (IsSpecialRegAccess)
+    return MBB;
+
+  auto &ST = MI.getMF()->getSubtarget<TPUSubtarget>();
+  unsigned PopOpcode = TPU::SPOP_V2SF;
+  const TargetRegisterClass *RegClass = &TPU::V2SFPRRegClass;
+  if (ST.hasVfcTensorCore()) {
+    PopOpcode = TPU::SPOP_SFRF;
+    RegClass = &TPU::SFRFPRRegClass;
+  }
+
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Unknown instruction for custom emission!");
+  case TPU::VROTDOWNri:
+    return EmitVROTDOWN(MI, MBB);
+  case TPU::VFREADi:
+    return EmitVecOrSFlagToScalar(
+        MI, MBB, ST.hasVfcTensorCore() ? TPU::tcvfVSYNCMOVEi : TPU::VSYNCMOVEi,
+        1, PopOpcode, RegClass);
+  case TPU::VFREADr:
+    return EmitVecOrSFlagToScalar(
+        MI, MBB, ST.hasVfcTensorCore() ? TPU::tcvfVSYNCMOVEr : TPU::VSYNCMOVEr,
+        1, PopOpcode, RegClass);
+  case TPU::VFREADDONEi:
+    return EmitVecOrSFlagToScalar(
+        MI, MBB,
+        ST.hasVfcTensorCore() ? TPU::tcvfVSYNCMOVEDONEi : TPU::VSYNCMOVEDONEi,
+        1, PopOpcode, RegClass);
+  case TPU::VFREADDONEr:
+    return EmitVecOrSFlagToScalar(
+        MI, MBB,
+        ST.hasVfcTensorCore() ? TPU::tcvfVSYNCMOVEDONEr : TPU::VSYNCMOVEDONEr,
+        1, PopOpcode, RegClass);
+  case TPU::VFREADPAi:
+    return EmitVecOrSFlagToScalar(MI, MBB, TPU::tcvfVSYNCMOVEPAi, 1, PopOpcode,
+                                  RegClass);
+  case TPU::VFREADPAr:
+    return EmitVecOrSFlagToScalar(MI, MBB, TPU::tcvfVSYNCMOVEPAr, 1, PopOpcode,
+                                  RegClass);
+  case TPU::VREAD:
+    assert(!IsSC);
+    return EmitVecOrSFlagToScalar(MI, MBB, TPU::VPUSH, 1, TPU::SPOP_V2SF,
+                                  &TPU::V2SFPRRegClass);
+  case TPU::scVREADi:
+    assert(IsSC);
+    return EmitVecOrSFlagToScalar(MI, MBB, TPU::scVPUSHi, 2, PopOpcode,
+                                  RegClass);
+  case TPU::scVREADr:
+    assert(IsSC);
+    return EmitVecOrSFlagToScalar(MI, MBB, TPU::scVPUSHr, 2, PopOpcode,
+                                  RegClass);
+  case TPU::VMREAD:
+    return EmitVmread(MI, MBB);
+  }
+}
+
+MachineBasicBlock *TPUTargetLowering::EmitVecOrSFlagToScalar(
+    MachineInstr &MI, MachineBasicBlock *MBB, unsigned PushOpcode,
+    int NumOfInputs, unsigned PopOpcode,
+    const TargetRegisterClass *RegClass) const {
+  auto &MRI = MBB->getParent()->getRegInfo();
+  auto InsertPt = MI.getIterator();
+
+  const unsigned FifoReg = MRI.createVirtualRegister(RegClass);
+  MachineInstrBuilder MIB =
+      BuildMI(*MBB, InsertPt, MI.getDebugLoc(), TII->get(PushOpcode), FifoReg);
+  for (int i = 1; i <= NumOfInputs; i++)
+    MIB.add(MI.getOperand(i));
+  MachineInstr *Push = AddDefaultPred(MIB);
+  MachineInstr *Pop =
+      AddDefaultPred(BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
+                             TII->get(PopOpcode), MI.getOperand(0).getReg())
+                         .addReg(FifoReg, getKillRegState(true)));
+  MI.eraseFromParent();
+
+  for (auto &I : {Push, Pop}) {
+    const TPUTargetMachine &TM =
+        static_cast<const TPUTargetMachine &>(MBB->getParent()->getTarget());
+    MachinePointerInfo MPI(TM.getFifoPSV(I == Push, RegClass));
+    auto *MemRef = MBB->getParent()->getMachineMemOperand(
+        MPI, MachineMemOperand::MOLoad | MachineMemOperand::MOStore, /*s=*/4,
+        /*base_alignment=*/llvm::Align(4));
+    I->addMemOperand(*MBB->getParent(), MemRef);
+  }
+  return MBB;
+}
+
+MachineBasicBlock *TPUTargetLowering::EmitVmread(MachineInstr &MI,
+                                                 MachineBasicBlock *MBB) const {
+  auto &MRI = MBB->getParent()->getRegInfo();
+  auto InsertPt = MI.getIterator();
+
+  unsigned ZeroReg = MRI.createVirtualRegister(&TPU::VPRRegClass);
+  AddDefaultPred(BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
+                         TII->get(TPU::VIMMI), ZeroReg)
+                     .addImm(0));
+  AddDefaultPred(BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
+                         TII->get(TPU::VSELir),
+                         MI.getOperand(0).getReg())
+                     .add(MI.getOperand(1))
+                     .addImm(1)
+                     .addReg(ZeroReg));
+  MI.eraseFromParent();
+  return MBB;
+}
+
+MachineBasicBlock *
+TPUTargetLowering::EmitVROTDOWN(MachineInstr &MI,
+                                MachineBasicBlock *MBB) const {
+  // Emit VROTDOWNri as a sequence of N VROTDOWNr's.
+  auto &MRI = MBB->getParent()->getRegInfo();
+
+  unsigned Imm = MI.getOperand(2).getImm();
+  auto OpReg = MI.getOperand(1).getReg();
+  auto FinalReg = MI.getOperand(0).getReg();
+  auto InsertPt = MI.getIterator();
+  if (Imm == 0) {
+    BuildMI(*MBB, InsertPt, MI.getDebugLoc(), TII->get(TPU::COPY), FinalReg)
+        .addReg(OpReg);
+    MI.eraseFromParent();
+    return MBB;
+  }
+
+  // TODO(hgreving): Sparsecore and Viperfish should be able to use
+  // one xlane instruction.
+  MachineInstr *TheMI = &MI;
+  for (unsigned I = 0; I < Imm; ++I) {
+    unsigned OutReg = (I == (Imm - 1))
+                          ? FinalReg
+                          : MRI.createVirtualRegister(&TPU::VPRRegClass);
+    TheMI = AddDefaultPred(
+        BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
+                TII->get(TPU::VROTDOWNr), OutReg)
+            .addReg(OpReg, getKillRegState(true)));
+    OpReg = OutReg;
+  }
+  MI.eraseFromParent();
+
+  return MBB;
+}
+
+bool TPUTargetLowering::allowsMemoryAccess(LLVMContext &Context,
+                                           const DataLayout &DL, EVT VT,
+                                           unsigned AddrSpace, Align Alignment,
+                                           MachineMemOperand::Flags Flags,
+                                           unsigned *Fast) const {
+  // Disallow load/store we don't support natively.
+  if (VT != MVT::i32 && VT != MVT::f32 && VT != VNF32 && VT != VNI32)
+    return false;
+  bool Allows = TargetLowering::allowsMemoryAccess(Context, DL, VT, AddrSpace,
+                                                   Alignment, Flags, Fast);
+  if (Allows)
+    *Fast = 1;
+  return Allows;
+}
+
+bool TPUTargetLowering::allowsMisalignedMemoryAccesses(
+    EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
+    unsigned *Fast) const {
+  // No memory access on TPU requires alignment > 4 bytes.
+  if (Alignment >= Align(4))
+    return true;
+  return false;
+}
+
+bool TPUTargetLowering::allowsMisalignedMemoryAccesses(
+    LLT LT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
+    unsigned *Fast) const {
+  // No memory access on TPU requires alignment > 4 bytes.
+  if (Alignment >= Align(4))
+    return true;
+  return false;
+}
+
+void TPUTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                                      SDNode *Node) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  const TPUTargetMachine &TM =
+      static_cast<const TPUTargetMachine &>(MBB->getParent()->getTarget());
+  if (MI.getOpcode() == TPU::INIT_STACK) {
+    // Move stack initialization to the very top of the function.
+    assert(ST->isTPUABIEnabled());
+    MI.setFlags(MachineInstr::FrameSetup);
+    MI.moveBefore(&*MBB->instr_begin());
+    return;
+  }
+  if (MI.getOpcode() == TPU::bcVST_concat ||
+      MI.getOpcode() == TPU::bcVST_concat_aliaddr) {
+    MachinePointerInfo MPI(
+        TM.getPSV(TPUTargetMachine::PSV_BarnaCore_ConcatReg));
+    auto *MemRef = MBB->getParent()->getMachineMemOperand(
+        MPI, MachineMemOperand::MOStore, /*s=*/4, /*base_alignment=*/llvm::Align(4));
+    MI.addMemOperand(*MBB->getParent(), MemRef);
+    return;
+  }
+  if (MI.getOpcode() == TPU::bcVSHIFT ||
+      MI.getOpcode() == TPU::bcVSHIFT_aliaddr) {
+    {
+      MachinePointerInfo MPI(
+          TM.getPSV(TPUTargetMachine::PSV_BarnaCore_ShiftReg));
+      auto *MemRef = MBB->getParent()->getMachineMemOperand(
+          MPI, MachineMemOperand::MOStore, /*s=*/4, /*base_alignment=*/llvm::Align(4));
+      MI.addMemOperand(*MBB->getParent(), MemRef);
+    }
+    {
+      MachinePointerInfo MPI(
+          TM.getPSV(TPUTargetMachine::PSV_BarnaCore_ConcatReg));
+      auto *MemRef = MBB->getParent()->getMachineMemOperand(
+          MPI, MachineMemOperand::MOLoad, /*s=*/4, /*base_alignment=*/llvm::Align(4));
+      MI.addMemOperand(*MBB->getParent(), MemRef);
+    }
+    return;
+  }
+  // We rely on convention of brcond ordering to match bcLOOP_END correctly.
+  // Ensure we actually matched correctly here. bcLOOP_END should point back
+  // to its own block (single block loops only are allowed).
+  assert(MI.getOpcode() == TPU::bcLOOP_END);
+  assert(MI.getOperand(0).getMBB() == MI.getParent() &&
+         "bcLOOP_END does not point to its parent!");
+  MI.getParent()->setMachineBlockAddressTaken();
+}
+
+// Custom version of CCInfo.AnalyzeCallOperands, supporting scalar and vector
+// stacks. Hacks the memory offsets split into two stacks into the ArgLocs and
+// returns the scalar and vector sizes for call parameters. Also returns the
+// extra bytes used for alignment of the vector stack of masks.
+static void analyzeCallOperands(const TPUTargetLowering &TLI,
+                                const TPUSubtarget *ST,
+                                const TargetLowering::CallLoweringInfo &CLI,
+                                CCState &CCInfo,
+                                SmallVector<CCValAssign, 16> &ArgLocs,
+                                int &NumBytesScalar, int &NumBytesVector,
+                                int &ExtraAlignBytesVector) {
+  const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  unsigned NumOps = Outs.size();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MVT ArgVT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    int PrevNumBytes = CCInfo.getNextStackOffset();
+    if (CC_TPU(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo)) {
+#ifndef NDEBUG
+      dbgs() << "Call operand #" << i << " has unhandled type "
+             << EVT(ArgVT).getEVTString() << '\n';
+#endif
+      llvm_unreachable(nullptr);
+    }
+    if (!ST->isTPUABIEnabled() || CCInfo.getCallingConv() == CallingConv::Fast)
+      continue;
+    assert(!ArgLocs[i].isMemLoc() ||
+           PrevNumBytes == ArgLocs[i].getLocMemOffset());
+    CCValAssign &CCV = ArgLocs[i];
+    if (int StackOffsetDelta = CCInfo.getNextStackOffset() - PrevNumBytes) {
+      if (ArgVT.isVector()) {
+        assert(ST->hasVPU());
+        // This is a trick using the API in order to adjust the LocMemOffset,
+        // because we have two separate stacks for scalar and vector.
+        if (isMaskVT(ArgVT, *ST)) {
+          int AlignedStackOffsetDelta =
+              alignTo(StackOffsetDelta, ST->vectorSizeInBytes());
+          ExtraAlignBytesVector += AlignedStackOffsetDelta - StackOffsetDelta;
+          StackOffsetDelta = AlignedStackOffsetDelta;
+        }
+        assert(StackOffsetDelta == ST->vectorSizeInBytes());
+        CCV.convertToMem(NumBytesVector);
+        NumBytesVector += StackOffsetDelta;
+      } else {
+        assert(StackOffsetDelta == ST->scalarSizeInBytes());
+        // Same comment as above.
+        CCV.convertToMem(NumBytesScalar);
+        NumBytesScalar += StackOffsetDelta;
+      }
+    }
+  }
+  assert(CCInfo.getCallingConv() == CallingConv::Fast ||
+         ArgLocs.size() == NumOps);
+}
+
+SDValue TPUTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                     SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &DL = CLI.DL;
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &IsTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
+  // Not supported.
+  assert(!IsVarArg);
+  // FIXME(b/237788792): Support return values.
+  assert(CLI.RetTy->isVoidTy() &&
+         "Return values should be passed by reference");
+  // No support for tail calls right now.
+  IsTailCall = false;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+  // How many bytes are to be pushed on the scalar stack.
+  int NumBytesScalar = 0;
+  // How many bytes are to be pushed on the vector stack.
+  int NumBytesVector = 0;
+  // Extra bytes added for vector memory alignment, used for masks.
+  int ExtraAlignBytesVector = 0;
+  analyzeCallOperands(*this, ST, CLI, CCInfo, ArgLocs, NumBytesScalar,
+                      NumBytesVector, ExtraAlignBytesVector);
+  assert(NumBytesScalar + NumBytesVector - ExtraAlignBytesVector ==
+         CCInfo.getNextStackOffset());
+
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytesScalar, NumBytesVector, DL);
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+
+  // Walk the register assignments, inserting copies.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    assert(VA.getValVT() == VA.getLocVT());
+    SDValue Arg = OutVals[I];
+    if (VA.isRegLoc()) {
+      // Promote the value if needed.
+      switch (VA.getLocInfo()) {
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+        break;
+      default:
+        llvm_unreachable("Unknown loc info!");
+      }
+
+      // Arguments that can be passed on register must be kept at RegsToPass
+      // vector
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else { // VA.isRegLoc()
+      assert(VA.isMemLoc());
+      assert(!VA.needsCustom());
+      auto PtrVT = getPointerTy(DAG.getDataLayout());
+      MachineFunction &MF = DAG.getMachineFunction();
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      // In order to make it easier for the callee, the stack pointer in the
+      // caller is incremented such that it points to a free slot in the callee
+      // for the return address. Adjusting the argument offsets here.
+      if (!VA.getValVT().isVector())
+        LocMemOffset += ST->scalarSizeInBytes();
+      else
+        assert(ST->hasVPU());
+      unsigned AdjustedLocMemOffset =
+          TPU::adjustForWordSize(
+              APInt(32, LocMemOffset),
+              VA.getValVT().isVector() ? TPUAS_TileSpmem : TPUAS_Smem, *ST)
+              .getZExtValue();
+      SDValue PtrOff = DAG.getIntPtrConstant(AdjustedLocMemOffset, DL);
+      // Stack pointer (not frame pointer) based after call stack adjustments.
+      SDValue DstAddr = DAG.getNode(
+          ISD::ADD, DL, PtrVT,
+          DAG.getRegister(VA.getValVT().isVector() ? TPU::SPV : TPU::SPS,
+                          MVT::i32),
+          PtrOff);
+      MachinePointerInfo DstInfo =
+          VA.getValVT().isVector()
+              ? MachinePointerInfo(TPUAS_TileSpmem, LocMemOffset)
+              : MachinePointerInfo::getStack(MF, LocMemOffset);
+      SDValue Store;
+      if (isMaskVT(VA.getValVT(), *ST)) {
+        SDValue Select =
+            DAG.getNode(ISD::VSELECT, DL, VNI32, Arg,
+                        DAG.getNode(TPUISD::SPLAT, DL, VNI32,
+                                    DAG.getConstant(0xFFFFFFFF, DL, MVT::i32)),
+                        DAG.getNode(TPUISD::SPLAT, DL, VNI32,
+                                    DAG.getConstant(0, DL, MVT::i32)));
+        Store = DAG.getStore(Chain, DL, Select, DstAddr, DstInfo);
+      } else {
+        Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
+      }
+      MemOpChains.push_back(Store);
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+  SDValue InFlag;
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain and
+  // flag operands which copy the outgoing args into registers.  The InFlag is
+  // necessary since all emitted instructions must be stuck together.
+  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
+                             RegsToPass[I].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL,
+                                      getPointerTy(DAG.getDataLayout()), 0);
+  Callee = DAG.getNode(TPUISD::WRAPPER, DL, MVT::i32, Callee);
+
+  // Function always return void.
+  SDVTList NodeTys = DAG.getVTList(MVT::isVoid, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add a register mask operand representing the call-preserved registers.
+  const uint32_t *Mask =
+      TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I)
+    Ops.push_back(DAG.getRegister(RegsToPass[I].first,
+                                  RegsToPass[I].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(CallConv == CallingConv::Fast ? TPUISD::CALL_FAST
+                                                    : TPUISD::CALL,
+                      DL, NodeTys, ArrayRef<SDValue>(&Ops[0], Ops.size()));
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(
+      Chain,
+      DAG.getConstant(NumBytesScalar, DL, getPointerTy(DAG.getDataLayout()),
+                      true),
+      DAG.getConstant(NumBytesVector, DL, getPointerTy(DAG.getDataLayout()),
+                      true),
+      InFlag, DL);
+  InFlag = Chain.getValue(1);
+  return Chain;
+}
+
+bool TPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                           const CallInst &I,
+                                           MachineFunction &MF,
+                                           unsigned Intrinsic) const {
+  const TPUTargetMachine &TM =
+      static_cast<const TPUTargetMachine &>(MF.getTarget());
+  switch (Intrinsic) {
+    case Intrinsic::tpu_syncadd:
+    case Intrinsic::tpu_syncadd_done:
+    case Intrinsic::tpu_syncadd_notdone:
+    case Intrinsic::tpu_syncadd_remote:
+    case Intrinsic::tpu_syncadd_remote_done:
+    case Intrinsic::tpu_syncadd_remote_doneinv:
+    case Intrinsic::tpu_syncadd_tile:
+    case Intrinsic::tpu_syncset_done:
+    case Intrinsic::tpu_syncset_notdone:
+    case Intrinsic::tpu_syncset_remote:
+    case Intrinsic::tpu_syncset_remote_doneinv:
+    case Intrinsic::tpu_syncdonemov:
+      Info.opc = (Intrinsic == Intrinsic::tpu_syncdonemov)
+                     ? ISD::INTRINSIC_W_CHAIN
+                     : ISD::INTRINSIC_VOID;
+      Info.memVT = MVT::i32;
+      Info.ptrVal = I.getOperand(0);
+      Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+      return true;
+    case Intrinsic::tpu_vld_shuffle:
+    case Intrinsic::tpu_vld_strided:
+    case Intrinsic::tpu_vld_indexed:
+    case Intrinsic::tpu_vld_replicate_evenodd_sublanes:
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.memVT = MVT::getVT(I.getCalledFunction()->getReturnType());
+      Info.ptrVal = I.getOperand(0);
+      Info.size = MemoryLocation::UnknownSize;
+      Info.flags = MachineMemOperand::MOLoad;
+      return true;
+    case Intrinsic::tpu_rdcbreg_smem_base:
+    case Intrinsic::tpu_rdcbreg_tilespmem_base:
+    case Intrinsic::tpu_rdcbreg_size:
+    case Intrinsic::tpu_rdcbreg_offset:
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      // FIXME(hgreving): re-visit memory operand strategy for this. The reason
+      // for this to read memory at all are the cb.upd semantics that are not
+      // modeled through register dependencies.
+      Info.memVT = MVT::getVT(I.getCalledFunction()->getArg(0)->getType());
+      Info.ptrVal = nullptr;
+      Info.size = MemoryLocation::UnknownSize;
+      Info.flags = MachineMemOperand::MOLoad;
+      return true;
+    case Intrinsic::tpu_sld_cb:
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.memVT = MVT::getVT(I.getCalledFunction()->getArg(0)->getType());
+      // FIXME(hgreving): re-visit memory operand strategy for this. We don't
+      // have a pointer and PSV values also don't work well here (upstream bug:
+      // can't set address space).
+      Info.ptrVal = nullptr;
+      Info.size = MemoryLocation::UnknownSize;
+      Info.flags = MachineMemOperand::MOLoad;
+      return true;
+    case Intrinsic::tpu_sld_cb_upd:
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.memVT = MVT::getVT(I.getCalledFunction()->getArg(0)->getType());
+      // FIXME(hgreving): re-visit memory operand strategy for this. We don't
+      // have a pointer and PSV values also don't work well here (upstream bug:
+      // can't set address space).
+      Info.ptrVal = nullptr;
+      Info.size = MemoryLocation::UnknownSize;
+      // upd modeled as store.
+      Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+      return true;
+    case Intrinsic::tpu_vld_msk:
+    case Intrinsic::tpu_vld_msk_strided:
+    case Intrinsic::tpu_vld_msk_idx_strided:
+    case Intrinsic::tpu_vld_msk_idx:
+    case Intrinsic::tpu_vld_msk_idx_np:
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.memVT = MVT::getVT(I.getCalledFunction()->getReturnType());
+      Info.ptrVal = I.getOperand(1);
+      Info.size = MemoryLocation::UnknownSize;
+      Info.flags = MachineMemOperand::MOLoad;
+      return true;
+    case Intrinsic::tpu_vst_strided:
+    case Intrinsic::tpu_vst_indexed:
+    case Intrinsic::tpu_vst_evenodd_sublanes:
+      Info.opc = ISD::INTRINSIC_VOID;
+      Info.memVT = MVT::getVT(I.getOperand(0)->getType());
+      Info.size = MemoryLocation::UnknownSize;
+      Info.ptrVal = I.getOperand(1);
+      Info.flags = MachineMemOperand::MOStore;
+      return true;
+    case Intrinsic::tpu_sst_cb:
+    case Intrinsic::tpu_sst_cb_upd:
+      Info.opc = ISD::INTRINSIC_VOID;
+      Info.memVT = MVT::getVT(I.getOperand(0)->getType());
+      Info.size = MemoryLocation::UnknownSize;
+      // FIXME(hgreving): re-visit memory operand strategy for this. We don't
+      // have a pointer and PSV values also don't work well here (upstream bug:
+      // can't set address space).
+      Info.ptrVal = nullptr;
+      Info.flags = MachineMemOperand::MOStore;
+      return true;
+    case Intrinsic::tpu_vst_msk_idx_add:
+    case Intrinsic::tpu_vst_msk_idx_add_np:
+      Info.opc = ISD::INTRINSIC_VOID;
+      Info.memVT = MVT::getVT(I.getOperand(3)->getType());
+      Info.size = MemoryLocation::UnknownSize;
+      Info.ptrVal = I.getOperand(1);
+      Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad;
+      return true;
+    case Intrinsic::tpu_vst_msk_idx_ret_add_np:
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.memVT = MVT::getVT(I.getOperand(3)->getType());
+      Info.size = MemoryLocation::UnknownSize;
+      Info.ptrVal = I.getOperand(1);
+      Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad;
+      return true;
+    case Intrinsic::tpu_vst_msk:
+    case Intrinsic::tpu_vst_msk_add:
+      Info.opc = ISD::INTRINSIC_VOID;
+      Info.memVT = MVT::getVT(I.getOperand(2)->getType());
+      Info.size = MemoryLocation::UnknownSize;
+      Info.ptrVal = I.getOperand(1);
+      Info.flags = MachineMemOperand::MOStore;
+      return true;
+    case Intrinsic::tpu_vst_cb_msk:
+    case Intrinsic::tpu_vst_cb_msk_add:
+    case Intrinsic::tpu_vst_cb_msk_add_strided:
+    case Intrinsic::tpu_vst_cb_msk_idx:
+    case Intrinsic::tpu_vst_cb_msk_idx_add:
+    case Intrinsic::tpu_vst_cb_msk_strided:
+    case Intrinsic::tpu_vst_cb_upd_msk:
+    case Intrinsic::tpu_vst_cb_upd_msk_add:
+    case Intrinsic::tpu_vst_cb_upd_msk_add_strided:
+    case Intrinsic::tpu_vst_cb_upd_msk_strided:
+      Info.opc = ISD::INTRINSIC_VOID;
+      Info.memVT = MVT::getVT(I.getOperand(3)->getType());
+      Info.size = MemoryLocation::UnknownSize;
+      // FIXME(hgreving): re-visit memory operand strategy for this. We don't
+      // have a pointer and PSV values also don't work well here (upstream bug:
+      // can't set address space).
+      Info.ptrVal = nullptr;
+      Info.flags = MachineMemOperand::MOStore;
+      return true;
+    case Intrinsic::tpu_vld_cb_msk:
+    case Intrinsic::tpu_vld_cb_msk_idx:
+    case Intrinsic::tpu_vld_cb_msk_idx_np:
+    case Intrinsic::tpu_vld_cb_msk_strided:
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.memVT = MVT::getVT(I.getCalledFunction()->getReturnType());
+      Info.size = MemoryLocation::UnknownSize;
+      Info.ptrVal = nullptr;
+      Info.flags = MachineMemOperand::MOLoad;
+      return true;
+    case Intrinsic::tpu_vld_cb_upd_msk:
+    case Intrinsic::tpu_vld_cb_upd_msk_strided:
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.memVT = MVT::getVT(I.getCalledFunction()->getReturnType());
+      Info.size = MemoryLocation::UnknownSize;
+      // FIXME(hgreving): re-visit memory operand strategy for this. We don't
+      // have a pointer and PSV values also don't work well here (upstream bug:
+      // can't set address space).
+      Info.ptrVal = nullptr;
+      // upd modeled as store
+      Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+      return true;
+    case Intrinsic::tpu_vst_msk_strided:
+    case Intrinsic::tpu_vst_msk_add_strided:
+      Info.opc = ISD::INTRINSIC_VOID;
+      Info.memVT = MVT::getVT(I.getOperand(3)->getType());
+      Info.size = MemoryLocation::UnknownSize;
+      Info.ptrVal = I.getOperand(1);
+      Info.flags = MachineMemOperand::MOStore;
+      return true;
+    case Intrinsic::tpu_vst_msk_idx:
+    case Intrinsic::tpu_vst_msk_idx_np:
+      Info.opc = ISD::INTRINSIC_VOID;
+      Info.memVT = MVT::getVT(I.getOperand(3)->getType());
+      Info.size = MemoryLocation::UnknownSize;
+      Info.ptrVal = I.getOperand(1);
+      Info.flags = MachineMemOperand::MOStore;
+      return true;
+    case Intrinsic::tpu_vst_msk_idx_strided:
+      Info.opc = ISD::INTRINSIC_VOID;
+      Info.memVT = MVT::getVT(I.getOperand(4)->getType());
+      Info.size = MemoryLocation::UnknownSize;
+      Info.ptrVal = I.getOperand(1);
+      Info.flags = MachineMemOperand::MOStore;
+      return true;
+    case Intrinsic::tpu_dma_hbm_to_smem:
+    case Intrinsic::tpu_dma_hbm_to_smem_sc_simple:
+    case Intrinsic::tpu_dma_hbm_to_vmem:
+    case Intrinsic::tpu_dma_hbm_to_spmem_sc_simple:
+    case Intrinsic::tpu_dma_hbm_to_hbm:
+    case Intrinsic::tpu_dma_hbm_to_hbm_sc_simple:
+    case Intrinsic::tpu_dma_hbm_to_hib:
+    case Intrinsic::tpu_dma_hbm_to_vmem_hib_update:
+    case Intrinsic::tpu_dma_smem_to_hbm:
+    case Intrinsic::tpu_dma_smem_to_hbm_sc_simple:
+    case Intrinsic::tpu_dma_vmem_to_hbm:
+    case Intrinsic::tpu_dma_spmem_to_hbm_sc_simple:
+    case Intrinsic::tpu_dma_spmem_to_spmem_sc_simple:
+    case Intrinsic::tpu_dma_timem_to_hbm:
+    case Intrinsic::tpu_dma_timem_to_hbm_sc_simple:
+    case Intrinsic::tpu_dma_hbm_to_simem_sc_simple:
+    case Intrinsic::tpu_dma_hbm_to_timem:
+    case Intrinsic::tpu_dma_hbm_to_timem_sc_simple:
+    case Intrinsic::tpu_dma_hbm_to_smem_single_strided:
+    case Intrinsic::tpu_dma_hbm_to_vmem_single_strided:
+    case Intrinsic::tpu_dma_smem_to_hbm_single_strided:
+    case Intrinsic::tpu_dma_vmem_to_hbm_single_strided:
+    case Intrinsic::tpu_dma_hbm_to_smem_general:
+    case Intrinsic::tpu_dma_hbm_to_vmem_general:
+    case Intrinsic::tpu_dma_smem_to_hbm_general:
+    case Intrinsic::tpu_dma_vmem_to_hbm_general:
+    case Intrinsic::tpu_dma_hbm_to_hbm_sc_general:
+    case Intrinsic::tpu_dma_smem_to_smem_sc_general:
+    case Intrinsic::tpu_dma_hbm_to_smem_sc_general:
+    case Intrinsic::tpu_dma_hbm_to_timem_sc_general:
+    case Intrinsic::tpu_dma_hbm_to_spmem_sc_general:
+    case Intrinsic::tpu_dma_smem_to_hbm_sc_general:
+    case Intrinsic::tpu_dma_timem_to_hbm_sc_general:
+    case Intrinsic::tpu_dma_spmem_to_spmem_sc_general:
+      Info.opc = ISD::INTRINSIC_VOID;
+      Info.memVT = VNI32;
+      // Access multiple pointers so set it to null so that alias analysis don't
+      // make any assumption.
+      // TODO(thomasraoux): We could have a finer grain aliasing information by
+      // adding several memory operands and actually add the pointers.
+      Info.ptrVal = nullptr;
+      Info.size = MemoryLocation::UnknownSize;
+      Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad;
+      return true;
+    case Intrinsic::tpu_dma_hbm_to_iova_sc_simple:
+    case Intrinsic::tpu_dma_iova_to_hbm_sc_simple:
+      Info.opc = ISD::INTRINSIC_VOID;
+      Info.memVT = MVT::v1024i32;
+      // Same comments as above.
+      Info.ptrVal = nullptr;
+      Info.size = MemoryLocation::UnknownSize;
+      Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad;
+      return true;
+    case Intrinsic::tpu_stream_indirect_gather_add_f32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_add_f32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_add_f32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_gather_add_f32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_add_s32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_add_s32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_add_s32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_gather_add_s32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_add_f32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_add_f32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_add_f32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_gather_cb_add_f32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_add_s32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_add_s32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_add_s32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_gather_cb_add_s32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_spmem_to_smem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_spmem_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_tilespmem_tileN_to_smem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_gather_cb_upd_add_f32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_upd_add_f32_hbm_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_gather_cb_upd_add_f32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_gather_cb_upd_add_f32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_gather_cb_upd_add_s32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_upd_add_s32_hbm_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_gather_cb_upd_add_s32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_gather_cb_upd_add_s32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_upd_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_upd_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_upd_spmem_to_smem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_upd_spmem_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_cb_upd_tilespmem_tileN_to_smem:
+    case Intrinsic::
+        tpu_stream_indirect_gather_cb_upd_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_spmem_to_smem:
+    case Intrinsic::tpu_stream_indirect_gather_spmem_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_gather_tilespmem_tileN_to_smem:
+    case Intrinsic::tpu_stream_indirect_gather_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_scatter_add_f32_smem_to_spmem:
+    case Intrinsic::tpu_stream_indirect_scatter_add_f32_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_scatter_add_f32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_scatter_add_f32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_scatter_add_s32_smem_to_spmem:
+    case Intrinsic::tpu_stream_indirect_scatter_add_s32_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_scatter_add_s32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_scatter_add_s32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_add_f32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_scatter_cb_add_f32_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_add_f32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_scatter_cb_add_f32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_add_s32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_scatter_cb_add_s32_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_add_s32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_scatter_cb_add_s32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_smem_to_spmem:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_tilespmem_to_hbm:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_tilespmem_to_hbm4b:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_tilespmem_to_spmem:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_upd_add_f32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_scatter_cb_upd_add_f32_smem_to_tilespmem_tileN:
+    case Intrinsic::
+        tpu_stream_indirect_scatter_cb_upd_add_f32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_scatter_cb_upd_add_f32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_upd_add_s32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_scatter_cb_upd_add_s32_smem_to_tilespmem_tileN:
+    case Intrinsic::
+        tpu_stream_indirect_scatter_cb_upd_add_s32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_scatter_cb_upd_add_s32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_upd_smem_to_spmem:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_upd_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_upd_tilespmem_to_hbm:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_upd_tilespmem_to_hbm4b:
+    case Intrinsic::tpu_stream_indirect_scatter_cb_upd_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_scatter_cb_upd_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_scatter_smem_to_spmem:
+    case Intrinsic::tpu_stream_indirect_scatter_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_hbm:
+    case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_hbm4b:
+    case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_spmem:
+    case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_add_f32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_add_f32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_add_f32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_add_f32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_add_s32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_add_s32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_add_s32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_add_s32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_add_f32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_cb_add_f32_hbm_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_add_f32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_add_f32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_add_s32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_cb_add_s32_hbm_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_add_s32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_add_s32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_cb_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_cb_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_cb_spmem_to_smem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_cb_spmem_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_cb_tilespmem_tileN_to_smem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_upd_add_f32_hbm4b_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_upd_add_f32_hbm_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_upd_add_f32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_upd_add_f32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_upd_add_s32_hbm4b_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_upd_add_s32_hbm_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_upd_add_s32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_upd_add_s32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_cb_upd_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_cb_upd_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_cb_upd_spmem_to_smem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_cb_upd_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_upd_tilespmem_tileN_to_smem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_cb_upd_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_spmem_to_smem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_spmem_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_gather_tilespmem_tileN_to_smem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_gather_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_add_f32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_add_f32_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_add_f32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_add_f32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_add_s32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_add_s32_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_add_s32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_add_s32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_add_f32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_add_f32_smem_to_tilespmem_tileN:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_add_f32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_add_f32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_add_s32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_add_s32_smem_to_tilespmem_tileN:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_add_s32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_add_s32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_smem_to_spmem:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_tilespmem_to_hbm:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_tilespmem_to_hbm4b:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_upd_add_f32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_upd_add_f32_smem_to_tilespmem_tileN:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_upd_add_f32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_upd_add_f32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_upd_add_s32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_upd_add_s32_smem_to_tilespmem_tileN:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_upd_add_s32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_upd_add_s32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_upd_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_upd_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_upd_tilespmem_to_hbm:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_upd_tilespmem_to_hbm4b:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_cb_upd_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_cb_upd_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_smem_to_spmem:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_tilespmem_to_hbm:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_tilespmem_to_hbm4b:
+    case Intrinsic::tpu_stream_indirect_vreg_scatter_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_indirect_vreg_scatter_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_gather_add_f32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_add_f32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_add_f32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_linear_gather_add_f32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_add_s32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_add_s32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_add_s32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_linear_gather_add_s32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_add_f32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_add_f32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_add_f32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_linear_gather_cb_add_f32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_add_s32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_add_s32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_add_s32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_linear_gather_cb_add_s32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_spmem_to_smem:
+    case Intrinsic::tpu_stream_linear_gather_cb_spmem_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_tilespmem_tileN_to_smem:
+    case Intrinsic::tpu_stream_linear_gather_cb_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_upd_add_f32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_upd_add_f32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_upd_add_f32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_linear_gather_cb_upd_add_f32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_upd_add_s32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_upd_add_s32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_upd_add_s32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_linear_gather_cb_upd_add_s32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_upd_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_upd_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_upd_spmem_to_smem:
+    case Intrinsic::tpu_stream_linear_gather_cb_upd_spmem_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_cb_upd_tilespmem_tileN_to_smem:
+    case Intrinsic::
+        tpu_stream_linear_gather_cb_upd_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_spmem_to_smem:
+    case Intrinsic::tpu_stream_linear_gather_spmem_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_gather_tilespmem_tileN_to_smem:
+    case Intrinsic::tpu_stream_linear_gather_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_linear_scatter_add_f32_smem_to_spmem:
+    case Intrinsic::tpu_stream_linear_scatter_add_f32_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_add_f32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_linear_scatter_add_f32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_add_s32_smem_to_spmem:
+    case Intrinsic::tpu_stream_linear_scatter_add_s32_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_add_s32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_linear_scatter_add_s32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_cb_add_f32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_linear_scatter_cb_add_f32_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_cb_add_f32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_linear_scatter_cb_add_f32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_cb_add_s32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_linear_scatter_cb_add_s32_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_cb_add_s32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_linear_scatter_cb_add_s32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_cb_smem_to_spmem:
+    case Intrinsic::tpu_stream_linear_scatter_cb_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_cb_tilespmem_to_hbm:
+    case Intrinsic::tpu_stream_linear_scatter_cb_tilespmem_to_hbm4b:
+    case Intrinsic::tpu_stream_linear_scatter_cb_tilespmem_to_spmem:
+    case Intrinsic::tpu_stream_linear_scatter_cb_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_cb_upd_add_f32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_linear_scatter_cb_upd_add_f32_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_cb_upd_add_f32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_linear_scatter_cb_upd_add_f32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_cb_upd_add_s32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_linear_scatter_cb_upd_add_s32_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_cb_upd_add_s32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_linear_scatter_cb_upd_add_s32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_cb_upd_smem_to_spmem:
+    case Intrinsic::tpu_stream_linear_scatter_cb_upd_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_cb_upd_tilespmem_to_hbm:
+    case Intrinsic::tpu_stream_linear_scatter_cb_upd_tilespmem_to_hbm4b:
+    case Intrinsic::tpu_stream_linear_scatter_cb_upd_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_linear_scatter_cb_upd_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_smem_to_spmem:
+    case Intrinsic::tpu_stream_linear_scatter_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_hbm:
+    case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_hbm4b:
+    case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_spmem:
+    case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_strided_gather_add_f32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_add_f32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_add_f32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_strided_gather_add_f32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_add_s32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_add_s32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_add_s32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_strided_gather_add_s32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_add_f32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_add_f32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_add_f32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_strided_gather_cb_add_f32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_add_s32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_add_s32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_add_s32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_strided_gather_cb_add_s32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_spmem_to_smem:
+    case Intrinsic::tpu_stream_strided_gather_cb_spmem_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_tilespmem_tileN_to_smem:
+    case Intrinsic::tpu_stream_strided_gather_cb_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_upd_add_f32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_upd_add_f32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_upd_add_f32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_strided_gather_cb_upd_add_f32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_upd_add_s32_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_upd_add_s32_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_upd_add_s32_spmem_to_tilespmem:
+    case Intrinsic::
+        tpu_stream_strided_gather_cb_upd_add_s32_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_upd_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_upd_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_upd_spmem_to_smem:
+    case Intrinsic::tpu_stream_strided_gather_cb_upd_spmem_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_cb_upd_tilespmem_tileN_to_smem:
+    case Intrinsic::
+        tpu_stream_strided_gather_cb_upd_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_hbm4b_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_hbm_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_spmem_to_smem:
+    case Intrinsic::tpu_stream_strided_gather_spmem_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_gather_tilespmem_tileN_to_smem:
+    case Intrinsic::tpu_stream_strided_gather_tilespmem_tileN_to_tilespmem:
+    case Intrinsic::tpu_stream_strided_scatter_add_f32_smem_to_spmem:
+    case Intrinsic::tpu_stream_strided_scatter_add_f32_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_strided_scatter_add_f32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_strided_scatter_add_f32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_strided_scatter_add_s32_smem_to_spmem:
+    case Intrinsic::tpu_stream_strided_scatter_add_s32_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_strided_scatter_add_s32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_strided_scatter_add_s32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_strided_scatter_cb_add_f32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_strided_scatter_cb_add_f32_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_strided_scatter_cb_add_f32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_strided_scatter_cb_add_f32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_strided_scatter_cb_add_s32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_strided_scatter_cb_add_s32_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_strided_scatter_cb_add_s32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_strided_scatter_cb_add_s32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_strided_scatter_cb_smem_to_spmem:
+    case Intrinsic::tpu_stream_strided_scatter_cb_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_strided_scatter_cb_tilespmem_to_hbm:
+    case Intrinsic::tpu_stream_strided_scatter_cb_tilespmem_to_hbm4b:
+    case Intrinsic::tpu_stream_strided_scatter_cb_tilespmem_to_spmem:
+    case Intrinsic::tpu_stream_strided_scatter_cb_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_strided_scatter_cb_upd_add_f32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_strided_scatter_cb_upd_add_f32_smem_to_tilespmem_tileN:
+    case Intrinsic::
+        tpu_stream_strided_scatter_cb_upd_add_f32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_strided_scatter_cb_upd_add_f32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_strided_scatter_cb_upd_add_s32_smem_to_spmem:
+    case Intrinsic::
+        tpu_stream_strided_scatter_cb_upd_add_s32_smem_to_tilespmem_tileN:
+    case Intrinsic::
+        tpu_stream_strided_scatter_cb_upd_add_s32_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_strided_scatter_cb_upd_add_s32_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_strided_scatter_cb_upd_smem_to_spmem:
+    case Intrinsic::tpu_stream_strided_scatter_cb_upd_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_strided_scatter_cb_upd_tilespmem_to_hbm:
+    case Intrinsic::tpu_stream_strided_scatter_cb_upd_tilespmem_to_hbm4b:
+    case Intrinsic::tpu_stream_strided_scatter_cb_upd_tilespmem_to_spmem:
+    case Intrinsic::
+        tpu_stream_strided_scatter_cb_upd_tilespmem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_strided_scatter_smem_to_spmem:
+    case Intrinsic::tpu_stream_strided_scatter_smem_to_tilespmem_tileN:
+    case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_hbm:
+    case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_hbm4b:
+    case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_spmem:
+    case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_tilespmem_tileN:
+      // We don't actually need to add memory operands for stream. We'd get
+      // regular barriers from the DAG builder otherwise, but we're doing it
+      // right and stick to memory edges here.
+      assert(IsSC);
+      Info.opc = ISD::INTRINSIC_VOID;
+      Info.memVT = VNI32;
+      // TODO(hgreving): We could have a finer grain aliasing information by
+      // adding several memory operands. We currently only attach the TileSpmem
+      // memory operand, because that's all we currently consider when analyzing
+      // the DAG's edge later. We also don't want to hard-code the operand
+      // number, because there are too many stream intrinsics. Instead, we're
+      // just searching the operands.
+      Info.ptrVal = nullptr;
+      for (auto &Op : I.operands()) {
+        if (!Op->getType()->isPointerTy())
+          continue;
+        if (Op->getType()->getPointerAddressSpace() != TPUAS_TileSpmem)
+          continue;
+        Info.ptrVal = Op;
+        break;
+      }
+      Info.size = MemoryLocation::UnknownSize;
+      Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOLoad;
+      return true;
+    case Intrinsic::tpu_bc_load_aliaddr:
+    case Intrinsic::tpu_bc_load_aliaddr_flm:
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.memVT = MVT::getVT(I.getType());
+      Info.ptrVal = I.getOperand(0);
+      Info.size = MemoryLocation::UnknownSize;
+      Info.flags = MachineMemOperand::MOLoad;
+      return true;
+    case Intrinsic::tpu_bc_store_aliaddr:
+    case Intrinsic::tpu_bc_store_aliaddr_flm:
+      Info.opc = ISD::INTRINSIC_VOID;
+      Info.memVT = MVT::getVT(I.getOperand(0)->getType());
+      Info.ptrVal = I.getOperand(1);
+      Info.size = MemoryLocation::UnknownSize;
+      Info.flags = MachineMemOperand::MOStore;
+      return true;
+    case Intrinsic::tpu_bc_loop_end: {
+      Info.opc = ISD::INTRINSIC_W_CHAIN;
+      Info.memVT = MVT::i1;
+      Info.ptrVal = TM.getPSV(TPUTargetMachine::PSV_BarnaCoreChannel_LoopEnd);
+      Info.size = 1;
+      Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+      return true;
+    }
+    default:
+      return false;
+  }
+}
+
+void TPUTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                      KnownBits &Known,
+                                                      const APInt &DemandedElts,
+                                                      const SelectionDAG &DAG,
+                                                      unsigned Depth) const {
+  KnownBits Known2;
+  Known.resetAll();
+
+  switch (Op.getOpcode()) {
+  default:
+    break;
+  case TPUISD::UMUL24:
+    unsigned BitWidth = 32;
+    Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    // Instruction zero out the 8 top bits.
+    Known.Zero.setHighBits(8);
+    Known2.Zero.setHighBits(8);
+    // If low bits are zero in either operand, output low known-0 bits.
+    // Also compute a conservative estimate for high known-0 bits.
+    unsigned TrailZ =
+        Known.countMinTrailingZeros() + Known2.countMinTrailingZeros();
+    unsigned LeadZ =
+        std::max(Known.countMinLeadingZeros() + Known2.countMinLeadingZeros(),
+                 BitWidth) -
+        BitWidth;
+
+    Known.resetAll();
+    Known.Zero.setLowBits(std::min(TrailZ, BitWidth));
+    Known.Zero.setHighBits(std::min(LeadZ, BitWidth));
+    break;
+  }
+}
+
+void TPUTargetLowering::addTPUMemOperand(SelectionDAG &DAG, SDNode *N,
+                                         bool IsPush,
+                                         const TargetRegisterClass *RC) const {
+  // Add a MachineMemOperand to N, marking it as a push or pop of the given
+  // register class.
+  MachineSDNode *MN = cast<MachineSDNode>(N);
+  MachinePointerInfo MPI(
+      static_cast<const TPUTargetMachine &>(getTargetMachine())
+          .getFifoPSV(IsPush, RC));
+  auto *MemRef = DAG.getMachineFunction().getMachineMemOperand(
+      MPI, MachineMemOperand::MOLoad | MachineMemOperand::MOStore, /*s=*/4,
+      /*base_alignment=*/llvm::Align(4));
+  DAG.setNodeMemRefs(MN, {MemRef});
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUISelLowering.h b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUISelLowering.h
new file mode 100644
index 0000000..7d10490
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUISelLowering.h

@@ -0,0 +1,276 @@
+//===-- TPUISelLowering.h - TPU DAG Lowering Interface --*- C++ --*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that TPU uses to lower LLVM code into
+// a selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_TPU_TPUISELLOWERING_H
+#define LLVM_LIB_TARGET_TPU_TPUISELLOWERING_H
+
+#include "TPU.h"
+#include "TPURegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+
+namespace llvm {
+namespace TPUISD {
+enum {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+  // HALT - Stops execution. Used in place of RETURN from the main kernel.
+  HALT,
+
+  // VROTDOWN - Rotate vector towards lane zero: shuffle(1,2,3,4,5,6,7,0).
+  VROTDOWN,
+
+  // SPLAT - Splat a value into all lanes of a vector (broadcast scalar).
+  SPLAT,
+
+  // VBROADCAST - Broadcast an element of a vector into all lanes of a vector.
+  VBROADCAST,
+
+  // VPERMUTE - Permutes the elements of a vector into all lanes of a vector.
+  VPERMUTE,
+
+  // WRAPPER - Work around insufficiencies of GLOBALADDR and TARGET_GLOBALADDR.
+  WRAPPER,
+
+  // BC_INSERTVALUE - Insert a vector into an aggregate based on (loop_index).
+  //                  The aggregate is referenced by its base register.
+  //                  Usage: newagg = BC_INSERTVALUE agg, value
+  BC_INSERTVALUE,
+
+  // BC_EXTRACTVALUE - Extract a vector from a vector of aggregates based on
+  //                   (loop_index). The aggregate is referenced by its base
+  //                   register.
+  BC_EXTRACTVALUE,
+
+  // UMUL24 - This represents Jellyfish SMUL.U24 instruction
+  UMUL24,
+
+  // CALL - This represents a function call.
+  CALL,
+
+  // CALL_FAST - This represents a fast pseudo function call.
+  CALL_FAST,
+
+  // RET - This represents a function return.
+  RET,
+};
+} // namespace TPUISD
+
+class TPUSubtarget;
+
+class TPUTargetLowering : public TargetLowering {
+public:
+  TPUTargetLowering(const TargetMachine &TM, const TPUSubtarget &STI);
+
+  // LowerOperation - Provide custom lowering hooks for some operations.
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  // getTargetNodeName - This method returns the name of a target specific
+  // DAG node.
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+  bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
+                             EVT NewVT) const override {
+    // It's never a good idea to reduce the load width; we only have one width
+    // of load per memory type.
+    return false;
+  }
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *MBB) const override;
+
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override {
+    return true;
+  }
+
+  bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
+                          unsigned AddrSpace, Align Alignment,
+                          MachineMemOperand::Flags Flags,
+                          unsigned *Fast) const override;
+
+  bool allowsMisalignedMemoryAccesses(
+      EVT VT, unsigned AddrSpace = 0, Align Alignment = Align(1),
+      MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+      unsigned *Fast = nullptr) const override;
+
+  bool allowsMisalignedMemoryAccesses(
+      LLT LT, unsigned AddrSpace = 0, Align Alignment = Align(1),
+      MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+      unsigned *Fast = nullptr) const override;
+
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &context,
+                         EVT VT) const override {
+    if (VT.isVector())
+      return EVT::getVectorVT(context, MVT::i1, VT.getVectorElementCount());
+    return MVT::i1;
+  }
+
+  bool functionArgumentNeedsConsecutiveRegisters(
+      Type *Ty, CallingConv::ID CallConv, bool isVarArg,
+      const DataLayout &DL) const override;
+
+  void AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                     SDNode *Node) const override;
+
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                          MachineFunction &MF,
+                          unsigned Intrinsic) const override;
+
+  void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+                                     const APInt &DemandedElts,
+                                     const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
+  // The TargetLowering's default is true. We're explicit here and avoid
+  // confusion.
+  bool isLegalAddImmediate(int64_t Imm) const override { return true; }
+  // Returns true if N is a TPUISD::SPLAT of a non-NaN value.
+  bool isNonNaNFPConstSplat(SDValue N) const;
+
+  // We'd run into trouble with pointer word sizes. Selection DAG also currently
+  // does not convey the address space for the MemOp here. We make this function
+  // unreachable in our backend.
+  EVT getOptimalMemOpType(const MemOp &Op,
+                          const AttributeList &FuncAttributes) const override;
+
+  MVT getVMNI1Ty() const { return VMNI1; }
+  MVT getVNI32Ty() const { return VNI32; }
+  MVT getVNF32Ty() const { return VNF32; }
+  MVT getVNBF16Ty() const { return VNBF16; }
+
+  // Add a machine memory operand.
+  void addTPUMemOperand(SelectionDAG &DAG, SDNode *N, bool IsPush,
+                        const TargetRegisterClass *RC) const;
+
+private:
+  SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool IsVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
+                      SelectionDAG &DAG) const override;
+
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerMUL32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVMUL32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue SimpleEmulVMUL32(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
+
+  MachineBasicBlock *EmitVROTDOWN(MachineInstr &MI,
+                                  MachineBasicBlock *MBB) const;
+  MachineBasicBlock *
+  EmitVecOrSFlagToScalar(MachineInstr &MI, MachineBasicBlock *MBB,
+                         unsigned PushOpcode, int NumOfInputs,
+                         unsigned PopOpcode,
+                         const TargetRegisterClass *RegClass) const;
+  MachineBasicBlock *EmitVmread(MachineInstr &MI, MachineBasicBlock *MBB) const;
+  MachineBasicBlock *SetDWGDep(MachineInstr &MI, MachineBasicBlock *MBB) const;
+
+  SDValue
+  PerformINSERT_VECTOR_ELTCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI) const;
+  SDValue
+  PerformSCALAR_TO_VECTORCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) const;
+  SDValue
+  PerformBUILD_VECTORCombine(SDNode *N,
+                             TargetLowering::DAGCombinerInfo &DCI) const;
+  SDValue
+  PerformVECTOR_SHUFFLECombine(ShuffleVectorSDNode *N,
+                               TargetLowering::DAGCombinerInfo &DCI) const;
+  SDValue
+  PerformVSELECTCombine(SDNode *N,
+                        TargetLowering::DAGCombinerInfo &DCI) const;
+  SDValue
+  PerformBcInsertValueCombine(SDNode *N,
+                        TargetLowering::DAGCombinerInfo &DCI) const;
+  SDValue
+  PerformBcExtractValueCombine(SDNode *N,
+                        TargetLowering::DAGCombinerInfo &DCI) const;
+
+  SDValue PerformPtrToIntCombine(SDNode *N) const;
+  SDValue PerformSETCCCombine(SDNode *N,
+                              TargetLowering::DAGCombinerInfo &DCI) const;
+
+  // Return a bool if the instruction accessing is the given FIFO
+  // register class. The returned value is true if instruction is a pop and
+  // false if it is a push.
+  std::optional<bool> IsFifoAccess(MachineInstr &MI,
+                                   const TargetRegisterClass *RegClass) const;
+
+  // Return true if the instruction uses a special architecture register
+  // from the given register class.
+  bool UsesSpecialReg(MachineInstr &MI,
+                      const TargetRegisterClass *RegClass) const;
+
+  // Attach a pseudo memory dependency to the instruction.
+  void SetDependency(MachineInstr &MI, MachineBasicBlock *MBB,
+                         const TargetRegisterClass *RegClass,
+                         bool isPush = false) const;
+
+  // Return the hardware mask register if the SDNode matches any of the
+  // supported embedded masks. Assumes that SDNode is a ISD::BUILD_VECTOR.
+  Register getSupportedEmbeddedMask(SDNode *N) const;
+
+  // Supports immediate mask creating according to
+  // go/vxc-isa#create-sublane-mask-instruction.
+  // Returns a SDValue with MachineSDNode if possible or empty SDValue
+  // otherwise.
+  SDValue getSupportedVCMask(SelectionDAG &DAG, SDNode *N) const;
+  // Same as above, but based on an integer representing the mask.
+  SDValue getSupportedVCMask(SelectionDAG &DAG, int VectorMask,
+                             SDLoc Loc) const;
+
+  const TPURegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+  const TPUSubtarget *ST;
+  MVT::SimpleValueType VNI32;
+  MVT::SimpleValueType VNF32;
+  MVT::SimpleValueType VNBF16;
+  MVT::SimpleValueType VNF16;
+  MVT::SimpleValueType VNI16;
+  MVT::SimpleValueType VNI8;
+  MVT::SimpleValueType VNI4;
+  MVT::SimpleValueType VNI2;
+  MVT::SimpleValueType VNI1;
+  MVT::SimpleValueType VMN16I1;
+  MVT::SimpleValueType VMN32I1;
+  MVT::SimpleValueType VMN64I1;
+  MVT::SimpleValueType VMNBF16I1;
+  MVT::SimpleValueType VNI8I1;
+  MVT::SimpleValueType VMNI1;
+  bool IsBC;
+  bool IsSC;
+  bool IsVFTC;
+  bool HasVPU = false;
+  bool HasLPVF = false;
+  bool HasLPGL = false;
+  bool HasVMinMax = false;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_TPU_TPUISELLOWERING_H

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrFormats.td b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrFormats.td
new file mode 100644
index 0000000..75669a9
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrFormats.td

@@ -0,0 +1,1352 @@
+//===---- TPUInstrFormats.td - TPU Instruction Formats --*- tablegen -*----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/TableGen/SearchableTable.td"
+
+// A predicate (Preg) that defaults to Always. The first element is a predicate
+// register (or always); the second is 1 if the sense of the predicate should
+// be negated.
+def pred : PredicateOperand<i1, (ops PPR:$reg, i32imm:$invert), (ops Palways, (i32 0))> {
+  let PrintMethod = "printPredicateOperand";
+  let EncoderMethod = "encodePredicateOperand";
+}
+
+class PredicateManipulationOperand<ValueType ty, dag OpTypes>
+  : Operand<ty> {
+   let MIOperandInfo = OpTypes;
+}
+
+// A predicate operand (Preg) similar to above. It prints differently in asm than
+// a "real" predicate operand and has no defaults.
+def predM : PredicateManipulationOperand<i1, (ops PPR:$reg, i32imm:$invert)> {
+  let PrintMethod = "printPredicateManipulationOperand";
+  let EncoderMethod = "encodePredicateOperand";
+}
+
+// A SparseCore trap string operand.
+def tsctrapS : Operand<i32> {
+  let MIOperandInfo = (ops imm:$tag);
+  let PrintMethod = "printSCTrapString";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// A BundleSlot describes which slots in a bundle an instruction can take up.
+// It defines the instruction's itinerary class too, because the DFA packetizer
+// uses itineraries to construct its DFA.
+class BundleSlot<InstrItinClass IIC> {
+  InstrItinClass Itinerary = IIC;
+  bit HasS0 = 0;
+  bit HasS1 = 0;
+  bit HasSM = 0;
+  bit HasSany = 0;
+  bit HasSanyMisc = 0;
+  bit HasV0 = 0;
+  bit HasV1 = 0;
+  bit HasV2 = 0;
+  bit HasV3 = 0;
+  bit HasVany = 0;
+}
+
+class Bundle<BundleSlot slot> {
+  BundleSlot bundleSlot = slot;
+  InstrItinClass Itinerary = slot.Itinerary;
+}
+
+// The symbol type for a bundle slot requirement. It is mapped directly
+// into a bitmask.
+class ImmSlotRequirement<bits<6> mask> {
+  bits<6> immMask = mask;
+}
+
+// Note that on BarnaCore the immediate field imm_4 and imm_5 are aliasing
+// with fields imm_0 and imm_1. Definitions of bits for imm_4 and imm_5
+// should never be used for BarnaCore instructions.
+def IMM_NONE   : ImmSlotRequirement<0b000000>;
+def IMM_0_to_3 : ImmSlotRequirement<0b001111>;
+def IMM_2_to_3 : ImmSlotRequirement<0b001100>;
+def IMM_2_to_5 : ImmSlotRequirement<0b111100>;
+def IMM_0_to_5 : ImmSlotRequirement<0b111111>;
+def IMM_0      : ImmSlotRequirement<0b000001>;
+def IMM_2      : ImmSlotRequirement<0b000100>;
+def IMM_3      : ImmSlotRequirement<0b001000>;
+
+// The symbol type defines indices of immediate operands for which slots are to
+// be allocated. Note that the symbol representing indices of operands that may
+// require immediate slot (currently MCOI_IMMEDIATE, OPERAND_UNKNOWN or
+// OPERAND_PCREL), this is intended to minimize the storage needed for the field
+// and is less error-prone.
+class ImmOperRequirement<bits<4> mask> {
+  bits<4> operMask = mask;
+}
+
+def IMM_OP_NONE : ImmOperRequirement<0b0000>;
+def IMM_OP_0 : ImmOperRequirement<0b0001>;
+def IMM_OP_1 : ImmOperRequirement<0b0010>;
+def IMM_OP_2 : ImmOperRequirement<0b0100>;
+def IMM_OP_3 : ImmOperRequirement<0b1000>;
+
+// Mixin that asserts that the instruction requires an immediate slot in the
+// bundle. Note that 'immRequirement' defines the slots to be used and
+// 'operands' defines a bitmask of immediate operands we should assign slots to,
+// with the bitmask representing indices of operands of types MCOI_IMMEDIATE,
+// OPERAND_UNKNOWN or OPERAND_PCREL (whic minimize the storage needed for the
+// field and is less error-prone).
+class BundleImm<ImmSlotRequirement imms, list<ImmOperRequirement> operands = [IMM_OP_0]> {
+  ImmSlotRequirement immRequirement = imms;
+  ImmOperRequirement operRequirement =
+    ImmOperRequirement<!foldl(0, operands, a, b, !add(a, b.operMask))>;
+}
+
+// Mixins that assert that the instruction requires a particular immediate slot
+// in the bundle.
+class BundleImm2 {
+  bit bundleRequiresImm2 = 1;
+}
+
+class BundleImm3 {
+  bit bundleRequiresImm3 = 1;
+}
+
+// Same as BundleImm, except asserts that the immediate slots is to be encoded
+// according to Vy rules.
+class BundleImmVy<list<ImmOperRequirement> operands = [IMM_OP_0],
+                  ImmSlotRequirement imms = IMM_0_to_5>
+    : BundleImm<imms, operands> {
+  bit isInVectorSlot = 1;
+}
+
+// Same as BundleImm, except asserts that the immediate slots is to be encoded
+// according to Sy rules.
+class BundleImmSy<list<ImmOperRequirement> operands = [IMM_OP_0],
+                  ImmSlotRequirement imms = IMM_0_to_3>
+    : BundleImm<imms, operands> {
+  bit isInScalarSlot = 1;
+}
+
+// Mark vector instructions. Those instruction will require scalar sources to go
+// through Vector to scalar slots. This information is also use to model the VIF
+// FIFO for RAW hazards.
+class IsVectorInstruction {
+  bit isVectorInstruction = 1;
+}
+
+class IsXLUInst {
+  bit isXLUInst = 1;
+}
+
+class IsMXUInst {
+  bit isMXUInst = 1;
+}
+
+// Bundle slot definitions.
+def B_S0 : BundleSlot<IIC_S0>     { let HasS0 = 1; }
+def B_S1 : BundleSlot<IIC_S1>     { let HasS1 = 1; }
+def B_SM : BundleSlot<IIC_SM>     { let HasSM = 1; }
+def B_Sany : BundleSlot<IIC_Sany> { let HasSany = 1; }
+def B_SanyMisc : BundleSlot<IIC_SanyMisc> { let HasSanyMisc = 1; }
+def B_Sboth : BundleSlot<IIC_Sboth>;
+def B_SLD : BundleSlot<IIC_SLD>;
+def B_SST : BundleSlot<IIC_SST>;
+def B_V0 : BundleSlot<IIC_V0>      { let HasV0 = 1; }
+def B_V1 : BundleSlot<IIC_V1>      { let HasV1 = 1; }
+def B_V2 : BundleSlot<IIC_V2>      { let HasV2 = 1; }
+def B_V3 : BundleSlot<IIC_V3>      { let HasV3 = 1; }
+def B_Vany : BundleSlot<IIC_Vany>  { let HasVany = 1; }
+def B_VLD : BundleSlot<IIC_VLD>;
+def B_VST : BundleSlot<IIC_VST>;
+def B_VLDVST : BundleSlot<IIC_VLDVST>;
+def B_VEX : BundleSlot<IIC_VEX>;
+def B_V_VEX : BundleSlot<IIC_V_VEX>;
+def B_VEX0 : BundleSlot<IIC_VEX0>;
+def B_VEX1 : BundleSlot<IIC_VEX1>;
+def B_VEXBoth : BundleSlot<IIC_VEXBoth>;
+def B_VRes0 : BundleSlot<IIC_VRES0>;
+def B_VRes1 : BundleSlot<IIC_VRES1>;
+def B_VResAny : BundleSlot<IIC_VRES>;
+// Architecture and instruction specific bundle definitions.
+def B_VARI : BundleSlot<IIC_VARI>   { let HasVany = 1; }
+def B_VCLAMP : BundleSlot<IIC_VCLAMP> { let HasVany = 1; }
+def B_EUP_OP : BundleSlot<IIC_EUP_OP>   { let HasVany = 1; }
+def B_VCVT : BundleSlot<IIC_VCVT>   { let HasVany = 1; }
+def B_VMPCNT : BundleSlot<IIC_VMPCNT>     { let HasVany = 1; }
+def B_VMPREFIX : BundleSlot<IIC_VMPREFIX> { let HasVany = 1; }
+def B_VM_OP : BundleSlot<IIC_VM_OP> { let HasVany = 1; }
+def B_VPUSH : BundleSlot<IIC_VPUSH> { let HasV2 = 1; }
+def B_VBCAST : BundleSlot<IIC_VBCAST> { let HasVany = 1; }
+def B_VPERM : BundleSlot<IIC_VPERM> { let HasVany = 1; }
+def B_VSHIFTI : BundleSlot<IIC_VSHIFTI> { let HasVany = 1; }
+def B_VFMUL : BundleSlot<IIC_VFMUL> { let HasVany = 1; }
+def B_VFADD : BundleSlot<IIC_VFADD> { let HasVany = 1; }
+def B_VMOVR : BundleSlot<IIC_VMOVR> { let HasVany = 1; }
+def B_TASK : BundleSlot<IIC_TASK>;
+def B_ALUOV : BundleSlot<IIC_ALUOV> { let HasSany = 1; }
+def B_PACK : BundleSlot<IIC_PACK>;
+def B_UNPACK : BundleSlot<IIC_UNPACK>;
+
+// A map from an instruction used during ISel and compilation (e.g. ADDri) to
+// a specifically bundled form (e.g. ADDri_S0). Note that we can't store the
+// functional units here as these differ per subtarget.
+class BundledVariant<string unbundled, string bundled> {
+  Instruction Unbundled = !cast<Instruction>(unbundled);
+  Instruction Bundled = !cast<Instruction>(bundled);
+}
+
+// Create a searchable table, allowing assembler and disassembler to map between
+// the different bundled variants of an instruction. This allows mapping from
+// ADDri to ADDri_S0 or ADDri_S1, for example.
+def BundledVariantTable : GenericTable {
+  let FilterClass = "BundledVariant";
+  let CppTypeName = "BundledVariantTy";
+  let Fields = ["Unbundled", "Bundled"];
+  let PrimaryKey = ["Unbundled"];
+  let PrimaryKeyName = "BundledVariant";
+}
+
+// An Enum representing the type of encoding an operand can use in an
+// instruction. This would be to differentiate between operands for different
+// instructions (and at different positions within an instruction) as they
+// are encoded differently and they support different values.
+// (VY supports different directly encoded values of SY for example).
+class OpEncodingType<string name, bits<4> enc> {
+  string Name = name;
+  bits<4> Encoding = enc;
+}
+
+def OpEncodings : GenericEnum {
+  let FilterClass = "OpEncodingType";
+  let NameField = "Name";
+  let ValueField = "Encoding";
+}
+
+// Encoding supporting standard 16b/20b immediate w/o special encoding support.
+def ENCODING_NORMAL :           OpEncodingType<"Normal", 0>;
+def ENCODING_MEMOFFSET :        OpEncodingType<"MemOffset", 1>;
+def ENCODING_MEMSTRIDE :        OpEncodingType<"MemStride", 2>;
+def ENCODING_SUBLANEMASK :      OpEncodingType<"SublaneMask", 3>;
+def ENCODING_VY :               OpEncodingType<"VY", 4>;
+def ENCODING_SY :               OpEncodingType<"SY", 5>;
+def ENCODING_BC_VLD_VST_BASE :  OpEncodingType<"BcVldVstBase", 6>;
+// Encoding supporting 32-bit immediates (using 2 slots), but doesn't
+// support any special encoding.
+def ENCODING_NORMAL32 :         OpEncodingType<"Normal32", 7>;
+def ENCODING_SY_STREAM :        OpEncodingType<"SY_Stream", 8>;
+// No-encoding type. The immediate value is encoded as-is.
+def ENCODING_PLAIN :            OpEncodingType<"Plain", 9>;
+def ENCODING_VS0 :              OpEncodingType<"VS0", 10>;
+def ENCODING_VS1 :              OpEncodingType<"VS1", 11>;
+def ENCODING_VS2 :              OpEncodingType<"VS2", 12>;
+def ENCODING_VS3 :              OpEncodingType<"VS3", 13>;
+def ENCODING_SM_X :             OpEncodingType<"SM_X", 14>;
+def ENCODING_SM_Y :             OpEncodingType<"SM_Y", 15>;
+
+// The symbol type defines indices of non-implicit operands.
+class OperandIndex<bits<8> index> {
+  bits<8> Index = index;
+}
+
+foreach N = ["Y", "Simm"] in {
+def N#OpIdx#None : OperandIndex<0>;
+foreach I = 0-7 in {
+  def N#OpIdx#I : OperandIndex<!shl(1, I)>;
+}
+}
+// TODO(maggioni): The idea is to move to a model where we attach
+// this information on the operand, because passing indices around is
+// potentially error prone. To unblock VST/VLD short term for b/147597613
+// we are gonna have an index that covers all the operands
+def OpIdxAll : OperandIndex<255>;
+
+// A map from an instruction to an index of special encoded ('y'-style encoded)
+// operand, such as 'y' operand in vector or scalar slots, etc.
+class OpWithSpecialEncoding<string instr,
+                            list<OperandIndex> indices,
+                            OpEncodingType enKind> {
+  Instruction Instr = !cast<Instruction>(instr);
+  bits<8> OpIdx = !foldl(0, indices, a, b, !or(a, b.Index));
+  bits<4> EnKind = enKind.Encoding;
+}
+
+// Create a searchable table, allowing code emitter to know the instructions and
+// indices of the operands that need to be encoded in special way.
+def OpWithSpecialEncodingTable : GenericTable {
+  let FilterClass = "OpWithSpecialEncoding";
+  let CppTypeName = "OpWithSpecialEncodingTy";
+  let Fields = ["Instr", "OpIdx", "EnKind"];
+  let PrimaryKey = ["Instr"];
+  let PrimaryKeyName = "OpWithSpecialEncoding";
+}
+
+// Allow setting predicates at different level of the definitions.
+class ExtraPredicates<list<Predicate> PlatformPredicates> {
+  list<Predicate> OtherPredicates = [];
+  list<Predicate> Predicates =
+    !listconcat(PlatformPredicates, OtherPredicates);
+}
+
+// Encoding for a subunit (computational unit of a functional unit), which
+// defines what kind of abstract hardware is required to execute an instruction.
+// This subunit information is consumed by the mdl compiler (go/mpact-mdl) to
+// generate cycle-by-cycle pipeline behavior in terms of register reads/writes
+// and resources used.
+class SubUnitEncoding<string name> {
+  string subunit_name = name;
+}
+
+def SU_atomic : SubUnitEncoding<"atomic">;
+def SU_control : SubUnitEncoding<"control">;
+def SU_delay : SubUnitEncoding<"delay">;
+def SU_descriptor_dma : SubUnitEncoding<"descriptor_dma">;
+def SU_dma : SubUnitEncoding<"dma">;
+def SU_local_dma : SubUnitEncoding<"local_dma">;
+def SU_eup_result : SubUnitEncoding<"eup_result">;
+def SU_extended_unary : SubUnitEncoding<"extended_unary">;
+def SU_f_convert : SubUnitEncoding<"f_convert">;
+def SU_fence : SubUnitEncoding<"fence">;
+def SU_f_math : SubUnitEncoding<"f_math">;
+def SU_f_multiply : SubUnitEncoding<"f_multiply">;
+def SU_halt : SubUnitEncoding<"halt">;
+def SU_i_multiply : SubUnitEncoding<"i_multiply">;
+def SU_load : SubUnitEncoding<"load">;
+def SU_packed_permute : SubUnitEncoding<"packed_permute">;
+def SU_packed_segmented_transpose_end : SubUnitEncoding<"packed_segmented_transpose_end">;
+def SU_packed_transpose_end : SubUnitEncoding<"packed_transpose_end">;
+def SU_packed_transpose_start : SubUnitEncoding<"packed_transpose_start">;
+def SU_permute : SubUnitEncoding<"permute">;
+def SU_png_gen : SubUnitEncoding<"png_gen">;
+def SU_png_read : SubUnitEncoding<"png_read">;
+def SU_png_set : SubUnitEncoding<"png_set">;
+def SU_pop : SubUnitEncoding<"pop">;
+def SU_read_sync : SubUnitEncoding<"read_sync">;
+def SU_reduce : SubUnitEncoding<"reduce">;
+def SU_segmented_reduce : SubUnitEncoding<"segmented_reduce">;
+def SU_scalar_alu : SubUnitEncoding<"scalar_alu">;
+def SU_scalar_alu_both : SubUnitEncoding<"scalar_alu_both">;
+def SU_scalar_cmp : SubUnitEncoding<"scalar_cmp">;
+def SU_scalar_cmp_ordered : SubUnitEncoding<"scalar_cmp_ordered">;
+def SU_scalar_misc1 : SubUnitEncoding<"scalar_misc1">;
+def SU_scalar_misc2 : SubUnitEncoding<"scalar_misc2">;
+def SU_scalar_misc3 : SubUnitEncoding<"scalar_misc3">;
+def SU_segmented_transpose_end : SubUnitEncoding<"segmented_transpose_end">;
+def SU_set_iar : SubUnitEncoding<"set_iar">;
+def SU_set_pattern : SubUnitEncoding<"set_pattern">;
+def SU_set_pattern_all : SubUnitEncoding<"set_pattern_all">;
+def SU_set_pattern_jfc : SubUnitEncoding<"set_pattern_jfc">;
+def SU_set_pattern_pfc : SubUnitEncoding<"set_pattern_pfc">;
+def SU_set_sync : SubUnitEncoding<"set_sync">;
+def SU_store : SubUnitEncoding<"store">;
+def SU_task : SubUnitEncoding<"task">;
+def SU_transpose_continue : SubUnitEncoding<"transpose_continue">;
+def SU_transpose_end : SubUnitEncoding<"transpose_end">;
+def SU_transpose_jfc : SubUnitEncoding<"transpose_jfc">;
+def SU_transpose_start : SubUnitEncoding<"transpose_start">;
+def SU_u_divide : SubUnitEncoding<"u_divide">;
+def SU_v2s_push : SubUnitEncoding<"v2s_push">;
+def SU_vdelay : SubUnitEncoding<"vdelay">;
+def SU_vector_load : SubUnitEncoding<"vector_load">;
+def SU_vector_store : SubUnitEncoding<"vector_store">;
+def SU_vector_cmp : SubUnitEncoding<"vector_cmp">;
+def SU_vector_compose : SubUnitEncoding<"vector_compose">;
+def SU_vector_float : SubUnitEncoding<"vector_float">;
+def SU_vector_fmul : SubUnitEncoding<"vector_fmul">;
+def SU_vector_move : SubUnitEncoding<"vector_move">;
+def SU_vector_op : SubUnitEncoding<"vector_op">;
+def SU_vector_cvt : SubUnitEncoding<"vector_cvt">;
+def SU_vector_cvt_ext : SubUnitEncoding<"vector_cvt_ext">;
+def SU_vector_math : SubUnitEncoding<"vector_math">;
+def SU_vector_math_ext : SubUnitEncoding<"vector_math_ext">;
+def SU_vector_pack : SubUnitEncoding<"vector_pack">;
+def SU_vector_rotate : SubUnitEncoding<"vector_rotate">;
+def SU_vector_shift : SubUnitEncoding<"vector_shift">;
+def SU_vector_ext0 : SubUnitEncoding<"vector_ext0">;
+def SU_vector_ext1 : SubUnitEncoding<"vector_ext1">;
+def SU_vector_ext2 : SubUnitEncoding<"vector_ext2">;
+def SU_vector_xlane : SubUnitEncoding<"vector_xlane">;
+def SU_xlu_result : SubUnitEncoding<"xlu_result">;
+def SU_mxu_result : SubUnitEncoding<"mxu_result">;
+def SU_vmask : SubUnitEncoding<"vmask">;
+def SU_vmisc : SubUnitEncoding<"vmisc">;
+def SU_vwait : SubUnitEncoding<"vwait">;
+def SU_matdwg_jfc : SubUnitEncoding<"matdwg_jfc">;
+def SU_matlmr : SubUnitEncoding<"matlmr">;
+def SU_matpush_jfc : SubUnitEncoding<"matpush_jfc">;
+def SU_matpush_jfc_xp : SubUnitEncoding<"matpush_jfc_xp">;
+def SU_matpush_pfc : SubUnitEncoding<"matpush_pfc">;
+def SU_matpush_f32 : SubUnitEncoding<"matpush_f32">;
+def SU_matpush_f32x : SubUnitEncoding<"matpush_f32x">;
+def SU_matpush_f16 : SubUnitEncoding<"matpush_f16">;
+def SU_matpush_f16x : SubUnitEncoding<"matpush_f16x">;
+def SU_matpush_f8 : SubUnitEncoding<"matpush_f8">;
+def SU_matpush_f8x : SubUnitEncoding<"matpush_f8x">;
+def SU_matmul_jfc : SubUnitEncoding<"matmul_jfc">;
+def SU_matmul_pfc : SubUnitEncoding<"matmul_pfc">;
+def SU_matmul_dwgn_pfc : SubUnitEncoding<"matmul_dwgn_pfc">;
+def SU_matmul_dwgt_pfc : SubUnitEncoding<"matmul_dwgt_pfc">;
+def SU_matmul_pfc_packed : SubUnitEncoding<"matmul_pfc_packed">;
+def SU_matmul_dwgn_pfc_packed : SubUnitEncoding<"matmul_dwgn_pfc_packed">;
+def SU_matmul_dwgt_pfc_packed : SubUnitEncoding<"matmul_dwgt_pfc_packed">;
+def SU_matmul_f32 : SubUnitEncoding<"matmul_f32">;
+def SU_matmul_f16 : SubUnitEncoding<"matmul_f16">;
+def SU_matmul_f8 : SubUnitEncoding<"matmul_f8">;
+def SU_matmul_u8 : SubUnitEncoding<"matmul_u8">;
+def SU_matmul_s8 : SubUnitEncoding<"matmul_s8">;
+def SU_matmul_lmr_8 : SubUnitEncoding<"matmul_lmr_8">;
+def SU_matmul_lmr_16 : SubUnitEncoding<"matmul_lmr_16">;
+def SU_xrf_result : SubUnitEncoding<"xrf_result">;
+def SU_stream : SubUnitEncoding<"stream">;
+def SU_stream_cbupd : SubUnitEncoding<"stream_cbupd">;
+def SU_load_circ : SubUnitEncoding<"load_circ">;
+def SU_store_circ : SubUnitEncoding<"store_circ">;
+def SU_read_cbreg : SubUnitEncoding<"read_cbreg">;
+def SU_write_cbreg : SubUnitEncoding<"write_cbreg">;
+def SU_read_write_dreg : SubUnitEncoding<"read_write_dreg">;
+def SU_tile_load : SubUnitEncoding<"tile_load">;
+def SU_tile_load_update : SubUnitEncoding<"tile_load_update">;
+def SU_tile_store : SubUnitEncoding<"tile_store">;
+def SU_tile_store_update : SubUnitEncoding<"tile_store_update">;
+def SU_tile_store_add : SubUnitEncoding<"tile_store_add">;
+def SU_tile_store_add_update : SubUnitEncoding<"tile_store_add_update">;
+
+class SubUnits<list<SubUnitEncoding> subunits> {
+  list<SubUnitEncoding> SubUnits = subunits;
+}
+
+class TPUInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : Instruction {
+  let Namespace = "TPU";
+  let DecoderNamespace = "TPU";
+  // This is a placeholder Inst, because all instructions must have an Inst
+  // field.
+  bits<0> Inst;
+
+  // The names of subunits an instruction can run on.
+  list<SubUnitEncoding> SubUnits = [];
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+  let hasSideEffects = 0;
+  field BundleSlot bundleSlot = ?;
+  field bit isVectorInstruction = 0;
+  field bit isPush = 0;
+  field bit isPop = 0;
+  field bit isVMemLoadInstr = 0;
+  field bit isTransposeEnd = 0;
+  field bit isTranspose = 0;
+  field bit isPermute = 0;
+  field bit isReduce = 0;
+  field bit isPacked = 0;
+  field bit isSegmented = 0;
+  field bit isIndexedLoadStore = 0;
+  field bit isDwg = 0;
+  field bit isPackedMatMul = 0;
+  field bit isInVectorSlot = 0;
+  field bit isInScalarSlot = 0;
+  field bit isVMemStoreInstr = 0;
+  field bit isFifoPseudoCopy = 0;
+  field bit isMXUInst = 0;
+  field bit isXLUInst = 0;
+  field ImmSlotRequirement immRequirement = IMM_NONE;
+  field ImmOperRequirement operRequirement = IMM_OP_NONE;
+  field bit supportsPopVoid = 0;
+  field bit isComposedErfFifo = 0;
+  field bit isComposedXrf0Fifo = 0;
+  field bit isComposedXrf1Fifo = 0;
+  field bit isComposedV2SFifo = 0;
+  field bit isComposedDrfFifo = 0;
+  field bit isStream = 0;
+  field bit isIndirectOrStridedStream = 0;
+  field bit isIndirectVregCbStream = 0;
+  field bit isIndirectVregStream = 0;
+  field bit supportsEmbeddedMask = 0;
+  field bit isNoParallel = 0;
+  field bit isDMA = 0;
+  field bit isCb = 0;
+  field bit isCbUpd = 0;
+  let TSFlags{2} = isVectorInstruction;
+  let TSFlags{3} = isPush;
+  let TSFlags{4} = isPop;
+  let TSFlags{5} = isVMemLoadInstr;
+  let TSFlags{6} = isTransposeEnd;
+  let TSFlags{7} = isTranspose;
+  let TSFlags{8} = isPermute;
+  let TSFlags{9} = isReduce;
+  let TSFlags{10} = isPacked;
+  let TSFlags{11} = isSegmented;
+  let TSFlags{12} = isIndexedLoadStore;
+  let TSFlags{13} = isDwg;
+  let TSFlags{14} = isPackedMatMul;
+  let TSFlags{15} = isInVectorSlot;
+  let TSFlags{16} = isInScalarSlot;
+  let TSFlags{22-17} = immRequirement.immMask;
+  let TSFlags{26-23} = operRequirement.operMask;
+  let TSFlags{27} = isVMemStoreInstr;
+  let TSFlags{28} = isFifoPseudoCopy;
+  let TSFlags{29} = isMXUInst;
+  let TSFlags{30} = isXLUInst;
+  let TSFlags{31} = supportsPopVoid;
+  let TSFlags{32} = isComposedErfFifo;
+  let TSFlags{33} = isComposedXrf0Fifo;
+  let TSFlags{34} = isComposedXrf1Fifo;
+  let TSFlags{35} = isComposedV2SFifo;
+  let TSFlags{36} = isComposedDrfFifo;
+  let TSFlags{37} = isStream;
+  let TSFlags{38} = isIndirectOrStridedStream;
+  let TSFlags{39} = isIndirectVregCbStream;
+  let TSFlags{40} = isIndirectVregStream;
+  let TSFlags{41} = supportsEmbeddedMask;
+  let TSFlags{42} = isNoParallel;
+  let TSFlags{43} = isDMA;
+  let TSFlags{44} = isCb;
+  let TSFlags{45} = isCbUpd;
+}
+
+// A predicable scalar instruction. The predicate is a pair<PPR, flags>
+// and is always the last two operands in the instruction.
+class TPUInstP<dag oops, dag iops, string asmstr, list<dag> pattern>
+  : TPUInst<oops, iops, asmstr, pattern> {
+  let InOperandList = !con(iops, (ins pred:$pred));
+}
+
+//===----------------------------------------------------------------------===//
+// Scalar encodings - simple (non-DMA) instructions in S0 or S1.
+//===----------------------------------------------------------------------===//
+
+class ScalarOperands {
+  bits<5> pred;
+  bits<5> d;
+  bits<5> x;
+  bits<6> y;
+}
+
+class Encoding_JFC_Empty : InstructionEncoding {
+  bits<322> Inst;
+}
+def EmptyEncoding_JFC : Encoding_JFC_Empty;
+
+class Encoding_JFC : Encoding_JFC_Empty {
+  // Setting all bits to 0b1 in 64-bit chunks (the largest type easily usable).
+  let Inst{63-0} = -1;
+  let Inst{127-64} = -1;
+  let Inst{191-128} = -1;
+  let Inst{255-192} = -1;
+  let Inst{319-256} = -1;
+  let Inst{321-320} = -1;
+}
+
+class Encoding_PXC_Empty : InstructionEncoding {
+  bits<408> Inst;
+}
+def EmptyEncoding_PXC : Encoding_PXC_Empty;
+
+class Encoding_PXC : Encoding_PXC_Empty {
+  // Setting all bits to 0b1 in 64-bit chunks (the largest type easily usable).
+  let Inst{63-0} = -1;
+  let Inst{127-64} = -1;
+  let Inst{191-128} = -1;
+  let Inst{255-192} = -1;
+  let Inst{319-256} = -1;
+  let Inst{383-320} = -1;
+  let Inst{407-383} = -1;
+}
+
+class EncodingS0_JFC<bits<6> opc> : Encoding_JFC, ScalarOperands {
+  let Inst{321-317} = pred;
+  let Inst{316-311} = opc;
+  let Inst{310-306} = x;
+  let Inst{305-300} = y;
+  let Inst{299-295} = d;
+}
+
+class EncodingS0_PXC<bits<6> opc> : Encoding_PXC, ScalarOperands {
+  let Inst{407-403} = pred;
+  let Inst{402-397} = opc;
+  let Inst{396-392} = x;
+  let Inst{391-386} = y;
+  let Inst{385-381} = d;
+}
+
+class EncodingS1_JFC<bits<6> opc> : Encoding_JFC, ScalarOperands {
+  let Inst{294-290} = pred;
+  let Inst{289-284} = opc;
+  let Inst{283-279} = x;
+  let Inst{278-273} = y;
+  let Inst{272-268} = d;
+}
+
+class EncodingS1_PXC<bits<6> opc> : Encoding_PXC, ScalarOperands {
+  let Inst{380-376} = pred;
+  let Inst{375-370} = opc;
+  let Inst{369-365} = x;
+  let Inst{364-359} = y;
+  let Inst{358-354} = d;
+}
+
+class TPUInstS0<bits<6> opc, dag oops, dag iops, string asmstr, list<dag> pattern>
+  : TPUInstP<oops, iops, asmstr, pattern>, Bundle<B_S0> {
+  let EncodingInfos = EncodingByHwMode<[JfcDfcHwMode, PxcHwMode],
+                                       [EncodingS0_JFC<opc>,
+                                        EncodingS0_PXC<opc>]>;
+}
+
+class TPUInstS1<bits<6> opc, dag oops, dag iops, string asmstr, list<dag> pattern>
+  : TPUInstP<oops, iops, asmstr, pattern>, Bundle<B_S1> {
+  let EncodingInfos = EncodingByHwMode<[JfcDfcHwMode, PxcHwMode],
+                                       [EncodingS1_JFC<opc>,
+                                        EncodingS1_PXC<opc>]>;
+}
+
+class TPUInstSM<bits<6> opc, dag oops, dag iops, string asmstr, list<dag> pattern>
+  : TPUInstP<oops, iops, asmstr, pattern>, Bundle<B_SM> {
+  let EncodingInfos = EncodingByHwMode<[JfcDfcHwMode, PxcHwMode],
+                                       // TODO(hgreving): needs update.
+                                       [EncodingS1_JFC<opc>,
+                                        EncodingS1_PXC<opc>]>;
+}
+
+multiclass TPUInstSany<bits<6> opc, dag oops, dag iops, string asmstr,
+                       list<dag> pattern> {
+  // The base def is bundle packed in B_Sany and has the isel pattern.
+  def "" : TPUInstP<oops, iops, asmstr, pattern>, Bundle<B_Sany>;
+  // Generate post-bundle-packing versions that have their slots baked in.
+  // These don't get isel patterns.
+  // The {|} syntax causes us to emit "(slot_s0) " in assembler variant 0 and
+  // "" in assembler variant 1. This is controlled by -print-encoding-annotations
+  // or by setting the AsmVariant when creating an InstPrinter.
+  def _S0 : TPUInstS0<opc, oops, iops, "{(slot_s0) |}"#asmstr, []>,
+            BundledVariant<NAME, NAME#_S0>;
+  def _S1 : TPUInstS1<opc, oops, iops, "{(slot_s1) |}"#asmstr, []>,
+            BundledVariant<NAME, NAME#_S1>;
+}
+
+multiclass TPUInstSanyMisc<bits<6> opc, dag oops, dag iops, string asmstr,
+                           list<dag> pattern> {
+  def "" : TPUInstP<oops, iops, asmstr, pattern>, Bundle<B_SanyMisc>;
+  def _S0 : TPUInstS0<opc, oops, iops, "{(slot_s0) |}"#asmstr, []>,
+            BundledVariant<NAME, NAME#_S0>;
+  def _S1 : TPUInstS1<opc, oops, iops, "{(slot_s1) |}"#asmstr, []>,
+            BundledVariant<NAME, NAME#_S1>;
+  def _SM : TPUInstSM<opc, oops, iops, "{(slot_sm) |}"#asmstr, []>,
+               BundledVariant<NAME, NAME#_SM>;
+}
+
+// This is a proxy class that forwards to different base classes depending on
+// Slot. This allows defs to inherit from a different base class depending on
+// slot.
+multiclass TPUInstS<BundleSlot Slot, bits<6> opc, dag oops, dag iops,
+                    string asmstr, list<dag> pattern> {
+  if !eq(Slot.HasS0, 1) then {
+    def "" : TPUInstS0<opc, oops, iops, asmstr, pattern>;
+  }
+  if !eq(Slot.HasS1, 1) then {
+    def "" : TPUInstS1<opc, oops, iops, asmstr, pattern>;
+  }
+  if !eq(Slot.HasSM, 1) then {
+    def "" : TPUInstSM<opc, oops, iops, asmstr, pattern>;
+  }
+  if !eq(Slot.HasSany, 1) then {
+    defm "" : TPUInstSany<opc, oops, iops, asmstr, pattern>;
+  }
+  if !eq(Slot.HasSanyMisc, 1) then {
+    defm "" : TPUInstSanyMisc<opc, oops, iops, asmstr, pattern>;
+  }
+}
+
+// Defined 'pred' operand.
+class PredOperand {
+  bits<7> pred; // Bits{4-0} are {Invert, Reg}, Bits{6-5} are barnacore pipeline stage.
+}
+
+// Common fields for V0/V1/V2 instructions.
+class VectorOperands : PredOperand {
+  bits<5> d;
+  bits<5> x;
+  bits<128> y;
+}
+
+class EncodingVy_JFC : Encoding_JFC, VectorOperands {
+  let Inst{262-247} = y{47-32};   // imm0
+  let Inst{246-231} = y{63-48};   // imm1
+  let Inst{230-215} = y{79-64};   // imm2
+  let Inst{214-199} = y{95-80};   // imm3
+  let Inst{198-183} = y{111-96};  // imm4
+  let Inst{182-167} = y{127-112}; // imm5
+
+  let Inst{166-162} = y{20-16};   // vs0
+  let Inst{161-157} = y{25-21};   // vs1
+  let Inst{156-152} = y{30-26};   // vs2
+}
+
+class EncodingV0_JFC<bits<6> opc> : EncodingVy_JFC {
+  let Inst{151-147} = pred{4-0};
+  let Inst{146-141} = opc;
+  let Inst{140-136} = x;
+  let Inst{135-131} = y{4-0}; // y_src
+  let Inst{130-126} = y{9-5}; // y_vreg
+  let Inst{125-121} = d;
+}
+
+class EncodingV1_JFC<bits<6> opc> : EncodingVy_JFC {
+  let Inst{120-116} = pred{4-0};
+  let Inst{115-110} = opc;
+  let Inst{109-105} = x;
+  let Inst{104-100} = y{4-0}; // y_src
+  let Inst{99-95} = y{9-5};   // y_vreg
+  let Inst{94-90} = d;
+}
+
+class EncodingVy_PXC : Encoding_PXC, VectorOperands {
+  let Inst{353-338} = y{47-32};   // imm0
+  let Inst{335-320} = y{63-48};   // imm1
+  let Inst{319-304} = y{79-64};   // imm2
+  let Inst{303-288} = y{95-80};   // imm3
+  let Inst{287-272} = y{111-96};  // imm4
+  let Inst{271-256} = y{127-112}; // imm5
+
+  let Inst{255-251} = y{20-16};   // vs0
+  let Inst{250-246} = y{25-21};   // vs1
+  let Inst{245-241} = y{30-26};   // vs2
+}
+
+class EncodingV0_PXC<bits<6> opc> : EncodingVy_PXC {
+  let Inst{240-236} = pred{4-0};
+  let Inst{235-230} = opc;
+  let Inst{229-225} = x;
+  let Inst{224-220} = y{4-0}; // y_src
+  let Inst{207-203} = y{9-5}; // y_vreg
+  let Inst{202-198} = d;
+}
+
+class EncodingV1_PXC<bits<6> opc> : EncodingVy_PXC {
+  let Inst{197-193} = pred{4-0};
+  let Inst{192-187} = opc;
+  let Inst{186-182} = x;
+  let Inst{181-177} = y{4-0}; // y_src
+  let Inst{176-172} = y{9-5}; // y_vreg
+  let Inst{171-167} = d;
+}
+
+class Encoding_BC_Empty : InstructionEncoding {
+  bits<239> Inst;
+}
+def EmptyEncoding_BC : Encoding_BC_Empty;
+
+class Encoding_BC : Encoding_BC_Empty {
+  // Setting all bits to 0b1 in 64-bit chunks (the largest type easily usable).
+  let Inst{63-0} = -1;
+  let Inst{127-64} = -1;
+  let Inst{191-128} = -1;
+  let Inst{238-192} = -1;
+
+  // XLA based codegen never uses/sets these values, so we hardcode them to
+  // always be 0.
+  let Inst{59-59} = 0; // add_loop_index_to_v1_vselect_mask
+  let Inst{58-58} = 0; // add_loop_index_to_v0_vselect_mask
+  let Inst{31-31} = 0; // add_loop_index_to_v1_dst
+  let Inst{30-30} = 0; // add_loop_index_to_v0_dst
+  let Inst{29-29} = 0; // add_loop_index_to_v1_y_reg
+  let Inst{28-28} = 0; // add_loop_index_to_v0_y_reg
+  let Inst{27-27} = 0; // add_loop_index_to_v1_x
+  let Inst{26-26} = 0; // add_loop_index_to_v0_x
+  let Inst{25-25} = 0; // add_loop_index_to_vst_src
+  let Inst{24-24} = 0; // add_loop_index_to_vld_dst
+}
+
+class EncodingVy_BC : Encoding_BC, VectorOperands {
+  let Inst{190-175} = y{47-32};   // imm0
+  let Inst{206-191} = y{63-48};   // imm1
+  let Inst{222-207} = y{79-64};   // imm2
+  let Inst{238-223} = y{95-80};   // imm3
+  // Since imm0/1 are the same as imm5/4 we should ignore the latter so they
+  // don't overwrite the former. imm4/5 should never be allocated by bundle
+  // packer on this platform.
+  // let Inst{206:191} = y{111-96};  // imm4
+  // let Inst{190:175} = y{127-112}; // imm5
+
+  let Inst{36-35} = y{17-16};   // vs0 (lower two bits)
+  let Inst{38-37} = y{22-21};   // vs1 (lower two bits)
+  let Inst{40-39} = y{27-26};   // vs2 (lower two bits)
+}
+
+class EncodingV0_BC<bits<6> opc> : EncodingVy_BC {
+  let Inst{61-60} = pred{6-5};
+  let Inst{66-62} = pred{4-0};
+  let Inst{72-67} = opc;
+  let Inst{77-73} = x;
+  let Inst{87-83} = y{4-0};
+  let Inst{82-78} = y{9-5};
+  let Inst{92-88} = d;
+}
+
+// Instruction encoding for the V1 slot.
+class EncodingV1_BC<bits<6> opc> : EncodingVy_BC {
+  let Inst{94-93} = pred{6-5};
+  let Inst{99-95} = pred{4-0};
+  let Inst{105-100} = opc;
+  let Inst{110-106} = x;
+  let Inst{120-116} = y{4-0};
+  let Inst{115-111} = y{9-5};
+  let Inst{125-121} = d;
+}
+
+// Defines TPU instruction encoding for all supported HW modes.
+class TPUEncodingByHwMode<list<InstructionEncoding> encodings>
+  : EncodingByHwMode<[JfcDfcHwMode, PxcHwMode, BarnaCorePxcHwMode],
+                      encodings>;
+
+// Defines TPU instruction encoding in V0/V1/V2 slots.
+class TPUInstVEncoding<EncodingByHwMode pv0, EncodingByHwMode pv1> {
+  EncodingByHwMode v0 = pv0;
+  EncodingByHwMode v1 = pv1;
+}
+
+// Base encoding for vector ALU ops in V0/V1 slots.
+class VIntALUOpEncoding<bits<6> opc> : TPUInstVEncoding<
+  TPUEncodingByHwMode<[EncodingV0_JFC<opc>, EncodingV0_PXC<opc>, EncodingV0_BC<opc>]>,
+  TPUEncodingByHwMode<[EncodingV1_JFC<opc>, EncodingV1_PXC<opc>, EncodingV1_BC<opc>]>>;
+
+class EncodingV0UnOp_JFC<bits<6> opc> : EncodingV0_JFC<opc> {
+  let x = 0;
+}
+class EncodingV1UnOp_JFC<bits<6> opc> : EncodingV1_JFC<opc> {
+  let x = 0;
+}
+class EncodingV0UnOp_PXC<bits<6> opc> : EncodingV0_PXC<opc> {
+  let x = 0;
+}
+class EncodingV1UnOp_PXC<bits<6> opc> : EncodingV1_PXC<opc> {
+  let x = 0;
+}
+class EncodingV0UnOp_BC<bits<6> opc> : EncodingV0_BC<opc> {
+  let x = 0;
+}
+class EncodingV1UnOp_BC<bits<6> opc> : EncodingV1_BC<opc> {
+  let x = 0;
+}
+
+// Defines TPU instruction encoding for an unary op in V0/V1 slots.
+class VIntALUUnOpEncoding<bits<6> opc> : TPUInstVEncoding<
+  TPUEncodingByHwMode<[EncodingV0UnOp_JFC<opc>, EncodingV0UnOp_PXC<opc>, EncodingV0UnOp_BC<opc>]>,
+  TPUEncodingByHwMode<[EncodingV1UnOp_JFC<opc>, EncodingV1UnOp_PXC<opc>, EncodingV1UnOp_BC<opc>]>>;
+
+class EncodingV0EupOp_JFC<bits<6> opc> : EncodingV0_JFC<opc> {
+  let d = 0;
+  let y{63-0} = -1;
+  let y{127-64} = -1;
+}
+class EncodingV1EupOp_JFC<bits<6> opc> : EncodingV1_JFC<opc> {
+  let d = 0;
+  let y{63-0} = -1;
+  let y{127-64} = -1;
+}
+class EncodingV0EupOp_PXC<bits<6> opc> : EncodingV0_PXC<opc> {
+  let d = 0;
+  let y{63-0} = -1;
+  let y{127-64} = -1;
+}
+class EncodingV1EupOp_PXC<bits<6> opc> : EncodingV1_PXC<opc> {
+  let d = 0;
+  let y{63-0} = -1;
+  let y{127-64} = -1;
+}
+class EncodingV0EupOp_BC<bits<6> opc> : EncodingV0_BC<opc> {
+  let d = 0;
+  let y{63-0} = -1;
+  let y{127-64} = -1;
+}
+class EncodingV1EupOp_BC<bits<6> opc> : EncodingV1_BC<opc> {
+  let d = 0;
+  let y{63-0} = -1;
+  let y{127-64} = -1;
+}
+
+// Defines TPU instruction encoding for an unary op in V0/V1 slots.
+class VIntALUEupOpEncoding<bits<6> opc> : TPUInstVEncoding<
+  TPUEncodingByHwMode<[EncodingV0EupOp_JFC<opc>, EncodingV0EupOp_PXC<opc>, EncodingV0EupOp_BC<opc>]>,
+  TPUEncodingByHwMode<[EncodingV1EupOp_JFC<opc>, EncodingV1EupOp_PXC<opc>, EncodingV1EupOp_BC<opc>]>>;
+
+// Instruction encoding for a set of Vselect instructions in V0/V1/V2 slots. Note
+// that there are 8 different opcodes for vsel (16 for 'Vselect Vmsk0 ...', 17
+// for 'Vselect Vmsk1 ...', etc...) so we model it by using 16 as an opcode and
+// then overwriting the lower 3 bits of the opcode with mask register. This hack
+// exploits the fact that the codegen treats the opcode as a constant, and then
+// overwrites part of it guaranteeing that opcode does not overwrite vmreg.
+class VmselParam {
+  bits<3> m;
+}
+def EncodingV0Vmsel_JFC : EncodingV0_JFC<16>, VmselParam {
+  let Inst{143-141} = m;
+}
+def EncodingV1Vmsel_JFC : EncodingV1_JFC<16>, VmselParam {
+  let Inst{112-110} = m;
+}
+def EncodingV0Vmsel_PXC : EncodingV0_PXC<16>, VmselParam {
+  let Inst{232-230} = m;
+}
+def EncodingV1Vmsel_PXC : EncodingV1_PXC<16>, VmselParam {
+  let Inst{189-187} = m;
+}
+def EncodingV0Vmsel_BC : EncodingV0_BC<16>, VmselParam {
+  let Inst{69-67} = m;
+}
+def EncodingV1Vmsel_BC : EncodingV1_BC<16>, VmselParam {
+  let Inst{102-100} = m;
+}
+
+// Defines TPU instruction encoding for a vector mask sel in V0/V1 slots.
+def VIntALUVmselEncoding : TPUInstVEncoding<
+  TPUEncodingByHwMode<[EncodingV0Vmsel_JFC, EncodingV0Vmsel_PXC, EncodingV0Vmsel_BC]>,
+  TPUEncodingByHwMode<[EncodingV1Vmsel_JFC, EncodingV1Vmsel_PXC, EncodingV1Vmsel_BC]>>;
+
+class TPUInstV0<BundleSlot Slot, EncodingByHwMode enc, dag oops, dag iops, string asmstr,
+                list<dag> pattern, OperandIndex yOpIdx = YOpIdx2>
+  : TPUInstP<oops, iops, asmstr, pattern>, Bundle<Slot>,
+    OpWithSpecialEncoding<NAME, [yOpIdx], ENCODING_VY> {
+  let EncodingInfos = enc;
+}
+
+class TPUInstV1<BundleSlot Slot, EncodingByHwMode enc, dag oops, dag iops, string asmstr,
+                list<dag> pattern, OperandIndex yOpIdx = YOpIdx2>
+  : TPUInstP<oops, iops, asmstr, pattern>, Bundle<Slot>,
+    OpWithSpecialEncoding<NAME, [yOpIdx], ENCODING_VY> {
+  let EncodingInfos = enc;
+}
+
+class TPUInstV2<BundleSlot Slot, EncodingByHwMode enc, dag oops, dag iops, string asmstr,
+                list<dag> pattern, OperandIndex yOpIdx = YOpIdx2>
+  : TPUInstP<oops, iops, asmstr, pattern>, Bundle<Slot>,
+    OpWithSpecialEncoding<NAME, [yOpIdx], ENCODING_VY> {
+  let EncodingInfos = enc;
+}
+
+// The Vany instruction class is also used for instructions that may be associated with
+// itineraries that don't necessarily use all the post-bundle variants that the Vany class
+// instantiates.
+multiclass TPUInstVany<BundleSlot Slot, TPUInstVEncoding enc, dag oops, dag iops, string asmstr,
+                       list<dag> pattern, OperandIndex yOpIdx = YOpIdx2> {
+  // The base def is bundle packed in B_Sany and has the isel pattern.
+  def "" : TPUInstP<oops, iops, asmstr, pattern>, Bundle<Slot>;
+  // Generate post-bundle-packing versions that have their slots baked in.
+  // These don't get isel patterns.
+  // The {|} syntax causes us to emit "(slot_s0) " in assembler variant 0 and
+  // "" in assembler variant 1. This is controlled by -print-encoding-annotations
+  // or by setting the AsmVariant when creating an InstPrinter.
+  def _V0 : TPUInstV0<B_V0, enc.v0, oops, iops, "{(slot_v0) |}"#asmstr, [], yOpIdx>,
+            BundledVariant<NAME, NAME#_V0>;
+  def _V1 : TPUInstV1<B_V1, enc.v1, oops, iops, "{(slot_v1) |}"#asmstr, [], yOpIdx>,
+            BundledVariant<NAME, NAME#_V1>;
+  // enc.v2 isn't needed and not used.
+  def _V2 : TPUInstV2<B_V2, enc.v1, oops, iops, "{(slot_v2) |}"#asmstr, [], yOpIdx>,
+            BundledVariant<NAME, NAME#_V2>;
+}
+
+// Defines TPU instruction encoding for vector extended result in VRes0/VRes1
+// slots, also uses pieces of in slots V0/V1/VLD/VAUX (on some platforms).
+class TPUInstVResEncoding<EncodingByHwMode pnone,
+          EncodingByHwMode pv0, EncodingByHwMode pv1,
+          EncodingByHwMode pvld, EncodingByHwMode pvaux> {
+  EncodingByHwMode none = pnone;
+  EncodingByHwMode v0 = pv0;
+  EncodingByHwMode v1 = pv1;
+  EncodingByHwMode vld = pvld;
+  EncodingByHwMode vaux = pvaux;
+}
+
+// Defines encoding for a pair of TPU instruction in VRes0 and VRes1 each with
+// its own sets of secondary slots.
+class TPUInstVresEncoding<TPUInstVResEncoding pvres0, TPUInstVResEncoding pvres1> {
+  TPUInstVResEncoding vres0 = pvres0;
+  TPUInstVResEncoding vres1 = pvres1;
+}
+
+class EncodingVResBase_BC<bits<2> dest_code> : PredOperand, Encoding_BC {
+  bits<5> v; // dest register.
+  let Inst{174-173} = dest_code;
+  let Inst{172-172} = 1; // opcode, always 1.
+  let Inst{171-167} = pred{4-0};
+}
+
+def EncodingVResV0_BC : EncodingVResBase_BC<0> {
+  let Inst{92-88} = v;
+  let Inst{61-60} = pred{6-5};
+}
+
+def EncodingVResV1_BC : EncodingVResBase_BC<1> {
+  let Inst{125-121} = v;
+  let Inst{94-93} = pred{6-5};
+}
+
+def EncodingVResVLD_BC : EncodingVResBase_BC<2> {
+  let Inst{161-157} = v;
+  let Inst{148-147} = pred{6-5};
+}
+
+// FIXME: add encoding for other platforms.
+class VRES_EUPEncodingByHwMode<InstructionEncoding bc_enc>
+   : TPUEncodingByHwMode<[EmptyEncoding_JFC, EmptyEncoding_PXC, bc_enc]>;
+def EmptyVRES_EUPEncodingByHwMode : VRES_EUPEncodingByHwMode<EmptyEncoding_BC>;
+
+// Defines TPU instruction encoding for osp in VRes0/VRes1 slots. Note that for
+// different platforms this op has different opcodes.
+def VRES_EUPEncoding : TPUInstVresEncoding<
+  TPUInstVResEncoding<
+    /*none:*/ EmptyVRES_EUPEncodingByHwMode,
+    /*  v0:*/ VRES_EUPEncodingByHwMode<EncodingVResV0_BC>,
+    /*  v1:*/ VRES_EUPEncodingByHwMode<EncodingVResV1_BC>,
+    /* vld:*/ VRES_EUPEncodingByHwMode<EncodingVResVLD_BC>,
+    /*vaux:*/ EmptyVRES_EUPEncodingByHwMode
+  >,
+  TPUInstVResEncoding<
+    /*none:*/ EmptyVRES_EUPEncodingByHwMode,
+    /*  v0:*/ EmptyVRES_EUPEncodingByHwMode,
+    /*  v1:*/ EmptyVRES_EUPEncodingByHwMode,
+    /* vld:*/ EmptyVRES_EUPEncodingByHwMode,
+    /*vaux:*/ EmptyVRES_EUPEncodingByHwMode
+  >
+>;
+
+class TPUInstVResBase<EncodingByHwMode enc, InstrItinClass iic,
+                      dag oops, dag iops, string asmstr, list<dag> pattern>
+  : TPUInstP<oops, iops, asmstr, pattern> {
+  let EncodingInfos = enc;
+  let Itinerary = iic;
+}
+
+// Note that this creats 2 opcodes for different VRES slots, plus 4 additional
+// versions for a combination of a particular slot with one of extra slots. Not
+// all the opcodes are used on all platforms, but they should never be created
+// during specialization (and thus used) since unsupported combinations of
+// resources will never be used by scheduler.
+multiclass TPUInstVResOneSlot<
+             TPUInstVResEncoding enc, string orig_name, string vres, string slot,
+             dag oops, dag iops, string asmstr, list<dag> pattern> {
+
+  // The base def is bundle packed in either of VRes0/VRes1 slots without any
+  // secondary slots (used only in 'sparsecore-vf').
+  def "" : TPUInstVResBase<enc.none, !cast<InstrItinClass>("IIC_"#vres),
+                           oops, iops, "{("#slot#") |}"#asmstr, pattern>,
+           BundledVariant<orig_name, NAME>;
+
+  // Generate post-bundle-packing versions that have their slots baked in.
+  // These don't get isel patterns.
+  // The {|} syntax causes us to emit "(slot_vres0 slot_<secondary>) " in
+  // assembler variant 0 and "" in assembler variant 1.
+  def _V0 : TPUInstVResBase<enc.v0, !cast<InstrItinClass>("IIC_"#vres#"_V0"),
+                            oops, iops, "{(slot_v0 "#slot#") |}"#asmstr, []>,
+            BundledVariant<orig_name, NAME#_V0>;
+  def _V1 : TPUInstVResBase<enc.v1, !cast<InstrItinClass>("IIC_"#vres#"_V1"),
+                            oops, iops, "{(slot_v1 "#slot#") |}"#asmstr, []>,
+            BundledVariant<orig_name, NAME#_V1>;
+  def _VLD : TPUInstVResBase<enc.vld, !cast<InstrItinClass>("IIC_"#vres#"_VLD"),
+                            oops, iops, "{(slot_vld "#slot#") |}"#asmstr, []>,
+            BundledVariant<orig_name, NAME#_VLD>;
+  def _VAUX : TPUInstVResBase<enc.vaux, !cast<InstrItinClass>("IIC_"#vres#"_VAUX"),
+                            oops, iops, "{(slot_vaux "#slot#") |}"#asmstr, []>,
+            BundledVariant<orig_name, NAME#_VAUX>;
+}
+
+multiclass TPUInstVResAny<TPUInstVresEncoding enc, dag oops, dag iops,
+                          string asmstr, list<dag> pattern> {
+  // The base def is bundle packed in B_VResAny and has the isel pattern.
+  def "" : TPUInstP<oops, iops, asmstr, pattern>, Bundle<B_VResAny>;
+
+  // Specialized instruction versions.
+  defm _VRES0 : TPUInstVResOneSlot<enc.vres0, NAME, "VRES0", "slot_vres0",
+                                   oops, iops, asmstr, []>;
+  defm _VRES1 : TPUInstVResOneSlot<enc.vres1, NAME, "VRES1", "slot_vres1",
+                                   oops, iops, asmstr, []>;
+}
+
+// This is a proxy class that forwards to different base classes depending on
+// Slot. This allows defs to inherit from a different base class depending on
+// slot.
+multiclass TPUInst<BundleSlot Slot, TPUInstVEncoding enc, dag oops, dag iops,
+                   string asmstr, list<dag> pattern,
+                   OperandIndex yOpIdx = YOpIdx2> {
+  if !eq(Slot.HasV0, 1) then {
+    def "" : TPUInstV0<B_V0, enc.v0, oops, iops, asmstr, pattern, yOpIdx>;
+  }
+  if !eq(Slot.HasV1, 1) then {
+    def "" : TPUInstV1<B_V1, enc.v1, oops, iops, asmstr, pattern, yOpIdx>;
+  }
+  if !eq(Slot.HasV2, 1) then {
+    // enc.v2 isn't needed and not used.
+    def "" : TPUInstV2<B_V2, enc.v1, oops, iops, asmstr, pattern, yOpIdx>;
+  }
+  if !eq(Slot.HasVany, 1) then {
+    defm "" : TPUInstVany<Slot, enc, oops, iops, asmstr, pattern, yOpIdx>;
+  }
+}
+
+multiclass MultiVTypePat<dag Intr, dag result> {
+  def : Pat<(vNf32 Intr), result>;
+  def : Pat<(vNi32 Intr), result>;
+}
+
+def load_smem  : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == TPUAS_Smem;
+}]>;
+def store_smem  : PatFrag<(ops node:$val, node:$ptr),
+                          (store node:$val, node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == TPUAS_Smem;
+}]>;
+def extload_smem  : PatFrag<(ops node:$ptr), (extload node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == TPUAS_Smem;
+}]>;
+
+def load_tilespmem  : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == TPUAS_TileSpmem;
+}]>;
+def store_tilespmem  : PatFrag<(ops node:$val, node:$ptr),
+                          (store node:$val, node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == TPUAS_TileSpmem;
+}]>;
+def load_sflag  : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == TPUAS_Sflag;
+}]>;
+def store_sflag  : PatFrag<(ops node:$val, node:$ptr),
+                          (store node:$val, node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == TPUAS_Sflag;
+}]>;
+def store_sflag_other  : PatFrag<(ops node:$val, node:$ptr),
+                          (store node:$val, node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == TPUAS_SflagOther;
+}]>;
+def load_vmem  : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == TPUAS_Vmem;
+}]>;
+def store_vmem  : PatFrag<(ops node:$val, node:$ptr),
+                          (store node:$val, node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == TPUAS_Vmem;
+}]>;
+def load_bmem  : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == TPUAS_Bmem;
+}]>;
+def store_bmem  : PatFrag<(ops node:$val, node:$ptr),
+                          (store node:$val, node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == TPUAS_Bmem;
+}]>;
+def load_dreg  : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == TPUAS_Dreg;
+}]>;
+def store_dreg  : PatFrag<(ops node:$val, node:$ptr),
+                          (store node:$val, node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == TPUAS_Dreg;
+}]>;
+
+// A list of registers. This is used by EVENT instructions to parse and dump
+// variadic lists of registers.
+def RegListAsmOperand : AsmOperandClass { let Name = "RegList"; }
+def reglist : Operand<i32> {
+  let ParserMatchClass = RegListAsmOperand;
+  let PrintMethod = "printRegisterList";
+}
+
+// An Enum representing a type of operand, an immediate mask
+// and the type of encoding it supports.
+class OpInfo<int v, bits<8> mask, OpEncodingType enc_enum> {
+  string OperandTypeName = NAME;
+  bits<8> OperandTypeEnumVal = v;
+  bits<8> ImmMask = mask;
+  bits<4> OpEncodingType = enc_enum.Encoding;
+}
+
+// Immediate OpInfo types.
+def OPERAND_IMM_SINGLE :            OpInfo<0, 0b00111111, ENCODING_NORMAL>;
+def OPERAND_MEMOFFSET :             OpInfo<1, 0b00111111, ENCODING_MEMOFFSET>;
+def OPERAND_MEMSTRIDE :             OpInfo<2, 0b00111111, ENCODING_MEMSTRIDE>;
+def OPERAND_SUBLANEMASK :           OpInfo<3, 0b00111111, ENCODING_SUBLANEMASK>;
+def OPERAND_SCALAR_IMM32 :          OpInfo<4, 0b00001111, ENCODING_SY>;
+def OPERAND_SYNC_IMM_SINGLE :       OpInfo<5, 0b00111100, ENCODING_NORMAL>;
+def OPERAND_TC_VLD_SHUFFLE_IMM32 :  OpInfo<7, 0b00111111, ENCODING_NORMAL32>;
+def OPERAND_VECTOR_IMM32 :          OpInfo<8, 0b00111111, ENCODING_VY>;
+def OPERAND_IMM_PLAIN :             OpInfo<9, 0, ENCODING_PLAIN>;
+
+// Register OpInfo types. Same special enumeration as immediate types.
+def OPERAND_GPR_VS0 : OpInfo<10, 0, ENCODING_VS0>;
+def OPERAND_GPR_VS1 : OpInfo<11, 0, ENCODING_VS1>;
+def OPERAND_GPR_VS2 : OpInfo<12, 0, ENCODING_VS2>;
+def OPERAND_GPR_VS3 : OpInfo<13, 0, ENCODING_VS3>;
+def OPERAND_GPR_SM_X : OpInfo<14, 0, ENCODING_SM_X>;
+def OPERAND_GPR_SM_Y : OpInfo<15, 0, ENCODING_SM_Y>;
+
+def TPUOperandType : GenericEnum {
+  let FilterClass = "OpInfo";
+  let ValueField = "OperandTypeEnumVal";
+}
+
+def ImmediateCompatibilityTable : GenericTable {
+  let FilterClass = "OpInfo";
+  let CppTypeName = "TPUOperandTypeRecord";
+  let Fields = ["OperandTypeEnumVal", "ImmMask", "OpEncodingType"];
+  let PrimaryKey = ["OperandTypeEnumVal"];
+  let PrimaryKeyName = "OperandTypeInfo";
+}
+
+// An explicit floating point immediate. We use the normal parsing methods
+// (which bitcast Real tokens to integer) but override the render method so we
+// add FP imm operands.
+def FpImmAsmOperand : AsmOperandClass {
+  let Name = "FpImm";
+  let PredicateMethod = "isImm";
+  let RenderMethod = "addFpImmOperands";
+}
+
+// Operand class with immediate slot semantics.
+class TPUSlotImmediate<ValueType Type, OpInfo Info> : Operand<Type> {
+  let OperandType = Info.OperandTypeName;
+}
+
+// Operand class with special register slot semantics, currently used
+// for special VS slot cases.
+class TPUSlotRegister<RegisterClass Type, OpInfo Info> : RegisterOperand<Type> {
+  let OperandType = Info.OperandTypeName;
+}
+
+// TODO(hgreving): Some of the immediates below are unsigned. We should add
+// a type for unsigned immediates, so we can use the highest bit. This only
+// matters for single slot immediates bigger than 19 bits, which doesn't seem
+// likely.
+
+def tcbreg : RegisterOperand<CBR> {
+  let PrintMethod = "printCBR";
+}
+
+let OperandNamespace = "TPUOp" in {
+// Register operand are named according to the following scheme:
+// [t]: followed by
+// <description>: Descriptive code of the operand.
+// [Register class]: Actual register class.
+
+def tvs0gpr : TPUSlotRegister<GPR, OPERAND_GPR_VS0>;
+def tvs1gpr : TPUSlotRegister<GPR, OPERAND_GPR_VS1>;
+def tvs2gpr : TPUSlotRegister<GPR, OPERAND_GPR_VS2>;
+def tvs3gpr : TPUSlotRegister<GPR, OPERAND_GPR_VS3>;
+def tsmxgpr : TPUSlotRegister<GPR, OPERAND_GPR_SM_X>;
+def tsmygpr : TPUSlotRegister<GPR, OPERAND_GPR_SM_Y>;
+
+// Immediate Operands are named according to the following scheme:
+// [t]: followed by
+// <description>: Descriptive code of the operand.
+// [v|s]: Vector, scalar, or don't care.
+// <imm>: Indicates an immediate.
+// [s]: Single slot. Can be 32-bit if omitted.
+// [i|f]: Integer or float.
+
+// Operand doesn't use an immediate slot. The immediate value is encoded as-is.
+def tplnimmi : TPUSlotImmediate<i32, OPERAND_IMM_PLAIN>,
+  ImmLeaf<i32, [{}]>;
+
+// Operand doesn't use an immediate slot and represents dreg.
+def tdreg : TPUSlotImmediate<i32, OPERAND_IMM_PLAIN>;
+
+// Single slot immediate following no specific encoding scheme.
+def timmsi : TPUSlotImmediate<i32, OPERAND_IMM_SINGLE>,
+  ImmLeaf<i32, [{
+    if (ST->hasVfcTensorCore() || ST->isSparseCore()) {
+      return isInt<20>(Imm);
+    }
+    return isInt<16>(Imm);
+  }]> {
+}
+
+// Single slot immediate for vector lane shift insert.
+def tvshlliimmsi : TPUSlotImmediate<i32, OPERAND_VECTOR_IMM32>,
+  ImmLeaf<i32, [{
+    return Imm >= 0 && Imm <= 8;
+  }]> {
+}
+
+// Single slot immediate for memory offset.
+def tmoffimmsi : TPUSlotImmediate<i32, OPERAND_MEMOFFSET>,
+  ImmLeaf<i32, [{
+    if (ST->hasVfcTensorCore() || ST->isSparseCore()) {
+      return isInt<20>(Imm);
+    }
+    return isInt<16>(Imm);
+  }]> {
+}
+
+// Single slot immediate for memory stride.
+def tstrdimmsi : TPUSlotImmediate<i32, OPERAND_MEMSTRIDE>,
+  ImmLeaf<i32, [{
+    if (ST->hasVfcTensorCore() || ST->isSparseCore()) {
+      return isInt<20>(Imm);
+    }
+    return isInt<16>(Imm);
+  }]> {
+}
+
+// Single slot immediate for sublane masks.
+def tsmskimmsi : TPUSlotImmediate<i32, OPERAND_SUBLANEMASK>,
+  ImmLeaf<i32, [{
+    if (ST->hasVfcTensorCore() || ST->isSparseCore()) {
+      return isInt<20>(Imm);
+    }
+    return isInt<16>(Imm);
+  }]> {
+}
+
+// Immediate with a max size of 32-bit supporting the encoding type for
+// scalar instructions (float variant).
+def tsimmf : TPUSlotImmediate<f32, OPERAND_SCALAR_IMM32> {
+  let ParserMatchClass = FpImmAsmOperand;
+}
+
+// Immediate with a max size of 32-bit supporting the encoding type for
+// vector instructions (float variant).
+def tvimmf : TPUSlotImmediate<f32, OPERAND_VECTOR_IMM32> {
+  let ParserMatchClass = FpImmAsmOperand;
+}
+
+// The tvimmf type for bf16 scalars. The immediate isn't 32-bit, but this
+// fact is ignored since there isn't real scalar support on hardware. The
+// type here exists to be used for splats.
+def tvimmbf16 : TPUSlotImmediate<bf16, OPERAND_VECTOR_IMM32> {
+  let ParserMatchClass = FpImmAsmOperand;
+}
+
+// Immediate with a max size of 32-bit supporting the encoding type for
+// vector instructions (integer variant).
+def tvimmi : TPUSlotImmediate<i32, OPERAND_VECTOR_IMM32>,
+  ImmLeaf<i32, [{}]>;
+
+// Immediate with a max size of 32-bit supporting the encoding type for
+// scalar instructions (integer variant).
+def tsimmi : TPUSlotImmediate<i32, OPERAND_SCALAR_IMM32>,
+  ImmLeaf<i32, [{}]>;
+
+// Immediate specifically used for stream instructions sflags. It's a single
+// slot 5-bit immediate.
+def tstrmsimmsi : TPUSlotImmediate<i32, OPERAND_SCALAR_IMM32>,
+  ImmLeaf<i32, [{
+    assert(ST->isSparseCore());
+    return isUInt<5>(Imm);
+  }]>;
+
+// Special stream immediate that only matches if zero.
+def tzstrmimmi : TPUSlotImmediate<i32, OPERAND_IMM_PLAIN>,
+  ImmLeaf<i32, [{
+    return Imm == 0;
+  }]>;
+
+// Single slot immediate with a max size of 16/20-bit used by sync instructions.
+// Doesn't support any embedded encoding.
+def tsyncimmsi : TPUSlotImmediate<i32, OPERAND_SYNC_IMM_SINGLE>,
+  ImmLeaf<i32, [{
+    if (ST->hasVfcTensorCore() || ST->isSparseCore()) {
+      return isInt<20>(Imm);
+    }
+    return isInt<16>(Imm);
+  }]>;
+
+// Immediate with a max size of 32-bit used by the VLD instruction for the
+// shuffle selector operand. Doesn't support any embedded encoding.
+def tvldimmi : TPUSlotImmediate<i32, OPERAND_TC_VLD_SHUFFLE_IMM32>,
+  ImmLeaf<i32, [{}]>;
+} // OperandNamespace = "TPUOp"

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrInfo.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrInfo.cpp
new file mode 100644
index 0000000..916673e
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrInfo.cpp

@@ -0,0 +1,2533 @@
+//===------ TPUInstrInfo.cpp - TPU Instruction Information ----*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the TPU implementation of the TargetInstrInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TPUInstrInfo.h"
+#include "MCTargetDesc/TPUMCTargetDesc.h"
+#include "TPU.h"
+#include "TPUAliasAnalysis.h"
+#include "TPUMachineFunctionInfo.h"
+#include "TPURegisterInfo.h"
+#include "TPUSchedule.h"
+#include "TPUSubtarget.h"
+#include "TPUTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+#define DEBUG_TYPE "tpu-instrinfo"
+
+#define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRMAP_INFO
+#include "TPUGenInstrInfo.inc"
+
+static cl::opt<bool> AliasAnalysisForMIRTest(
+    "tpu-mir-aa-for-test", cl::init(false),
+    cl::desc("Runs address space based analysis on memory operands used for "
+             "MIR based tests."));
+
+static cl::opt<bool>
+    RdWrDregWellDefined("tpu-rd-wr-dreg-well-defined", cl::init(false),
+                        cl::desc("Assumes that SCS only writes DREG, and TEC "
+                                 "only reads DREG for parameter passing."));
+
+TPUInstrInfo::TPUInstrInfo(const TPUSubtarget *ST, unsigned HwMode)
+    : TPUGenInstrInfo(
+          ST->isTPUABIEnabled() ? TPU::ADJCALLSTACKDOWN : TPU::CALLSEQ_START,
+          ST->isTPUABIEnabled() ? TPU::ADJCALLSTACKUP : TPU::CALLSEQ_END),
+      RegisterInfo(HwMode) {}
+
+ScheduleHazardRecognizer *
+TPUInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+                                           const ScheduleDAG *DAG) const {
+  TPUHazardRecognizer *TPUHR =
+      new TPUHazardRecognizer(STI->getInstrItineraryData(), DAG);
+  TPUHR->setCanHandleDelaySlots(false);
+  return TPUHR;
+}
+
+ScheduleHazardRecognizer *
+TPUInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
+                                             const ScheduleDAGMI *DAG) const {
+  TPUHazardRecognizer *TPUHR = new TPUHazardRecognizer(II, DAG);
+  TPUHR->setCanHandleDelaySlots(true);
+  return TPUHR;
+}
+
+bool TPUInstrInfo::requiresImmediateSlots(const MachineInstr &MI,
+                                          uint64_t &SMask, uint64_t &OMask) {
+  SMask = MI.getDesc().TSFlags & TSF_ImmediateRangeMask;
+  OMask = MI.getDesc().TSFlags & TSF_ImmediateOperandsMask;
+  if (SMask != 0) {
+    assert((OMask != 0) && "Slots without operands?");
+    SMask >>= TSF_ImmediateRangeOffset;
+    OMask >>= TSF_ImmediateOperandsOffset;
+    return true;
+  }
+  assert((OMask == 0) && "Operands without slots?");
+  SMask = 0;
+  OMask = 0;
+  // TODO(maggioni): This computes the slot mask and the operand mask
+  // for the new code path. In the new code-path the operand mask shouldn't
+  // be necessary, but to update the rest of the code we have to first change
+  // all the places using the old system with the operand mask and slot mask.
+  // This method could also be retired in the future when all the code is
+  // updated.
+  unsigned ImmOpIdx = 0;
+  for (unsigned OpIdx = 0; OpIdx < MI.getDesc().NumOperands; ++OpIdx) {
+    auto OperandType = MI.getDesc().OpInfo[OpIdx].OperandType;
+    // We don't produce such operands until post bundle packing.
+    assert(OperandType != MCOI::OPERAND_PCREL);
+    // Has to be in sync with the 'if' in BundleTracker.
+    if (!isTPUImmediate(OperandType) &&
+        (OperandType != MCOI::OPERAND_IMMEDIATE) &&
+        (OperandType != MCOI::OPERAND_UNKNOWN /*BR operand*/) &&
+        (OperandType != MCOI::OPERAND_PCREL /*BRrel operand*/)) {
+      continue;
+    }
+    auto OperandRecord =
+        getOperandTypeRecord(static_cast<TPUOp::OperandType>(OperandType));
+    if (!OperandRecord.has_value()) {
+      ImmOpIdx++;
+      continue;
+    }
+    SMask |= (*OperandRecord).ImmMask;
+    OMask |= 1U << ImmOpIdx++;
+  }
+  assert(((SMask == 0 || OMask != 0) && (OMask == 0 || SMask != 0)) &&
+         "Operand masks found but no slots indicated.");
+  return SMask != 0;
+}
+
+int TPUInstrInfo::getTransposeFifoUsage(const MachineInstr &MI,
+                                        int SequenceNum) {
+  assert(SequenceNum >= 0 && "Didn't properly handle sequence number");
+  const TPUSubtarget &ST = MI.getMF()->getSubtarget<TPUSubtarget>();
+  if (ST.hasPfcTensorCore() || ST.hasVfcTensorCore()) {
+    if (SequenceNum != 0)
+      return 0;
+    if (TPUInstrInfo::isPacked(MI.getDesc()))
+      return MI.getOperand(3).getImm() / 16;
+    return MI.getOperand(2).getImm() / 8;
+  }
+  // JFC/DFC has a weird behavior, where the allocation actually happens at
+  // the 2nd instruction of the sequence, while on pufferfish it starts
+  // at the first one.
+  if (SequenceNum != 1)
+    return 0;
+  return 16;
+}
+
+void TPUInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator Position,
+                               const DebugLoc &DL,
+                               MCRegister DestinationRegister,
+                               MCRegister SourceRegister,
+                               bool KillSource) const {
+  // Special-case a copy of $vN -> $vaggN. This can happen during tests as
+  // all elements of a struct return are assumed to be aggregates.
+  if (TPU::VAGGRegClass.contains(DestinationRegister) &&
+      TPU::VPRRegClass.contains(SourceRegister) &&
+      DestinationRegister - TPU::VAGG0 == SourceRegister - TPU::V0) {
+    return;
+  }
+
+  constexpr std::pair<const TargetRegisterClass *, unsigned>
+      RegClassesAndOpcodes[] = {
+          {&TPU::GPRRegClass, TPU::MOV},
+          {&TPU::PPRRegClass, TPU::PMOV},
+          {&TPU::VPRRegClass, TPU::VMOVr},
+          {&TPU::MPRRegClass, TPU::VMMOV},
+          {&TPU::CBRRegClass, TPU::COPY_CBR},
+          {&TPU::ERFPRRegClass, TPU::COPY_ERFPR},
+          {&TPU::V2SFPRRegClass, TPU::COPY_V2SFPR},
+          {&TPU::SFRFPRRegClass, TPU::COPY_SFRFPR},
+          {&TPU::MRFPR0RegClass, TPU::COPY_MRFPR0},
+          {&TPU::MRFPR1RegClass, TPU::COPY_MRFPR1},
+          {&TPU::MRFPR2RegClass, TPU::COPY_MRFPR2},
+          {&TPU::MRFPR3RegClass, TPU::COPY_MRFPR3},
+          {&TPU::TRFPR0RegClass, TPU::COPY_TRFPR0},
+          {&TPU::TRFPR1RegClass, TPU::COPY_TRFPR1},
+          {&TPU::TRFPR2RegClass, TPU::COPY_TRFPR2},
+          {&TPU::XRFPR0RegClass, TPU::COPY_XRFPR0},
+          {&TPU::XRFPR1RegClass, TPU::COPY_XRFPR1},
+      };
+  for (const auto &KV : RegClassesAndOpcodes) {
+    if (KV.first->contains(DestinationRegister, SourceRegister)) {
+      MachineInstrBuilder MIB(
+          BuildMI(MBB, Position, DL, get(KV.second), DestinationRegister)
+              .addReg(SourceRegister, getKillRegState(KillSource)));
+      // We could support predicates in these pseudo instructions, but we chose
+      // not to.
+      if (MIB->isPredicable())
+        AddDefaultPred(MIB);
+      return;
+    }
+  }
+  llvm_unreachable("Impossible reg-to-reg copy");
+}
+
+bool TPUInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  auto SelLowering = [&](unsigned Opcode, bool UseFalseSrc) {
+    // tied $d PSEUDO_SELrr = $pred, tied $a, $b -> $d = MOV $b, $pred, 1
+    //
+    // Note that in this and the other variants we mark the tied register as
+    // implicit use to stop the compiler thinking the incoming value of the tied
+    // register is killed.
+    unsigned SourceIdx = UseFalseSrc ? 2 : 3;
+    assert(TPU::PPRRegClass.contains(MI.getOperand(1).getReg()));
+    // Pseudo SEL instructions don't have a predicate operand. Predication is
+    // done through predicate manipulation, and the instruction itself is not
+    // predicated.
+    assert(!isPredicated(MI));
+    auto I = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+                     get(Opcode), MI.getOperand(0).getReg());
+    I.add(MI.getOperand(SourceIdx));
+    I.addReg(MI.getOperand(1).getReg())
+        .addImm(UseFalseSrc ? 0 : 1)
+        .addReg(MI.getOperand(0).getReg(), getImplRegState(true));
+    MI.eraseFromParent();
+    return true;
+  };
+
+  auto BcSelLowering = [&](unsigned Opcode) {
+    // tied $d bcVSELrr = $pred, tied $a, $b -> $d = MOV $b, $pred, 1
+    //
+    // Note that in this and the other variants we mark the tied register as
+    // implicit use to stop the compiler thinking the incoming value of the tied
+    // register is killed.
+    unsigned SourceIdx = 3;
+    auto I = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+                     get(Opcode), MI.getOperand(0).getReg());
+    I.add(MI.getOperand(SourceIdx));
+
+    TPUPredicate CurrentP(MI);
+    TPUPredicate P =
+        TPUPredicate::fromRawBcEncoding(MI.getOperand(1).getImm())
+            .toggleInvert()
+            .setBarnaCorePipelineStage(CurrentP.getBarnaCorePipelineStage());
+    I.addReg(P.getReg());
+    I.addImm(P.getImm());
+    MI.eraseFromParent();
+    return true;
+  };
+
+  auto CvmnegLowering = [&]() {
+    // Lowering CVMNEG to an actual VMNEG real instruction.
+    BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+            get(TPU::VMNEG), MI.getOperand(0).getReg())
+        .add(MI.getOperand(1))
+        .add(MI.getOperand(2))
+        .addImm(0);
+    MI.eraseFromParent();
+    return true;
+  };
+
+  auto InitStackLowering = [&]() {
+    if (MI.getMF()->getSubtarget<TPUSubtarget>().hasVPU()) {
+      AddDefaultPred(BuildMI(*MI.getParent(), MI.getIterator(),
+                             MI.getDebugLoc(), get(TPU::IMM), TPU::SPV)
+                         .addImm(MI.getOperand(0).getImm()));
+    }
+    AddDefaultPred(BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+                           get(TPU::IMM), TPU::SPS)
+                       .addImm(MI.getOperand(1).getImm()));
+    MI.eraseFromParent();
+    return true;
+  };
+
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case TPU::PSEUDO_SELrr:
+    return SelLowering(TPU::MOV, false);
+  case TPU::PSEUDO_SELri:
+    return SelLowering(TPU::IMM, false);
+  case TPU::PSEUDO_SELir:
+    return SelLowering(TPU::IMM, true);
+  case TPU::PSEUDO_PSELrr:
+    return SelLowering(TPU::PMOV, false);
+  case TPU::PSEUDO_MSELrr:
+    return SelLowering(TPU::VMMOV, false);
+  case TPU::PSEUDO_CSELrr:
+    // This case is special, because COPY_CBR is a pseudo instruction that won't
+    // be a real instruction until the second run of this pseudo expansion pass.
+    // First, we can reuse lowering COPY_CBR this way. But second and moreover,
+    // we can't create final code here yet, because COPY_CBR requires register
+    // scavenging, and the block has to be stable first.
+    return SelLowering(TPU::COPY_CBR, false);
+  case TPU::PSEUDO_VSELrr:
+    return SelLowering(TPU::VMOVr, false);
+  case TPU::PSEUDO_VSELri:
+    return SelLowering(TPU::VIMMI, false);
+  case TPU::PSEUDO_VSELrif:
+    return SelLowering(TPU::VIMMF, false);
+  case TPU::PSEUDO_VSELir:
+    return SelLowering(TPU::VIMMI, true);
+  case TPU::PSEUDO_VSELirf:
+    return SelLowering(TPU::VIMMF, true);
+  case TPU::bcVSELrr:
+    return BcSelLowering(TPU::VMOVr);
+  case TPU::bcVSELrf:
+    return BcSelLowering(TPU::VIMMF);
+  case TPU::CVMNEG:
+    return CvmnegLowering();
+  case TPU::INIT_STACK:
+    return InitStackLowering();
+  }
+  if (MI.getOpcode() == TPU::VMZERO) {
+    unsigned Reg = MI.getOperand(0).getReg();
+    AddDefaultPred(BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+                           get(TPU::VMXOR), Reg)
+                       .addReg(Reg)
+                       .addReg(Reg));
+    MI.eraseFromParent();
+    return true;
+  }
+  if (MI.getOpcode() == TPU::VMLANEi || MI.getOpcode() == TPU::VMLANEr) {
+    unsigned Reg = MI.getOperand(0).getReg();
+    unsigned VseqReg = MI.getOperand(1).getReg();
+    unsigned PredReg = MI.getOperand(3).getReg();
+    unsigned PredInvert = MI.getOperand(4).getImm();
+
+    bool ConstantIdx = MI.getOpcode() == TPU::VMLANEi;
+    auto MIB = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+                       get(ConstantIdx ? TPU::VCMPEQri : TPU::VCMPEQrr), Reg)
+                   .addReg(VseqReg, getKillRegState(true));
+    if (ConstantIdx)
+      MIB.addImm(MI.getOperand(2).getImm());
+    else
+      MIB.addReg(MI.getOperand(2).getReg());
+    MIB.addReg(PredReg);
+    MIB.addImm(PredInvert);
+
+    MI.eraseFromParent();
+    return true;
+  }
+  // Legalize SparseCore pseudo side-effect moves.
+  const TargetSubtargetInfo *STI = &MI.getMF()->getSubtarget();
+  const TargetInstrInfo *TII = STI->getInstrInfo();
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case TPU::MOV_SE:
+    MI.setDesc(TII->get(TPU::MOV));
+    return true;
+  case TPU::VMOV_SEr:
+    MI.setDesc(TII->get(TPU::VMOVr));
+    return true;
+  case TPU::VMMOV_SE:
+    MI.setDesc(TII->get(TPU::VMMOV));
+    return true;
+  case TPU::PMOV_SE:
+    MI.setDesc(TII->get(TPU::PMOV));
+    return true;
+  }
+  if (MI.getOpcode() == TPU::COPY_CBR) {
+    // We're expanding pseudo cbr copies in the second run of postRA pseudo
+    // instruction expansion pass, once we can analyze the finished block to
+    // scavenge a register.
+    // FIXME(b/239705210): Spilling during scavenging to emergency location
+    // currently unsupported.
+    RegScavenger RS;
+    RS.enterBasicBlockEnd(*MI.getParent());
+    RS.backward(MI);
+    RS.backward();
+    unsigned Sreg = RS.FindUnusedReg(&TPU::GPRRegClass);
+    assert(Sreg != 0);
+    Register FromReg = MI.getOperand(1).getReg();
+    Register ToReg = MI.getOperand(0).getReg();
+    // COPY_CBR can be predicated, because it can be the result of a pseudo
+    // select.
+    TPUPredicate Pred(MI);
+    BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+            get(TPU::scRDCBREG), Sreg)
+        .addReg(FromReg)
+        .addImm(0)
+        .addReg(Pred.getReg())
+        .addImm(Pred.getImm());
+    BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+            get(TPU::scWRCBREGr), ToReg)
+        .addReg(Sreg)
+        .addReg(ToReg)
+        .addImm(0)
+        .addReg(Pred.getReg())
+        .addImm(Pred.getImm());
+    BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+            get(TPU::scRDCBREG), Sreg)
+        .addReg(FromReg)
+        .addImm(1)
+        .addReg(Pred.getReg())
+        .addImm(Pred.getImm());
+    BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+            get(TPU::scWRCBREGr), ToReg)
+        .addReg(Sreg)
+        .addReg(ToReg)
+        .addImm(1)
+        .addReg(Pred.getReg())
+        .addImm(Pred.getImm());
+    BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+            get(TPU::scRDCBREG), Sreg)
+        .addReg(FromReg)
+        .addImm(2)
+        .addReg(Pred.getReg())
+        .addImm(Pred.getImm());
+    BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+            get(TPU::scWRCBREGr), ToReg)
+        .addReg(Sreg)
+        .addReg(ToReg)
+        .addImm(2)
+        .addReg(Pred.getReg())
+        .addImm(Pred.getImm());
+    MI.eraseFromParent();
+    return true;
+  }
+  // Legalize NP opcodes.
+  switch (MI.getOpcode()) {
+  default:
+    // Check for NP opcodes that should be handled below.
+    assert(!TPUInstrInfo::isNoParallel(MI));
+    break;
+  case TPU::scVLD_CB_IDX_MSK_NP:
+    MI.setDesc(TII->get(TPU::scVLD_CB_IDX_MSK));
+    return true;
+  case TPU::scVLD_IDX_MSK_NP:
+    MI.setDesc(TII->get(TPU::scVLD_IDX_MSK));
+    return true;
+  case TPU::scVLD_IDX_MSK_STRIDEi_NP:
+    MI.setDesc(TII->get(TPU::scVLD_IDX_MSK_STRIDEi));
+    return true;
+  case TPU::scVLD_IDX_MSK_STRIDEr_NP:
+    MI.setDesc(TII->get(TPU::scVLD_IDX_MSK_STRIDEr));
+    return true;
+  case TPU::scVST_CB_IDX_MSK_ADDF_NP:
+    MI.setDesc(TII->get(TPU::scVST_CB_IDX_MSK_ADDF));
+    return true;
+  case TPU::scVST_CB_IDX_MSK_ADD_NP:
+    MI.setDesc(TII->get(TPU::scVST_CB_IDX_MSK_ADD));
+    return true;
+  case TPU::scVST_CB_IDX_MSK_NP:
+    MI.setDesc(TII->get(TPU::scVST_CB_IDX_MSK));
+    return true;
+  case TPU::scVST_IDX_MSK_ADDF_NP:
+    MI.setDesc(TII->get(TPU::scVST_IDX_MSK_ADDF));
+    return true;
+  case TPU::scVST_IDX_MSK_ADD_NP:
+    MI.setDesc(TII->get(TPU::scVST_IDX_MSK_ADD));
+    return true;
+  case TPU::scVST_IDX_MSK_RET_ADD_NP:
+    MI.setDesc(TII->get(TPU::scVST_IDX_MSK_RET_ADD));
+    return true;
+  case TPU::scVST_IDX_MSK_RET_ADDF_NP:
+    MI.setDesc(TII->get(TPU::scVST_IDX_MSK_RET_ADDF));
+    return true;
+  case TPU::scVST_IDX_MSK_NP:
+    MI.setDesc(TII->get(TPU::scVST_IDX_MSK));
+    return true;
+  case TPU::scVST_IDX_MSK_STRIDEi_NP:
+    MI.setDesc(TII->get(TPU::scVST_IDX_MSK_STRIDEi));
+    return true;
+  case TPU::scVST_IDX_MSK_STRIDEr_NP:
+    MI.setDesc(TII->get(TPU::scVST_IDX_MSK_STRIDEr));
+    return true;
+  case TPU::scVST_IDX_MSK_ADD_S16_NP:
+    MI.setDesc(TII->get(TPU::scVST_IDX_MSK_ADD_S16));
+    return true;
+  case TPU::scVST_IDX_MSK_ADD_BF16_NP:
+    MI.setDesc(TII->get(TPU::scVST_IDX_MSK_ADD_BF16));
+    return true;
+  case TPU::scVST_CB_IDX_MSK_ADD_S16_NP:
+    MI.setDesc(TII->get(TPU::scVST_CB_IDX_MSK_ADD_S16));
+    return true;
+  case TPU::scVST_CB_IDX_MSK_ADD_BF16_NP:
+    MI.setDesc(TII->get(TPU::scVST_CB_IDX_MSK_ADD_BF16));
+    return true;
+  case TPU::scVST_IDX_MSK_RET_ADD_S16_NP:
+    MI.setDesc(TII->get(TPU::scVST_IDX_MSK_RET_ADD_S16));
+    return true;
+  case TPU::scVST_IDX_MSK_RET_ADD_BF16_NP:
+    MI.setDesc(TII->get(TPU::scVST_IDX_MSK_RET_ADD_BF16));
+    return true;
+  }
+  return false;
+}
+
+std::pair<unsigned, unsigned>
+TPUInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+  return std::make_pair(TF, 0u);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+TPUInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+  return {};
+}
+
+bool TPUInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                                  Register &SrcReg2, int64_t &CmpMask,
+                                  int64_t &CmpValue) const {
+  return false;
+}
+
+bool TPUInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+                                        Register SrcReg2, int64_t /*CmpMask*/,
+                                        int64_t CmpValue,
+                                        const MachineRegisterInfo *MRI) const {
+  return false;
+}
+
+bool TPUInstrInfo::analyzeSelect(const MachineInstr &MI,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 unsigned &TrueOp, unsigned &FalseOp,
+                                 bool &Optimizable) const {
+  assert(MI.isSelect());
+  auto IsOpImm = [&MI](unsigned Op) {
+    const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+    MachineInstr *OpMI = MRI.getUniqueVRegDef(MI.getOperand(Op).getReg());
+    // Could support globals, chose not to.
+    return OpMI->isMoveImmediate() && !OpMI->getOperand(1).isGlobal();
+  };
+  switch (MI.getOpcode()) {
+  case TPU::PSEUDO_SELrr:
+  case TPU::PSEUDO_VSELrr:
+  case TPU::VPSELrr:
+  case TPU::VPSELrs:
+  case TPU::VPSELsr: {
+    Cond.push_back(MI.getOperand(1));
+    Cond.push_back(MachineOperand::CreateImm(0));
+    TrueOp = 2;
+    FalseOp = 3;
+    if (IsOpImm(TrueOp) || IsOpImm(FalseOp)) {
+      Optimizable = true;
+      return false;
+    }
+    break;
+  }
+  default:
+    break;
+  }
+  return true;
+}
+
+MachineInstr *
+TPUInstrInfo::optimizeSelect(MachineInstr &MI,
+                             SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+                             bool /*PreferFalse*/) const {
+  assert(MI.isSelect());
+  auto IsOpImm = [&MI](unsigned Op, int64_t &Imm, const ConstantFP *&FPImm,
+                       bool &IsFPImm) {
+    const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+    MachineInstr *OpMI = MRI.getUniqueVRegDef(MI.getOperand(Op).getReg());
+    if (OpMI->isMoveImmediate()) {
+      IsFPImm = OpMI->getOperand(1).isFPImm();
+      if (IsFPImm)
+        FPImm = OpMI->getOperand(1).getFPImm();
+      else
+        Imm = OpMI->getOperand(1).getImm();
+      return true;
+    }
+    return false;
+  };
+  auto GetOptOpcode = [](unsigned Opcode, bool ImmediateInTrue, bool IsFPImm) {
+    switch (Opcode) {
+    case TPU::VPSELrr:
+    case TPU::VPSELrs:
+    case TPU::VPSELsr:
+      return ImmediateInTrue ? TPU::VPSELir : TPU::VPSELri;
+    case TPU::PSEUDO_SELrr:
+      return ImmediateInTrue ? TPU::PSEUDO_SELir : TPU::PSEUDO_SELri;
+    case TPU::PSEUDO_VSELrr:
+      if (ImmediateInTrue) {
+        return IsFPImm ? TPU::PSEUDO_VSELirf : TPU::PSEUDO_VSELir;
+      } else {
+        return IsFPImm ? TPU::PSEUDO_VSELrif : TPU::PSEUDO_VSELri;
+      }
+    default:
+      llvm_unreachable("Unsupported opcode.");
+      return TPU::SNOP;
+    }
+  };
+  bool IsVpSel = false;
+  switch (MI.getOpcode()) {
+  case TPU::VPSELrr:
+  case TPU::VPSELrs:
+  case TPU::VPSELsr:
+    IsVpSel = true;
+    LLVM_FALLTHROUGH;
+  case TPU::PSEUDO_SELrr:
+  case TPU::PSEUDO_VSELrr: {
+    unsigned TrueOp = 2;
+    unsigned FalseOp = 3;
+    int64_t Imm;
+    const ConstantFP *FPImm;
+    bool IsFPImm;
+    if (IsOpImm(TrueOp, Imm, FPImm, IsFPImm)) {
+      auto MIB = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+                         get(GetOptOpcode(MI.getOpcode(),
+                                          /*ImmediateInTrue=*/true, IsFPImm)),
+                         MI.getOperand(0).getReg())
+                     .addReg(MI.getOperand(1).getReg());
+      if (IsVpSel && IsFPImm) {
+        // We don't have a float version of vpsel.
+        APInt I = FPImm->getValue().bitcastToAPInt();
+        Imm = I.getZExtValue();
+        IsFPImm = false;
+      }
+      if (IsFPImm) {
+        MIB.addFPImm(FPImm);
+      } else {
+        MIB.addImm(Imm);
+      }
+      MIB.addReg(MI.getOperand(FalseOp).getReg());
+      return MIB;
+    }
+    if (IsOpImm(FalseOp, Imm, FPImm, IsFPImm)) {
+      auto MIB = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+                         get(GetOptOpcode(MI.getOpcode(),
+                                          /*ImmediateInTrue=*/false, IsFPImm)),
+                         MI.getOperand(0).getReg())
+                     .addReg(MI.getOperand(1).getReg())
+                     .addReg(MI.getOperand(TrueOp).getReg());
+      if (IsVpSel && IsFPImm) {
+        // We don't have a float version of vpsel.
+        APInt I = FPImm->getValue().bitcastToAPInt();
+        Imm = I.getZExtValue();
+        IsFPImm = false;
+      }
+      if (IsFPImm)
+        MIB.addFPImm(FPImm);
+      else
+        MIB.addImm(Imm);
+      return MIB;
+    }
+    break;
+  }
+  default:
+    break;
+  }
+  llvm_unreachable("Attempting to optimize unsupported select.");
+  return nullptr;
+}
+
+// The analyzeBranch function is used to examine conditional instructions and
+// remove unnecessary instructions. This method is used by BranchFolder and
+// IfConverter machine function passes to improve the CFG.
+// - TrueBlock is set to the destination if condition evaluates true (it is the
+//   nullptr if the destination is the fall-through branch);
+// - FalseBlock is set to the destination if condition evaluates to false (it
+//   is the nullptr if the branch is unconditional);
+// - condition is populated with machine operands needed to generate the branch
+//   to insert in insertBranch;
+// Returns: false if branch could successfully be analyzed.
+bool TPUInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                 MachineBasicBlock *&TrueBlock,
+                                 MachineBasicBlock *&FalseBlock,
+                                 SmallVectorImpl<MachineOperand> &Condition,
+                                 bool AllowModify) const {
+  if (!MBB.empty() && MBB.back().isReturn())
+    return true;
+  SmallVector<MachineInstr *, 4> Branches;
+  for (MachineInstr &MI : MBB) {
+    if (MachineInstr *Br = getFirstFromBundle(
+            &MI, {TPU::BR, TPU::BRClr, TPU::BRcond, TPU::BRcondClr}))
+      Branches.push_back(Br);
+    if (getFirstFromBundle(&MI, {TPU::HALT, TPU::bcHALT}) != nullptr)
+      // Block may not exit - treat as unanalyzable.
+      return true;
+    if (getFirstFromBundle(&MI, {TPU::bcBR, TPU::bcLOOP_END}) != nullptr)
+      // FIXME: Barnacore branches not yet handled.
+      return true;
+    if (getFirstFromBundle(&MI, {TPU::BR, TPU::BRClr, TPU::BRcond,
+                                 TPU::BRcondClr}) != nullptr) {
+      if (!MI.getOperand(0).isMBB())
+        // Treat absolute branches as unanalyzable.
+        return true;
+    }
+  }
+  if (Branches.empty()) {
+    // Unconditional fallthrough.
+    TrueBlock = nullptr;
+    return false;
+  }
+  if (Branches.size() > 2)
+    // More than two exits; unanalyzable.
+    return true;
+
+  // There are either 1 or 2 branches at this point. If we have two branches,
+  // the predicated one must be first.
+  TPUPredicate Pred(Branches.front());
+  if (Branches.size() == 1 && Pred.getReg() == TPU::Palways) {
+    // Unconditional non-fallthrough.
+    TrueBlock = Branches.front()->getOperand(0).getMBB();
+    FalseBlock = nullptr;
+    return false;
+  }
+
+  Condition.push_back(MachineOperand::CreateReg(Pred.getReg(), /*IsDef=*/false));
+  Condition.push_back(MachineOperand::CreateImm(Pred.getImm()));
+  if (Branches.size() == 1) {
+    // Conditional fallthrough.
+    TrueBlock = Branches.front()->getOperand(0).getMBB();
+    FalseBlock = nullptr;
+    return false;
+  }
+
+  if (TPUPredicate(Branches.back()).getReg() != TPU::Palways)
+    // Two conditional branches - unanalyzable (because there are three
+    // successors).
+    return true;
+
+  // Conditional branch and unconditional branch.
+  TrueBlock = Branches.front()->getOperand(0).getMBB();
+  FalseBlock = Branches.back()->getOperand(0).getMBB();
+  return false;
+}
+
+// reverseBranchCondition - Reverses the branch condition of the specified
+// condition list, returning false on success and true if it cannot be
+// reversed.
+bool TPUInstrInfo::reverseBranchCondition(
+    SmallVectorImpl<llvm::MachineOperand> &Condition) const {
+  Condition[1].ChangeToImmediate(Condition[1].getImm() ^ 1);
+  return false;
+}
+
+// Insert the branch with condition specified in condition and given targets
+// (TrueBlock and FalseBlock). This function returns the number of machine
+// instructions inserted.
+unsigned TPUInstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock *TrueBlock,
+                                    MachineBasicBlock *FalseBlock,
+                                    ArrayRef<MachineOperand> Condition,
+                                    const DebugLoc &DL, int *BytesAdded) const {
+  LLVM_DEBUG(dbgs() << "insertBranch (true %bb." << TrueBlock->getNumber()
+                    << "), (false %bb."
+                    << (FalseBlock ? FalseBlock->getNumber() : -1) << ") "
+                    << MBB);
+  if (Condition.empty()) {
+    // Unconditional branch.
+    assert(!FalseBlock);
+    BuildMI(&MBB, DL, get(TPU::BR)).addMBB(TrueBlock);
+    return 1;
+  }
+
+  // Conditional branch.
+  BuildMI(&MBB, DL, get(TPU::BRcond))
+      .addMBB(TrueBlock)
+      .addReg(Condition[0].getReg(), getKillRegState(true))
+      .add(Condition[1]);
+  if (!FalseBlock)
+    // Just fallthrough.
+    return 1;
+  BuildMI(&MBB, DL, get(TPU::BR)).addMBB(FalseBlock);
+  return 2;
+}
+
+unsigned TPUInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                    int *BytesRemoved) const {
+  LLVM_DEBUG(dbgs() << "RemoveBranch: " << MBB);
+  unsigned NumRemoved = 0;
+  while (!MBB.empty() && isBR(&MBB.back())) {
+    MBB.back().eraseFromParent();
+    NumRemoved++;
+  }
+  return NumRemoved;
+}
+
+bool TPUInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+                                 Register Reg, MachineRegisterInfo *MRI) const {
+  return TargetInstrInfo::FoldImmediate(UseMI, DefMI, Reg, MRI);
+}
+
+bool TPUInstrInfo::isLoopMetadata(MachineBasicBlock &MBB, StringRef MS) {
+  const BasicBlock *OrigBB = MBB.getBasicBlock();
+  const Instruction *TI = nullptr;
+  MDNode *LoopID = nullptr;
+  if (OrigBB != nullptr)
+    TI = OrigBB->getTerminator();
+  if (TI != nullptr)
+    LoopID = TI->getMetadata(LLVMContext::MD_loop);
+  if (LoopID != nullptr) {
+    for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+      MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+      if (MD == nullptr)
+        continue;
+      MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+      if (S == nullptr)
+        continue;
+      if (S->getString() == MS)
+        return true;
+    }
+  }
+  return false;
+}
+
+bool TPUInstrInfo::isLoopParallel(MachineBasicBlock &MBB) {
+  return TPUInstrInfo::isLoopMetadata(MBB, "llvm.loop.parallel_accesses");
+}
+
+bool TPUInstrInfo::isLoopPipelineDisabled(MachineBasicBlock &MBB) {
+  return TPUInstrInfo::isLoopMetadata(MBB, "llvm.loop.pipeline.disable");
+}
+
+bool TPUInstrInfo::updateTerminator(MachineBasicBlock &MBB,
+                                    MachineBasicBlock *FromMBB,
+                                    MachineBasicBlock *ToMBB) {
+  if (MBB.empty())
+    return false;
+  bool Changed = false;
+  if (MBB.instr_begin()->isBundle()) {
+    // Post bundle-packing.
+    for (auto BundleIt =
+             MachineBasicBlock::iterator::getAtBundleBegin(MBB.instr_begin());
+         BundleIt != MBB.instr_end(); BundleIt++) {
+      auto It = BundleIt.getInstrIterator();
+      for (auto MII = getBundleStart(It); MII != getBundleEnd(It); MII++) {
+        if (MII->isBundle() || !MII->isTerminator())
+          continue;
+        for (MachineOperand &MO : MII->operands()) {
+          if (!MO.isMBB())
+            continue;
+          if (MO.getMBB() != FromMBB)
+            continue;
+          MO.setMBB(ToMBB);
+          Changed = true;
+        }
+      }
+    }
+  } else {
+    // Pre bundle-packing.
+    for (auto &TermInstr : MBB.terminators()) {
+      for (MachineOperand &MO : TermInstr.operands()) {
+        if (!MO.isMBB())
+          continue;
+        if (MO.getMBB() != FromMBB)
+          continue;
+        MO.setMBB(ToMBB);
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+bool TPUInstrInfo::analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
+                               MachineInstr *&CmpInst) const {
+  IndVarInst = nullptr;
+  CmpInst = nullptr;
+  MachineBasicBlock *LatchBlock = L.findLoopControlBlock();
+  if (!LatchBlock)
+    return true;
+  MachineRegisterInfo &MRI = LatchBlock->getParent()->getRegInfo();
+  if (!MRI.isSSA())
+    return true;
+  for (auto TermInstrIt = LatchBlock->getFirstInstrTerminator();
+       TermInstrIt != LatchBlock->instr_end(); TermInstrIt++) {
+    MachineInstr *LoopTerm = &*TermInstrIt;
+    // We're assuming the loop's backbranch canonical form of BRCond followed by
+    // BR.
+    if (LoopTerm->getOpcode() != TPU::BRcond)
+      continue;
+    unsigned PredReg = LoopTerm->getOperand(1).getReg();
+    if (PredReg == TPU::Palways)
+      return true;
+    CmpInst = MRI.getUniqueVRegDef(PredReg);
+    if (!CmpInst || !CmpInst->isCompare())
+      return true;
+    // We assume that the compare instruction is in canonical form with the
+    // post-incloop induction variable as the first operand.
+    //
+    // Enforce that the compare's operand is an ADDri of a PHI, and that PHI
+    // uses the ADDri. This pattern covers both upcounting and downcounting
+    // loops (but not loops that use pointers as their induction variable).
+    MachineInstr *IncInst =
+        MRI.getUniqueVRegDef(CmpInst->getOperand(1).getReg());
+    if (!IncInst || IncInst->getOpcode() != TPU::ADDri)
+      return true;
+    MachineInstr *LoopPhi =
+        MRI.getUniqueVRegDef(IncInst->getOperand(1).getReg());
+    if (!LoopPhi || !LoopPhi->isPHI() ||
+        !LoopPhi->readsVirtualRegister(IncInst->getOperand(0).getReg()))
+      return true;
+
+    IndVarInst = LoopPhi;
+    return false;
+  }
+  return true;
+}
+
+unsigned TPUInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                           int &FrameIndex) const {
+  return 0;
+}
+
+unsigned TPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
+                                                 int &FrameIndex) const {
+  return 0;
+}
+
+unsigned TPUInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                          int &FrameIndex) const {
+  return 0;
+}
+
+bool TPUInstrInfo::getMemOperandWithOffsetWidth(
+    MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset,
+    unsigned &Width, const TargetRegisterInfo * /*TRI*/) const {
+  assert(0 && "Implement");
+  return false;
+}
+
+bool TPUInstrInfo::getMemOperandsWithOffsetWidth(
+    const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
+    int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+    const TargetRegisterInfo *TRI) const {
+  return false;
+}
+
+namespace {
+
+class TPUTensorCorePipelinerLoopInfo : public TPUPipelinerLoopInfo {
+  unsigned CmpOpcode;
+  MachineInstr *Cmp, *IVUpdate;
+  bool ReverseCond = false;
+  int64_t TripCount = -1;
+
+public:
+  TPUTensorCorePipelinerLoopInfo(MachineInstr *Cmp, MachineInstr *IVUpdate,
+                                 bool ReverseCond);
+  TPUTensorCorePipelinerLoopInfo() : Cmp(nullptr), IVUpdate(nullptr) {}
+  bool shouldIgnoreForPipelining(const MachineInstr *MI) const override;
+  // Obsolete, bug, don't use. FIXME(hgreving): fix upstream.
+  std::optional<bool> createTripCountGreaterCondition(
+      int TC, MachineBasicBlock &MBB,
+      SmallVectorImpl<MachineOperand> &Cond) override;
+  std::optional<bool>
+  createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
+                                  MachineInstr *Cmp,
+                                  SmallVectorImpl<MachineOperand> &Cond);
+  MachineInstr *getIVUpdate() const override { return IVUpdate; }
+  MachineInstr *getCmp() const override { return Cmp; }
+  bool isReverseCond() const override { return ReverseCond; }
+  int64_t getTripCount() const override { return TripCount; }
+  void setPreheader(MachineBasicBlock *NewPreheader) override {}
+  void adjustTripCount(int TripCountAdjust) override {}
+  void disposed() override {}
+};
+
+class TPUBarnaCorePipelinerLoopInfo : public TPUPipelinerLoopInfo {
+public:
+  TPUBarnaCorePipelinerLoopInfo() = default;
+  bool shouldIgnoreForPipelining(const MachineInstr *MI) const override;
+  // Obsolete, bug, don't use. FIXME(hgreving): fix upstream.
+  std::optional<bool> createTripCountGreaterCondition(
+      int TC, MachineBasicBlock &MBB,
+      SmallVectorImpl<MachineOperand> &Cond) override;
+  std::optional<bool>
+  createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
+                                  MachineInstr *Cmp,
+                                  SmallVectorImpl<MachineOperand> &Cond);
+  MachineInstr *getIVUpdate() const override { return nullptr; }
+  MachineInstr *getCmp() const override { return nullptr; }
+  bool isReverseCond() const override { return false; }
+  int64_t getTripCount() const override { return -1; }
+  void setPreheader(MachineBasicBlock *NewPreheader) override {}
+  void adjustTripCount(int TripCountAdjust) override {}
+  void disposed() override {}
+};
+
+TPUTensorCorePipelinerLoopInfo::TPUTensorCorePipelinerLoopInfo(
+    MachineInstr *Cmp, MachineInstr *IVUpdate, bool ReverseCond)
+    : CmpOpcode(Cmp->getOpcode()), Cmp(Cmp), IVUpdate(IVUpdate),
+      ReverseCond(ReverseCond) {
+  auto &MRI = Cmp->getMF()->getRegInfo();
+  if (!MRI.isSSA() || !IVUpdate->getOperand(2).isImm())
+    return;
+  int64_t Inc = IVUpdate->getOperand(2).getImm();
+  MachineInstr *LoopPhi =
+      MRI.getUniqueVRegDef(IVUpdate->getOperand(1).getReg());
+  if (LoopPhi->getNumOperands() != 5)
+    return;
+  int OriginalValOperand = 1;
+  if (LoopPhi->getOperand(2).getMBB() == LoopPhi->getParent())
+    OriginalValOperand = 3;
+  MachineInstr *SetImm =
+      MRI.getUniqueVRegDef(LoopPhi->getOperand(OriginalValOperand).getReg());
+  if (SetImm->getOpcode() != TPU::IMM || !Cmp->getOperand(2).isImm())
+    return;
+  int64_t OriginalValue = SetImm->getOperand(1).getImm();
+  MachineInstr *CmpTest = MRI.getUniqueVRegDef(Cmp->getOperand(1).getReg());
+  assert(CmpTest == LoopPhi || CmpTest == IVUpdate ||
+         CmpTest->getOpcode() == TPU::SRLri);
+  int64_t EndValue = Cmp->getOperand(2).getImm();
+  if (CmpTest->getOpcode() == TPU::SRLri)
+    EndValue <<= CmpTest->getOperand(2).getImm();
+  unsigned CmpOpCode = Cmp->getOpcode();
+  if (ReverseCond) {
+    switch (CmpOpCode) {
+    case TPU::CMPEQri:
+      CmpOpCode = TPU::CMPNEri;
+      break;
+    case TPU::CMPGEri:
+      CmpOpCode = TPU::CMPLTri;
+      break;
+    case TPU::CMPLEri:
+      CmpOpCode = TPU::CMPGTri;
+      break;
+    case TPU::CMPUGEri:
+      CmpOpCode = TPU::CMPULTri;
+      break;
+    case TPU::CMPULEri:
+      CmpOpCode = TPU::CMPUGTri;
+      break;
+    default:
+      return;
+    }
+  }
+  switch (CmpOpCode) {
+  case TPU::CMPNEri:
+    if ((EndValue - OriginalValue) % Inc != 0)
+      return;
+    break;
+  case TPU::CMPLTri:
+    if (Inc < 0)
+      return;
+    break;
+  case TPU::CMPGTri:
+    if (Inc > 0)
+      return;
+    break;
+  case TPU::CMPULTri:
+    if (Inc < 0)
+      return;
+    // Make sure we don't wrap around, which is not undefined for unsigned
+    // values.
+    if (OriginalValue + Inc < 0 || EndValue < 0)
+      return;
+    break;
+  case TPU::CMPUGTri:
+    if (Inc > 0)
+      return;
+    // Make sure we don't wrap around, which is not undefined for unsigned
+    // values.
+    if (OriginalValue + Inc < 0 || EndValue < 0)
+      return;
+    break;
+  default:
+    return;
+  }
+  TripCount = (EndValue - OriginalValue) / Inc;
+}
+
+bool TPUTensorCorePipelinerLoopInfo::shouldIgnoreForPipelining(
+    const MachineInstr *MI) const {
+  return MI->isTerminator() || MI == Cmp || MI == IVUpdate;
+}
+
+// Obsolete, this function is used by stitching, but may fail to find the right
+// loop test.
+std::optional<bool>
+TPUTensorCorePipelinerLoopInfo::createTripCountGreaterCondition(
+    int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond) {
+  if (TripCount != -1)
+    return TripCount > TC;
+  auto I = MBB.instr_back().getIterator();
+  while (I->getOpcode() != CmpOpcode && I != MBB.instr_begin())
+    --I;
+  MachineInstr &NewCmp = *I;
+
+  // The branch condition is always the loop comparison. Invert the
+  // predicate's value if it's a loop exit condition.
+  Cond.push_back(NewCmp.getOperand(0));
+  Cond.push_back(MachineOperand::CreateImm(ReverseCond ? 0 : 1));
+  return {};
+}
+
+std::optional<bool>
+TPUTensorCorePipelinerLoopInfo::createTripCountGreaterCondition(
+    int TC, MachineBasicBlock &MBB, MachineInstr *Cmp,
+    SmallVectorImpl<MachineOperand> &Cond) {
+  if (TripCount != -1)
+    return TripCount > TC;
+  auto I = MBB.instr_back().getIterator();
+  while (&*I != Cmp)
+    --I;
+  MachineInstr &NewCmp = *I;
+
+  // The branch condition is always the loop comparison. Invert the
+  // predicate's value if it's a loop exit condition.
+  Cond.push_back(NewCmp.getOperand(0));
+  Cond.push_back(MachineOperand::CreateImm(ReverseCond ? 0 : 1));
+  return {};
+}
+
+bool TPUBarnaCorePipelinerLoopInfo::shouldIgnoreForPipelining(
+    const MachineInstr *MI) const {
+  return MI->isTerminator();
+}
+
+std::optional<bool>
+TPUBarnaCorePipelinerLoopInfo::createTripCountGreaterCondition(
+    int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond) {
+  assert(0 && "Barnacore cannot generate prologue/epilogue");
+  return {};
+}
+
+std::optional<bool>
+TPUBarnaCorePipelinerLoopInfo::createTripCountGreaterCondition(
+    int TC, MachineBasicBlock &MBB, MachineInstr *Cmp,
+    SmallVectorImpl<MachineOperand> &Cond) {
+  assert(0 && "Barnacore cannot generate prologue/epilogue");
+  return {};
+}
+
+} // namespace
+
+// Enforce that the compare's operand is a supported iv update using a PHI, and
+// that PHI uses the iv update. This pattern covers both upcounting and
+// downcounting loops (but not loops that use pointers as their induction
+// variable).
+// TODO(hgreving): If the first operand happens to be an ADDri fed by a PHI as
+// well, we may fail to analyze the loop.
+MachineInstr *TPUInstrInfo::analyzeIVUpdateforPipelining(
+    MachineBasicBlock *LoopBB, MachineInstr *CmpInst, MachineLoop *Loop) const {
+  MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
+  MachineInstr *InstSrc = nullptr;
+  // FIXME(hgreving): This analysis code here is too fragile and case specific.
+  // We need something better, more robust and general.
+  auto supportedIVUpdateForPipelining = [](const MachineInstr *MI) {
+    switch (MI->getOpcode()) {
+    case TPU::ADDri:
+    case TPU::ADDrr:
+      return true;
+    default:
+      return false;
+    }
+  };
+  auto FollowToPhi = [&MRI,
+                      &supportedIVUpdateForPipelining](MachineInstr *InstSrc) {
+    if (InstSrc->getOpcode() == TPU::SRLri) {
+      InstSrc = MRI.getUniqueVRegDef(InstSrc->getOperand(1).getReg());
+      if (supportedIVUpdateForPipelining(InstSrc)) {
+        // TODO(hgreving): we really need to make this code prettier and more
+        // generic.
+        if (InstSrc->getOperand(2).isReg())
+          return (MachineInstr *)nullptr;
+        InstSrc = MRI.getUniqueVRegDef(InstSrc->getOperand(1).getReg());
+      }
+    }
+    if (InstSrc->isPHI())
+      return InstSrc;
+    return (MachineInstr *)nullptr;
+  };
+  for (int s = 1; s <= 2; s++) {
+    MachineOperand &MO = CmpInst->getOperand(s);
+    InstSrc = MO.isReg() ? MRI.getUniqueVRegDef(MO.getReg()) : nullptr;
+    // We currently only support the simple case of an add directly feeding
+    // the cmp, either directly or through a phi.
+    if (!InstSrc)
+      continue;
+    if (supportedIVUpdateForPipelining(InstSrc)) {
+      MachineInstr *LoopPhi =
+          MRI.getUniqueVRegDef(InstSrc->getOperand(1).getReg());
+      if (!LoopPhi || !LoopPhi->isPHI() ||
+          !LoopPhi->readsVirtualRegister(InstSrc->getOperand(0).getReg()))
+        continue;
+      return InstSrc;
+    } else if (InstSrc = FollowToPhi(InstSrc); InstSrc) {
+      // We currently support a phi that has 2 inputs, with the add being
+      // the induction variable update in the loop, as second operand of
+      // the phi.
+      const int SupportedPhiOperands = 2;
+      if (InstSrc->getNumOperands() != 2 * SupportedPhiOperands + 1)
+        continue;
+      if (!InstSrc->getOperand(2 * SupportedPhiOperands - 1).isReg() ||
+          !InstSrc->getOperand(2 * SupportedPhiOperands).isMBB())
+        continue;
+      MachineBasicBlock *RMBB =
+          InstSrc->getOperand(2 * SupportedPhiOperands).getMBB();
+      if ((Loop && !Loop->contains(RMBB)) || (!Loop && RMBB != LoopBB))
+        continue;
+      MachineInstr *IVUpdate = MRI.getUniqueVRegDef(
+          InstSrc->getOperand(2 * SupportedPhiOperands - 1).getReg());
+      if (!supportedIVUpdateForPipelining(IVUpdate))
+        continue;
+      if (MRI.getUniqueVRegDef(IVUpdate->getOperand(1).getReg()) != InstSrc)
+        continue;
+      return IVUpdate;
+    }
+  }
+  return (MachineInstr *)nullptr;
+}
+
+std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+TPUInstrInfo::analyzeLoopForSSIPipelining(MachineBasicBlock *MBB,
+                                          MachineLoop *Loop) const {
+  if (MBB->empty() || MBB->instr_begin() == MBB->getFirstInstrTerminator())
+    return nullptr;
+  for (MachineInstr &MI : *MBB) {
+    if (TPUInstrInfo::isSpill(&MI))
+      // A spill from the pre-spiller by definition writes to the same memory
+      // location, which makes the loop not "loop parallel".
+      return nullptr;
+  }
+  assert(Loop->contains(MBB));
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  if (!MRI.isSSA())
+    return nullptr;
+  MachineBasicBlock *LatchMBB = Loop->findLoopControlBlock();
+  assert(LatchMBB);
+  if (!TPUInstrInfo::isLoopParallel(*LatchMBB))
+    // Not loop parallel loops defy the assumption of the SSI dependencies.
+    return nullptr;
+  if (!Loop->getExitBlock() || !Loop->getLoopPreheader())
+    return nullptr;
+
+  MachineInstr *ExitBranch = nullptr;
+  MachineInstr *BackBranch = nullptr;
+  for (auto &Inst : *LatchMBB) {
+    if (Inst.getOpcode() == TPU::BR || Inst.getOpcode() == TPU::BRcond) {
+      assert(Inst.getOperand(0).isMBB() && "Unexpected operand to branch");
+      if (Loop->contains(Inst.getOperand(0).getMBB())) {
+        BackBranch = &Inst;
+        break;
+      }
+      if (ExitBranch != nullptr)
+        return nullptr;
+      ExitBranch = &Inst;
+    }
+  }
+  if (!BackBranch)
+    return nullptr;
+  bool ReverseCond = false;
+  unsigned PredReg = TPUPredicate(BackBranch).getReg();
+  if (PredReg == TPU::Palways) {
+    // Unconditional backbranch.
+    if (ExitBranch == nullptr) {
+      // We currently only handle up to one loop exit branch, preceding the
+      // unconditional backbranch in the loop's single bb.
+      return nullptr;
+    }
+    ReverseCond = true;
+    PredReg = TPUPredicate(ExitBranch).getReg();
+    assert(PredReg != TPU::Palways &&
+           "Multiple unconditional branches in a BB");
+  }
+  MachineInstr *CmpInst = MRI.getUniqueVRegDef(PredReg);
+  if (!CmpInst || !CmpInst->isCompare())
+    return nullptr;
+  // We're making an extra check, constraining the loop comparison to a
+  // post-comparison IV update only. SSI pipelining will rely on this check.
+  MachineInstr *CmpPHI = nullptr;
+  for (int s = 1; s <= 2; s++) {
+    MachineOperand &MO = CmpInst->getOperand(s);
+    MachineInstr *InstSrc =
+        MO.isReg() ? MRI.getUniqueVRegDef(MO.getReg()) : nullptr;
+    if (InstSrc && InstSrc->isPHI()) {
+      if (CmpPHI != nullptr)
+        return nullptr;
+      CmpPHI = InstSrc;
+    } else if (InstSrc && Loop->contains(InstSrc->getParent())) {
+      return nullptr;
+    }
+  }
+  if (CmpPHI == nullptr)
+    return nullptr;
+  MachineInstr *IVUpdate = analyzeIVUpdateforPipelining(MBB, CmpInst, Loop);
+  if (!IVUpdate)
+    return nullptr;
+  // The SSI code currently only works if the IV update is easy to understand
+  // and part of the SSI scheduled block.
+  if (IVUpdate->getParent() != MBB)
+    return nullptr;
+  return std::make_unique<TPUTensorCorePipelinerLoopInfo>(CmpInst, IVUpdate,
+                                                          ReverseCond);
+}
+
+std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+TPUInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
+  MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
+  if (!MRI.isSSA())
+    return nullptr;
+
+  MachineInstr *ExitBranch = nullptr;
+  MachineInstr *BackBranch = nullptr;
+  for (auto &Inst : *LoopBB) {
+    if (Inst.getOpcode() == TPU::BR || Inst.getOpcode() == TPU::BRcond ||
+        Inst.getOpcode() == TPU::bcLOOP_END) {
+      assert(Inst.getOperand(0).isMBB() && "Unexpected operand to branch");
+      if (Inst.getOperand(0).getMBB() == LoopBB) {
+        BackBranch = &Inst;
+        break;
+      }
+      if (ExitBranch != nullptr)
+        return nullptr;
+      ExitBranch = &Inst;
+    }
+  }
+  if (!BackBranch)
+    return nullptr;
+  if (LoopBB->getParent()->getSubtarget<TPUSubtarget>().isPxcBarnaCore())
+    return std::make_unique<TPUBarnaCorePipelinerLoopInfo>();
+  bool ReverseCond = false;
+  unsigned PredReg = TPUPredicate(BackBranch).getReg();
+  if (PredReg == TPU::Palways) {
+    // Unconditional backbranch.
+    if (ExitBranch == nullptr) {
+      // We currently only handle up to one loop exit branch, preceding the
+      // unconditional backbranch in the loop's single bb.
+      return nullptr;
+    }
+    ReverseCond = true;
+    PredReg = TPUPredicate(ExitBranch).getReg();
+    assert(PredReg != TPU::Palways &&
+           "Multiple unconditional branches in a BB");
+  }
+  MachineInstr *CmpInst = MRI.getUniqueVRegDef(PredReg);
+  if (!CmpInst || !CmpInst->isCompare())
+    return nullptr;
+  MachineInstr *IVUpdate =
+      analyzeIVUpdateforPipelining(LoopBB, CmpInst, /* Loop = */ nullptr);
+  if (!IVUpdate)
+    return nullptr;
+  return std::make_unique<TPUTensorCorePipelinerLoopInfo>(CmpInst, IVUpdate,
+                                                          ReverseCond);
+}
+
+// reduceLoopCount is used by the MachinePipeliner. A definition of this
+// function only currently exists for the Hexagon backend, and its signature
+// and semantics feel slightly overfit to that backend. The semantics as the
+// TPU implements are:
+//
+// MBB is a peeled iteration of a loop. IndVar and Cmp are the returns of
+// analyzeLoop on that original loop. IndVar is unused here, and Cmp is the
+// compare (CMPNE etc) that defines the original loop's backbranch predicate.
+//
+// reduceLoopCount must populate Cond with the exit condition for the peeled
+// iteration, which is then passed as the Condition argument to
+// TPUInstrInfo::insertBranch
+//
+// In the Hexagon backend, there are clever LOOP/ENDLOOP instructions that they
+// need to faff about and update. In TPU we just use normal conditional
+// branches, and in fact the loop peeling process correctly duplicates the
+// induction variable update and threads it correctly. We only support loops
+// with an induction variable and a fixed termination condition (rather than
+// encoding the trip count explicitly like Hexagon does). So all we need to do
+// here is just find the cloned loop backbranch condition, invert it to become
+// a loop exit condition, and return that.
+unsigned TPUInstrInfo::reduceLoopCount(
+    MachineBasicBlock &MBB, MachineBasicBlock &PreHeader, MachineInstr *IndVar,
+    MachineInstr &Cmp, SmallVectorImpl<MachineOperand> &Cond,
+    SmallVectorImpl<MachineInstr *> &PrevInsts, unsigned Iter,
+    unsigned MaxIter) const {
+#ifndef NDEBUG
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  assert(MRI.isSSA());
+#endif
+  MachineInstr &NewCmp = MBB.back();
+  assert(NewCmp.getOpcode() == Cmp.getOpcode());
+
+  // The branch condition is always the loop comparison. Invert it for the loop
+  // exit condition.
+  Cond.push_back(NewCmp.getOperand(0));
+  Cond.push_back(MachineOperand::CreateImm(1));
+  return NewCmp.getOperand(0).getReg();
+}
+
+bool TPUInstrInfo::isPredicable(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default:
+    return TargetInstrInfo::isPredicable(MI);
+  case TPU::PSEUDO_SELrr:
+  case TPU::PSEUDO_SELri:
+  case TPU::PSEUDO_SELir:
+  case TPU::PSEUDO_VSELrr:
+  case TPU::PSEUDO_VSELri:
+  case TPU::PSEUDO_VSELir:
+  case TPU::PSEUDO_MSELrr:
+  case TPU::PSEUDO_PSELrr:
+  case TPU::VPSELrr:
+  case TPU::VPSELri:
+  case TPU::VPSELrs:
+    return true;
+  }
+}
+
+bool TPUInstrInfo::canPredicatePredicatedInstr(const MachineInstr &MI) const {
+  assert(MI.isPredicable());
+  // We can always just insert a `pand`.
+  return true;
+}
+
+void TPUInstrInfo::swapSelOperands(MachineInstr &MI) const {
+  DenseMap<unsigned, unsigned> SwapOpcode = {
+      {TPU::PSEUDO_SELrr, TPU::PSEUDO_SELrr},
+      {TPU::PSEUDO_VSELrr, TPU::PSEUDO_VSELrr},
+      {TPU::PSEUDO_MSELrr, TPU::PSEUDO_MSELrr},
+      {TPU::PSEUDO_PSELrr, TPU::PSEUDO_PSELrr},
+      {TPU::PSEUDO_SELri, TPU::PSEUDO_SELir},
+      {TPU::PSEUDO_VSELri, TPU::PSEUDO_VSELir},
+      {TPU::PSEUDO_VSELrif, TPU::PSEUDO_VSELirf},
+      {TPU::PSEUDO_SELir, TPU::PSEUDO_SELri},
+      {TPU::PSEUDO_VSELir, TPU::PSEUDO_VSELri},
+      {TPU::PSEUDO_VSELirf, TPU::PSEUDO_VSELrif},
+      {TPU::VPSELrr, TPU::VPSELrr},
+      {TPU::VPSELri, TPU::VPSELir},
+      {TPU::VPSELir, TPU::VPSELri},
+      {TPU::VPSELrs, TPU::VPSELsr},
+      {TPU::VPSELsr, TPU::VPSELrs},
+  };
+  switch (MI.getOpcode()) {
+  case TPU::VPSELrs:
+  case TPU::VPSELsr: {
+    unsigned RegT = MI.getOperand(2).getReg();
+    MI.getOperand(2).setReg(MI.getOperand(3).getReg());
+    MI.getOperand(3).setReg(RegT);
+    MI.setDesc(get(SwapOpcode[MI.getOpcode()]));
+    break;
+  }
+  case TPU::PSEUDO_SELrr:
+  case TPU::PSEUDO_VSELrr:
+  case TPU::VPSELrr:
+  case TPU::PSEUDO_MSELrr:
+  case TPU::PSEUDO_PSELrr: {
+    unsigned RegT = MI.getOperand(2).getReg();
+    MI.getOperand(2).setReg(MI.getOperand(3).getReg());
+    MI.getOperand(3).setReg(RegT);
+    break;
+  }
+  case TPU::PSEUDO_SELri:
+  case TPU::PSEUDO_VSELri:
+  case TPU::VPSELri:
+  case TPU::PSEUDO_VSELrif: {
+    unsigned Reg = MI.getOperand(2).getReg();
+    bool IsDef = MI.getOperand(2).isDef();
+    bool IsImplicit = MI.getOperand(2).isImplicit();
+    int64_t Imm = MI.getOperand(3).getImm();
+    // VPSEL is a real select, no need to tie.
+    if (MI.getOpcode() != TPU::VPSELri)
+      MI.untieRegOperand(2);
+    MI.getOperand(2).ChangeToImmediate(Imm);
+    MI.getOperand(3).ChangeToRegister(Reg, IsDef, IsImplicit);
+    MI.setDesc(get(SwapOpcode[MI.getOpcode()]));
+    if (MI.getOpcode() != TPU::VPSELri)
+      MI.tieOperands(0, 3);
+    break;
+  }
+  case TPU::PSEUDO_SELir:
+  case TPU::PSEUDO_VSELir:
+  case TPU::VPSELir:
+  case TPU::PSEUDO_VSELirf: {
+    int64_t Imm = MI.getOperand(2).getImm();
+    unsigned Reg = MI.getOperand(3).getReg();
+    bool IsDef = MI.getOperand(3).isDef();
+    bool IsImplicit = MI.getOperand(3).isImplicit();
+    // VPSEL is a real select, no need to tie.
+    if (MI.getOpcode() != TPU::VPSELir)
+      MI.untieRegOperand(3);
+    MI.getOperand(2).ChangeToRegister(Reg, IsDef, IsImplicit);
+    MI.getOperand(3).ChangeToImmediate(Imm);
+    MI.setDesc(get(SwapOpcode[MI.getOpcode()]));
+    if (MI.getOpcode() != TPU::VPSELir)
+      MI.tieOperands(0, 2);
+    break;
+  }
+  default:
+    llvm_unreachable("Unsupported opcode.");
+    break;
+  }
+}
+
+bool TPUInstrInfo::PredicateInstruction(MachineInstr &MI,
+                                        ArrayRef<MachineOperand> Pred) const {
+  LLVM_DEBUG(dbgs() << "PredicateInstruction: " << MI);
+  assert(Pred.size() == 2);
+  assert(Pred[1].getImm() == 0 || Pred[1].getImm() == 1);
+  MachineRegisterInfo *MRI = &MI.getMF()->getRegInfo();
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case TPU::VPSELrr:
+  case TPU::VPSELri:
+  case TPU::VPSELir:
+  case TPU::VPSELrs:
+  case TPU::VPSELsr:
+  case TPU::PSEUDO_SELrr:
+  case TPU::PSEUDO_SELri:
+  case TPU::PSEUDO_SELir:
+  case TPU::PSEUDO_VSELrr:
+  case TPU::PSEUDO_VSELri:
+  case TPU::PSEUDO_VSELir:
+  case TPU::PSEUDO_MSELrr:
+  case TPU::PSEUDO_PSELrr:
+    // Special support for pseudo SEL instructions, for example:
+    // $a SEL = $Ps, $b, $c
+    // ->
+    // Pt  = POR !Ps, !Pn // Pt0 == !(Ps && Pn), if Pred is not inverse
+    // Pt  = POR !Ps, Pn // Pt0 == !(Ps && !Pn), if Pred is inverse
+    // $a SELrr = $Pt, $c, $b // !Pt by swapping operands
+    // This only works because we assume its operands are tied.
+    assert(MI.getOpcode() != TPU::PSEUDO_VSELrr || MI.getOperand(2).isTied());
+    assert(MI.getOpcode() != TPU::PSEUDO_VSELri || MI.getOperand(2).isTied());
+    assert(MI.getOpcode() != TPU::PSEUDO_VSELir || MI.getOperand(3).isTied());
+    assert(MI.getOpcode() != TPU::PSEUDO_SELrr || MI.getOperand(2).isTied());
+    assert(MI.getOpcode() != TPU::PSEUDO_SELri || MI.getOperand(2).isTied());
+    assert(MI.getOpcode() != TPU::PSEUDO_SELir || MI.getOperand(3).isTied());
+    assert(MI.getOpcode() != TPU::PSEUDO_MSELrr || MI.getOperand(2).isTied());
+    Register PredTmp = MRI->createVirtualRegister(&TPU::PPRRegClass);
+    AddDefaultPred(BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+                           get(TPU::POR), PredTmp)
+                       .addReg(MI.getOperand(1).getReg())
+                       .addImm(1)
+                       .addReg(Pred[0].getReg())
+                       .addImm(Pred[1].getImm() == 1 ? 0 : 1));
+    MI.getOperand(1).setReg(PredTmp);
+    // Negate the predicate by swapping the pseudo SEL's operands.
+    swapSelOperands(MI);
+    return true;
+  }
+  if (isPredicated(MI)) {
+    // De Morgan's law Pred && Pred(MI) = !(!Pred || !Pred(MI) )
+    Register PredTmp = MRI->createVirtualRegister(&TPU::PPRRegClass);
+    AddDefaultPred(BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+                           get(TPU::POR), PredTmp)
+                       .addReg(TPUPredicate(MI).getReg())
+                       .addImm(TPUPredicate(MI).getImm() == 1 ? 0 : 1)
+                       .addReg(Pred[0].getReg())
+                       .addImm(Pred[1].getImm() == 1 ? 0 : 1));
+    TPUPredicate(MI).setReg(PredTmp).setInvert(1).applyTo(&MI);
+    return true;
+  }
+  assert(isPredicable(MI));
+  assert(!isPredicated(MI));
+  unsigned PredRegOpIdx = MI.getNumExplicitOperands() - 2;
+  unsigned PredInvertOpIdx = PredRegOpIdx + 1;
+  MI.getOperand(PredRegOpIdx).setReg(Pred[0].getReg());
+  MI.getOperand(PredInvertOpIdx).setImm(Pred[1].getImm());
+  return true;
+}
+
+/// Returns true if the first specified predicate
+/// subsumes the second, e.g. GE subsumes GT.
+bool TPUInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+                                     ArrayRef<MachineOperand> Pred2) const {
+  return false;
+}
+
+/// If the specified instruction defines any predicate
+/// or condition code register(s) used for predication, returns true as well
+/// as the definition predicate(s) by reference.
+bool TPUInstrInfo::ClobbersPredicate(MachineInstr &MI,
+                                     std::vector<MachineOperand> &Pred,
+                                     bool SkipDead) const {
+  if (MI.getNumDefs() >= 1 && MI.getOperand(0).isReg() &&
+      TPU::PPRRegClass.contains(MI.getOperand(0).getReg())) {
+    Pred.push_back(MI.getOperand(0));
+    Pred.push_back(MachineOperand::CreateImm(0));
+    return true;
+  }
+  return false;
+}
+
+bool TPUInstrInfo::isPredicated(const MachineInstr &MI) const {
+  if (MI.getOpcode() == TPU::BUNDLE)
+    return false;
+  if (!MI.isPredicable())
+    return false;
+  return !TPUPredicate(MI).isAlways();
+}
+
+bool TPUInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
+                                       unsigned NumCycles,
+                                       unsigned ExtraPredCycles,
+                                       BranchProbability Probability) const {
+  // Picked out of thin air. We aggressively predicate before pre-RA scheduling
+  // to allow better scheduling and potentially better software pipelining. We
+  // keep the post RA if conversion pass as it handles more cases.
+  if (MBB.getParent()->getRegInfo().isSSA())
+    return NumCycles < 64;
+  return NumCycles < 64;
+}
+
+namespace {
+//===----------------------------------------------------------------------===//
+// Table gen LdStInfo mapping.
+//===----------------------------------------------------------------------===//
+struct LdStInfoTy {
+  unsigned Opcode;
+  bool HasAddress;
+  bool HasMask;
+  bool HasStride;
+  bool HasShuffle;
+  bool HasVMask;
+  bool HasLdReplicateEvenOdd;
+  bool HasVsEvenOdd;
+  bool HasIndex;
+};
+
+using namespace TPU;
+#define GET_LdSTMemAccessTable_IMPL
+#include "TPUGenSearchableTables.inc"
+#undef GET_LdSTMemAccessTable_IMPL
+
+// Represent the span of a memory access.
+struct MemAccess {
+  Register Base = 0;
+  Register DynamicStride = 0;
+  // Stride and Offset can be negative.
+  int Offset = 0;
+  int Stride = 1;
+  unsigned Mask = ~0;
+  // Right now all Vector memory accesses have a width of 8 words.
+  // FIXME(hgreving): Not always 8 for SparseCore.
+  static const int Width = 8;
+};
+
+// Return a vector of addresses accessed by the given memory access.
+// It can have up to 8 addresses.
+SmallVector<int, 8> getRelativeAddresses(const MemAccess &M) {
+  SmallVector<int, 8> Addresses;
+  for (int I = 0; I < M.Width; I++) {
+    if (M.Mask & (1 << I)) {
+      int64_t Address = M.Offset + I * M.Stride;
+      Addresses.push_back(Address);
+    }
+  }
+  return Addresses;
+}
+
+// Return true if the given memory accesses may overlap.
+bool MemAccessOverlap(const MemAccess &Ma, const MemAccess &Mb) {
+  if (Ma.Base != Mb.Base)
+    return true;
+  if (Ma.DynamicStride != 0 || Mb.DynamicStride != 0) {
+    // The stride is dynamic. If the offset match and the masks are disjoint
+    // we know the accesses don't overlap otherwise we don't have enough
+    // information.
+    if (Ma.Offset == Mb.Offset && ((Ma.Mask & Mb.Mask) == 0))
+      return false;
+    return true;
+  }
+  SmallVector<int, 8> AddressA = getRelativeAddresses(Ma);
+  SmallVector<int, 8> AddressB = getRelativeAddresses(Mb);
+  for (auto I : AddressA)
+    for (auto J : AddressB)
+      if (I == J)
+        return true;
+  return false;
+}
+
+// Apply shuffle to a given mask and return the new mask.
+unsigned ShuffleMask(unsigned Mask, const ArrayRef<int> ShuffleIdx) {
+  unsigned NewMask = 0;
+  for (int I = 0; I < ShuffleIdx.size(); I++) {
+    NewMask |= ((Mask >> I) & 0x1) << ShuffleIdx[I];
+  }
+  return NewMask;
+}
+
+// Analyze the instruction and extract the memory access information.
+// Return false if the instruction cannot be analyzed.
+bool getMemAccess(const TPUSubtarget &ST, const MachineInstr &MI,
+                  MemAccess &MA) {
+  const LdStInfoTy *Info = LdStInfo(MI.getOpcode());
+  if (Info == nullptr)
+    return false;
+  // Currently don't model indexed ld/st.
+  if (Info->HasIndex)
+    return false;
+  unsigned OpIdx = 1;
+  if (Info->HasAddress)
+    MA.Base = MI.getOperand(OpIdx++).getReg();
+  MA.Offset = MI.getOperand(OpIdx++).getImm();
+  if (Info->HasMask) {
+    auto &Mask = MI.getOperand(OpIdx++);
+    if (Mask.isImm())
+      MA.Mask = Mask.getImm();
+  }
+  if (Info->HasStride) {
+    auto &Stride = MI.getOperand(OpIdx++);
+    if (Stride.isImm())
+      MA.Stride = Stride.getImm();
+    else
+      MA.DynamicStride = Stride.getReg();
+  }
+  if (Info->HasShuffle) {
+    // We don't need to know the order of accesses so shuffle only affects the
+    // mask. If we don't know the shuffle operand just assume worst case
+    // scenario and reset to a full mask.
+    auto &Shuffle = MI.getOperand(OpIdx++);
+    if (Shuffle.isImm()) {
+      // Shuffle mask is encoded in 4 bits per sublane on PF and 3 bits per
+      // sublane on JFC/DFC.
+      unsigned ShuffleIdxSize =
+          ST.hasPfcTensorCore() || ST.hasVfcTensorCore() ? 4 : 3;
+      std::array<int, MemAccess::Width> ShuffleIdx;
+      for(int I = 0; I < MemAccess::Width; I++)
+        ShuffleIdx[I] = (Shuffle.getImm() >> (ShuffleIdxSize * I)) &
+                        ((1 << ShuffleIdxSize) - 1);
+      MA.Mask = ShuffleMask(MA.Mask, ShuffleIdx);
+    } else {
+      MA.Mask = ~0;
+    }
+  }
+  if (Info->HasVMask)
+    OpIdx++;
+  if (Info->HasLdReplicateEvenOdd) {
+    std::array<int, 8> ShuffleIdx = { 0, 1, 2, 3, 0, 1, 2, 3 };
+    MA.Mask = ShuffleMask(MA.Mask, ShuffleIdx);
+  }
+  if (Info->HasVsEvenOdd) {
+    std::array<int, 8> ShuffleIdx = { 0, 2, 4, 6, 1, 3, 5, 7 };
+    MA.Mask = ShuffleMask(MA.Mask, ShuffleIdx);
+  }
+  return true;
+}
+} // namespace
+
+// Alias analysis isn't able to trivially detect that custom memory operands
+// are independent when an instruction has more than one custom memory operand.
+bool TPUInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
+                                                   const MachineInstr &MIb) const {
+  assert((MIa.mayStore() || MIb.mayStore()) &&
+         "This function expects that we already trivially "
+         "checked that load/load don't alias.");
+
+  // FIXME(hgreving): This should not be here. The reason this is added is that
+  // we want to run scoped AA before basic AA. Basic AA runs before scoped AA in
+  // the default AA stack. The problem is that the IR that AA looks at is the IR
+  // after GEP lowering. That IR is basically wrong, and may misguide basic AA
+  // to think memory ops `MustAlias`, while they usually wouldn't. I've seen
+  // cases like this. The workaround is to manually run scoped AA here. The real
+  // fix would be that pointer analysis in basic AA understands the address
+  // space's word size, w/o GEP lowering pass.
+  if (!MIa.memoperands_empty() && !MIb.memoperands_empty()) {
+    MachineMemOperand *MMOa = *MIa.memoperands().begin();
+    MachineMemOperand *MMOb = *MIb.memoperands().begin();
+    const AAMDNodes &AAMDa = MMOa->getAAInfo();
+    const AAMDNodes &AAMDb = MMOb->getAAInfo();
+    ScopedNoAliasAAResult SAR;
+    TargetLibraryInfoImpl TLII;
+    TargetLibraryInfo TLI (TLII);
+    AAResults AAR (TLI);
+    AAR.addAAResult(SAR);
+    SimpleAAQueryInfo AAQIP(AAR);
+    if (SAR.alias(MemoryLocation(nullptr, LocationSize::beforeOrAfterPointer(),
+                                 AAMDa),
+                  MemoryLocation(nullptr, LocationSize::beforeOrAfterPointer(),
+                                 AAMDb),
+                  AAQIP, nullptr) == AliasResult::NoAlias) {
+      return true;
+    }
+  }
+
+  if (AliasAnalysisForMIRTest) {
+    if (MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
+      const DataLayout &DL = MIa.getMF()->getMMI().getModule()->getDataLayout();
+      const Triple T(MIa.getMF()->getMMI().getModule()->getTargetTriple());
+      MachineMemOperand *MMOa = *MIa.memoperands().begin();
+      MachineMemOperand *MMOb = *MIb.memoperands().begin();
+      auto &ST = MIa.getMF()->getSubtarget<TPUSubtarget>();
+      TPUAAResult TAR(DL, T, ST.isSparseCore());
+      if (!TPUInstrInfo::isCbUpd(MIa) && !TPUInstrInfo::isCbUpd(MIb)) {
+        // We can disambiguate easily, but only if no cb.upd semantics, which is
+        // (unfortunately-) modeled through memory as well.
+        if (TAR.aliasFromAddrSpace(MMOa->getAddrSpace(),
+                                   MMOb->getAddrSpace()) ==
+            AliasResult::NoAlias) {
+          return true;
+        }
+      }
+    }
+  }
+
+  MemAccess MAa;
+  MemAccess MAb;
+  const TPUSubtarget &ST = MIa.getMF()->getSubtarget<TPUSubtarget>();
+  if (!ST.isSparseCore()) {
+    // FIXME(hgreving): This doesn't work for the SparseCore yet.
+    if (getMemAccess(ST, MIa, MAa) && getMemAccess(ST, MIb, MAb)) {
+      if (!MemAccessOverlap(MAa, MAb))
+        return true;
+    }
+  }
+
+  // Assume that when an instruction has target custom memory operand
+  // it alias if and only if the custom memory operands intersect.
+  std::pair</*loop id*/ int, /*iteration id*/ int> CustomPipelinedMIa;
+  std::pair</*loop id*/ int, /*iteration id*/ int> CustomPipelinedMIb;
+  auto GetPipelinedLoopIter = [](const MachineInstr &MI) {
+    for (MachineMemOperand *MMO : MI.memoperands()) {
+      const TPUTargetMachine &TM =
+          static_cast<const TPUTargetMachine &>(MI.getMF()->getTarget());
+      if (const PseudoSourceValue *Pseudo = MMO->getPseudoValue()) {
+        if (TM.isSWPIterationPSV(Pseudo)) {
+          const TPUTargetMachine::SWPTargetPSV *SWPPSV =
+              static_cast<const TPUTargetMachine::SWPTargetPSV *>(Pseudo);
+          return std::make_pair(SWPPSV->getLoop(), SWPPSV->getIteration());
+        }
+      }
+    }
+    return std::make_pair(-1, -1);
+  };
+  CustomPipelinedMIa = GetPipelinedLoopIter(MIa);
+  CustomPipelinedMIb = GetPipelinedLoopIter(MIb);
+
+  // Handle custom operands for prolog and epilog. The semantics are that
+  // operands do not alias if they're from different iterations of the same
+  // loop, indicated by the loop and iteration operands.
+  if (CustomPipelinedMIa.first != -1 && CustomPipelinedMIb.first != -1) {
+    if (CustomPipelinedMIa.first == CustomPipelinedMIb.first &&
+        CustomPipelinedMIa.second != CustomPipelinedMIb.second) {
+      return true;
+    }
+  }
+
+  SmallSet<unsigned, 4> CustomMem;
+  SmallSet<std::pair<const MachineInstr *, unsigned>, 4> CustomMemCheck;
+  SmallSet<const MachineInstr *, 4> HasMemOperand;
+  bool DTPrologMem = false;
+  bool DTEpilogMem = false;
+  bool DTHeaderMem = false;
+  for (const MachineInstr *MI : {&MIa, &MIb}) {
+    bool DTPrologMem_Check = false;
+    bool DTEpilogMem_Check = false;
+    bool DTHeaderMem_Check = false;
+    for (MachineMemOperand *MMO : MI->memoperands()) {
+      const TPUTargetMachine &TM =
+          static_cast<const TPUTargetMachine &>(MI->getMF()->getTarget());
+      if (const PseudoSourceValue *Pseudo = MMO->getPseudoValue()) {
+        // Dovetailed prolog/epilog/header PSVs are handled separately. They
+        // mean trivially disjoint if both are present, one in each instruction,
+        // or they're being ignored. Note that the PSVs specifically match
+        // dovetailing semantics. They derive from the fact that we had been
+        // overlapping instructions from different iterations of a parallel
+        // loop, with the header and prolog instructions being from a different
+        // iteration than the epilog.
+        if (TM.isDTPrologPSV(Pseudo)) {
+          if (DTEpilogMem)
+            return true;
+          DTPrologMem = true;
+          DTPrologMem_Check = true;
+        } else if (TM.isDTEpilogPSV(Pseudo)) {
+          if (DTPrologMem || DTHeaderMem)
+            return true;
+          DTEpilogMem = true;
+          DTEpilogMem_Check = true;
+        } else if (TM.isDTHeaderPSV(Pseudo)) {
+          if (DTEpilogMem)
+            return true;
+          DTHeaderMem = true;
+          DTHeaderMem_Check = true;
+        } else if (TM.isSWPIterationPSV(Pseudo)) {
+          assert(!isFifoPush(*MI) && !isFifoPop(*MI));
+          // Do not push into CustomMem.
+          continue;
+        } else if (unsigned Custom = Pseudo->getTargetCustom()) {
+          // Make sure there are no duplicated custom memory operands.
+          assert(CustomMemCheck.insert(std::make_pair(MI, Custom)).second);
+          (void)CustomMemCheck;
+          // Prolog, Epilog, Header PSVs are mutually exclusive.
+          assert(!DTEpilogMem_Check && !DTPrologMem_Check && !DTHeaderMem_Check);
+          if (CustomMem.insert(Custom).second == false) {
+            // Found the same custom memory twice, instructions may alias.
+            return false;
+          }
+        }
+      } else {
+        // Mark the instruction as having a normal memory operand.
+        HasMemOperand.insert(MI);
+      }
+    }
+  }
+
+  // If we found custom memory without intersections and one of the instruction
+  // only has custom memory operand assume they don't alias.
+  if (!CustomMem.empty() &&
+      (HasMemOperand.count(&MIa) == 0 || HasMemOperand.count(&MIb) == 0)) {
+    return true;
+  }
+
+  // If this is not covered otherwise already, we cover it trivially here.
+  auto ReorderFifoMem = [&](const MachineInstr &A, const MachineInstr &B) {
+    if (isFifoPush(A) || isFifoPop(A)) {
+      if (!isFifoPush(B) && !isFifoPop(B)) {
+        return true;
+      }
+    }
+    return false;
+  };
+  if (ReorderFifoMem(MIa, MIb))
+    return true;
+  if (ReorderFifoMem(MIb, MIa))
+    return true;
+
+  // For BarnaCore, aliaddr loads and stores with different
+  // feature-length-multiples ($flm) trivially never alias.
+  auto GetFlm = [](const MachineInstr &MI) -> int {
+    switch (MI.getOpcode()) {
+    case TPU::bcVLD_aliaddrr:
+    case TPU::bcVLD_aliaddri:
+    case TPU::bcVST_aliaddrr:
+    case TPU::bcVST_aliaddri:
+      return MI.getOperand(3).getImm();
+    default:
+      return -1;
+    }
+  };
+  int FlmA = GetFlm(MIa);
+  int FlmB = GetFlm(MIb);
+  if (FlmA != -1 && FlmB != -1 && FlmA != FlmB)
+    return true;
+  // TODO(jmolloy): This is not fully correct. If both instructions have feature
+  // length multiples and these are the same, the instructions are must-alias
+  // but only within the same loop iteration! These instructions in different
+  // loop iterations are no-alias with respect to each other.
+  //
+  // The software pipeliner calls this function as a proxy for "are these
+  // things no-alias across loop iterations?". The TODO here is to make a better
+  // query function.
+  if (FlmA != -1 && FlmB != -1)
+    return true;
+  return false;
+}
+
+bool TPUInstrInfo::getMemAccessInfo(const TPUSubtarget &ST,
+                                    const MachineInstr &MI,
+                                    MemAccessInfo &MA) const {
+  MemAccess M;
+  if (!getMemAccess(ST, MI, M))
+    return false;
+  if (M.DynamicStride != 0)
+    return false;
+  MA.Base = M.Base;
+  MA.Offsets = getRelativeAddresses(M);
+  return true;
+}
+
+MachineInstr *TPUInstrInfo::getFirstFromBundle(MachineInstr *MI,
+                                               ArrayRef<unsigned> Opcodes) {
+  auto CheckCandidate = [Opcodes](MachineInstr *MI) -> MachineInstr * {
+    for (unsigned Opc : Opcodes) {
+      if (MI->getOpcode() == Opc)
+        return MI;
+    }
+    return nullptr;
+  };
+  if (MI->getOpcode() != TPU::BUNDLE)
+    return CheckCandidate(MI);
+  for (auto I = std::next(MI->getIterator()), E = MI->getParent()->instr_end();
+       I != E; ++I) {
+    if (auto *Cand = CheckCandidate(&*I))
+      return Cand;
+    if (!I->isBundledWithSucc())
+      break;
+  }
+  return nullptr;
+}
+
+void TPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator Position,
+                                       Register SourceRegister, bool IsKill,
+                                       int FrameIndex,
+                                       const TargetRegisterClass *RegisterClass,
+                                       const TargetRegisterInfo *RegisterInfo,
+                                       Register VReg) const {
+  DebugLoc DL;
+  if (Position != MBB.end()) {
+    DL = Position->getDebugLoc();
+  }
+  MachineFunction *MF = MBB.getParent();
+  const TPUSubtarget &ST = MF->getSubtarget<TPUSubtarget>();
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FrameIndex),
+      MachineMemOperand::MOStore, MFI.getObjectSize(FrameIndex),
+      MFI.getObjectAlign(FrameIndex));
+  if (/* Includes stack alias registers */ RegisterClass->hasSuperClassEq(
+      &TPU::GPRRegClass)) {
+    auto MIB =
+        BuildMI(MBB, Position, DL,
+                get(!MF->getSubtarget<TPUSubtarget>().isSparseCore() ? TPU::SSTi
+                    : ST.isTPUABIEnabled() ? TPU::SPILL_GPRs
+                                           : TPU::SPILL_GPR))
+            .addReg(SourceRegister, getKillRegState(IsKill));
+    if (ST.isTPUABIEnabled())
+      MIB.addReg(TPU::LR);
+    MIB.addFrameIndex(FrameIndex).addMemOperand(MMO);
+    AddDefaultPred(MIB);
+    if (ST.isTPUABIEnabled()) {
+      AddDefaultPred(
+          BuildMI(MBB, MIB->getIterator(), DL, get(TPU::SPILL_GPR_ADD), TPU::LR)
+              .addReg(TPU::FPS, getKillRegState(false))
+              .addFrameIndex(FrameIndex));
+    }
+  } else if (RegisterClass == &TPU::VPRRegClass) {
+    assert(MF->getSubtarget<TPUSubtarget>().hasVPU());
+    unsigned OpCode;
+    if (!MF->getSubtarget<TPUSubtarget>().isSparseCore()) {
+      // TODO(hgreving): We don't have a TC spill pseudo instruction at present,
+      // because this was introduced for Sparsecore. We could add this for TPU
+      // as well.
+      OpCode = TPU::tcVSVi;
+    } else {
+      assert(MF->getSubtarget<TPUSubtarget>().isSparseCore());
+      OpCode = ST.isTPUABIEnabled() ? TPU::SPILL_VPRs : TPU::SPILL_VPR;
+    }
+    auto MIB = BuildMI(MBB, Position, DL, get(OpCode))
+                   .addReg(SourceRegister, getKillRegState(IsKill));
+    if (ST.isTPUABIEnabled())
+      MIB.addReg(TPU::FPV);
+    MIB.addFrameIndex(FrameIndex).addMemOperand(MMO);
+    AddDefaultPred(MIB);
+  } else if (RegisterClass == &TPU::MPRRegClass) {
+    assert(MF->getSubtarget<TPUSubtarget>().hasVPU());
+    auto MIB =
+        BuildMI(MBB, Position, DL,
+                get(ST.isTPUABIEnabled() ? TPU::SPILL_MPRs : TPU::SPILL_MPR))
+            .addReg(SourceRegister, getKillRegState(IsKill));
+    if (ST.isTPUABIEnabled())
+      MIB.addReg(TPU::FPV);
+    MIB.addFrameIndex(FrameIndex).addMemOperand(MMO);
+    AddDefaultPred(MIB);
+  } else if (RegisterClass == &TPU::PPRRegClass) {
+    auto MIB =
+        BuildMI(MBB, Position, DL,
+                get(ST.isTPUABIEnabled() ? TPU::SPILL_PPRs : TPU::SPILL_PPR))
+            .addReg(SourceRegister, getKillRegState(IsKill));
+    if (ST.isTPUABIEnabled())
+      MIB.addReg(TPU::LR);
+    MIB.addFrameIndex(FrameIndex).addMemOperand(MMO);
+    AddDefaultPred(MIB);
+    if (ST.isTPUABIEnabled()) {
+      AddDefaultPred(
+          BuildMI(MBB, MIB->getIterator(), DL, get(TPU::SPILL_GPR_ADD), TPU::LR)
+              .addReg(TPU::FPS, getKillRegState(false))
+              .addFrameIndex(FrameIndex));
+    }
+  } else {
+    report_fatal_error(Twine("Can't store this register to stack slot: " +
+                             std::to_string(RegisterClass->getID())));
+  }
+}
+
+void TPUInstrInfo::loadRegFromStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
+    Register DestinationRegister, int FrameIndex,
+    const TargetRegisterClass *RegisterClass,
+    const TargetRegisterInfo *RegisterInfo, Register VReg) const {
+  DebugLoc DL;
+  if (Position != MBB.end()) {
+    DL = Position->getDebugLoc();
+  }
+  MachineFunction *MF = MBB.getParent();
+  const TPUSubtarget &ST = MF->getSubtarget<TPUSubtarget>();
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FrameIndex),
+      MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
+      MFI.getObjectAlign(FrameIndex));
+  if (/* Includes stack alias registers */ RegisterClass->hasSuperClassEq(
+      &TPU::GPRRegClass)) {
+    auto MIB =
+        BuildMI(MBB, Position, DL,
+                get(!MF->getSubtarget<TPUSubtarget>().isSparseCore() ? TPU::SLDi
+                    : ST.isTPUABIEnabled() ? TPU::RESTORE_GPRs
+                                           : TPU::RESTORE_GPR),
+                DestinationRegister);
+    if (ST.isTPUABIEnabled())
+      MIB.addReg(TPU::FPS);
+    MIB.addFrameIndex(FrameIndex).addMemOperand(MMO);
+    AddDefaultPred(MIB);
+  } else if (RegisterClass == &TPU::VPRRegClass) {
+    assert(MF->getSubtarget<TPUSubtarget>().hasVPU());
+    unsigned OpCode;
+    if (!MF->getSubtarget<TPUSubtarget>().isSparseCore()) {
+      // TODO(hgreving): We don't have a TPU restore pseudo instruction at
+      // present, because this was introduced for Sparsecore. We could add this
+      // for TPU as well.
+      OpCode = TPU::tcVLVi;
+    } else {
+      assert(MF->getSubtarget<TPUSubtarget>().isSparseCore());
+      OpCode = ST.isTPUABIEnabled() ? TPU::RESTORE_VPRs : TPU::RESTORE_VPR;
+    }
+    auto MIB = BuildMI(MBB, Position, DL, get(OpCode), DestinationRegister);
+    if (ST.isTPUABIEnabled())
+      MIB.addReg(TPU::FPV);
+    MIB.addFrameIndex(FrameIndex).addMemOperand(MMO);
+    AddDefaultPred(MIB);
+  } else if (RegisterClass == &TPU::MPRRegClass) {
+    assert(MF->getSubtarget<TPUSubtarget>().hasVPU());
+    auto MIB = BuildMI(
+        MBB, Position, DL,
+        get(ST.isTPUABIEnabled() ? TPU::RESTORE_MPRs : TPU::RESTORE_MPR),
+        DestinationRegister);
+    if (ST.isTPUABIEnabled())
+      MIB.addReg(TPU::FPV);
+    MIB.addFrameIndex(FrameIndex).addMemOperand(MMO);
+    AddDefaultPred(MIB);
+  } else if (RegisterClass == &TPU::PPRRegClass) {
+    auto MIB = BuildMI(
+        MBB, Position, DL,
+        get(ST.isTPUABIEnabled() ? TPU::RESTORE_PPRs : TPU::RESTORE_PPR),
+        DestinationRegister);
+    if (ST.isTPUABIEnabled())
+      MIB.addReg(TPU::FPS);
+    MIB.addFrameIndex(FrameIndex).addMemOperand(MMO);
+    AddDefaultPred(MIB);
+  } else {
+    report_fatal_error(Twine("Can't load this register from stack slot: " +
+                             std::to_string(RegisterClass->getID())));
+  }
+}
+
+bool TPUInstrInfo::hasLoadFromStackSlot(
+    const MachineInstr &MI,
+    SmallVectorImpl<const MachineMemOperand *> &Accesses) const {
+  if (MI.isBundle()) {
+    const size_t AccessesSize = Accesses.size();
+    // Accumulate all accesses from the bundled instructions.
+    for (auto It = std::next(MI.getIterator());; ++It) {
+      TargetInstrInfo::hasLoadFromStackSlot(*It, Accesses);
+      if (!It->isBundledWithSucc())
+        break;
+    }
+    return Accesses.size() != AccessesSize;
+  }
+  return TargetInstrInfo::hasLoadFromStackSlot(MI, Accesses);
+}
+
+bool TPUInstrInfo::hasStoreToStackSlot(
+    const MachineInstr &MI,
+    SmallVectorImpl<const MachineMemOperand *> &Accesses) const {
+  if (MI.isBundle()) {
+    const size_t AccessesSize = Accesses.size();
+    // Accumulate all accesses from the bundled instructions.
+    for (auto It = std::next(MI.getIterator());; ++It) {
+      TargetInstrInfo::hasStoreToStackSlot(*It, Accesses);
+      if (!It->isBundledWithSucc())
+        break;
+    }
+    return Accesses.size() != AccessesSize;
+  }
+  return TargetInstrInfo::hasStoreToStackSlot(MI, Accesses);
+}
+
+// Return true if we can emit a select with scalar predicate.
+// Note that our hardware doesn't have select instruction with scalar predicate
+// but we emit a pseudo op that gets lowered to a predicated move postRA.
+bool TPUInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
+                                   ArrayRef<MachineOperand> Cond,
+                                   Register DstReg, Register TrueReg,
+                                   Register FalseReg, int &CondCycles,
+                                   int &TrueCycles, int &FalseCycles) const {
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  if (MRI.getRegClass(TrueReg) != &TPU::GPRRegClass &&
+      MRI.getRegClass(TrueReg) != &TPU::VPRRegClass &&
+      MRI.getRegClass(TrueReg) != &TPU::MPRRegClass &&
+      MRI.getRegClass(TrueReg) != &TPU::PPRRegClass)
+    return false;
+  if (MRI.getRegClass(TrueReg) != MRI.getRegClass(FalseReg))
+    return false;
+
+  CondCycles = 1;
+  TrueCycles = 1;
+  FalseCycles = 1;
+  return true;
+}
+
+// Insert a pseudo select with scalar predicate. This gets lowered done to the
+// corresponding predicated move post-RA.
+void TPUInstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I,
+                                const DebugLoc &DL, Register DstReg,
+                                ArrayRef<MachineOperand> Cond, Register TrueReg,
+                                Register FalseReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  auto &ST = MBB.getParent()->getSubtarget<TPUSubtarget>();
+  assert(Cond.size() == 2 && "Invalid Cond array");
+  unsigned Opc = TPU::PSEUDO_SELrr;
+  if (MRI.getRegClass(DstReg) == &TPU::GPRRegClass) {
+    Opc = TPU::PSEUDO_SELrr;
+  } else if (MRI.getRegClass(DstReg) == &TPU::MPRRegClass) {
+    Opc = TPU::PSEUDO_MSELrr;
+  } else if (MRI.getRegClass(DstReg) == &TPU::PPRRegClass) {
+    Opc = TPU::PSEUDO_PSELrr;
+  } else if (MRI.getRegClass(DstReg) == &TPU::VPRRegClass) {
+    if (ST.hasVectorPredicateSelect()) {
+      Opc = TPU::VPSELrr;
+    } else {
+      Opc = TPU::PSEUDO_VSELrr;
+    }
+  } else {
+    llvm_unreachable("unexpected register type for select");
+  }
+
+  // Create the right select Machine instruction based on the type.
+  // If the condition is reversed (cond[1]) swap the sources.
+  BuildMI(MBB, I, DL, get(Opc), DstReg)
+      .addReg(Cond[0].getReg())
+      .addReg(Cond[1].getImm() ? FalseReg : TrueReg)
+      .addReg(Cond[1].getImm() ? TrueReg : FalseReg);
+}
+
+bool TPUInstrInfo::isReallyTriviallyReMaterializable(
+    const MachineInstr &MI) const {
+  // This function should only be called for rematerizalizable instructions.
+  assert(MI.isRematerializable());
+  if (!RdWrDregWellDefined) {
+    if (MI.getOpcode() == TPU::scRDDREG) {
+      return false;
+    }
+  }
+  // Can be trivially rematerialized only if it doesn't have implicit
+  // operands.
+  return MI.getNumImplicitOperands() == 0;
+}
+
+bool TPUInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                        const MachineBasicBlock *MBB,
+                                        const MachineFunction &MF) const {
+  // Using the standard target independent scheduling boundary +
+  // CALLSEQ_START and CALLSEQ_END, to make sure we schedule copies to ABI
+  // parameter registers before a call to avoid register pressure failures.
+  auto &ST = MI.getMF()->getSubtarget<TPUSubtarget>();
+  if (!ST.isSparseCore()) {
+    if (MI.getOpcode() == TPU::CALLSEQ_START ||
+        MI.getOpcode() == TPU::CALLSEQ_END)
+      return true;
+  }
+  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
+}
+
+bool TPUInstrInfo::isEvent(const MachineInstr &MI) const {
+  return MI.getOpcode() == TPU::EVENT || MI.getOpcode() == TPU::EVENT_NULLARY;
+}
+
+int TPUInstrInfo::getVRegAddressCalcOpNo(const MachineInstr *MI) const {
+  auto &ST = MI->getMF()->getSubtarget<TPUSubtarget>();
+  if (!ST.isSparseCore())
+    return -1;
+  if (MI->getOpcode() == TPU::BUNDLE)
+    return -1;
+  if (TPUInstrInfo::isIndirectVregStream(*MI))
+    return 4;
+  switch (MI->getOpcode()) {
+  default:
+    return -1;
+  case TPU::scVLD_IDX_MSK:
+  case TPU::scVLD_IDX_MSK_NP:
+    return 2;
+  case TPU::scVST_IDX_MSK:
+  case TPU::scVST_IDX_MSK_NP:
+  case TPU::scVST_IDX_MSK_ADD:
+  case TPU::scVST_IDX_MSK_ADDF:
+  case TPU::scVST_IDX_MSK_ADD_NP:
+  case TPU::scVST_IDX_MSK_ADDF_NP:
+  case TPU::scVST_CB_IDX_MSK_ADD:
+  case TPU::scVST_CB_IDX_MSK_ADDF:
+  case TPU::scVST_CB_IDX_MSK_ADD_NP:
+  case TPU::scVST_CB_IDX_MSK_ADDF_NP:
+  case TPU::scVST_CB_IDX_MSK:
+  case TPU::scVST_CB_IDX_MSK_NP:
+  case TPU::scVLD_IDX_MSK_STRIDEi:
+  case TPU::scVLD_IDX_MSK_STRIDEr:
+  case TPU::scVLD_IDX_MSK_STRIDEi_NP:
+  case TPU::scVLD_IDX_MSK_STRIDEr_NP:
+  case TPU::scVST_IDX_MSK_STRIDEr:
+  case TPU::scVST_IDX_MSK_STRIDEi:
+  case TPU::scVST_IDX_MSK_STRIDEr_NP:
+  case TPU::scVST_IDX_MSK_STRIDEi_NP:
+    return 2;
+  }
+}
+
+int TPUInstrInfo::getVMaskAddressCalcOpNo(const MachineInstr *MI) const {
+  auto &ST = MI->getMF()->getSubtarget<TPUSubtarget>();
+  if (!ST.isSparseCore())
+    return -1;
+  switch (MI->getOpcode()) {
+  default:
+    return -1;
+  case TPU::scVLD_IDX_MSK:
+  case TPU::scVLD_IDX_MSK_NP:
+  case TPU::scVST_IDX_MSK:
+  case TPU::scVST_IDX_MSK_NP:
+  case TPU::scVST_IDX_MSK_ADD:
+  case TPU::scVST_IDX_MSK_ADDF:
+  case TPU::scVST_IDX_MSK_ADD_NP:
+  case TPU::scVST_IDX_MSK_ADDF_NP:
+  case TPU::scVST_CB_IDX_MSK_ADD:
+  case TPU::scVST_CB_IDX_MSK_ADDF:
+  case TPU::scVST_CB_IDX_MSK_ADD_NP:
+  case TPU::scVST_CB_IDX_MSK_ADDF_NP:
+  case TPU::scVST_CB_IDX_MSK:
+  case TPU::scVST_CB_IDX_MSK_NP:
+  case TPU::scVLD_MSK_STRIDEri:
+  case TPU::scVLD_MSK_STRIDErr:
+  case TPU::scVLD_CB_MSK_STRIDEi:
+  case TPU::scVLD_CB_MSK_STRIDEr:
+  case TPU::scVLD_CB_UPD_MSK_STRIDEi:
+  case TPU::scVLD_CB_UPD_MSK_STRIDEr:
+  case TPU::scVLD_IDX_MSK_STRIDEi:
+  case TPU::scVLD_IDX_MSK_STRIDEr:
+  case TPU::scVLD_IDX_MSK_STRIDEi_NP:
+  case TPU::scVLD_IDX_MSK_STRIDEr_NP:
+  case TPU::scVST_MSK_STRIDEr:
+  case TPU::scVST_MSK_STRIDEi:
+  case TPU::scVST_MSK_ADD_STRIDEr:
+  case TPU::scVST_MSK_ADD_STRIDEi:
+  case TPU::scVST_MSK_ADDF_STRIDEr:
+  case TPU::scVST_MSK_ADDF_STRIDEi:
+  case TPU::scVST_IDX_MSK_STRIDEr:
+  case TPU::scVST_IDX_MSK_STRIDEi:
+  case TPU::scVST_IDX_MSK_STRIDEr_NP:
+  case TPU::scVST_IDX_MSK_STRIDEi_NP:
+  case TPU::scVST_CB_MSK_STRIDEr:
+  case TPU::scVST_CB_MSK_STRIDEi:
+  case TPU::scVST_CB_UPD_MSK_STRIDEr:
+  case TPU::scVST_CB_UPD_MSK_STRIDEi:
+  case TPU::scVST_CB_MSK_ADD_STRIDEr:
+  case TPU::scVST_CB_MSK_ADD_STRIDEi:
+  case TPU::scVST_CB_MSK_ADDF_STRIDEr:
+  case TPU::scVST_CB_MSK_ADDF_STRIDEi:
+  case TPU::scVST_CB_UPD_MSK_ADD_STRIDEr:
+  case TPU::scVST_CB_UPD_MSK_ADD_STRIDEi:
+  case TPU::scVST_CB_UPD_MSK_ADDF_STRIDEr:
+  case TPU::scVST_CB_UPD_MSK_ADDF_STRIDEi:
+    return 1;
+  }
+}
+
+int TPUInstrInfo::countVRegSrcs(const MachineInstr &MI) {
+  if (MI.isBranch() || MI.isCall())
+    // Shortcut, and also necessary to avoid counting call operands.
+    return 0;
+  int NumVRegs = 0;
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+  for (int N = MI.getNumDefs(); N < MI.getNumOperands(); N++) {
+    if (!MI.getOperand(N).isReg())
+      continue;
+    Register R = MI.getOperand(N).getReg();
+    if (Register::isVirtualRegister(R)) {
+      if (MRI.getRegClass(R) == &TPU::VPRRegClass)
+        NumVRegs++;
+    } else {
+      if (TPU::VPRRegClass.contains(R))
+        NumVRegs++;
+    }
+  }
+  return NumVRegs;
+}
+
+int TPUInstrInfo::getAddressCalcDelay(MachineInstr *DefMI, MachineInstr *MI,
+                                      Register R) {
+  auto &ST = MI->getMF()->getSubtarget<TPUSubtarget>();
+  if (!ST.isSparseCore())
+    return 0;
+  int OpNo = ST.getInstrInfo()->getVRegAddressCalcOpNo(MI);
+  if (OpNo != -1) {
+    if (R == MI->getOperand(OpNo).getReg())
+      return ST.getVRegAddressCalcLatency(DefMI);
+  }
+  OpNo = ST.getInstrInfo()->getVMaskAddressCalcOpNo(MI);
+  // Please note that for a given register, it will either be a vreg or a
+  // vmask, not both.
+  if (OpNo != -1) {
+    if (R == MI->getOperand(OpNo).getReg())
+      return ST.getVMaskAddressCalcLatency(DefMI);
+  }
+  return 0;
+}
+
+bool TPUInstrInfo::needsPop3Word0Void(const TPUSubtarget &ST,
+                                      const MachineInstr &MI) const {
+  if (!ST.isVfcSparseCore())
+    return false;
+  if (MI.getOpcode() == TPU::scVDUPCNT || MI.getOpcode() == TPU::scVDUPCNTF ||
+      MI.getOpcode() == TPU::scVUNIQUE || MI.getOpcode() == TPU::scVUNIQUEF)
+    return true;
+  return false;
+}
+
+bool TPUInstrInfo::verifyInstruction(const MachineInstr &MI,
+                                     StringRef &ErrInfo) const {
+  const MCInstrDesc &MCID = MI.getDesc();
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+  if (MI.isCopy() || MI.isPHI() || MI.isImplicitDef())
+    return TargetInstrInfo::verifyInstruction(MI, ErrInfo);
+  if (MCID.OpInfo) {
+    for (unsigned MONum = 0, E = MI.getNumOperands(); MONum != E; ++MONum) {
+      const MachineOperand &MO = MI.getOperand(MONum);
+      unsigned NumDefs = MCID.getNumDefs();
+      if (MONum < NumDefs) {
+        if (MO.isReg()) {
+          Register R = MO.getReg();
+          if (Register::isVirtualRegister(R)) {
+            if (MRI.getRegClass(R) == &TPU::VPRRegClass ||
+                MRI.getRegClass(R) == &TPU::MPRRegClass) {
+              if (!isVectorInstruction(MI.getDesc())) {
+                ErrInfo = "Expected vector instruction.\n";
+                LLVM_DEBUG(MI.dump());
+                return false;
+              }
+            }
+          } else if (TPU::VPRRegClass.contains(R) ||
+                     TPU::MPRRegClass.contains(R)) {
+            if (!isVectorInstruction(MI.getDesc())) {
+              ErrInfo = "Expected vector instruction.\n";
+              LLVM_DEBUG(MI.dump());
+              return false;
+            }
+          }
+        }
+        // We let the existing TargetInstrInfo::verifyInstruction check the
+        // destinations.
+      } else if (MONum < MCID.getNumOperands()) {
+        bool IsOptional = MI.isVariadic() && MONum == MCID.getNumOperands() - 1;
+        if (!IsOptional) {
+          const MCOperandInfo &MCOI = MCID.OpInfo[MONum];
+          if (MCOI.OperandType == TPUOp::OPERAND_IMM_SINGLE ||
+              MCOI.OperandType == TPUOp::OPERAND_MEMSTRIDE ||
+              MCOI.OperandType == TPUOp::OPERAND_SUBLANEMASK ||
+              MCOI.OperandType == TPUOp::OPERAND_SCALAR_IMM32 ||
+              MCOI.OperandType == TPUOp::OPERAND_SYNC_IMM_SINGLE ||
+              MCOI.OperandType == TPUOp::OPERAND_TC_VLD_SHUFFLE_IMM32 ||
+              MCOI.OperandType == TPUOp::OPERAND_VECTOR_IMM32 ||
+              MCOI.OperandType == TPUOp::OPERAND_IMM_PLAIN) {
+            if (!MO.isImm() && !MO.isGlobal() && !MO.isFPImm() && !MO.isFI()) {
+              ErrInfo = "Expected an (fp)immediate operand or global.\n";
+              LLVM_DEBUG(MI.dump());
+              return false;
+            }
+          } else if (MCOI.OperandType == TPUOp::OPERAND_MEMOFFSET) {
+            if (!MO.isImm() && !MO.isGlobal()) {
+              ErrInfo = "Expected an immediate operand or global.\n";
+              LLVM_DEBUG(MI.dump());
+              return false;
+            }
+          }
+        }
+      }
+    }
+  }
+  if (isIndirectOrStridedStream(MCID)) {
+    if (MI.getOperand(7).isReg()) {
+      if (MI.getOperand(5).getReg() != MI.getOperand(7).getReg()) {
+        ErrInfo = "Indirect_list_size/length_per_stride and hbm4b offset "
+                  "expected to be in the "
+                  "same register.\n";
+        LLVM_DEBUG(MI.dump());
+        return false;
+      }
+    }
+  }
+  // On SparseCore, we've added these checks for consistency after finding
+  // mismatches of the expected mayLoad, mayStore properties. Once we're
+  // confident enough, we may decide to remove these checks.
+  bool mayLoad = MI.mayLoad();
+  bool mayStore = MI.mayStore();
+  switch (MI.getOpcode()) {
+  case TPU::scVLDri:
+  case TPU::scVLDi:
+  case TPU::scVLD_MSKi:
+  case TPU::scVLD_MSKri:
+  case TPU::scVLD_IDX_MSK_NP:
+  case TPU::scVLD_IDX_MSK:
+  case TPU::scVLD_CB_MSK:
+  case TPU::scVLD_CB_UPD_MSK:
+  case TPU::scVLD_MSK_STRIDErr:
+  case TPU::scVLD_MSK_STRIDEri:
+  case TPU::scVLD_CB_IDX_MSK_NP:
+  case TPU::scVLD_CB_IDX_MSK:
+  case TPU::scVLD_IDX_MSK_STRIDEr_NP:
+  case TPU::scVLD_IDX_MSK_STRIDEr:
+  case TPU::scVLD_IDX_MSK_STRIDEi_NP:
+  case TPU::scVLD_IDX_MSK_STRIDEi:
+    if (!mayLoad) {
+      ErrInfo = "Instruction should be mayLoad.\n";
+      LLVM_DEBUG(MI.dump());
+      return false;
+    }
+    if (mayStore) {
+      ErrInfo = "Instruction should not be mayStore.\n";
+      LLVM_DEBUG(MI.dump());
+      return false;
+    }
+    break;
+  case TPU::scVSTi:
+  case TPU::scVSTri:
+  case TPU::scVST_MSK:
+  case TPU::scVST_IDX_MSK_NP:
+  case TPU::scVST_IDX_MSK:
+  case TPU::scVST_CB_MSK:
+  case TPU::scVST_CB_UPD_MSK:
+  case TPU::scVST_MSK_STRIDEi:
+  case TPU::scVST_CB_IDX_MSK_NP:
+  case TPU::scVST_CB_IDX_MSK:
+  case TPU::scVST_MSK_STRIDEr:
+  case TPU::scVST_IDX_MSK_STRIDEr_NP:
+  case TPU::scVST_IDX_MSK_STRIDEr:
+  case TPU::scVST_IDX_MSK_STRIDEi_NP:
+  case TPU::scVST_IDX_MSK_STRIDEi:
+    if (mayLoad) {
+      ErrInfo = "Instruction should not be mayLoad.\n";
+      LLVM_DEBUG(MI.dump());
+      return false;
+    }
+    if (!mayStore) {
+      ErrInfo = "Instruction should be mayStore.\n";
+      LLVM_DEBUG(MI.dump());
+      return false;
+    }
+    break;
+  case TPU::scVST_MSK_ADDF:
+  case TPU::scVST_MSK_ADD:
+  case TPU::scVST_IDX_MSK_ADDF_NP:
+  case TPU::scVST_IDX_MSK_ADDF:
+  case TPU::scVST_IDX_MSK_ADD_NP:
+  case TPU::scVST_IDX_MSK_ADD:
+  case TPU::scVST_MSK_ADDF_STRIDEr:
+  case TPU::scVST_MSK_ADD_STRIDEr:
+  case TPU::scVST_MSK_ADDF_STRIDEi:
+  case TPU::scVST_MSK_ADD_STRIDEi:
+  case TPU::scVST_CB_IDX_MSK_ADDF_NP:
+  case TPU::scVST_CB_IDX_MSK_ADDF:
+  case TPU::scVST_CB_IDX_MSK_ADD_NP:
+  case TPU::scVST_CB_IDX_MSK_ADD:
+    if (!mayLoad) {
+      ErrInfo = "Instruction should be mayLoad.\n";
+      LLVM_DEBUG(MI.dump());
+      return false;
+    }
+    if (!mayStore) {
+      ErrInfo = "Instruction should be mayStore.\n";
+      LLVM_DEBUG(MI.dump());
+      return false;
+    }
+    break;
+  default:
+    break;
+  }
+  return TargetInstrInfo::verifyInstruction(MI, ErrInfo);
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrInfo.h b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrInfo.h
new file mode 100644
index 0000000..af7abe6
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrInfo.h

@@ -0,0 +1,932 @@
+//===------- TPUInstrInfo.h - TPU Instruction Information -------*-C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the TPU implementation of the TargetInstrInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_TPU_TPUINSTRINFO_H
+#define LLVM_LIB_TARGET_TPU_TPUINSTRINFO_H
+
+#include "MCTargetDesc/TPUBaseInfo.h"
+#include "TPU.h"
+#include "TPUMIRFormatter.h"
+#include "TPURegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "TPUGenInstrInfo.inc"
+
+namespace llvm {
+class TPUSubtarget;
+
+// TODO(hgreving): upstream a change that allows us to remove this second layer
+// class we're using here.
+class TPUPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
+public:
+  virtual MachineInstr *getIVUpdate() const = 0;
+  virtual MachineInstr *getCmp() const = 0;
+  virtual int64_t getTripCount() const = 0;
+  virtual bool isReverseCond() const = 0;
+  // Obsolete, bug, don't use. FIXME(hgreving): fix upstream.
+  virtual std::optional<bool>
+  createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
+                                  SmallVectorImpl<MachineOperand> &Cond) = 0;
+  virtual std::optional<bool>
+  createTripCountGreaterCondition(int TC, MachineBasicBlock &MBB,
+                                  MachineInstr *Cmp,
+                                  SmallVectorImpl<MachineOperand> &Cond) = 0;
+};
+
+class TPUInstrInfo : public TPUGenInstrInfo {
+  const TPURegisterInfo RegisterInfo;
+
+  // Contents of MCInstrDesc's TSFlags field.
+  enum TSFlags : uint64_t {
+    TSF_Reserved0 = 1ULL << 0,
+    TSF_Reserved1 = 1ULL << 1,
+    TSF_IsVectorInstruction = 1ULL << 2,
+    TSF_IsPush = 1ULL << 3,
+    TSF_IsPop = 1ULL << 4,
+    TSF_IsVMemLoadInstr = 1ULL << 5,
+    TSF_IsTransposeEnd = 1ULL << 6,
+    TSF_IsTranspose = 1ULL << 7,
+    TSF_IsPermute = 1ULL << 8,
+    TSF_IsReduce = 1ULL << 9,
+    TSF_IsPacked = 1ULL << 10,
+    TSF_IsSegmented = 1ULL << 11,
+    TSF_IsIndexedLoadStore = 1ULL << 12,
+    TSF_IsDwg = 1ULL << 13,
+    TSF_IsPackedMatMul = 1ULL << 14,
+    TSF_IsInVectorSlot = 1ULL << 15,
+    TSF_IsInScalarSlot = 1ULL << 16,
+    TSF_ImmediateRangeOffset = 17,
+    TSF_ImmediateRangeMask = 0x3FULL << 17,
+    TSF_ImmediateOperandsOffset = 23,
+    TSF_ImmediateOperandsMask = 0xFULL << 23,
+    TSF_IsVMemStoreInstr = 1ULL << 27,
+    TSF_IsFifoPseudoCopy = 1ULL << 28,
+    TSF_IsMXUInst = 1ULL << 29,
+    TSF_IsXLUInst = 1ULL << 30,
+    TSF_SupportsPopVoid = 1ULL << 31,
+    TSF_IsComposedErfFifo = 1ULL << 32,
+    TSF_IsComposedXrf0Fifo = 1ULL << 33,
+    TSF_IsComposedXrf1Fifo = 1ULL << 34,
+    TSF_IsComposedV2SFifo = 1ULL << 35,
+    TSF_IsComposedDrfFifo = 1ULL << 36,
+    TSF_IsStream = 1ULL << 37,
+    TSF_IsIndirectOrStridedStream = 1ULL << 38,
+    TSF_IsIndirectVregCbStream = 1ULL << 39,
+    TSF_IsIndirectVregStream = 1ULL << 40,
+    TSF_SupportsHwMask = 1ULL << 41,
+    TSF_IsNoParallel = 1ULL << 42,
+    TSF_IsDMA = 1ULL << 43,
+    TSF_IsCb = 1ULL << 44,
+    TSF_IsCbUpd = 1ULL << 45,
+  };
+
+public:
+  TPUInstrInfo(const TPUSubtarget *ST, unsigned HwMode);
+
+  // Return true if MI requires immediate slots when bundled. Returns the
+  // masks for immediate slots to choose from and mask of instruction operands
+  // requiring the slot.
+  static bool requiresImmediateSlots(const MachineInstr &MI, uint64_t &SMask,
+                                     uint64_t &OMask);
+
+  // Helper function that returns how much a Transpose Fifo instruction
+  // increases the consumption of the Fifo space.
+  // This is meant to model the weird behavior these instructions have
+  // in Jellyfish in regard of when they allocate the fifo space for the
+  // result.
+  static int getTransposeFifoUsage(const MachineInstr &MI, int SequenceNum);
+
+  // Return true if the instruction is a vector instruction. This means it would
+  // go through the vector FIFO if the platform has one and it has to use vector
+  // to scalar slots to read scalar registers.
+  static bool isVectorInstruction(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsVectorInstruction;
+  }
+
+  // Return true if MIDC is a FIFO push.
+  static bool isFifoPush(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsPush;
+  }
+
+  // Return true if MIDC is a FIFO pop.
+  static bool isFifoPop(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsPop;
+  }
+
+  // Returns the FIFO operand number for a FIFO pop instruction. Assumes MICD is
+  // a pop.
+  static int getFifoPopOperandNo(const MCInstrDesc &MIDC) {
+    assert(TPUInstrInfo::isFifoPop(MIDC));
+    switch (MIDC.getOpcode()) {
+    case TPU::scVPOP3_XRF0:
+    case TPU::scVPOP3_XRF1:
+      return 3;
+    case TPU::SPOP_V2SF:
+    case TPU::VPOPCNTr:
+    case TPU::SPOP_DRF:
+    case TPU::SPOP_SFRF:
+    case TPU::tcMXU0MATPOP:
+    case TPU::tcMXU1MATPOP:
+    case TPU::tcMXU2MATPOP:
+    case TPU::tcMXU3MATPOP:
+    case TPU::tcvfMXU0MATPOP:
+    case TPU::tcvfMXU1MATPOP:
+    case TPU::tcvfMXU2MATPOP:
+    case TPU::tcvfMXU3MATPOP:
+    case TPU::tcXLU0Pop:
+    case TPU::tcXLU1Pop:
+    case TPU::tcvfXLU0Pop:
+    case TPU::tcvfXLU1Pop:
+    case TPU::tcvfXLU2Pop:
+    case TPU::VRES_EUP:
+    case TPU::VRES_EUP_VRES0:
+    case TPU::VRES_EUP_VRES0_V0:
+    case TPU::VRES_EUP_VRES0_V1:
+    case TPU::VRES_EUP_VRES0_VAUX:
+    case TPU::VRES_EUP_VRES0_VLD:
+    case TPU::VRES_EUP_VRES1:
+    case TPU::VRES_EUP_VRES1_V0:
+    case TPU::VRES_EUP_VRES1_V1:
+    case TPU::VRES_EUP_VRES1_VAUX:
+    case TPU::VRES_EUP_VRES1_VLD:
+      return 1;
+    default:
+      // Performs a self-check in subtarget.
+      llvm_unreachable("Must handle all pop instructions.");
+      return -1;
+    }
+  }
+
+  // Return true if MIDC is a FIFO pop that supports writing to void.
+  static bool supportsFifoPopVoid(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_SupportsPopVoid;
+  }
+
+  // Return true if MIDC is a composed ERF FIFO instruction.
+  static bool isComposedErfFifo(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsComposedErfFifo;
+  }
+
+  // Return true if MIDC is a composed XRF0 FIFO instruction.
+  static bool isComposedXrf0Fifo(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsComposedXrf0Fifo;
+  }
+
+  // Return true if MIDC is a composed XRF1 FIFO instruction.
+  static bool isComposedXrf1Fifo(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsComposedXrf1Fifo;
+  }
+
+  // Return true if MIDC is a composed V2S FIFO instruction.
+  static bool isComposedV2SFifo(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsComposedV2SFifo;
+  }
+
+  // Return true if MIDC is a composed DRF FIFO instruction.
+  static bool isComposedDrfFifo(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsComposedDrfFifo;
+  }
+
+  // Return true if MIDC is a stream instruction.
+  static bool isStream(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsStream;
+  }
+
+  // Return true if MIDC is a DMA instruction.
+  static bool isDMA(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsDMA;
+  }
+
+  // Return true if MIDC is a scalar or vector circular buffer load/store.
+  static bool isCb(const MCInstrDesc &MIDC) { return MIDC.TSFlags & TSF_IsCb; }
+
+  // Return true if MIDC is a scalar or vector circular buffer load/store with
+  // update.
+  static bool isCbUpd(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsCbUpd;
+  }
+
+  // Return true if MIDC is an indirect or strided stream instruction.
+  static bool isIndirectOrStridedStream(const MCInstrDesc &MIDC) {
+    assert(MIDC.TSFlags & TSF_IsStream ||
+           !(MIDC.TSFlags & TSF_IsIndirectOrStridedStream));
+    return MIDC.TSFlags & TSF_IsIndirectOrStridedStream;
+  }
+
+  // Return true if MIDC is an indirect_vreg.*.cb.* instruction.
+  static bool isIndirectVregCbStream(const MCInstrDesc &MIDC) {
+    assert(MIDC.TSFlags & TSF_IsStream ||
+           !(MIDC.TSFlags & TSF_IsIndirectVregCbStream));
+    return MIDC.TSFlags & TSF_IsIndirectVregCbStream;
+  }
+
+  // Return true if MIDC is an indirect_vreg.* instruction.
+  static bool isIndirectVregStream(const MCInstrDesc &MIDC) {
+    assert(MIDC.TSFlags & TSF_IsStream ||
+           !(MIDC.TSFlags & TSF_IsIndirectVregStream));
+    return MIDC.TSFlags & TSF_IsIndirectVregStream;
+  }
+
+  // Return true if MIDC supports a fixed hardware mask.
+  static bool supportsEmbeddedMask(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_SupportsHwMask;
+  }
+
+  // Return true if MIDC is marked no parallel.
+  static bool isNoParallel(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsNoParallel;
+  }
+
+  // Return true if MIDC is a composed FIFO instruction.
+  static bool isComposedFifo(const MCInstrDesc &MIDC) {
+    return isComposedErfFifo(MIDC) || isComposedXrf0Fifo(MIDC) ||
+           isComposedXrf1Fifo(MIDC) || isComposedV2SFifo(MIDC) ||
+           isComposedDrfFifo(MIDC);
+  }
+
+  static bool isNopBundle(MachineInstr *Bundle, unsigned nop) {
+    if (Bundle->getOpcode() != TPU::BUNDLE)
+      return false;
+    if (!Bundle->isBundledWithSucc())
+      return false;
+    if (Bundle->getBundleSize() > 1)
+      return false;
+    MachineInstr &MI = *std::next(Bundle->getIterator());
+    return MI.getOpcode() == nop;
+  }
+
+  static bool isVNopBundle(MachineInstr *Bundle) {
+    return isNopBundle(Bundle, TPU::VNOP);
+  }
+
+  static bool isVMemLoadInstr(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsVMemLoadInstr;
+  }
+
+  static bool isVMemLoadInstr(MachineInstr *MI) {
+    return isVMemLoadInstr(MI->getDesc());
+  }
+
+  static bool isVMemStoreInstr(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsVMemStoreInstr;
+  }
+
+  static bool isVMemStoreInstr(MachineInstr *MI) {
+    return isVMemStoreInstr(MI->getDesc());
+  }
+
+  static bool isTransposeEnd(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsTransposeEnd;
+  }
+
+  static bool isTranspose(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsTranspose;
+  }
+
+  static bool isPermute(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsPermute;
+  }
+
+  static bool isReduce(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsReduce;
+  }
+
+  static bool isPacked(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsPacked;
+  }
+
+  static bool isSegmented(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsSegmented;
+  }
+
+  static bool isIndexedLoadStore(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsIndexedLoadStore;
+  }
+
+  static bool isPackedMatMul(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsPackedMatMul;
+  }
+
+  static bool isInVectorSlot(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsInVectorSlot;
+  }
+
+  static bool isInScalarSlot(const MCInstrDesc &MIDC) {
+    return MIDC.TSFlags & TSF_IsInScalarSlot;
+  }
+
+  // Return true if the instruction is a vector instruction.
+  static bool isVectorInstruction(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return isVectorInstruction(MI.getDesc());
+  }
+
+  // Return true if MI is a FIFO push.
+  static bool isFifoPush(const MachineInstr &MI) {
+    if (MI.getOpcode() != TPU::BUNDLE)
+      return MI.getDesc().TSFlags & TSF_IsPush;
+    auto E = MI.getParent()->instr_end();
+    for (auto I = std::next(MI.getIterator()); I != E && I->isInsideBundle();
+         ++I)
+      if (isFifoPush(I->getDesc()))
+        return true;
+    return false;
+  }
+
+  // Return true if MI is a FIFO pop.
+  static bool isFifoPop(const MachineInstr &MI) {
+    if (MI.getOpcode() != TPU::BUNDLE)
+      return MI.getDesc().TSFlags & TSF_IsPop;
+    auto E = MI.getParent()->instr_end();
+    for (auto I = std::next(MI.getIterator()); I != E && I->isInsideBundle();
+         ++I)
+      if (isFifoPop(I->getDesc()))
+        return true;
+    return false;
+  }
+
+  // Returns the FIFO operand number for a FIFO pop instruction. Assumes MI is a
+  // pop instruction.
+  static int getFifoPopOperandNo(const MachineInstr &MI) {
+    return TPUInstrInfo::getFifoPopOperandNo(MI.getDesc());
+  }
+
+  // Return true if MI is a FIFO pop that supports writing to void.
+  static bool supportsFifoPopVoid(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return supportsFifoPopVoid(MI.getDesc());
+  }
+
+  // Return true if MI is a composed ERF FIFO instruction.
+  static bool isComposedErfFifo(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return isComposedErfFifo(MI.getDesc());
+  }
+
+  // Return true if MI is a composed XRF0 FIFO instruction.
+  static bool isComposedXrf0Fifo(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return isComposedXrf0Fifo(MI.getDesc());
+  }
+
+  // Return true if MI is a composed XRF1 FIFO instruction.
+  static bool isComposedXrf1Fifo(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return isComposedXrf1Fifo(MI.getDesc());
+  }
+
+  // Return true if MI is a composed V2S FIFO instruction.
+  static bool isComposedV2SFifo(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return isComposedV2SFifo(MI.getDesc());
+  }
+
+  // Return true if MI is a composed DRF FIFO instruction.
+  static bool isComposedDrfFifo(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return isComposedDrfFifo(MI.getDesc());
+  }
+
+  // Return true if MI is a stream instruction.
+  static bool isStream(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return isStream(MI.getDesc());
+  }
+
+  // Return true if MI is a DMA instruction.
+  static bool isDMA(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return isDMA(MI.getDesc());
+  }
+
+  // Return true if MI is a scalar or vector circular buffer load/store.
+  static bool isCb(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return isCb(MI.getDesc());
+  }
+
+  // Return true if MI is a scalar or vector circular buffer load/store with
+  // update.
+  static bool isCbUpd(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return isCbUpd(MI.getDesc());
+  }
+
+  // Return true if MI is an indirect or strided stream instruction.
+  static bool isIndirectOrStridedStream(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return isIndirectOrStridedStream(MI.getDesc());
+  }
+
+  // Return true if MI is an indirect_vreg.*.cb.* instruction.
+  static bool isIndirectVregCbStream(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return isIndirectVregCbStream(MI.getDesc());
+  }
+
+  // Return true if MI is an indirect_vreg.* instruction.
+  static bool isIndirectVregStream(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return isIndirectVregStream(MI.getDesc());
+  }
+
+  // Return true if MI supports a fixed hardware mask.
+  static bool supportsEmbeddedMask(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return supportsEmbeddedMask(MI.getDesc());
+  }
+
+  // Return true if MI is marked no parallel.
+  static bool isNoParallel(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return isNoParallel(MI.getDesc());
+  }
+
+  // Return true if MI is a composed FIFO instruction.
+  static bool isComposedFifo(const MachineInstr &MI) {
+    assert(MI.getOpcode() != TPU::BUNDLE);
+    return isComposedFifo(MI.getDesc());
+  }
+
+  static bool isFifoPseudoCopy(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & TSF_IsFifoPseudoCopy;
+  }
+
+  static bool isScBundleLimiter(const MachineInstr &MI) {
+    return MI.getOpcode() == TPU::scBUNDLE;
+  }
+
+  static bool isScTrap(const MachineInstr &MI) {
+    return MI.getOpcode() == TPU::scTRAPr || MI.getOpcode() == TPU::scTRAPi ||
+           MI.getOpcode() == TPU::scPSEUDO_TRAPr ||
+           MI.getOpcode() == TPU::scPSEUDO_TRAPi;
+  }
+
+  static bool isDWGInst(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & TSF_IsDwg;
+  }
+
+  static bool isMXUInst(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & TSF_IsMXUInst;
+  }
+
+  static bool isXLUInst(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & TSF_IsXLUInst;
+  }
+
+  static bool isSfence(const MachineInstr *MI) {
+    assert(MI->getOpcode() != TPU::BUNDLE);
+    return MI->getOpcode() == TPU::SFENCE ||
+           MI->getOpcode() == TPU::SFENCE_SCMF ||
+           MI->getOpcode() == TPU::SFENCE_SELr ||
+           MI->getOpcode() == TPU::SFENCE_SELi;
+  }
+
+  static bool isSfenceImem(const MachineInstr *MI) {
+    assert(MI->getOpcode() != TPU::BUNDLE);
+    return MI->getOpcode() == TPU::SFENCE_IMEM;
+  }
+
+  static bool isBR(const MachineInstr *MI) {
+    assert(MI->getOpcode() != TPU::BUNDLE);
+    return MI->getOpcode() == TPU::BR || MI->getOpcode() == TPU::BRcond ||
+           MI->getOpcode() == TPU::BRClr || MI->getOpcode() == TPU::BRcondClr ||
+           MI->getOpcode() == TPU::bcBR || MI->getOpcode() == TPU::BRret;
+  }
+
+  // Returns whether opcode is a spill to stack, excluding pre-spiller generated
+  // spills.
+  static bool isSpillStack(const MachineInstr *MI) {
+    assert(MI->getOpcode() != TPU::BUNDLE);
+    return MI->getOpcode() == TPU::SPILL_GPRs ||
+           MI->getOpcode() == TPU::SPILL_MPRs ||
+           MI->getOpcode() == TPU::SPILL_PPRs ||
+           MI->getOpcode() == TPU::SPILL_VPRs;
+  }
+
+  // Returns whether opcode is a spill, excluding pre-spiller generated spills.
+  static bool isSpill(const MachineInstr *MI) {
+    assert(MI->getOpcode() != TPU::BUNDLE);
+    return MI->getOpcode() == TPU::SPILL_GPR ||
+           MI->getOpcode() == TPU::SPILL_MPR ||
+           MI->getOpcode() == TPU::SPILL_PPR ||
+           MI->getOpcode() == TPU::SPILL_VPR || isSpillStack(MI);
+  }
+
+  // Returns whether opcode is a fill from stack, excluding pre-spiller
+  // generated spills.
+  static bool isRestoreStack(const MachineInstr *MI) {
+    assert(MI->getOpcode() != TPU::BUNDLE);
+    return MI->getOpcode() == TPU::RESTORE_GPRs ||
+           MI->getOpcode() == TPU::RESTORE_MPRs ||
+           MI->getOpcode() == TPU::RESTORE_PPRs ||
+           MI->getOpcode() == TPU::RESTORE_VPRs;
+  }
+
+  // Returns whether opcode is a fill, excluding pre-spiller generated spills.
+  static bool isRestore(const MachineInstr *MI) {
+    assert(MI->getOpcode() != TPU::BUNDLE);
+    return MI->getOpcode() == TPU::RESTORE_GPR ||
+           MI->getOpcode() == TPU::RESTORE_MPR ||
+           MI->getOpcode() == TPU::RESTORE_PPR ||
+           MI->getOpcode() == TPU::RESTORE_VPR || isRestoreStack(MI);
+  }
+
+  // Returns whether opcode is a pre-spiller generated stack spill.
+  static bool isPreSpillStack(const MachineInstr *MI) {
+    assert(MI->getOpcode() != TPU::BUNDLE);
+    return MI->getOpcode() == TPU::SPILL_GPR_Ps ||
+           MI->getOpcode() == TPU::SPILL_MPR_Ps ||
+           MI->getOpcode() == TPU::SPILL_PPR_Ps ||
+           MI->getOpcode() == TPU::SPILL_VPR_Ps;
+  }
+
+  // Returns whether opcode is a pre-spiller generated spill.
+  static bool isPreSpill(const MachineInstr *MI) {
+    assert(MI->getOpcode() != TPU::BUNDLE);
+    return MI->getOpcode() == TPU::SPILL_GPR_P ||
+           MI->getOpcode() == TPU::SPILL_MPR_P ||
+           MI->getOpcode() == TPU::SPILL_PPR_P ||
+           MI->getOpcode() == TPU::SPILL_VPR_P || isPreSpillStack(MI);
+  }
+
+  // Returns whether opcode is a pre-spiller generated stack fill.
+  static bool isPreRestoreStack(const MachineInstr *MI) {
+    assert(MI->getOpcode() != TPU::BUNDLE);
+    return MI->getOpcode() == TPU::RESTORE_GPR_Ps ||
+           MI->getOpcode() == TPU::RESTORE_MPR_Ps ||
+           MI->getOpcode() == TPU::RESTORE_PPR_Ps ||
+           MI->getOpcode() == TPU::RESTORE_VPR_Ps;
+  }
+
+  // Returns whether opcode is a pre-spiller generated fill.
+  static bool isPreRestore(const MachineInstr *MI) {
+    assert(MI->getOpcode() != TPU::BUNDLE);
+    return MI->getOpcode() == TPU::RESTORE_GPR_P ||
+           MI->getOpcode() == TPU::RESTORE_MPR_P ||
+           MI->getOpcode() == TPU::RESTORE_PPR_P ||
+           MI->getOpcode() == TPU::RESTORE_VPR_P || isRestoreStack(MI);
+  }
+
+  // Return the first instruction in a bundle that has one of the opcodes
+  // in the opcode set.
+  static MachineInstr *getFirstFromBundle(MachineInstr *MI,
+                                          ArrayRef<unsigned> Opcodes);
+
+  // Returns address calculation delay for a potential register use R in MI
+  // which was defined by DefMI, or 0.
+  static int getAddressCalcDelay(MachineInstr *DefMI, MachineInstr *MI,
+                                 Register R);
+
+  // Returns true for the opcodes that need word_0 to be unused on the
+  // subtarget. This is a bit flaky workaround for being able to reuse the
+  // intrinsics between VFC and GLC on SparseCore.
+  bool needsPop3Word0Void(const TPUSubtarget &ST, const MachineInstr &MI) const;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator Position,
+                           Register SourceRegister, bool IsKill, int FrameIndex,
+                           const TargetRegisterClass *RegisterClass,
+                           const TargetRegisterInfo *RegisterInfo,
+                           Register VReg) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator Position,
+                            Register DestinationRegister, int FrameIndex,
+                            const TargetRegisterClass *RegisterClass,
+                            const TargetRegisterInfo *RegisterInfo,
+                            Register VReg) const override;
+
+  bool hasLoadFromStackSlot(
+      const MachineInstr &MI,
+      SmallVectorImpl<const MachineMemOperand *> &Accesses) const override;
+
+  bool hasStoreToStackSlot(
+      const MachineInstr &MI,
+      SmallVectorImpl<const MachineMemOperand *> &Accesses) const override;
+
+  bool eliminateSpillOpcode(MachineInstr &MI) const {
+    switch (MI.getOpcode()) {
+    case TPU::RESTORE_GPR:
+    case TPU::RESTORE_GPR_P:
+      MI.setDesc(get(TPU::SLDi));
+      break;
+    case TPU::RESTORE_GPR_Ps:
+    case TPU::RESTORE_GPRs:
+      MI.setDesc(get(TPU::SLDri));
+      break;
+    case TPU::RESTORE_VPR:
+    case TPU::RESTORE_VPR_P:
+      MI.setDesc(get(TPU::scVLDi));
+      break;
+    case TPU::RESTORE_VPR_Ps:
+    case TPU::RESTORE_VPRs:
+      MI.setDesc(get(TPU::scVLDri));
+      break;
+    case TPU::SPILL_GPR:
+    case TPU::SPILL_GPR_P:
+      MI.setDesc(get(TPU::SSTi));
+      break;
+    case TPU::SPILL_GPR_Ps:
+    case TPU::SPILL_GPRs:
+      llvm_unreachable("Can't handle this spill opcode here.");
+      break;
+    case TPU::SPILL_VPR:
+    case TPU::SPILL_VPR_P:
+      MI.setDesc(get(TPU::scVSTi));
+      break;
+    case TPU::SPILL_VPR_Ps:
+    case TPU::SPILL_VPRs:
+      MI.setDesc(get(TPU::scVSTri));
+      break;
+    case TPU::SPILL_GPR_ADD:
+      MI.setDesc(get(TPU::ADDri));
+      break;
+    default:
+      return false;
+      break;
+    }
+    return true;
+  }
+
+  bool convertToPreSpillOpcode(MachineInstr &MI) const {
+    switch (MI.getOpcode()) {
+    case TPU::RESTORE_GPR:
+      MI.setDesc(get(TPU::RESTORE_GPR_P));
+      break;
+    case TPU::RESTORE_GPRs:
+      MI.setDesc(get(TPU::RESTORE_GPR_Ps));
+      break;
+    case TPU::RESTORE_VPR:
+      MI.setDesc(get(TPU::RESTORE_VPR_P));
+      break;
+    case TPU::RESTORE_VPRs:
+      MI.setDesc(get(TPU::RESTORE_VPR_Ps));
+      break;
+    case TPU::SPILL_GPR:
+      MI.setDesc(get(TPU::SPILL_GPR_P));
+      break;
+    case TPU::SPILL_GPRs:
+      MI.setDesc(get(TPU::SPILL_GPR_Ps));
+      break;
+    case TPU::SPILL_VPR:
+      MI.setDesc(get(TPU::SPILL_VPR_P));
+      break;
+    case TPU::SPILL_VPRs:
+      MI.setDesc(get(TPU::SPILL_VPR_Ps));
+      break;
+    default:
+      return false;
+      break;
+    }
+    return true;
+  }
+
+  // getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  // such, whenever a client has an instance of instruction info, it should
+  // always be able to get register info as well (through this method).
+  virtual const TPURegisterInfo &getRegisterInfo() const {
+    return RegisterInfo;
+  }
+
+  // Create a ScheduleHazardRecognizer that:
+  //   1. Enforces bundles are correctly packed;
+  //   2. Ensures bundle slots are reserved for a BRrel instruction in all
+  //      bundles between a BRreserve and its corresponding BR.
+  ScheduleHazardRecognizer *
+  CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+                               const ScheduleDAG *DAG) const override;
+
+  ScheduleHazardRecognizer *
+  CreateTargetMIHazardRecognizer(const InstrItineraryData *,
+                                 const ScheduleDAGMI *DAG) const override;
+
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
+                                     int &FrameIndex) const override;
+
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
+                   const DebugLoc &DL, MCRegister DestinationRegister,
+                   MCRegister SourceRegister, bool KillSource) const override;
+
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+  bool getMemOperandsWithOffsetWidth(
+      const MachineInstr &LdSt,
+      SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
+      bool &OffsetIsScalable, unsigned &Width,
+      const TargetRegisterInfo *TRI) const override;
+
+  bool getMemOperandWithOffsetWidth(MachineInstr &LdSt, MachineOperand *&BaseOp,
+                                    int64_t &Offset, unsigned &Width,
+                                    const TargetRegisterInfo *TRI) const;
+
+  std::pair<unsigned, unsigned>
+  decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableDirectMachineOperandTargetFlags() const override;
+
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TrueBlock,
+                     MachineBasicBlock *&FalseBlock,
+                     SmallVectorImpl<MachineOperand> &Condition,
+                     bool AllowModify) const override;
+
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+
+  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
+                     MachineRegisterInfo *MRI) const override;
+
+  // Returns true if MBB's original basic block's terminator has
+  // llvm.loop.parallel_accesses metadata attached.
+  static bool isLoopParallel(MachineBasicBlock &MBB);
+
+  // Returns true if MBB's original basic block's terminator has
+  // llvm.loop.pipeline.disable metadata attached.
+  static bool isLoopPipelineDisabled(MachineBasicBlock &MBB);
+
+  // Updates any terminator with target FromMBB to using ToMBB. Returns true if
+  // terminators were updated.
+  static bool updateTerminator(MachineBasicBlock &MBB,
+                               MachineBasicBlock *FromMBB,
+                               MachineBasicBlock *ToMBB);
+
+  // For a comparison instruction, return the source registers in SrcReg and
+  // SrcReg2 if having two register operands, and the value it compares against
+  // in CmpValue. Return true if the comparison instruction can be analyzed.
+  bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                      Register &SrcReg2, int64_t &CmpMask,
+                      int64_t &CmpValue) const override;
+
+  // See if the comparison instruction can be converted into something more
+  // efficient. E.g., on TPU register-register instructions can set the
+  // flag register, obviating the need for a separate compare.
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+                            Register SrcReg2, int64_t CmpMask, int64_t CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
+
+  // Analyze the given select instruction, returning true if it cannot be
+  // understood. It is assumed that MI->isSelect() is true.
+  //
+  // When successful, return the controlling condition and the operands that
+  // determine the true and false result values.
+  //
+  //   Result = SELECT Cond, TrueOp, FalseOp
+  //
+  // TPU can optimize certain select instructions, for example by
+  // predicating the instruction defining one of the operands and sets
+  // Optimizable to true.
+  bool analyzeSelect(const MachineInstr &MI,
+                     SmallVectorImpl<MachineOperand> &Cond, unsigned &TrueOp,
+                     unsigned &FalseOp, bool &Optimizable) const override;
+
+  // Given a select instruction that was understood by analyzeSelect and
+  // returned Optimizable = true, attempt to optimize MI by merging it with one
+  // of its operands. Returns NULL on failure.
+  //
+  // When successful, returns the new select instruction. The client is
+  // responsible for deleting MI.
+  //
+  // If both sides of the select can be optimized, the TrueOp is modifed.
+  // PreferFalse is not used.
+  MachineInstr *optimizeSelect(MachineInstr &MI,
+                               SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+                               bool PreferFalse) const override;
+
+  // Analyze whether the loop is suitable for software pipelining.
+  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+  analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override;
+
+  // Analyze whether the loop is suitable for speculative superimposed software
+  // pipelining.
+  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+  analyzeLoopForSSIPipelining(MachineBasicBlock *MBB, MachineLoop *Loop) const;
+
+  MachineInstr *analyzeIVUpdateforPipelining(MachineBasicBlock *LoopBB,
+                                             MachineInstr *CmpInst,
+                                             MachineLoop *Loop) const;
+
+  bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
+                   MachineInstr *&CmpInst) const override;
+
+  unsigned reduceLoopCount(MachineBasicBlock &MBB, MachineBasicBlock &PreHeader,
+                           MachineInstr *IndVar, MachineInstr &Cmp,
+                           SmallVectorImpl<MachineOperand> &Cond,
+                           SmallVectorImpl<MachineInstr *> &PrevInsts,
+                           unsigned Iter, unsigned MaxIter) const override;
+
+  bool reverseBranchCondition(
+      SmallVectorImpl<MachineOperand> &Condition) const override;
+
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TrueBlock,
+                        MachineBasicBlock *FalseBlock,
+                        ArrayRef<MachineOperand> Condition, const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+
+  bool PredicateInstruction(MachineInstr &MI,
+                            ArrayRef<MachineOperand> Pred) const override;
+
+  bool isPredicable(const MachineInstr &MI) const override;
+
+  bool canPredicatePredicatedInstr(const MachineInstr &MI) const override;
+
+  void swapSelOperands(MachineInstr &MI) const;
+
+  /// Returns true if the first specified predicate
+  /// subsumes the second, e.g. GE subsumes GT.
+  bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
+                         ArrayRef<MachineOperand> Pred2) const override;
+
+  /// If the specified instruction defines any predicate
+  /// or condition code register(s) used for predication, returns true as well
+  /// as the definition predicate(s) by reference.
+  bool ClobbersPredicate(MachineInstr &MI, std::vector<MachineOperand> &Pred,
+                         bool SkipDead) const override;
+
+  bool isPredicated(const MachineInstr &MI) const override;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                           unsigned ExtraPredCycles,
+                           BranchProbability Probability) const override;
+
+  bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
+                                       const MachineInstr &MIb) const override;
+
+  bool canInsertSelect(const MachineBasicBlock &MBB,
+                       ArrayRef<MachineOperand> Cond, Register DstReg,
+                       Register TrueReg, Register FalseReg, int &CondCycles,
+                       int &TrueCycles, int &FalseCycles) const override;
+
+  void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    const DebugLoc &DL, Register DstReg,
+                    ArrayRef<MachineOperand> Cond, Register TrueReg,
+                    Register FalseReg) const override;
+
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override;
+
+  bool isSchedulingBoundary(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
+  bool isEvent(const MachineInstr &MI) const;
+
+  struct MemAccessInfo {
+    Register Base = 0;
+    Register DynamicStride = 0;
+    SmallVector<int, 8> Offsets;
+  };
+
+  bool getMemAccessInfo(const TPUSubtarget &ST, const MachineInstr &MI,
+                        MemAccessInfo &MA) const;
+
+  // Returns the operand number if instruction has a vreg operand subject to
+  // address calculation latency. Returns -1 otherwise.
+  int getVRegAddressCalcOpNo(const MachineInstr *MI) const;
+
+  // Returns the operand number if instruction has a mask register operand
+  // subject to address calculation latency. Returns -1 otherwise.
+  int getVMaskAddressCalcOpNo(const MachineInstr *MI) const;
+
+  const MIRFormatter *getMIRFormatter() const override {
+    if (!TPUFormatter)
+      TPUFormatter = std::make_unique<TPUMIRFormatter>();
+    return TPUFormatter.get();
+  }
+
+  // We are checking for TPUOp specific properties.
+  virtual bool verifyInstruction(const MachineInstr &MI,
+                                 StringRef &ErrInfo) const override;
+
+  // Returns the number of source operands reading from a VReg. MI's operands
+  // can be either virtual or physical registers.
+  static int countVRegSrcs(const MachineInstr &MI);
+
+private:
+  mutable std::unique_ptr<TPUMIRFormatter> TPUFormatter;
+
+  // Returns true if MBB's original basic block's terminator has
+  // metadata as in MetaString attached.
+  static bool isLoopMetadata(MachineBasicBlock &MBB, StringRef MS);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_TPU_TPUINSTRINFO_H

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrInfo.td b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrInfo.td
new file mode 100644
index 0000000..a7595c0
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrInfo.td

@@ -0,0 +1,2182 @@
+//===------ TPUInstrInfo.td - Target Description for TPU Target -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the TPU instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Subtarget features
+//===----------------------------------------------------------------------===//
+
+def HasV8    : Predicate<"ST->hasV8()">;
+def HasV16    : Predicate<"ST->hasV16()">;
+def HasV1024 : Predicate<"ST->hasV1024()">;
+def HasLPVF  : Predicate<"ST->hasLPVF()">;
+def HasLPGL  : Predicate<"ST->hasLPGL()">;
+def HasTranscendental  : Predicate<"ST->hasTranscendental()">;
+def HasVPU   : Predicate<"ST->hasV1024() || ST->hasV8() || ST->hasV16()">;
+def HasMXU   : Predicate<"ST->hasMXU()">;
+def HasPfcJfcDfcTensorCore :
+  Predicate<"ST->hasPfcTensorCore() || ST->hasJfcTensorCore() ||"
+             "ST->hasDfcTensorCore()">,
+  AssemblerPredicate<(any_of FeatureHasPfcTensorCore, FeatureHasJfcTensorCore,
+                             FeatureHasDfcTensorCore)>;
+def HasJfcDfcTensorCore :
+  Predicate<"ST->hasJfcTensorCore() || ST->hasDfcTensorCore()">,
+  AssemblerPredicate<(any_of FeatureHasJfcTensorCore, FeatureHasDfcTensorCore)>;
+def HasPxcVPU : Predicate<"ST->hasPxcVPU()">,
+                AssemblerPredicate<(all_of FeatureHasPxcVPU)>;
+def NotPxcVPU : Predicate<"!ST->hasPxcVPU()">,
+                AssemblerPredicate<(all_of (not FeatureHasPxcVPU))>;
+def HasScalarSflags : Predicate<"ST->hasScalarSflags()">;
+def HasVectorSflags : Predicate<"ST->hasVectorSflags()">;
+def HasMaskPermute  : Predicate<"ST->hasMaskPermute()">;
+// "Normal" TPU instructions don't work on BarnaCore. In particular BarnaCore
+// has a "pipeline stage" operand on most instructions that has no corollary
+// in TensorCore.
+def NotBC : Predicate<"!ST->isPxcBarnaCore()">,
+            AssemblerPredicate<(all_of (not FeatureHasBcChannelControllerIsa))>;
+def IsBC : Predicate<"ST->isPxcBarnaCore()">,
+           AssemblerPredicate<(all_of FeatureHasBcChannelControllerIsa)>;
+def HasSMul24 : Predicate<"ST->hasSMul24()">,
+                AssemblerPredicate<(all_of FeatureHasSMul24)>;
+def HasSMul32 : Predicate<"ST->hasSMul32()">,
+                AssemblerPredicate<(all_of FeatureHasSMul32)>;
+def UseGsftForXpose : Predicate<"ST->hasGsft()">;
+def UseGsfnForXpose : Predicate<"!ST->hasGsft()">;
+def IsSC  : Predicate<"ST->isSparseCore()">;
+def HasVMinMax  : Predicate<"ST->hasVMinMax()">;
+def NotSC : Predicate<"!ST->isSparseCore()">;
+def IsVFTC  : Predicate<"ST->hasVfcTensorCore()">,
+              AssemblerPredicate<(all_of FeatureHasVfcTensorCore)>;
+def NotVFTC : Predicate<"!ST->hasVfcTensorCore()">,
+              AssemblerPredicate<(all_of (not FeatureHasVfcTensorCore))>;
+def IsVFTCOrSC :
+  Predicate<"ST->hasVfcTensorCore() || ST->isSparseCore()">,
+  AssemblerPredicate<(any_of FeatureHasVfcTensorCore, FeatureHasVfcSparsecoreIsa,
+                             FeatureHasGlcSparsecoreIsa, FeatureHasGfcSparsecoreIsa)>;
+def IsVFTCOrVFSC :
+  Predicate<"ST->hasVfcTensorCore() || ST->isVfcSparseCore()">,
+  AssemblerPredicate<(any_of FeatureHasVfcTensorCore, FeatureHasVfcSparsecoreIsa)>;
+def HasFC  : Predicate<"ST->isTPUABIEnabled()">;
+def NoFC  : Predicate<"!ST->isTPUABIEnabled()">;
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "TPUInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// SDNode types
+//===----------------------------------------------------------------------===//
+
+def SDT_Splat : SDTypeProfile<1,  1, [SDTCisVec<0>, SDTCisEltOfVec<1, 0>]>;
+// This Splat is specifically for MVT::v32i8 splats, with MVT::i8 being not a legal scalar type.
+def SDT_Splatv32i8 : SDTypeProfile<1,  1, [SDTCisVT<0, vNb8>, SDTCisVT<1, i32>]>;
+def SDT_Vrotdown : SDTypeProfile<1,  2, [SDTCisVec<0>, SDTCisSameAs<1, 0>,
+                                         SDTCisVT<2, i32>]>;
+def SDT_Wrapper : SDTypeProfile<1,  1, [SDTCisVT<0, i32>, SDTCisSameAs<1, 0>]>;
+def SDT_Vbroadcast : SDTypeProfile<1,  2, [SDTCisVec<0>, SDTCisSameAs<1, 0>,
+                                           SDTCisVT<2, i32>]>;
+def SDT_Vpermute : SDTypeProfile<1,  2, [SDTCisVec<0>, SDTCisSameAs<1, 0>,
+                                           SDTCisVT<2, vNi32>]>;
+def SDT_BcInsertValue : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<1, 0>,
+                                             SDTCisSameAs<1, 0>]>;
+def SDT_BcExtractValue : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<1, 0>]>;
+def SDT_Umul24 : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 0>, SDTCisSameAs<2, 0>]>;
+
+// shalt is used as the return instruction so is variadic and has chain and
+// glue.
+def Halt : SDNode<"TPUISD::HALT", SDTNone,
+                  [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def Splat : SDNode<"TPUISD::SPLAT", SDT_Splat, []>;
+def Splatv32i8 : SDNode<"TPUISD::SPLAT", SDT_Splatv32i8, []>;
+def Vrotdown : SDNode<"TPUISD::VROTDOWN", SDT_Vrotdown, []>;
+def Wrapper : SDNode<"TPUISD::WRAPPER", SDT_Wrapper, []>;
+def Vbroadcast : SDNode<"TPUISD::VBROADCAST", SDT_Vbroadcast, []>;
+def Vpermute : SDNode<"TPUISD::VPERMUTE", SDT_Vpermute, []>;
+def BcInsertValue : SDNode<"TPUISD::BC_INSERTVALUE", SDT_BcInsertValue, []>;
+def BcExtractValue : SDNode<"TPUISD::BC_EXTRACTVALUE", SDT_BcExtractValue, []>;
+def Umul24 : SDNode<"TPUISD::UMUL24", SDT_Umul24, []>;
+
+def ftoi : SDNodeXForm<fpimm, [{
+  APInt I = N->getValueAPF().bitcastToAPInt();
+  return CurDAG->getTargetConstant(I.getZExtValue(), SDLoc(N),
+                                   MVT::getIntegerVT(I.getBitWidth()));
+}]>;
+
+// Inspects parent to determine whether an or instruction can be implemented as
+// an add (i.e. whether we know overflow won't occur in the add).
+def AddLikeOrOp : ComplexPattern<i32, 1, "SelectAddLikeOr", [],
+                                 [SDNPWantParent]>;
+
+//===----------------------------------------------------------------------===//
+// Control flow
+//===----------------------------------------------------------------------===//
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1, hasSideEffects = 1 in {
+def HALT : TPUInstP<(outs), (ins), "_ =\tshalt${pred}", [(Halt)]>,
+     Requires<[NotBC]>, Bundle<B_Sany>, SubUnits<[SU_halt]>;
+} // isReturn = 1, isTerminator = 1, isBarrier = 1, hasSideEffects = 1
+
+let hasSideEffects = 1, isPseudo = 1 in {
+def TRAP : TPUInst<(outs), (ins PPR:$p),
+                       "_ =\t#TRAP $p",
+                       [(int_tpu_halt_trap PPR:$p)]>, Bundle<B_Sany>,
+  Requires<[NotBC]>, SubUnits<[SU_halt]>;
+} // hasSideEffects = 1, isPseudo = 1
+
+//===----------------------------------------------------------------------===//
+// Scalar ALU ops
+//===----------------------------------------------------------------------===//
+
+// Register-register and register-immediate variants for scalar ALU ops.
+// The ISA defines an ALU op as "OP x, y", where y can be a register or
+// immediate. However, some ops are defined as "OP y, x"; again where y
+// can be an immediate. Most binary ops are commutative apart from SUB,
+// for which this distinction matters.
+//
+// LLVM likes immediates on the RHS of binary ops. Therefore we define the
+// MachineInstr and matchers to have the immediate on the RHS always and print
+// the operands inverted if the ISA wants it that way.
+//
+// The only non-commutative op is SUB, so that's handled specially.
+
+let Predicates = [NotBC] in {
+multiclass IntALUOp<string Name, SDPatternOperator OpNode, BundleSlot Slot, string XY, bits<6> Opcode> {
+  defm ri : TPUInstS<Slot, Opcode, (outs GPR:$d), (ins tsmxgpr:$x, tsimmi:$y),
+                     "$d =\t" # Name # "${pred} " # XY,
+                     [(set GPR:$d, (OpNode (i32 tsmxgpr:$x), (i32 imm:$y)))]>;
+  defm rr : TPUInstS<Slot, Opcode, (outs GPR:$d), (ins tsmxgpr:$x, tsmygpr:$y),
+                     "$d =\t" # Name # "${pred} " # XY,
+                     [(set GPR:$d, (OpNode (i32 tsmxgpr:$x), (i32 tsmygpr:$y)))]>;
+}
+multiclass FPALUOp<string Name, SDPatternOperator OpNode, BundleSlot Slot, string XY, bits<6> Opcode> {
+  defm ri : TPUInstS<Slot, Opcode, (outs GPR:$d), (ins GPR:$x, tsimmf:$y),
+                     "$d =\t" # Name # "${pred} " # XY,
+                     [(set GPR:$d, (OpNode (f32 GPR:$x), (f32 fpimm:$y)))]>;
+  defm rr : TPUInstS<Slot, Opcode, (outs GPR:$d), (ins GPR:$x, GPR:$y),
+                     "$d =\t" # Name # "${pred} " # XY,
+                     [(set GPR:$d, (OpNode (f32 GPR:$x), (f32 GPR:$y)))]>;
+}
+
+multiclass IntALUOpXY<string Name, SDPatternOperator OpNode, bits<6> Opcode, BundleSlot Slot = B_Sany> :
+  IntALUOp<Name, OpNode, Slot, "$x, $y", Opcode>;
+multiclass IntALUOpYX<string Name, SDPatternOperator OpNode, bits<6> Opcode, BundleSlot Slot = B_Sany> :
+  IntALUOp<Name, OpNode, Slot, "$y, $x", Opcode>;
+multiclass FPALUOpXY<string Name, SDPatternOperator OpNode, bits<6> Opcode, BundleSlot Slot = B_Sany> :
+  FPALUOp<Name, OpNode, Slot, "$x, $y", Opcode>;
+multiclass FPALUOpYX<string Name, SDPatternOperator OpNode, bits<6> Opcode, BundleSlot Slot = B_Sany> :
+  FPALUOp<Name, OpNode, Slot, "$y, $x", Opcode>;
+
+let SubUnits = [SU_scalar_alu, SU_scalar_misc1] in {
+defm SUBir : TPUInstSanyMisc<33, (outs GPR:$d), (ins tsimmi:$y, GPR:$x),
+                              "$d =\tssub.s32${pred} $y, $x",
+                              [(set GPR:$d, (sub (i32 imm:$y), (i32 GPR:$x)))]>;
+defm SUBrr : TPUInstSanyMisc<33, (outs GPR:$d), (ins GPR:$y, GPR:$x),
+                              "$d =\tssub.s32${pred} $y, $x",
+                              [(set GPR:$d, (sub (i32 GPR:$y), (i32 GPR:$x)))]>;
+defm ADD : IntALUOpYX<"sadd.s32", add, 32, B_SanyMisc>;
+defm AND : IntALUOpYX<"sand.u32", and, 34, B_SanyMisc>;
+defm OR : IntALUOpYX<"sor.u32", or, 35, B_SanyMisc>;
+defm XOR : IntALUOpYX<"sxor.u32", xor, 36, B_SanyMisc>;
+defm SHL : IntALUOpXY<"sshll.u32", shl, 43, B_SanyMisc>;
+defm SRL : IntALUOpXY<"sshrl.u32", srl, 44, B_SanyMisc>;
+defm SRA : IntALUOpXY<"sshra.s32", sra, 45, B_SanyMisc>;
+} // Predicates = [NotBC]
+
+let Predicates = [IsVFTCOrSC] in {
+// TODO(hgreving): I've noticed that certain select patterns do not combine into min/max.
+// Specifically when trying LLVM-IR with shiftable numbers like 256. We might want to
+// add those. See scalar_i32_sc.ll.
+defm MAX : IntALUOpXY<"smax.u32", umax, 28, B_SanyMisc>;
+defm MIN : IntALUOpXY<"smin.u32", umin, 29, B_SanyMisc>;
+} // Predicates = [IsVFTCOrSC]
+} // SubUnits = [SU_scalar_alu, SU_scalar_misc1]
+
+let Predicates = [NotBC] in {
+let SubUnits = [SU_i_multiply] in {
+defm MUL : IntALUOpYX<"smul.u32", mul, 40, B_S0>, Requires<[HasSMul32]>;
+defm MUL24 : IntALUOpYX<"smul.u24", Umul24, 40, B_S0>, Requires<[HasSMul24]>;
+} // SubUnits = [SU_i_multiply]
+
+def : Pat<(i32 (int_tpu_shll (i32 GPR:$lhs), (i32 GPR:$rhs))),
+          (SHLrr GPR:$lhs, GPR:$rhs)>;
+def : Pat<(i32 (int_tpu_shll (i32 GPR:$lhs), (i32 imm:$rhs))),
+          (SHLri GPR:$lhs, (i32 imm:$rhs))>;
+def : Pat<(i32 (int_tpu_shrl (i32 GPR:$lhs), (i32 GPR:$rhs))),
+          (SRLrr GPR:$lhs, GPR:$rhs)>;
+def : Pat<(i32 (int_tpu_shrl (i32 GPR:$lhs), (i32 imm:$rhs))),
+          (SRLri GPR:$lhs, (i32 imm:$rhs))>;
+def : Pat<(i32 (int_tpu_shra (i32 GPR:$lhs), (i32 GPR:$rhs))),
+          (SRArr GPR:$lhs, GPR:$rhs)>;
+def : Pat<(i32 (int_tpu_shra (i32 GPR:$lhs), (i32 imm:$rhs))),
+          (SRAri GPR:$lhs, (i32 imm:$rhs))>;
+
+def FSUBir : TPUInstS1<38, (outs GPR:$d), (ins tsimmf:$y, GPR:$x),
+                     "$d =\tssub.f32${pred} $y, $x",
+                     [(set GPR:$d, (fsub (f32 fpimm:$y), (f32 GPR:$x)))]>,
+            Sched<[WriteFadd]>, SubUnits<[SU_f_math]>;
+def FSUBrr : TPUInstS1<38, (outs GPR:$d), (ins GPR:$y, GPR:$x),
+                     "$d =\tssub.f32${pred} $y, $x",
+                     [(set GPR:$d, (fsub (f32 GPR:$y), (f32 GPR:$x)))]>,
+            Sched<[WriteFadd]>, SubUnits<[SU_f_math]>;
+
+// XOR can go in slot0 or 1 while fsub can only go in slot1.
+def : Pat<(f32 (fneg GPR:$x)), (XORri GPR:$x, (i32 0x80000000))>;
+
+defm FADD : FPALUOpYX<"sadd.f32", fadd, 37, B_S1>, Sched<[WriteFadd]>,
+     SubUnits<[SU_f_math]>;
+defm FMUL : FPALUOpYX<"smul.f32", fmul, 39, B_S0>, Sched<[WriteFmul]>,
+     SubUnits<[SU_f_multiply]>;
+defm FMAX : FPALUOpXY<"smax.f32", fmaximum, 41>, SubUnits<[SU_scalar_alu]>;
+defm FMIN : FPALUOpXY<"smin.f32", fminimum, 42>, SubUnits<[SU_scalar_alu]>;
+} // Predicates = [NotBC]
+
+let Predicates = [IsVFTCOrSC] in {
+defm FCEIL : TPUInstS<B_Sany, 15, (outs GPR:$d), (ins GPR:$y),
+                     "$d =\tsceil.f32${pred} $y",
+                     [(set GPR:$d, (fceil (f32 GPR:$y)))]>,
+             SubUnits<[SU_scalar_alu]>;
+defm FFLOOR : TPUInstS<B_Sany, 16, (outs GPR:$d), (ins GPR:$y),
+                      "$d =\tsfloor.f32${pred} $y",
+                      [(set GPR:$d, (ffloor (f32 GPR:$y)))]>,
+              SubUnits<[SU_scalar_alu]>;
+} // Predicates = [IsVFTCOrSC]
+
+// This works correctly for all finite and non-finite numbers.
+//
+// NaNs are represented by an exponent field of all ones and a non-zero
+// significand. Masking away the sign bit merely changes the sign of the
+// NaN.
+def : Pat<(f32 (fabs GPR:$x)), (ANDri GPR:$x, (i32 0x7fffffff))>;
+
+def : Pat<(f32 (fcopysign (f32 GPR:$x), (f32 GPR:$y))),
+          (ORrr (ANDri GPR:$x, (i32 0x7fffffff)), (ANDri GPR:$y, (i32 0x80000000)))>;
+
+let Predicates = [NotBC] in {
+defm CLZ : TPUInstSanyMisc<47, (outs GPR:$d), (ins tsmygpr:$y),
+                 "$d =\tsclz.u32${pred} $y",
+                 [(set GPR:$d, (ctlz (i32 tsmygpr:$y)))]>,
+                 SubUnits<[SU_scalar_alu, SU_scalar_misc1]>;
+
+let isMoveReg = 1 in {
+defm MOV : TPUInstSanyMisc<46, (outs GPR:$d), (ins tsmygpr:$y),
+                          "$d =\tsmov.u32${pred} $y", []>,
+                          SubUnits<[SU_scalar_alu, SU_scalar_misc1]>;
+let isPseudo = 1, hasSideEffects = 1 in {
+// MOV with side effects, used to escape CSE.
+defm MOV_SE : TPUInstSanyMisc<46, (outs GPR:$d), (ins tsmygpr:$y),
+                             "$d =\tMOV_SE${pred} $y", []>,
+                             SubUnits<[SU_scalar_alu, SU_scalar_misc1]>;
+} // isPseudo = 1, hasSideEffects = 1
+} // isMoveReg = 1
+
+let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+defm IMM : TPUInstSanyMisc<46, (outs GPR:$Sd), (ins tsimmi:$y),
+                      "$Sd =\tsimm.s32${pred} $y",
+                      [(set GPR:$Sd, (i32 imm:$y))]>,
+                      SubUnits<[SU_scalar_alu, SU_scalar_misc1]>;
+defm FIMM : TPUInstSanyMisc<46, (outs GPR:$Sd), (ins tsimmf:$y),
+                      "$Sd =\tsimm.f32${pred} $y",
+                      [(set GPR:$Sd, (f32 fpimm:$y))]>,
+                      SubUnits<[SU_scalar_alu]>;
+} // isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1
+} // Predicates = [NotBC]
+def : Pat<(Wrapper tglobaladdr:$g), (IMM imm:$g)>;
+
+let Predicates = [IsVFTCOrSC] in {
+let mayLoad = 1, mayStore = 1 in {
+
+// The DRF fifo exists on all VF, TAC, TEC, and SCS subtargets.
+let isPush = 1 in {
+multiclass DivRem_<string opcode> {
+def rr : TPUInstP<(outs DRFPR:$drf),
+                  (ins GPR:$x, GPR:$y),
+                  "$drf =\t"#opcode#".u32${pred} $x, $y",
+                  []>,
+         Bundle<B_S0>, Sched<[WriteDrf]>;
+
+def ri : TPUInstP<(outs DRFPR:$drf),
+                  (ins GPR:$x, tsimmi:$imm),
+                  "$drf =\t"#opcode#".u32${pred} $x, $imm",
+                  []>,
+         Bundle<B_S0>, Sched<[WriteDrf]>;
+}
+
+defm SDIVREM : DivRem_<"sdivrem">, SubUnits<[SU_u_divide]>;
+defm SDIV : DivRem_<"sdiv">, SubUnits<[SU_u_divide]>;
+defm SREM : DivRem_<"srem">, SubUnits<[SU_u_divide]>;
+} // isPush = 1
+
+let isPop = 1 in {
+// spop (drf) does not support a void destination.
+def SPOP_DRF : TPUInstP<(outs GPR:$sdst), (ins DRFPR:$drf),
+                        "$sdst =\tspop${pred} $drf", []>,
+                 Bundle<B_Sany>, Sched<[WriteV2SFPop]>, SubUnits<[SU_pop]>;
+} // isPop = 1
+
+} // mayLoad = 1, mayStore = 1
+
+let isComposedDrfFifo = 1 in {
+multiclass DivRemCF_<string type> {
+def rr : TPUInstP<(outs GPR:$sdst),
+                  (ins GPR:$x, GPR:$y),
+                  "$sdst =\t"#type#"_CFrr.u32${pred} $x, $y",
+                  []>,
+         Bundle<B_S0>, Sched<[WriteDrf]>;
+
+def ri : TPUInstP<(outs GPR:$sdst),
+                  (ins GPR:$x, tsimmi:$imm),
+                  "$sdst =\t"#type#"_CFri.u32${pred} $x, $imm",
+                  []>,
+         Bundle<B_S0>, Sched<[WriteDrf]>;
+}
+
+// We do not currently support SDIVREM as a composed version, due to its multiple pops. We
+// may be able to make this work with our current system, but it needs a closer look.
+defm SDIV_CF : DivRemCF_<"SDIV">, SubUnits<[SU_u_divide]>;
+defm SREM_CF : DivRemCF_<"SREM">, SubUnits<[SU_u_divide]>;
+} // isComposedDrfFifo = 1
+
+} // Predicates = [IsVFTCOrSC]
+
+let Predicates = [IsVFTCOrSC] in {
+defm SMULHIOV : IntALUOpYX<"smulhi.u32", int_tpu_smulhi, 21, B_S0>,
+     SubUnits<[SU_i_multiply]>;
+defm SSHLAOV : IntALUOpXY<"sshla.ov.s32", int_tpu_sshla_ov, 49, B_SanyMisc>,
+     SubUnits<[SU_scalar_alu, SU_scalar_misc1]>;
+def SADDOVri : TPUInstP<(outs GPR:$d), (ins tsmxgpr:$x, tsimmi:$y),
+                        "$d =\tsadd.ov.s32${pred} $y, $x",
+                        [(set GPR:$d, (int_tpu_sadd_ov (i32 tsmxgpr:$x), (i32 imm:$y)))]>,
+               Bundle<B_ALUOV>, SubUnits<[SU_scalar_alu, SU_scalar_misc1]>;
+def SADDOVrr : TPUInstP<(outs GPR:$d), (ins tsmxgpr:$x, tsmygpr:$y),
+                        "$d =\tsadd.ov.s32${pred} $y, $x",
+                        [(set GPR:$d, (int_tpu_sadd_ov (i32 tsmxgpr:$x), (i32 tsmygpr:$y)))]>,
+               Bundle<B_ALUOV>, SubUnits<[SU_scalar_alu, SU_scalar_misc1]>;
+def SSUBOVir : TPUInstP<(outs GPR:$d), (ins tsimmi:$y, GPR:$x),
+                        "$d =\tssub.ov.s32${pred} $y, $x",
+                        [(set GPR:$d, (int_tpu_ssub_ov (i32 imm:$y), (i32 GPR:$x)))]>,
+               Bundle<B_ALUOV>, SubUnits<[SU_scalar_alu, SU_scalar_misc1]>;
+def SSUBOVrr : TPUInstP<(outs GPR:$d), (ins GPR:$y, GPR:$x),
+                        "$d =\tssub.ov.s32${pred} $y, $x",
+                        [(set GPR:$d, (int_tpu_ssub_ov (i32 GPR:$y), (i32 GPR:$x)))]>,
+               Bundle<B_ALUOV>, SubUnits<[SU_scalar_alu, SU_scalar_misc1]>;
+} // Predicates = [IsVFTCOrSC]
+
+
+//===----------------------------------------------------------------------===//
+// Scalar comparison ops
+//===----------------------------------------------------------------------===//
+
+let Predicates = [NotBC] in {
+multiclass IntCompareOp<string Name, SDPatternOperator OpNode, bits<6> Opcode> {
+  defm ri : TPUInstSanyMisc<Opcode, (outs PPR:$d), (ins tsmxgpr:$x, tsimmi:$y),
+                       "$d =\t" # Name # "${pred} $x, $y",
+                       [(set PPR:$d, (OpNode (i32 tsmxgpr:$x), (i32 imm:$y)))]>;
+  defm rr : TPUInstSanyMisc<Opcode, (outs PPR:$d), (ins tsmxgpr:$x, tsmygpr:$y),
+                       "$d =\t" # Name # "${pred} $x, $y",
+                       [(set PPR:$d, (OpNode (i32 tsmxgpr:$x), (i32 tsmygpr:$y)))]>;
+}
+
+multiclass FPCompareOp<string Name, SDPatternOperator OpNode, bits<6> Opcode> {
+  defm ri : TPUInstSany<Opcode, (outs PPR:$d), (ins GPR:$x, tsimmf:$y),
+                   "$d =\t" # Name # "${pred} $x, $y",
+                   [(set PPR:$d, (OpNode (f32 GPR:$x), (f32 fpimm:$y)))]>;
+  defm rr : TPUInstSany<Opcode, (outs PPR:$d), (ins GPR:$x, GPR:$y),
+                   "$d =\t" # Name # "${pred} $x, $y",
+                   [(set PPR:$d, (OpNode (f32 GPR:$x), (f32 GPR:$y)))]>;
+}
+
+let isCompare = 1 in {
+let SubUnits = [SU_scalar_cmp, SU_scalar_misc1] in {
+defm CMPEQ : IntCompareOp<"seq.s32", seteq, 48>;
+defm CMPNE : IntCompareOp<"sne.s32", setne, 49>;
+defm CMPGT : IntCompareOp<"sgt.s32", setgt, 50>;
+defm CMPGE : IntCompareOp<"sge.s32", setge, 51>;
+defm CMPLT : IntCompareOp<"slt.s32", setlt, 52>;
+defm CMPLE : IntCompareOp<"sle.s32", setle, 53>;
+} // SubUnits = [SU_scalar_cmp]
+
+let SubUnits = [SU_scalar_cmp_ordered] in {
+defm CMPUGT : IntCompareOp<"sgt.u32", setugt, 36>, Requires<[IsVFTCOrSC]>;
+defm CMPUGE : IntCompareOp<"sge.u32", setuge, 37>, Requires<[IsVFTCOrSC]>;
+defm CMPULT : IntCompareOp<"slt.u32", setult, 38>, Requires<[IsVFTCOrSC]>;
+defm CMPULE : IntCompareOp<"sle.u32", setule, 39>, Requires<[IsVFTCOrSC]>;
+defm FCMPEQ : FPCompareOp<"seq.f32", setoeq, 56>, Sched<[WriteSFCmp]>;
+defm FCMPNE : FPCompareOp<"sne.f32", setune, 57>, Sched<[WriteSFCmp]>;
+defm FCMPGT : FPCompareOp<"sgt.f32", setogt, 58>, Sched<[WriteSFCmp]>;
+defm FCMPGE : FPCompareOp<"sge.f32", setoge, 59>, Sched<[WriteSFCmp]>;
+defm FCMPLT : FPCompareOp<"slt.f32", setolt, 60>, Sched<[WriteSFCmp]>;
+defm FCMPLE : FPCompareOp<"sle.f32", setole, 61>, Sched<[WriteSFCmp]>;
+} // SubUnits = [SU_scalar_cmp_ordered]
+}
+
+multiclass FPComparePat<string OpName, SDPatternOperator OpNode> {
+  def : Pat<(OpNode (f32 GPR:$x), (f32 fpimm:$y)),
+            (!cast<Instruction>(OpName#"ri") GPR:$x, tsimmf:$y)>;
+  def : Pat<(OpNode (f32 GPR:$x), (f32 GPR:$y)),
+            (!cast<Instruction>(OpName#"rr") GPR:$x, GPR:$y)>;
+}
+// Patterns for the cases where we don't care about unordered.
+defm : FPComparePat<"FCMPEQ", seteq>;
+defm : FPComparePat<"FCMPNE", setne>;
+defm : FPComparePat<"FCMPGT", setgt>;
+defm : FPComparePat<"FCMPGE", setge>;
+defm : FPComparePat<"FCMPLT", setlt>;
+defm : FPComparePat<"FCMPLE", setle>;
+
+defm CARRYOUT : IntCompareOp<"sc.u32", int_tpu_addcarry, 54>,
+     SubUnits<[SU_scalar_cmp]>;
+} // Predicates = [NotBC]
+
+defm WEIRD : TPUInstSany<62, (outs PPR:$Pd), (ins GPR:$Ss),
+                 "$Pd =\tsweird.f32${pred} $Ss",
+                 [(set PPR:$Pd, (int_tpu_weird_f32 (f32 GPR:$Ss)))]>,
+                 Requires<[NotBC]>, SubUnits<[SU_scalar_cmp_ordered]>;
+
+//===----------------------------------------------------------------------===//
+// Scalar conversion ops
+//===----------------------------------------------------------------------===//
+let Predicates = [NotBC, NotVFTC, NotSC] in {
+def FPTOSIrr : TPUInstP<(outs GPR:$Sd), (ins GPR:$x, GPR:$y),
+                      "$Sd =\tscvt.f32.s32${pred} $x, $y",
+                      [(set GPR:$Sd,
+                       (int_tpu_cvt_pr_fptosi (f32 GPR:$x), (i32 GPR:$y)))]>,
+          Bundle<B_Sany>, Sched<[WriteFPConvert]>, SubUnits<[SU_f_convert]>;
+def FPTOSIri : TPUInstP<(outs GPR:$Sd), (ins GPR:$x, tsimmi:$y),
+                      "$Sd =\tscvt.f32.s32${pred} $x, $y",
+                      [(set GPR:$Sd,
+                       (int_tpu_cvt_pr_fptosi (f32 GPR:$x), (i32 imm:$y)))]>,
+          Bundle<B_Sany>, Sched<[WriteFPConvert]>, SubUnits<[SU_f_convert]>;
+
+def : Pat<(i32 (fp_to_sint (f32 GPR:$x))), (FPTOSIri GPR:$x, (i32 -1))>;
+} // Predicates = [NotBC, NotVFTC, NotSC]
+
+let Predicates = [IsVFTCOrSC] in {
+def FPTOSIr : TPUInstP<(outs GPR:$Sd), (ins GPR:$y),
+                      "$Sd =\tscvt.f32.s32${pred} $y",
+                      [(set GPR:$Sd,
+                       (int_tpu_cvt_fptosi (f32 GPR:$y)))]>,
+          Bundle<B_Sany>, Sched<[WriteFPConvert]>, SubUnits<[SU_f_convert]>;
+def FPTOSIi : TPUInstP<(outs GPR:$Sd), (ins tsimmf:$y),
+                      "$Sd =\tscvt.f32.s32${pred} $y",
+                      [(set GPR:$Sd,
+                       (int_tpu_cvt_fptosi (f32 imm:$y)))]>,
+          Bundle<B_Sany>, Sched<[WriteFPConvert]>, SubUnits<[SU_f_convert]>;
+
+def : Pat<(i32 (fp_to_sint (f32 GPR:$x))), (FPTOSIr GPR:$x)>;
+} // Predicates = [IsVFTCOrSC]
+
+let Predicates = [NotBC] in {
+def SITOFPr : TPUInstP<(outs GPR:$Sd), (ins GPR:$y),
+                      "$Sd =\tscvt.s32.f32${pred} $y",
+                      [(set (f32 GPR:$Sd), (sint_to_fp (i32 GPR:$y)))]>,
+          Bundle<B_Sany>, Sched<[WriteFPConvert]>, SubUnits<[SU_f_convert]>;
+def SITOFPi : TPUInstP<(outs GPR:$Sd), (ins tsimmi:$y),
+                      "$Sd =\tscvt.s32.f32${pred} $y",
+                      [(set (f32 GPR:$Sd), (sint_to_fp (i32 imm:$y)))]>,
+          Bundle<B_Sany>, Sched<[WriteFPConvert]>, SubUnits<[SU_f_convert]>;
+} // Predicates = [NotBC]
+
+//===----------------------------------------------------------------------===//
+// Predicate manipulation ops
+//===----------------------------------------------------------------------===//
+def pnot : PatFrags<(ops node:$a),
+                    [(xor node:$a, (i1 -1)),
+                     (add node:$a, (i1 -1))]>;
+
+let Predicates = [NotBC] in {
+// The only predicate op is POR. It can negate any of its operands, so we can
+// create PMOV, PNOT, PSET and PCLEAR
+
+def POR : TPUInstP<(outs PPR:$Pd), (ins predM:$Ps, predM:$Pt),
+                 "$Pd =\tpor${pred} $Ps, $Pt",
+                 []>, Bundle<B_SanyMisc>, SubUnits<[SU_scalar_cmp, SU_scalar_misc1]>;
+
+let isPseudo = 1 in {
+// This is a convenience instruction that simplifies handling predicate movs.
+def PMOV : TPUInstP<(outs PPR:$Pd), (ins PPR:$Ps),
+                  "$Pd =\tPMOV${pred} $Ps, $Ps", []>,
+           Bundle<B_SanyMisc>, SubUnits<[SU_scalar_cmp, SU_scalar_misc1]>;
+let hasSideEffects = 1 in {
+// PMOV with side effects, used to escape CSE.
+def PMOV_SE : TPUInstP<(outs PPR:$Pd), (ins PPR:$Ps),
+                       "$Pd =\tPMOV_SE${pred} $Ps, $Ps", []>,
+              Bundle<B_SanyMisc>, SubUnits<[SU_scalar_cmp, SU_scalar_misc1]>;
+} // hasSideEffects = 1
+} // isPseudo = 1
+
+def : Pat<(or PPR:$Ps, PPR:$Pt), (POR PPR:$Ps, (i32 0), PPR:$Pt, (i32 0))>;
+def : Pat<(pnot PPR:$x), (POR PPR:$x, (i32 1), PPR:$x, (i32 1))>;
+def : Pat<(setne PPR:$x, (i1 -1)), (POR PPR:$x, (i32 1), PPR:$x, (i32 1))>;
+
+// PORii takes two immediates it is used for PSET or PCLEAR. We need two
+// operands to allow MCParser to work correctly.
+def PORii : TPUInstP<(outs PPR:$Pd), (ins i1imm:$val0, i1imm:$val1),
+                 "$Pd =\tpor${pred} $val0, $val1",
+                 []>, Bundle<B_SanyMisc>,
+                 SubUnits<[SU_scalar_cmp, SU_scalar_misc1]>;
+
+def : Pat<(i1 imm:$val), (PORii imm:$val, imm:$val)>;
+def : Pat<(i1 (trunc (i32 GPR:$x))), (CMPEQri (ANDri $x, (i32 1)), (i32 1))>;
+def : Pat<(i1 (and PPR:$Ps, PPR:$Pt)),
+          (POR (POR $Ps, (i32 1), $Pt, (i32 1)), (i32 1),
+               (POR $Ps, (i32 1), $Pt, (i32 1)), (i32 1)
+          )>;
+def : Pat<(i1 (and (pnot PPR:$Ps), (pnot PPR:$Pt))),
+          (POR (POR $Ps, (i32 0), $Pt, (i32 0)), (i32 1),
+               (POR $Ps, (i32 0), $Pt, (i32 0)), (i32 1)
+          )>;
+
+// DAG combine may convert XOR i1 %x, -1 to setcc.
+def : Pat<(i1 (setcc (i32 (zext PPR:$p)), (i32 1), SETNE)),
+                          (POR $p, (i32 1), $p, (i32 1))>;
+
+let Constraints = "$d = $a", isPseudo = 1 in {
+  def PSEUDO_PSELrr : TPUInst<(outs PPR:$d), (ins PPR:$p, PPR:$a, PPR:$b),
+                              "$d =\t#PSEL $p, $a, $b",
+                              [(set PPR:$d, (select PPR:$p, PPR:$a, PPR:$b))]>,
+                       Bundle<B_Sany>;
+}
+} // Predicates = [NotBC]
+
+def FPZero : PatFrag<(ops), (fpimm), [{
+  return cast<ConstantFPSDNode>(N)->isZero();
+}]>;
+
+// Match clamp(a, 0, b) - clamp a between [0, b].
+def Relu : PatFrag<(ops node:$a, node:$b),
+                   (fmaximum (fminimum node:$a, node:$b),
+                             (Splat FPZero))>;
+
+//===----------------------------------------------------------------------===//
+// Load/store ops
+//===----------------------------------------------------------------------===//
+let Predicates = [NotBC] in {
+
+// Scalar load from smem.
+let mayLoad = 1 in {
+let SubUnits = [SU_load] in {
+def SLDri : TPUInstP<(outs GPR:$Sd), (ins GPR:$Sx, i32imm:$imm),
+                    "$Sd =\tsld${pred} [smem:${Sx}+$imm]",
+                    [(set GPR:$Sd, (i32 (load_smem (add GPR:$Sx, imm:$imm))))]>,
+            Bundle<B_SLD>, BundleImmSy, Sched<[WriteSld]>;
+def SLDi : TPUInstP<(outs GPR:$Sd), (ins i32imm:$imm),
+                    "$Sd =\tsld${pred} [smem:$imm]",
+                    [(set GPR:$Sd, (i32 (load_smem (Wrapper tglobaladdr:$imm))))]>,
+            Bundle<B_SLD>, BundleImmSy, Sched<[WriteSld]>;
+def SLDrr : TPUInstP<(outs GPR:$Sd), (ins GPR:$Sx, GPR:$Sy),
+                    "$Sd =\tsld${pred} [smem:${Sx}+${Sy}]",
+                    [(set GPR:$Sd, (i32 (load_smem (add GPR:$Sx, GPR:$Sy))))]>,
+            Bundle<B_SLD>, Sched<[WriteSld]>;
+} // SubUnits = [SU_load]
+} // mayLoad = 1
+// Provide patterns for no-immediate and floating point equivalents which are
+// bitwise identical.
+def : Pat<(f32 (load_smem (Wrapper tglobaladdr:$imm))), (SLDi imm:$imm)>;
+def : Pat<(f32 (load_smem (imm:$imm))), (SLDi imm:$imm)>;
+def : Pat<(i32 (load_smem (imm:$imm))), (SLDi imm:$imm)>;
+def : Pat<(i32 (load_smem GPR:$Sx)), (SLDri GPR:$Sx, (i32 0))>;
+def : Pat<(f32 (load_smem (add GPR:$Sx, imm:$imm))), (SLDri GPR:$Sx, imm:$imm)>;
+def : Pat<(f32 (load_smem (or AddLikeOrOp:$Sx, imm:$imm))), (SLDri GPR:$Sx, imm:$imm)>;
+def : Pat<(f32 (load_smem (add GPR:$Sx, GPR:$Sy))), (SLDrr GPR:$Sx, GPR:$Sy)>;
+def : Pat<(f32 (load_smem GPR:$Sx)), (SLDri GPR:$Sx, (i32 0))>;
+
+def : Pat<(i32 (extload_smem (Wrapper tglobaladdr:$imm))), (SLDi imm:$imm)>;
+def : Pat<(i32 (extload_smem (imm:$imm))), (SLDi imm:$imm)>;
+def : Pat<(f32 (extload_smem (imm:$imm))), (SLDi imm:$imm)>;
+def : Pat<(i32 (extload_smem GPR:$Sx)), (SLDri GPR:$Sx, (i32 0))>;
+def : Pat<(i32 (extload_smem (add GPR:$Sx, imm:$imm))), (SLDri GPR:$Sx, imm:$imm)>;
+def : Pat<(i32 (extload_smem (or AddLikeOrOp:$Sx, imm:$imm))), (SLDri GPR:$Sx, imm:$imm)>;
+def : Pat<(i32 (extload_smem (add GPR:$Sx, GPR:$Sy))), (SLDrr GPR:$Sx, GPR:$Sy)>;
+
+// Scalar store to smem.
+let mayStore = 1 in {
+let SubUnits = [SU_store] in {
+def SSTr : TPUInstP<(outs), (ins GPR:$Sx, GPR:$Sy),
+                    "[smem:${Sy}] =\tsst${pred} $Sx",
+                    [(store_smem (i32 GPR:$Sx), GPR:$Sy)]>,
+            Bundle<B_SST>;
+def SSTi : TPUInstP<(outs), (ins GPR:$Sx, i32imm:$imm),
+                    "[smem:${imm}] =\tsst${pred} $Sx",
+                    [(store_smem (i32 GPR:$Sx), (Wrapper tglobaladdr:$imm))]>,
+            Bundle<B_SST>, BundleImmSy;
+} // SubUnits = [SU_store]
+} // mayStore = 1
+
+def : Pat<(store_smem (f32 GPR:$Sx), (Wrapper tglobaladdr:$imm)),
+          (SSTi GPR:$Sx, imm:$imm)>;
+def : Pat<(store_smem (f32 GPR:$Sval), (imm:$imm)),
+          (SSTi GPR:$Sval, imm:$imm)>;
+def : Pat<(store_smem (i32 GPR:$Sval), (imm:$imm)),
+          (SSTi GPR:$Sval, imm:$imm)>;
+def : Pat<(store_smem (f32 GPR:$Sx), (i32 GPR:$Sy)),
+          (SSTr GPR:$Sx, GPR:$Sy)>;
+
+//===----------------------------------------------------------------------===//
+// Spilling related pseudo instructions
+//===----------------------------------------------------------------------===//
+
+// MPR register spill pseudos:
+//
+// We spill vector mask registers with three instructions:
+//
+//   scratch_vreg = 0
+//   scratch_vreg = vselect vmreg, 0xFFFFFFFF, scratch_vreg
+//   vst scratch_vreg
+//
+// We restore vector mask registers with two instructions:
+//
+//   scratch_vreg = vld
+//   vmreg = vcmp.ne scratch_vreg, 0
+//
+// N.B. We use 0xFFFFFFFF instead of 1 for truthy values because PxC has two
+// bits per sublane per lane. A vselect between 0xFFFFFFFF and 0 makes each
+// 16-bit half-word of the Vreg to be set to either 0xFFFF or 0.
+//
+// Since TargetInstrInfo::storeRegToStackSlot(...) assumes that only one store
+// instruction will be added, we have to create a pseudo instruction and expand
+// it later.
+
+// PPR register spill pseudos:
+//
+// We spill predicate registers with three instructions:
+//
+//   scratch_sreg  = simm{preg}, 0x1;
+//   scratch_sreg  = simm{!preg}, 0x0;
+//   sst scratch_sreg
+//
+// We restore predicate registers with three instructions:
+//
+//   scratch_sreg = sld
+//   preg = seq scratch_sreg, $0x1
+//
+// Since TargetInstrInfo::storeRegToStackSlot(...) assumes that only one store
+// instruction will be added, we have to create a pseudo instruction and expand
+// it later.
+//
+// Explanation about the s and P derivatives:
+//
+// * The _P (pre-spill) derivatives are used by the pre-spiller and are otherwise
+//   equivalent to the non _P version. The purpose for this is to be able to
+//   distinguish them from real spills and frame-setup spills when e.g. analyzing
+//   code in the super pass during software pipelining. The pre-spiller only
+//   pre-spills GPR and VPR live-ranges.
+// * The s (stack) versions of the opcode are versions of the spill instructions
+//   that are relative to a register (usually frame- or stack pointer). Since
+//   on current architectures there are no base+offset variants of scalar store,
+//   the PPRs and GPRs variants will ultimately drop the frame index, and require
+//   the presence of a SPILL_GPR_ADD instruction that adds the frame index to the
+//   register.
+
+class Spill_<RegisterClass RC, Operand Op> : TPUInstP<(outs), (ins RC:$val, Op:$FI),
+                                          "\t"# NAME # "${pred} $val, $$FI", []>;
+class SpillStack_<RegisterClass RC, Operand Op> : TPUInstP<(outs), (ins RC:$val, GPR:$s, Op:$FI),
+                                               "\t"# NAME # "${pred} $val, $s, $$FI", []>;
+
+multiclass SpillAll_<RegisterClass RC, Operand Op> {
+  def "" : Spill_<RC, Op>;
+  def _P : Spill_<RC, Op>;
+  def s : SpillStack_<RC, Op>;
+  def _Ps : SpillStack_<RC, Op>;
+}
+
+class Restore_<RegisterClass RC, Operand Op> : TPUInstP<(outs RC:$dst), (ins Op:$FI),
+                                            "$dst =\t" # NAME # "${pred} $$FI", []>;
+class RestoreStack_<RegisterClass RC, Operand Op> : TPUInstP<(outs RC:$dst), (ins GPR:$s, Op:$FI),
+                                                 "$dst =\t" # NAME # "${pred} $s, $$FI", []>;
+
+multiclass RestoreAll_<RegisterClass RC, Operand Op> {
+ def "" : Restore_<RC, Op>;
+ def _P : Restore_<RC, Op>;
+ def s : RestoreStack_<RC, Op>;
+ def _Ps : RestoreStack_<RC, Op>;
+}
+
+let isPseudo = 1 in {
+let mayStore = 1 in {
+defm SPILL_GPR : SpillAll_<GPR, tsimmi>,
+                 Bundle<B_SST>, BundleImmSy, SubUnits<[SU_store]>;
+defm SPILL_VPR : SpillAll_<VPR, tvimmi>,
+                 Bundle<B_VST>, IsVectorInstruction;
+defm SPILL_MPR : SpillAll_<MPR, tvimmi>,
+                 Bundle<B_Vany>;
+defm SPILL_PPR : SpillAll_<PPR, tsimmi>,
+                 Bundle<B_Sany>;
+// Add frame index
+def SPILL_GPR_ADD : TPUInstP<(outs GPR:$d), (ins tsmxgpr:$x, i32imm:$FI),
+                            "$d =\t" # Name # "${pred} " # "$x, $FI",
+                            []>,
+                    Bundle<B_Sany>;
+} // mayStore = 1
+} // isPseudo = 1
+
+let mayLoad = 1 in {
+let isPseudo = 1 in {
+defm RESTORE_GPR : RestoreAll_<GPR, tsimmi>,
+                   Bundle<B_SLD>, Sched<[WriteSld]>;
+defm RESTORE_VPR : RestoreAll_<VPR, tvimmi>,
+                   Bundle<B_VLD>, Sched<[WriteVLD]>, IsVectorInstruction;
+defm RESTORE_MPR : RestoreAll_<MPR, tvimmi>,
+                   Bundle<B_Vany>, Sched<[WriteVLD]>, IsVectorInstruction;
+defm RESTORE_PPR : RestoreAll_<PPR, tsimmi>,
+                   Bundle<B_Sany>, Sched<[WriteSld]>;
+} // isPseudo = 1
+} // mayLoad = 1
+
+} // Predicates = [NotBC]
+
+let hasSideEffects = 1 in {
+let mayStore = 1, mayLoad = 1 in {
+let isPseudo = 1 in {
+def SPILL_GPR_DEBUG : TPUInstP<(outs), (ins GPR:$v),
+                               "\tSPILL_GPR_DEBUG${pred} $v",
+                               [(int_tpu_spill_debug (i32 GPR:$v))]>;
+def SPILL_VPR_DEBUG : TPUInstP<(outs), (ins VPR:$v),
+                               "\tSPILL_VPR_DEBUG${pred} $v",
+                               [(int_tpu_spill_debug (vNi32 VPR:$v))]>;
+} // isPseudo = 1
+} // mayStore = 1, mayLoad = 1
+} // hasSideEffects = 1
+
+//===----------------------------------------------------------------------===//
+// Branch ops
+//===----------------------------------------------------------------------===//
+// The BR pseudo is used from early to late code generation to represent the
+// branching point; the point at which control flow changes.
+//
+// The BRrel instruction is the actual branch instruction; it has a delay slot
+// so is inserted N cycles before the BR pseudo was scheduled at.
+//
+// (brcond) SDNodes are custom selected to a BR.
+//===----------------------------------------------------------------------===//
+
+// Relative branch. This is used for both unconditional and conditional jumps.
+class RelTargetOperand<ValueType VT> : Operand<VT> {
+  let PrintMethod = "printPCRelImm";
+  let OperandType = "OPERAND_PCREL";
+}
+
+let Predicates = [NotBC], SubUnits = [SU_control] in {
+let isBranch = 1, isTerminator = 1 in {
+def BRrel : TPUInstP<(outs), (ins RelTargetOperand<OtherVT>:$target),
+                    "(pc) =\tsbr.rel${pred} $target",
+                    []>, Bundle<B_S0>, BundleImm<IMM_0>;
+def BRabs : TPUInstP<(outs), (ins Operand<OtherVT>:$target),
+                    "(pc) =\tsbr.abs${pred} $target",
+                    []>, Bundle<B_S0>, BundleImm<IMM_0>;
+def BRabsClr : TPUInstP<(outs), (ins Operand<OtherVT>:$target),
+                      "(pc) =\tsbr.abs.clr${pred} $target",
+                      []>, Bundle<B_S0>, BundleImm<IMM_0>;
+def BRind : TPUInstP<(outs), (ins GPR:$target),
+                    "(pc) =\tsbr.ind${pred} $target",
+                    []>, Bundle<B_S0>, BundleImm<IMM_0>;
+} // isBranch = 1, isTerminator = 1
+
+let isPseudo = 1, isBranch = 1 in {
+let isTerminator = 1 in {
+let isBarrier = 1 in {
+// Pseudo to model the actual change in control flow, after the delay slot ends.
+def BR : TPUInst<(outs), (ins Operand<OtherVT>:$target),
+                 "(pc) = \t#BR $target", [(br bb:$target)]>,
+         Bundle<B_S0>, BundleImm<IMM_0>;
+def BRClr : TPUInst<(outs), (ins Operand<OtherVT>:$target),
+                    "(pc) = \t#BR $target", [(br bb:$target)]>,
+            Bundle<B_S0>, BundleImm<IMM_0>;
+} // isBarrier
+def BRcond : TPUInstP<(outs), (ins Operand<OtherVT>:$target),
+                      "(pc) = \t#BRcond${pred} $target", []>,
+             Bundle<B_S0>, BundleImm<IMM_0>;
+def BRcondClr : TPUInstP<(outs), (ins Operand<OtherVT>:$target),
+                         "(pc) = \t#BRcond${pred} $target", []>,
+                Bundle<B_S0>, BundleImm<IMM_0>;
+} // isTerminator
+// Used only for MIR tests.
+def BRcondT : TPUInstP<(outs), (ins Operand<OtherVT>:$target),
+                  "(pc) = \t#BRcondT${pred} $target", []>,
+              Bundle<B_S0>, BundleImm<IMM_0>;
+} // isPseudo = 1, isBranch = 1
+} // Predicates = [NotBC], SubUnits = [SU_control]
+
+// Special placeholder instruction that isn't isBranch but uses the same
+// resources, used for scheduling.
+let isPseudo = 1 in {
+def BRSched : TPUInstP<(outs), (ins Operand<OtherVT>:$target),
+                       "\tBRSched${pred} $target",
+                       []>, Bundle<B_S0>, BundleImm<IMM_0>;
+} // isPseudo = 1
+
+//===----------------------------------------------------------------------===//
+// Syncflag ops
+//===----------------------------------------------------------------------===//
+
+// i32 that must be a sign-extended i16.
+def imm16 : Operand<i32>, PatLeaf<(imm), [{
+    return isInt<16>(N->getSExtValue()); }]> {
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo ops
+//===----------------------------------------------------------------------===//
+let Predicates = [NotBC] in {
+let isPseudo = 1, isSelect = 1 in {
+  // Pseudo-select instruction. Note that this is lowered to either a predicated
+  // IMM or MOV, so we don't have an ii version. The instruction is predicable,
+  // but actual predication is emulated through predicate manipulation.
+  let Constraints = "$d = $a" in {
+    def PSEUDO_SELrr : TPUInst<(outs GPR:$d), (ins PPR:$p, GPR:$a, GPR:$b),
+                              "$d =\t#PSEUDO_SELrr $p, $a, $b",
+                              [(set GPR:$d, (select PPR:$p, (i32 GPR:$a), (i32 GPR:$b)))]>,
+                        Bundle<B_Sany>,
+                        SubUnits<[SU_scalar_alu, SU_scalar_misc1]>;
+    def PSEUDO_SELri : TPUInst<(outs GPR:$d), (ins PPR:$p, GPR:$a, i32imm:$b),
+                              "$d =\t#PSEUDO_SELri $p, $a, $b",
+                              [(set GPR:$d, (select PPR:$p, (i32 GPR:$a), (i32 imm:$b)))]>,
+                       Bundle<B_Sany>, SubUnits<[SU_scalar_alu, SU_scalar_misc1]>;
+  } // Constraints = "$d = $a"
+  let Constraints = "$Md = $Ms" in {
+    def PSEUDO_MSELrr : TPUInst<(outs MPR:$Md), (ins PPR:$p, MPR:$Ms, MPR:$Mt),
+                               "$Md =\t#PSEUDO_MSELrr $p, $Ms, $Mt",
+                               [(set MPR:$Md, (select PPR:$p, (i32 MPR:$Ms), (i32 MPR:$Mt)))]>,
+                         Bundle<B_Vany>, IsVectorInstruction,
+                         SubUnits<[SU_vmask]>;
+    // We also support PSEUDO_CSEL, defined in TPUInstrSparseCore.td.
+  } // Constraints = "$Md = $Ms"
+  let Constraints = "$d = $b" in {
+    def PSEUDO_SELir : TPUInst<(outs GPR:$d), (ins PPR:$p, i32imm:$a, GPR:$b),
+                              "$d =\t#SEL $p, $a, $b",
+                              [(set GPR:$d, (select PPR:$p, (i32 imm:$a), (i32 GPR:$b)))]>,
+                       Bundle<B_Sany>,
+                       SubUnits<[SU_scalar_alu, SU_scalar_misc1]>;
+  }  // Constraints = "$d = $b"
+} // isPseudo = 1, isSelect = 1
+
+def : Pat<(select PPR:$p, (f32 GPR:$a), (f32 GPR:$b)),
+          (PSEUDO_SELrr PPR:$p, GPR:$a, GPR:$b)>;
+def : Pat<(select PPR:$p, (f32 GPR:$a), (f32 fpimm:$b)),
+          (PSEUDO_SELri PPR:$p, GPR:$a, (ftoi $b))>;
+def : Pat<(select PPR:$p, (f32 fpimm:$a), (f32 GPR:$b)),
+          (PSEUDO_SELir PPR:$p, (ftoi $a), GPR:$b)>;
+
+def : Pat<(select PPR:$p, (vNi1 MPR:$a), (vNi1 MPR:$b)),
+          (PSEUDO_MSELrr PPR:$p, MPR:$a, MPR:$b)>;
+
+def : Pat<(i32 (zext PPR:$x)), (PSEUDO_SELri PPR:$x, (IMM 1), 0)>;
+def : Pat<(i32 (anyext PPR:$x)), (PSEUDO_SELri PPR:$x, (IMM 1), 0)>;
+def : Pat<(i32 (sext PPR:$x)), (PSEUDO_SELri PPR:$x, (IMM -1), 0)>;
+
+def : Pat<(f32 (uint_to_fp PPR:$x)), (PSEUDO_SELri PPR:$x, (IMM 0x3f800000), 0)>;
+def : Pat<(f32 (sint_to_fp PPR:$x)), (PSEUDO_SELri PPR:$x, (IMM 0xbf800000), 0)>;
+} // Predicates = [NotBC]
+
+//===----------------------------------------------------------------------===//
+// Misc ops
+//===----------------------------------------------------------------------===//
+let Predicates = [NotBC] in {
+
+let hasSideEffects = 1 in {
+def SNOP : TPUInstP<(outs), (ins), "_ = \tsnop${pred}", [(int_tpu_nop)]>;
+// Hardware doesn't have vnop, we encode it as vm0 vmov vm0 so it needs a misc
+// slot.
+def VNOP : TPUInstP<(outs), (ins), "_ = \tvnop${pred}", []>, Bundle<B_SM>,
+  IsVectorInstruction, SubUnits<[SU_vmisc]>;
+}
+
+let hasSideEffects = 1, mayLoad = 1, SubUnits = [SU_fence] in {
+def SFENCE : TPUInstP<(outs), (ins),
+             "_ =\tsfence${pred}",
+             [(atomic_fence (timm), (timm))]>,
+             Sched<[WriteSfence]>, Bundle<B_Sany>;
+}
+let hasSideEffects = 1, mayLoad = 1, SubUnits = [SU_fence] in {
+let isPseudo = 1 in {
+// See the int.tpu.sfence.imem intrinsic description for how this is being used.
+def SFENCE_IMEM : TPUInstP<(outs), (ins),
+                  "_ =\tSFENCE_IMEM${pred}",
+                  [(int_tpu_sfence_imem)]>,
+                  Sched<[WriteSfence]>, Bundle<B_Sany>;
+} // isPseudo = 1
+def SFENCE_SCMF : TPUInstP<(outs), (ins),
+                  "_ =\tsfence.scmf${pred}",
+                  [(int_tpu_sfence_scmf)]>,
+                  Sched<[WriteSfence]>, Bundle<B_Sany>;
+def SFENCE_SELr : TPUInstP<(outs), (ins GPR:$s),
+                  "_ =\tsfence.sel${pred} $s",
+                  [(int_tpu_sfence_sel GPR:$s)]>,
+                  Sched<[WriteSfence]>, Bundle<B_Sany>;
+def SFENCE_SELi : TPUInstP<(outs), (ins tsimmi:$bm),
+                  "_ =\tsfence.sel${pred} $bm",
+                  [(int_tpu_sfence_sel (i32 imm:$bm))]>,
+                  Sched<[WriteSfence]>, Bundle<B_Sany>;
+} // hasSideEffects = 1, mayStore = 1, SubUnits = [SU_fence]
+
+} // Predicates = [NotBC]
+
+//===----------------------------------------------------------------------===//
+// Local DMA Operations for TensorCore
+//===----------------------------------------------------------------------===//
+
+let Predicates = [NotBC, NotSC] in {
+
+// TODO(thomasraoux): Mark those instructions as using all Vs slots.
+multiclass DMAStrided<string srcmem, string dstmem, DAGOperand FlagT,
+  dag PatFlagType, DAGOperand LenT, int PatLenType_isimm,
+  Intrinsic strided_intr =
+    !cast<Intrinsic>("int_tpu_dma_"#srcmem#"_to_"#dstmem#"_single_strided")> {
+    def "" : TPUInstP<(outs), (ins GPR:$dst, FlagT:$sflag, GPR:$src, LenT:$len,
+                                    GPR:$srcs, GPR:$dsts, GPR:$els),
+  "["#dstmem#":${dst}@${dsts}], [sflag:${sflag}] =\tdma.strided${pred} ["
+  #srcmem#":${src}@${srcs}], length:${len}, elements_per_stride:${els}",
+  [(strided_intr PatFlagType, GPR:$src, GPR:$dst, !if(PatLenType_isimm, imm, GPR):$len,
+    GPR:$srcs, GPR:$dsts, GPR:$els)]>,
+  Bundle<B_Sboth>, Sched<[WriteDmaLocal]>, SubUnits<[SU_dma]>;
+}
+
+multiclass DMAGeneral<string srcmem, string dstmem, DAGOperand LenT,
+  int PatLenType_isimm, DAGOperand DescT,  int PatDescType_isimm,
+  Intrinsic general_intr =
+    !cast<Intrinsic>("int_tpu_dma_"#srcmem#"_to_"#dstmem#"_general")> {
+    def "" : TPUInstP<(outs), (ins GPR:$dst, GPR:$dstflags, GPR:$src,
+                               GPR:$sflag, LenT:$len, DescT:$desc,
+                               i32imm:$scount, GPR:$override),
+  "["#dstmem#":${dst}], [sflag:${dstflags}] =\tdma.general${pred} ["#srcmem
+  #":${src}], [sflag:${sflag}], length:${len}, [smem:${desc}], stride_count:${scount}, ici_dest:${override}",
+  [(general_intr GPR:$dstflags, GPR:$src, GPR:$dst, !if(PatLenType_isimm, imm, GPR):$len, GPR:$sflag,
+    !if(PatDescType_isimm, imm, GPR):$desc, (i32 imm:$scount), GPR:$override)]>,
+  Bundle<B_Sboth>, Sched<[WriteDmaGeneral]>, SubUnits<[SU_dma]>;
+}
+
+multiclass DMA_Extended_<string srcmem, string dstmem> {
+defm _STRIDEDrr : DMAStrided<srcmem, dstmem, GPR, (i32 GPR:$sflag), GPR, 0 /* !isimm */>;
+defm _STRIDEDri :
+  DMAStrided<srcmem, dstmem, i32imm, (Wrapper tglobaladdr:$sflag), GPR, 0 /* !isimm */>,
+  BundleImmSy;
+defm _STRIDEDir : DMAStrided<srcmem, dstmem, GPR, (i32 GPR:$sflag), i32imm, 1 /* isimm */>,
+  BundleImmSy;
+defm _STRIDEDii :
+  DMAStrided<srcmem, dstmem, i32imm, (Wrapper tglobaladdr:$sflag), i32imm, 1 /* isimm */>,
+  BundleImmSy<[IMM_OP_0, IMM_OP_1]>;
+
+defm _GENERALrr : DMAGeneral<srcmem, dstmem, GPR, 0 /* !isimm */, GPR, 0 /* !isimm */>;
+defm _GENERALri : DMAGeneral<srcmem, dstmem, i32imm, 1 /* isimm */, GPR, 0 /* !isimm */>,
+                  BundleImmSy;
+defm _GENERALir : DMAGeneral<srcmem, dstmem, GPR, 0 /* !isimm */, i32imm, 1 /* isimm */>,
+                  BundleImmSy;
+defm _GENERALii : DMAGeneral<srcmem, dstmem, i32imm, 1 /* isimm */, i32imm, 1 /* isimm */>,
+                  BundleImmSy<[IMM_OP_0, IMM_OP_1]>;
+}
+
+defm DMA_HBM_TO_SMEM : DMA_Extended_<"hbm", "smem">;
+defm DMA_HBM_TO_VMEM : DMA_Extended_<"hbm", "vmem">;
+defm DMA_SMEM_TO_HBM :  DMA_Extended_<"smem", "hbm">;
+defm DMA_VMEM_TO_HBM :  DMA_Extended_<"vmem", "hbm">;
+} // Predicates = [NotBC, NotSC]
+
+let Predicates = [NotBC] in {
+let hasSideEffects = 1, mayLoad = 1, mayStore = 1, SubUnits = [SU_descriptor_dma] in {
+// DMA Descriptor
+def DMADescr : TPUInstP<(outs), (ins GPR:$desc),
+                     "_ =\tdma.desc${pred} [smem:${desc}]",
+                     [(int_tpu_dma_descriptor GPR:$desc)]>,
+             Bundle<B_S1>, Sched<[WriteDmaLocal]>;
+def DMADesci : TPUInstP<(outs), (ins i32imm:$desc),
+                     "_ =\tdma.desc${pred} [smem:${desc}]",
+                     [(int_tpu_dma_descriptor (i32 imm:$desc))]>,
+             Bundle<B_S1>, BundleImmSy, Sched<[WriteDmaLocal]>;
+} // hasSideEffects = 1, mayLoad = 1, mayStore = 1, SubUnits = [SU_descriptor_dma]
+
+let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in {
+//
+// Note that this instruction is not predicated. The predicate goes at the end of
+// the operand uses list, and the length of this list is not known at compile time.
+// The LOWER versions are temporary unpredicated instructions so lowering works with
+// the variadic reglist. This allows us to keep the predicate at the end of the
+// operand list as usual. They should get converted into EVENT early after isel.
+//
+def EVENT_LOWER : TPUInst<(outs), (ins i32imm:$tag, reglist:$regs, variable_ops),
+                    "_ =\tEVENT_LOWER $tag$regs", []>;
+def EVENT_NULLARY_LOWER : TPUInst<(outs), (ins i32imm:$tag),
+                    "_ =\tEVENT_NULLARY_LOWER $tag", []>;
+def EVENT : TPUInstP<(outs), (ins i32imm:$tag, reglist:$regs, variable_ops),
+                    "_ =\tevent${pred} $tag$regs", []>;
+def EVENT_NULLARY : TPUInstP<(outs), (ins i32imm:$tag),
+                    "_ =\tevent${pred} $tag", []>;
+} // hasSideEffects = 1, mayLoad = 1, mayStore = 1
+
+def SDELAY : TPUInstP<(outs), (ins i32imm:$cycles),
+                     "_ =\tsdelay${pred} $cycles", []>, Bundle<B_Sany>,
+                     SubUnits<[SU_delay]>;
+
+} // Predicates = [NotBC]
+
+let Predicates = [NotBC, NotSC] in {
+def VDELAY : TPUInstP<(outs), (ins i32imm:$cycles),
+                     "_ =\tvdelay${pred} $cycles", []>, Bundle<B_SM>,
+                     IsVectorInstruction, SubUnits<[SU_vdelay]>;
+// For delay longer than 8 cycles we need to use immediate slots.
+def VDELAY_LONG : TPUInstP<(outs), (ins i32imm:$cycles),
+                     "_ =\tvdelay${pred} $cycles", []>, Bundle<B_SM>,
+    BundleImmVy<[IMM_OP_0], IMM_2_to_5>, IsVectorInstruction,
+    SubUnits<[SU_vdelay]>;
+} // Predicates = [NotBC]
+
+//===----------------------------------------------------------------------===//
+// Vector ALU ops
+//===----------------------------------------------------------------------===//
+
+multiclass UnaryOp<string Name, SDPatternOperator Intr, ValueTypeByHwMode VDstType,
+              ValueTypeByHwMode SrcType, TPUInstVEncoding enc = VIntALUEupOpEncoding<0>> {
+  defm "" : TPUInstVany<B_Vany, enc, (outs VPR:$Vd), (ins VPR:$x), "$Vd =\t"#Name#"${pred} $x",
+                        [(set (VDstType VPR:$Vd), (Intr (SrcType VPR:$x)))], YOpIdxNone>,
+                        IsVectorInstruction;
+}
+
+multiclass UnaryVFOp<string Name, SDPatternOperator Intr, TPUInstVEncoding enc = VIntALUEupOpEncoding<0>> {
+  defm "" : UnaryOp<Name, Intr, vNf32, vNf32, enc>;
+}
+
+multiclass VIntALUOp<VIntALUOpEncoding enc, string Name, SDPatternOperator OpNode, ValueTypeByHwMode VDstAluType,
+                     ValueTypeByHwMode VSrcAluType, BundleSlot Slot, string XY, int HasSplat = 1> {
+  if !eq(HasSplat, 1) then {
+    // Register-immediate - full 32-bit immediate.
+    defm ri : TPUInst<Slot, enc, (outs VPR_AGG:$Vd), (ins VPR_AGG:$x, tvimmi:$y),
+                      "$Vd =\t" # Name # "${pred} " # XY,
+                      [(set (VDstAluType VPR_AGG:$Vd), (OpNode (VSrcAluType VPR_AGG:$x),
+                                             (VSrcAluType (Splat imm:$y))))]>,
+              IsVectorInstruction;
+  // Register-scalar - splat a scalar into all lanes of a vector.
+    defm rs : TPUInst<Slot, enc, (outs VPR_AGG:$Vd), (ins VPR_AGG:$x, GPR:$y),
+                     "$Vd =\t" # Name # "${pred} " # XY,
+                     [(set (VDstAluType VPR_AGG:$Vd), (OpNode (VSrcAluType VPR_AGG:$x),
+                                            (VSrcAluType (Splat (i32 GPR:$y)))))]>,
+              IsVectorInstruction;
+  }
+  // Register-register.
+  defm rr : TPUInst<Slot, enc, (outs VPR_AGG:$Vd), (ins VPR_AGG:$x, VPR_AGG:$y),
+                   "$Vd =\t" # Name # "${pred} " # XY,
+                   [(set (VDstAluType VPR_AGG:$Vd), (OpNode (VSrcAluType VPR_AGG:$x),
+                                          (VSrcAluType VPR_AGG:$y)))]>,
+          IsVectorInstruction;
+}
+
+multiclass VFPALUOp<VIntALUOpEncoding enc, string Name, SDPatternOperator OpNode, ValueTypeByHwMode VDstAluType,
+                    ValueTypeByHwMode VSrcAluType, BundleSlot Slot, string XY, int HasSplat,
+                    DAGOperand SplatImmTy> {
+  if !eq(HasSplat, 1) then {
+    defm ri : TPUInst<Slot, enc, (outs VPR_AGG:$Vd), (ins VPR_AGG:$x, SplatImmTy:$y),
+                     "$Vd =\t" # Name # "${pred} " # XY,
+                     [(set (VDstAluType VPR_AGG:$Vd), (OpNode (VSrcAluType VPR_AGG:$x),
+                                            (VSrcAluType (Splat fpimm:$y))))]>,
+              IsVectorInstruction;
+    defm rs : TPUInst<Slot, enc, (outs VPR_AGG:$Vd), (ins VPR_AGG:$x, GPR:$y),
+                     "$Vd =\t" # Name # "${pred} " # XY,
+                     [(set (VDstAluType VPR_AGG:$Vd), (OpNode (VSrcAluType VPR_AGG:$x),
+                                            (VSrcAluType (Splat (f32 GPR:$y)))))]>,
+             IsVectorInstruction;
+  }
+  defm rr : TPUInst<Slot, enc, (outs VPR_AGG:$Vd), (ins VPR_AGG:$x, VPR_AGG:$y),
+                   "$Vd =\t" # Name # "${pred} " # XY,
+                   [(set (VDstAluType VPR_AGG:$Vd), (OpNode (VSrcAluType VPR_AGG:$x),
+                                          (VSrcAluType VPR_AGG:$y)))]>,
+           IsVectorInstruction;
+}
+
+multiclass VIntALUOpXY<bits<6> opc, string Name, SDPatternOperator OpNode, ValueTypeByHwMode VDstAluType,
+                        ValueTypeByHwMode VSrcAluType, BundleSlot Slot = B_Vany, int HasSplat = 1> :
+           VIntALUOp<VIntALUOpEncoding<opc>, Name, OpNode, VDstAluType, VSrcAluType, Slot, "$x, $y", HasSplat>;
+multiclass VIntALUOpYX<bits<6> opc, string Name, SDPatternOperator OpNode, ValueTypeByHwMode VDstAluType,
+                       ValueTypeByHwMode VSrcAluType, BundleSlot Slot = B_Vany, int HasSplat = 1> :
+           VIntALUOp<VIntALUOpEncoding<opc>, Name, OpNode, VDstAluType, VSrcAluType, Slot, "$y, $x", HasSplat>;
+multiclass VFPALUOpXY<bits<6> opc, string Name, SDPatternOperator OpNode, ValueTypeByHwMode VDstAluType,
+                      ValueTypeByHwMode VSrcAluType, BundleSlot Slot = B_Vany, int HasSplat = 1,
+                      DAGOperand SplatImmTy = tvimmf> :
+           VFPALUOp<VIntALUOpEncoding<opc>, Name, OpNode, VDstAluType, VSrcAluType, Slot, "$x, $y", HasSplat,
+                    SplatImmTy>;
+multiclass VFPALUOpYX<bits<6> opc, string Name, SDPatternOperator OpNode, ValueTypeByHwMode VDstAluType,
+                      ValueTypeByHwMode VSrcAluType, BundleSlot Slot = B_Vany, int HasSplat = 1,
+                      DAGOperand SplatImmTy = tvimmf> :
+           VFPALUOp<VIntALUOpEncoding<opc>, Name, OpNode, VDstAluType, VSrcAluType, Slot, "$y, $x", HasSplat,
+           SplatImmTy>;
+
+// Non commutative version of VFPALUOpYX.
+multiclass VFPALUOpYX_NC<bits<6> opc, string Name, SDPatternOperator OpNode, ValueTypeByHwMode VDstAluType,
+                       ValueTypeByHwMode VSrcAluType, BundleSlot Slot = B_Vany, int HasSplat = 1> {
+if !eq(HasSplat, 1) then {
+  defm ir : TPUInst<Slot, VIntALUOpEncoding<opc>, (outs VPR_AGG:$Vd), (ins tvimmf:$y, VPR_AGG:$x),
+                        "$Vd =\t" # Name # "${pred} $y, $x",
+                        [(set (VDstAluType VPR_AGG:$Vd), (OpNode (VSrcAluType (Splat fpimm:$y)),
+                                             (VSrcAluType VPR_AGG:$x)))],
+                        YOpIdx1>, IsVectorInstruction;
+  defm sr : TPUInst<Slot, VIntALUOpEncoding<opc>, (outs VPR_AGG:$Vd), (ins GPR:$y, VPR_AGG:$x),
+                        "$Vd =\t" # Name # "${pred} $y, $x",
+                        [(set (VDstAluType VPR_AGG:$Vd), (OpNode (VSrcAluType (Splat GPR:$y)),
+                                             (VSrcAluType VPR_AGG:$x)))],
+                        YOpIdx1>, IsVectorInstruction;
+}
+defm rr : TPUInst<Slot, VIntALUOpEncoding<opc>, (outs VPR_AGG:$Vd), (ins VPR_AGG:$y, VPR_AGG:$x),
+                      "$Vd =\t" # Name # "${pred} $y, $x",
+                      [(set (VDstAluType VPR_AGG:$Vd), (OpNode (VSrcAluType VPR_AGG:$y),
+                                           (VSrcAluType VPR_AGG:$x)))],
+                      YOpIdx1>, IsVectorInstruction;
+}
+
+// Non commutative version of VIntALUOpYX.
+multiclass VIntALUOpYX_NC<bits<6> opc, string Name, SDPatternOperator OpNode, ValueTypeByHwMode VDstAluType,
+                       ValueTypeByHwMode VSrcAluType, BundleSlot Slot = B_Vany, int HasSplat = 1> {
+if !eq(HasSplat, 1) then {
+  defm ir : TPUInst<Slot, VIntALUOpEncoding<opc>, (outs VPR_AGG:$Vd), (ins tvimmi:$y, VPR_AGG:$x),
+                        "$Vd =\t" # Name # "${pred} $y, $x",
+                        [(set (VDstAluType VPR_AGG:$Vd), (OpNode (VSrcAluType (Splat imm:$y)),
+                                             (VSrcAluType VPR_AGG:$x)))],
+                        YOpIdx1>, IsVectorInstruction;
+  defm sr : TPUInst<Slot, VIntALUOpEncoding<opc>, (outs VPR_AGG:$Vd), (ins GPR:$y, VPR_AGG:$x),
+                        "$Vd =\t" # Name # "${pred} $y, $x",
+                        [(set (VDstAluType VPR_AGG:$Vd), (OpNode (VSrcAluType (Splat GPR:$y)),
+                                             (VSrcAluType VPR_AGG:$x)))],
+                        YOpIdx1>, IsVectorInstruction;
+}
+defm rr : TPUInst<Slot, VIntALUOpEncoding<opc>, (outs VPR_AGG:$Vd), (ins VPR_AGG:$y, VPR_AGG:$x),
+                      "$Vd =\t" # Name # "${pred} $y, $x",
+                      [(set (VDstAluType VPR_AGG:$Vd), (OpNode (VSrcAluType VPR_AGG:$y),
+                                           (VSrcAluType VPR_AGG:$x)))],
+                      YOpIdx1>, IsVectorInstruction;
+}
+
+let SubUnits = [SU_vector_move, SU_xrf_result] in {
+let isMoveReg = 1 in {
+defm VMOVr : TPUInstVany<B_VMOVR, VIntALUUnOpEncoding<31>, (outs VPR:$Vd), (ins VPR:$y),
+                         "$Vd =\tvmov${pred} $y", [], YOpIdx1>, IsVectorInstruction;
+let isPseudo = 1, hasSideEffects = 1 in {
+// VMOVr with side effects, used to escape CSE.
+defm VMOV_SEr : TPUInstVany<B_VMOVR, VIntALUUnOpEncoding<31>, (outs VPR:$Vd), (ins VPR:$y),
+                           "$Vd =\tVMOV_SEr${pred} $y", [], YOpIdx1>, IsVectorInstruction;
+} // isPseudo = 1, hasSideEffects = 1
+} // let isMoveReg = 1
+} // let SubUnits = [SU_vector_move,SU_xrf_result]
+
+let SubUnits = [SU_vector_move] in {
+defm VMOVs : TPUInstVany<B_Vany, VIntALUUnOpEncoding<31>, (outs VPR:$Vd), (ins GPR:$y),
+                         "$Vd =\tvmov${pred} $y",
+                         [(set VPR:$Vd, (vNf32 (Splat GPR:$y)))], YOpIdx1>, IsVectorInstruction;
+let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+defm VIMMF : TPUInstVany<B_Vany, VIntALUUnOpEncoding<31>, (outs VPR:$Vd), (ins tvimmf:$y),
+                         "$Vd =\tvimm.f32${pred} $y",
+                         [(set VPR:$Vd, (vNf32 (Splat fpimm:$y)))], YOpIdx1>,
+             IsVectorInstruction;
+// Need a special version for int version as immediates are handled differently
+defm VIMMI : TPUInstVany<B_Vany, VIntALUUnOpEncoding<31>, (outs VPR:$Vd), (ins tvimmi:$y),
+                         "$Vd =\tvimm.s32${pred} $y",
+                         [(set VPR:$Vd, (vNi32 (Splat (i32 imm:$y))))], YOpIdx1>,
+             IsVectorInstruction;
+} // let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1
+} // let SubUnits = [SU_vector_move]
+
+def : Pat<(vNi32 (Splat GPR:$y)), (VMOVs GPR:$y)>;
+
+let Predicates = [HasVPU] in {
+defm VADD  : VIntALUOpYX<0, "vadd.s32", add, vNi32, vNi32>, SubUnits<[SU_vector_op]>;
+defm VSUB  : VIntALUOpYX_NC<0, "vsub.s32", sub, vNi32, vNi32>, SubUnits<[SU_vector_op]>;
+// The pattern is unpredicated: unsupported targets emulate.
+defm VMUL  : VIntALUOpYX<0, "vmul.u32", mul, vNi32, vNi32>, SubUnits<[SU_vector_op]>;
+defm VFADD : VFPALUOpYX<5, "vadd.f32", fadd, vNf32, vNf32, B_VFADD>, Sched<[WriteFadd]>,
+     SubUnits<[SU_vector_float]>;
+defm VFMUL : VFPALUOpYX<7, "vmul.f32", fmul, vNf32, vNf32, B_VFMUL>, Sched<[WriteFmul]>,
+     SubUnits<[SU_vector_fmul]>;
+defm VFSUB : VFPALUOpYX_NC<6, "vsub.f32", fsub, vNf32, vNf32, B_VFADD>, Sched<[WriteFadd]>,
+     SubUnits<[SU_vector_float]>;
+
+let SubUnits = [SU_vector_op] in {
+defm VAND  : VIntALUOpYX<2, "vand.u32", and, vNi32, vNi32>;
+defm VOR   : VIntALUOpYX<3, "vor.u32", or, vNi32, vNi32>;
+defm VXOR  : VIntALUOpYX<4, "vxor.u32", xor, vNi32, vNi32>;
+defm VFMAX : VFPALUOpXY<8, "vmax.f32", fmaximum, vNf32, vNf32>;
+defm VFMIN : VFPALUOpXY<9, "vmin.f32", fminimum, vNf32, vNf32>;
+} // let SubUnits = [SU_vector_op]
+
+let SubUnits = [SU_vector_shift] in {
+defm VSHL  : VIntALUOpXY<10, "vshll.u32", shl, vNi32, vNi32, B_VARI>;
+defm VSRL  : VIntALUOpXY<11, "vshrl.u32", srl, vNi32, vNi32, B_VARI>;
+defm VSRA  : VIntALUOpXY<12, "vshra.s32", sra, vNi32, vNi32, B_VARI>;
+defm VRSRA : VIntALUOpXY<13, "vrshra.s32", int_tpu_vrshra, vNi32, vNi32, B_VARI>,
+             Sched<[WriteVrshra]>;
+} // let SubUnits = [SU_vector_shift]
+
+// This works correctly for all finite and non-finite numbers.
+//
+// NaNs are represented by an exponent field of all ones and a non-zero
+// significand. Masking away the sign bit merely changes the sign of the
+// NaN.
+def : Pat<(vNf32 (fabs VPR:$x)), (VANDri VPR:$x, (i32 0x7fffffff))>;
+
+def : Pat<(vNf32 (fcopysign (vNf32 VPR:$x), (vNf32 VPR:$y))),
+          (VORrr (VANDri VPR:$x, (i32 0x7fffffff)), (VANDri VPR:$y, (i32 0x80000000)))>;
+
+def : Pat<(vNi32 (int_tpu_shll (vNi32 VPR:$lhs), (vNi32 VPR:$rhs))),
+          (VSHLrr VPR:$lhs, VPR:$rhs)>;
+def : Pat<(vNi32 (int_tpu_shll (vNi32 VPR:$lhs), (vNi32 (Splat (i32 imm:$rhs))))),
+          (VSHLri VPR:$lhs, (i32 imm:$rhs))>;
+def : Pat<(vNi32 (int_tpu_shll (vNi32 VPR:$lhs), (vNi32 (Splat (i32 GPR:$rhs))))),
+          (VSHLrs VPR:$lhs, GPR:$rhs)>;
+def : Pat<(vNi32 (int_tpu_shrl (vNi32 VPR:$lhs), (vNi32 VPR:$rhs))),
+          (VSRLrr VPR:$lhs, VPR:$rhs)>;
+def : Pat<(vNi32 (int_tpu_shrl (vNi32 VPR:$lhs), (vNi32 (Splat (i32 imm:$rhs))))),
+          (VSRLri VPR:$lhs, (i32 imm:$rhs))>;
+def : Pat<(vNi32 (int_tpu_shrl (vNi32 VPR:$lhs), (vNi32 (Splat (i32 GPR:$rhs))))),
+          (VSRLrs VPR:$lhs, GPR:$rhs)>;
+def : Pat<(vNi32 (int_tpu_shra (vNi32 VPR:$lhs), (vNi32 VPR:$rhs))),
+          (VSRArr VPR:$lhs, VPR:$rhs)>;
+def : Pat<(vNi32 (int_tpu_shra (vNi32 VPR:$lhs), (vNi32 (Splat (i32 imm:$rhs))))),
+          (VSRAri VPR:$lhs, (i32 imm:$rhs))>;
+def : Pat<(vNi32 (int_tpu_shra (vNi32 VPR:$lhs), (vNi32 (Splat (i32 GPR:$rhs))))),
+          (VSRArs VPR:$lhs, (i32 GPR:$rhs))>;
+
+defm VCLAMPZ : VFPALUOpXY<30, "vclamp.gez.f32", Relu, vNf32, vNf32, B_VCLAMP>,
+     SubUnits<[SU_vector_math]>;
+} // let Predicates = [HasVPU]
+
+let Predicates = [HasPxcVPU] in {
+defm VCLAMPS : VFPALUOpXY<54, "vclamps.f32", int_tpu_clamp_symmetric, vNf32, vNf32, B_VCLAMP>,
+     SubUnits<[SU_vector_math]>;
+} // let Predicates = [HasPxcVPU]
+
+// XOR can go in slot0 or 1 while fsub can only go in slot1.
+def : Pat<(vNf32 (fneg VPR:$x)), (VXORri VPR:$x, (i32 0x80000000))>;
+
+let Predicates = [IsVFTCOrSC, HasVPU] in {
+defm VCEIL : UnaryVFOp<"vceil.f32", fceil>, SubUnits<[SU_vector_math_ext]>;
+defm VFLOOR : UnaryVFOp<"vfloor.f32", ffloor>, SubUnits<[SU_vector_math_ext]>;
+} // Predicates = [IsVFTCOrSC, HasVPU]
+
+//===----------------------------------------------------------------------===//
+// Vector conversion ops
+//===----------------------------------------------------------------------===//
+let Predicates = [HasVPU, NotBC, NotVFTC, NotSC] in {
+let SubUnits = [SU_vector_cvt] in {
+def VFPTOSIrr : TPUInstP<(outs VPR:$Vd), (ins VPR:$x, VPR:$y),
+                      "$Vd =\tvcvt.f32.s32${pred} $x, $y",
+                      [(set (vNi32 VPR:$Vd),
+                       (int_tpu_cvt_pr_fptosi (vNf32 VPR:$x), (vNi32 VPR:$y)))]>,
+          Bundle<B_VCVT>, Sched<[WriteFPConvert]>, IsVectorInstruction;
+def VFPTOSIrs : TPUInstP<(outs VPR:$Vd), (ins VPR:$x, GPR:$y),
+                      "$Vd =\tvcvt.f32.s32${pred} $x, $y",
+                      [(set (vNi32 VPR:$Vd),
+                       (int_tpu_cvt_pr_fptosi (vNf32 VPR:$x), (vNi32 (Splat (i32 GPR:$y)))))]>,
+          Bundle<B_VCVT>, Sched<[WriteFPConvert]>, IsVectorInstruction;
+def VFPTOSIri : TPUInstP<(outs VPR:$Vd), (ins VPR:$x, tvimmi:$y),
+                      "$Vd =\tvcvt.f32.s32${pred} $x, $y",
+                      [(set (vNi32 VPR:$Vd),
+                       (int_tpu_cvt_pr_fptosi (vNf32 VPR:$x), (vNi32 (Splat (i32 imm:$y)))))]>,
+          Bundle<B_VCVT>, Sched<[WriteFPConvert]>, IsVectorInstruction;
+} // SubUnits = [SU_vector_cvt]
+def : Pat<(vNi32 (fp_to_sint (vNf32 VPR:$x))), (VFPTOSIri VPR:$x, (i32 -1))>;
+} // Predicates = [HasVPU, NotBC, NotVFTC, NotSC]
+
+let Predicates = [HasVPU, IsVFTCOrSC] in {
+def VFPTOSIr : TPUInstP<(outs VPR:$Vd), (ins VPR:$x),
+                      "$Vd =\tvcvt.f32.s32${pred} $x",
+                      [(set (vNi32 VPR:$Vd),
+                       (int_tpu_cvt_fptosi (vNf32 VPR:$x)))]>,
+               Bundle<B_VCVT>, Sched<[WriteFPConvert]>,
+               IsVectorInstruction, SubUnits<[SU_vector_cvt_ext]>;
+def : Pat<(vNi32 (fp_to_sint (vNf32 VPR:$x))), (VFPTOSIr VPR:$x)>;
+} // Predicates = [HasVPU, IsVFTCOrSC]
+
+let Predicates = [NotBC, HasVPU] in {
+let SubUnits = [SU_vector_cvt] in {
+def VSITOFPr : TPUInstP<(outs VPR:$Vd), (ins VPR:$x),
+                       "$Vd =\tvcvt.s32.f32${pred} $x",
+                       [(set VPR:$Vd, (sint_to_fp (vNi32 VPR:$x)))]>,
+           Bundle<B_VCVT>, Sched<[WriteFPConvert]>, IsVectorInstruction;
+} // let SubUnits = [SU_vector_cvt]
+} // let Predicates = [NotBC, HasVPU]
+
+let Predicates = [IsVFTCOrSC, HasVPU], SubUnits = [SU_vector_cvt_ext] in {
+def VFPTOBF8 : TPUInstP<(outs VPR:$Vd), (ins VPR:$x),
+                       "$Vd =\tvcvt.f32.bf8${pred} $x",
+                       [(set (vNf32 VPR:$Vd), (int_tpu_vcvt_fptobf8 (vNf32 VPR:$x)))]>,
+           Bundle<B_VCVT>, Sched<[WriteFPConvert]>, IsVectorInstruction;
+def VFPTOIF8 : TPUInstP<(outs VPR:$Vd), (ins VPR:$x),
+                       "$Vd =\tvcvt.f32.if8${pred} $x",
+                       [(set (vNf32 VPR:$Vd), (int_tpu_vcvt_fptoif8 (vNf32 VPR:$x)))]>,
+           Bundle<B_VCVT>, Sched<[WriteFPConvert]>, IsVectorInstruction;
+def VFPTOBF16 : TPUInstP<(outs VPR:$Vd), (ins VPR:$x),
+                       "$Vd =\tvcvt.f32.bf16${pred} $x",
+                       [(set (vNf32 VPR:$Vd), (int_tpu_vcvt_fptobf16 (vNf32 VPR:$x)))]>,
+           Bundle<B_VCVT>, Sched<[WriteFPConvert]>, IsVectorInstruction;
+def VFPTOHF16 : TPUInstP<(outs VPR:$Vd), (ins VPR:$x),
+                       "$Vd =\tvcvt.f32.hf16${pred} $x",
+                       [(set (vNf32 VPR:$Vd), (int_tpu_vcvt_fptohf16 (vNf32 VPR:$x)))]>,
+           Bundle<B_VCVT>, Sched<[WriteFPConvert]>, IsVectorInstruction;
+
+multiclass VFloatConvertSrOp<string Name, SDPatternOperator OpNode> {
+ def rr : TPUInstP<(outs VPR:$Vd), (ins VPR:$y, VPR:$x),
+                       "$Vd =\t" # Name # "${pred} $y, $x",
+                       [(set (vNf32 VPR:$Vd), (OpNode (vNi32 VPR:$y), (vNf32 VPR:$x)))]>,
+          Bundle<B_VCVT>, Sched<[WriteFPConvert]>, IsVectorInstruction;
+
+ def ir : TPUInstP<(outs VPR:$Vd), (ins i32imm:$y, VPR:$x),
+                        "$Vd =\t" # Name # "${pred} $y, $x",
+                        [(set (vNf32 VPR:$Vd), (OpNode (vNi32 (Splat (i32 imm:$y))), (vNf32 VPR:$x)))]>,
+          Bundle<B_VCVT>, Sched<[WriteFPConvert]>, IsVectorInstruction;
+
+ def sr : TPUInstP<(outs VPR:$Vd), (ins GPR:$y, VPR:$x),
+                        "$Vd =\t" # Name # "${pred} $y, $x",
+                        [(set (vNf32 VPR:$Vd), (OpNode (vNi32 (Splat (i32 GPR:$y))), (vNf32 VPR:$x)))]>,
+          Bundle<B_VCVT>, Sched<[WriteFPConvert]>, IsVectorInstruction;
+}
+
+defm VFPTOBF8_SR : VFloatConvertSrOp<"vcvt.sr.f32.bf8", int_tpu_vcvt_sr_fptobf8>;
+defm VFPTOIF8_SR : VFloatConvertSrOp<"vcvt.sr.f32.if8", int_tpu_vcvt_sr_fptoif8>;
+defm VFPTOBF16_SR : VFloatConvertSrOp<"vcvt.sr.f32.bf16", int_tpu_vcvt_sr_fptobf16>;
+defm VFPTOHF16_SR : VFloatConvertSrOp<"vcvt.sr.f32.hf16", int_tpu_vcvt_sr_fptohf16>;
+} // let Predicates = [IsVFTCOrSC, HasVPU], SubUnits = [SU_vector_cvt]
+
+//===----------------------------------------------------------------------===//
+// Vector comparison ops
+//===----------------------------------------------------------------------===//
+multiclass VIntCompareOp<bits<6> opc, string Name, SDPatternOperator OpNode,
+                         ValueTypeByHwMode VMskType, ValueTypeByHwMode VCmpType> {
+  defm ri : TPUInst<B_Vany, VIntALUOpEncoding<opc>, (outs MPR:$Md), (ins VPR_AGG:$x, tvimmi:$y),
+                          "$Md =\t" # Name # "${pred} $x, $y",
+                          [(set (VMskType MPR:$Md), (OpNode (VCmpType VPR_AGG:$x), (VCmpType (Splat (i32 imm:$y)))))]>,
+            IsVectorInstruction;
+  defm rs : TPUInst<B_Vany, VIntALUOpEncoding<opc>, (outs MPR:$Md), (ins VPR_AGG:$x, GPR:$y),
+                          "$Md =\t" # Name # "${pred} $x, $y",
+                          [(set (VMskType MPR:$Md), (OpNode (VCmpType VPR_AGG:$x),
+                                                 (VCmpType (Splat (i32 GPR:$y)))))]>,
+            IsVectorInstruction;
+  defm rr : TPUInst<B_Vany, VIntALUOpEncoding<opc>, (outs MPR:$Md), (ins VPR_AGG:$x, VPR_AGG:$y),
+                          "$Md =\t" # Name # "${pred} $x, $y",
+                          [(set (VMskType MPR:$Md), (OpNode (VCmpType VPR_AGG:$x), (VCmpType VPR_AGG:$y)))]>,
+            IsVectorInstruction;
+}
+
+multiclass VIntUnaryCompareOp<string Name, SDPatternOperator OpNode,
+                              ValueTypeByHwMode VMskType, ValueTypeByHwMode VCmpType> {
+  def s : TPUInstP<(outs MPR:$Md), (ins GPR:$y),
+                         "$Md =\t" # Name # "${pred} $y",
+                         [(set (VMskType MPR:$Md), (OpNode (VCmpType (Splat GPR:$y))))]>,
+          Bundle<B_Vany>, IsVectorInstruction;
+  def r : TPUInstP<(outs MPR:$Md), (ins VPR_AGG:$y),
+                         "$Md =\t" # Name # "${pred} $y",
+                         [(set (VMskType MPR:$Md), (OpNode (VCmpType VPR_AGG:$y)))]>,
+          Bundle<B_Vany>, IsVectorInstruction;
+}
+
+multiclass VFPCompareOp<bits<6> opc, string Name, SDPatternOperator OpNode,
+                        ValueTypeByHwMode VMskType, ValueTypeByHwMode VCmpType> {
+  defm ri : TPUInst<B_Vany, VIntALUOpEncoding<opc>, (outs MPR:$Md), (ins VPR_AGG:$x, tvimmf:$y),
+                          "$Md =\t" # Name # "${pred} $x, $y",
+                          [(set (VMskType MPR:$Md), (OpNode (VCmpType VPR_AGG:$x),
+                                                        (VCmpType (Splat (f32 fpimm:$y)))))]>,
+            IsVectorInstruction;
+  defm rs : TPUInst<B_Vany, VIntALUOpEncoding<opc>, (outs MPR:$Md), (ins VPR_AGG:$x, GPR:$y),
+                          "$Md =\t" # Name # "${pred} $x, $y",
+                          [(set (VMskType MPR:$Md), (OpNode (VCmpType VPR_AGG:$x),
+                                                        (VCmpType (Splat (f32 GPR:$y)))))]>,
+           IsVectorInstruction;
+  defm rr : TPUInst<B_Vany, VIntALUOpEncoding<opc>, (outs MPR:$Md), (ins VPR_AGG:$x, VPR_AGG:$y),
+                          "$Md =\t" # Name # "${pred} $x, $y",
+                          [(set (VMskType MPR:$Md), (OpNode (VCmpType VPR_AGG:$x), (VCmpType VPR_AGG:$y)))]>,
+           IsVectorInstruction;
+}
+
+let SubUnits = [SU_vector_cmp] in {
+let Predicates = [HasVPU] in {
+let isCompare = 1 in {
+defm VCMPEQ  : VIntCompareOp<32, "veq.s32", seteq, vNi1, vNi32>;
+defm VCMPNE  : VIntCompareOp<33, "vne.s32", setne, vNi1, vNi32>;
+defm VCMPGT  : VIntCompareOp<34, "vgt.s32", setgt, vNi1, vNi32>;
+defm VCMPGE  : VIntCompareOp<35, "vge.s32", setge, vNi1, vNi32>;
+defm VCMPLT  : VIntCompareOp<36, "vlt.s32", setlt, vNi1, vNi32>;
+defm VCMPLE  : VIntCompareOp<37, "vle.s32", setle, vNi1, vNi32>;
+defm VCMPUGT : VIntCompareOp<80, "vgt.u32", setugt, vNi1, vNi32>, Requires<[IsSC]>;
+defm VCMPUGE : VIntCompareOp<81, "vge.u32", setuge, vNi1, vNi32>, Requires<[IsSC]>;
+defm VCMPULT : VIntCompareOp<82, "vlt.u32", setult, vNi1, vNi32>, Requires<[IsSC]>;
+defm VCMPULE : VIntCompareOp<83, "vle.u32", setule, vNi1, vNi32>, Requires<[IsSC]>;
+defm VFCMPEQ : VFPCompareOp<40, "veq.f32", setoeq, vNi1, vNf32>;
+defm VFCMPNE : VFPCompareOp<41, "vne.f32", setune, vNi1, vNf32>;
+defm VFCMPGT : VFPCompareOp<42, "vgt.f32", setogt, vNi1, vNf32>;
+defm VFCMPGE : VFPCompareOp<43, "vge.f32", setoge, vNi1, vNf32>;
+defm VFCMPLT : VFPCompareOp<44, "vlt.f32", setolt, vNi1, vNf32>;
+defm VFCMPLE : VFPCompareOp<45, "vle.f32", setole, vNi1, vNf32>;
+} // isCompare = 1
+defm VCARRYOUT  : VIntCompareOp<38, "vc.u32", int_tpu_addcarry, vNi1, vNi32>;
+} // Predicates = [HasVPU]
+
+let Predicates = [NotBC, HasVPU] in {
+def VWEIRD : TPUInstP<(outs MPR:$Md), (ins VPR:$Vs),
+                     "$Md =\tvweird.f32${pred} $Vs",
+                     [(set (vNi1 MPR:$Md),
+                           (int_tpu_weird (vNf32 VPR:$Vs)))]>,
+             Bundle<B_Vany>, IsVectorInstruction;
+} // let Predicates = [NotBC, HasVPU]
+} // SubUnits = [SU_vector_cmp]
+
+multiclass VFPComparePat<string OpName, SDPatternOperator OpNode> {
+  def : Pat<(vNi1 (OpNode (vNf32 VPR:$x), (Splat (f32 fpimm:$y)))),
+            (!cast<Instruction>(OpName#"ri") VPR:$x, tvimmf:$y)>;
+  def : Pat<(vNi1 (OpNode (vNf32 VPR:$x), (Splat (f32 GPR:$y)))),
+            (!cast<Instruction>(OpName#"rs") VPR:$x, GPR:$y)>;
+  def : Pat<(vNi1 (OpNode (vNf32 VPR:$x), (vNf32 VPR:$y))),
+            (!cast<Instruction>(OpName#"rr") VPR:$x, VPR:$y)>;
+}
+
+// Patterns for the cases where we don't care about (un)ordered.
+defm : VFPComparePat<"VFCMPEQ", seteq>;
+defm : VFPComparePat<"VFCMPNE", setne>;
+defm : VFPComparePat<"VFCMPGT", setgt>;
+defm : VFPComparePat<"VFCMPGE", setge>;
+defm : VFPComparePat<"VFCMPLT", setlt>;
+defm : VFPComparePat<"VFCMPLE", setle>;
+
+let Predicates = [IsVFTCOrSC], SubUnits = [SU_vector_cmp] in {
+defm VBYTE_NOTZERO : VIntUnaryCompareOp<"vnez.u8", int_tpu_byte_not_zero, vNi1, vNi32>;
+defm VLTTO : VFPCompareOp<37, "vlt.to.f32", int_tpu_vlt_to, vNi1, vNf32>;
+defm VLETO : VFPCompareOp<53, "vle.to.f32", int_tpu_vle_to, vNi1, vNf32>;
+defm VCLASS : VFPCompareOp<54, "vclass.f32", int_tpu_vclass, vNi1, vNf32>;
+
+let isReMaterializable = 1 in {
+def VCMASKi : TPUInstP<(outs MPR:$Md), (ins tvimmi:$y),
+                      "$Md =\tvcmask${pred} $y",
+                      []>,
+              Bundle<B_Vany>, IsVectorInstruction;
+} // isReMaterializable = 1
+def VCMASKs : TPUInstP<(outs MPR:$Md), (ins GPR:$y),
+                      "$Md =\tvcmask${pred} $y",
+                      []>,
+              Bundle<B_Vany>, IsVectorInstruction;
+} // let Predicates = [IsVFTCOrSC], SubUnits = [SU_vector_cmp]
+
+//===----------------------------------------------------------------------===//
+// Vector select and mask manipulation ops
+//===----------------------------------------------------------------------===//
+let SubUnits = [SU_vector_move] in {
+let Predicates = [HasVPU] in {
+// Note the order of these operands - the first operand is a VPR/GPR/imm32,
+// the second operand is always a VPR.
+defm VSELir : TPUInst<B_Vany, VIntALUVmselEncoding, (outs VPR:$Vd), (ins MPR:$m, tvimmi:$y, VPR:$x),
+                     "$Vd =\tvsel${pred} $m, $y, $x",
+                     [(set VPR:$Vd, (vselect (vNi1 MPR:$m), (vNi32 (Splat imm:$y)),
+                                                            (vNi32 VPR:$x)))]>,
+              IsVectorInstruction;
+defm VSELsr : TPUInst<B_Vany, VIntALUVmselEncoding, (outs VPR:$Vd), (ins MPR:$m, GPR:$y, VPR:$x),
+                     "$Vd =\tvsel${pred} $m, $y, $x",
+                     [(set VPR:$Vd, (vselect (vNi1 MPR:$m), (vNi32 (Splat GPR:$y)),
+                                                            (vNi32 VPR:$x)))]>,
+              IsVectorInstruction;
+defm VSELrr : TPUInst<B_Vany, VIntALUVmselEncoding, (outs VPR:$Vd), (ins MPR:$m, VPR:$y, VPR:$x),
+                     "$Vd =\tvsel${pred} $m, $y, $x",
+                     [(set VPR:$Vd, (vselect (vNi1 MPR:$m), (vNi32 VPR:$y),
+                                                            (vNi32 VPR:$x)))]>,
+                     IsVectorInstruction;
+} // let Predicates = [HasVPU]
+} // SubUnits = [SU_vector_move]
+
+let SubUnits = [SU_vector_move] in {
+let Predicates = [IsVFTCOrSC] in {
+defm VNSELir : TPUInst<B_Vany, VIntALUVmselEncoding, (outs VPR:$Vd), (ins MPR:$m, tvimmi:$y, VPR:$x),
+                     "$Vd =\tvnsel${pred} $m, $y, $x",
+                     [(set VPR:$Vd, (vselect MPR:$m, (vNi32 VPR:$x),
+                                                     (vNi32 (Splat imm:$y))))]>,
+               IsVectorInstruction;
+defm VNSELsr : TPUInst<B_Vany, VIntALUVmselEncoding, (outs VPR:$Vd), (ins MPR:$m, GPR:$y, VPR:$x),
+                     "$Vd =\tvnsel${pred} $m, $y, $x",
+                     [(set VPR:$Vd, (vselect MPR:$m, (vNi32 VPR:$x),
+                                                     (vNi32 (Splat GPR:$y))))]>,
+              IsVectorInstruction;
+} // let Predicates = [IsVFTCOrSC]
+} // SubUnits = [SU_vector_move]
+
+// Always make vsel use integer immediate since it is type agnostic and we don't
+// want to print an integer as float.
+def : Pat<(vNf32 (vselect MPR:$m, (vNf32 (Splat fpimm:$y)), VPR:$x)),
+          (VSELir MPR:$m, (ftoi $y), VPR:$x)>;
+// Because rr and sr are type agnostic, define the vNf32 variants as patterns.
+def : Pat<(vNf32 (vselect MPR:$m, VPR:$y, VPR:$x)),
+          (VSELrr MPR:$m, VPR:$y, VPR:$x)>;
+def : Pat<(vNf32 (vselect MPR:$m, (Splat GPR:$y), VPR:$x)),
+          (VSELsr MPR:$m, GPR:$y, VPR:$x)>;
+let Predicates = [IsVFTCOrSC] in {
+// Same patterns above for vnsel.
+def : Pat<(vNf32 (vselect MPR:$m, VPR:$x, (vNf32 (Splat fpimm:$y)))),
+          (VNSELir MPR:$m, (ftoi $y), VPR:$x)>;
+def : Pat<(vNf32 (vselect MPR:$m, VPR:$x, (Splat GPR:$y))),
+          (VNSELsr MPR:$m, GPR:$y, VPR:$x)>;
+} // Predicates = [IsVFTCOrSC]
+
+let Predicates = [NotBC, NotSC] in {
+// Vector select with scalar predicate.
+let isPseudo = 1, isSelect = 1, SubUnits = [SU_vector_move] in {
+  // Pseudo-select instruction. Note that this is lowered to either a predicated
+  // VMOVr or VMOVi.
+  let Constraints = "$d = $a" in {
+    def PSEUDO_VSELrr : TPUInst<(outs VPR:$d), (ins PPR:$p, VPR:$a, VPR:$b),
+                            "$d =\t#VSEL $p, $a, $b",
+                            [(set VPR:$d, (select PPR:$p,
+                             (vNi32 VPR:$a), (vNi32 VPR:$b)))]>,
+                     Bundle<B_Vany>, IsVectorInstruction;
+    def PSEUDO_VSELri : TPUInst<(outs VPR:$d), (ins PPR:$p, VPR:$a, i32imm:$b),
+                            "$d =\t#VSEL $p, $a, $b",
+                            [(set VPR:$d, (select PPR:$p, (vNi32 VPR:$a),
+                                             (vNi32 (Splat imm:$b))))]>,
+                     Bundle<B_Vany>, IsVectorInstruction;
+    def PSEUDO_VSELrif : TPUInst<(outs VPR:$d), (ins PPR:$p, VPR:$a, tvimmf:$b),
+                            "$d =\t#VSEL $p, $a, $b",
+                            [(set VPR:$d, (select PPR:$p, (vNf32 VPR:$a),
+                                             (Splat (f32 fpimm:$b))))]>,
+                     Bundle<B_Vany>, IsVectorInstruction;
+  }
+  let Constraints = "$d = $b" in {
+    def PSEUDO_VSELir : TPUInst<(outs VPR:$d), (ins PPR:$p, i32imm:$a, VPR:$b),
+                            "$d =\t#VSEL $p, $a, $b",
+                            [(set VPR:$d, (select PPR:$p,
+                             (vNi32 (Splat imm:$a)), (vNi32 VPR:$b)))]>,
+                     Bundle<B_Vany>, IsVectorInstruction;
+    def PSEUDO_VSELirf : TPUInst<(outs VPR:$d), (ins PPR:$p, tvimmf:$a, VPR:$b),
+                            "$d =\t#VSEL $p, $a, $b",
+                            [(set VPR:$d, (select PPR:$p,
+                             (vNf32 (Splat (f32 fpimm:$a))), (vNf32 VPR:$b)))]>,
+                     Bundle<B_Vany>, IsVectorInstruction;
+  }
+} // isPseudo = 1, isSelect = 1, SubUnits = [SU_vector_move]
+
+def : Pat<(vNf32 (select PPR:$p, VPR:$y, VPR:$x)),
+          (PSEUDO_VSELrr PPR:$p, VPR:$y, VPR:$x)>;
+} // Predicates = [NotBC, NotSC]
+
+let Predicates = [IsSC] in {
+let SubUnits = [SU_vector_move] in {
+let isSelect = 1 in {
+def VPSELrr : TPUInst<(outs VPR:$d), (ins PPR:$p, VPR:$y, VPR:$x),
+                       "$d =\tvpsel ${p}, $y, $x",
+                       [(set VPR:$d, (select PPR:$p,
+                       (vNi32 VPR:$y), (vNi32 VPR:$x)))]>,
+              Bundle<B_Vany>, IsVectorInstruction;
+// NOTE: The following instruction inverts its predicate.
+def VPSELri : TPUInst<(outs VPR:$d), (ins PPR:$p, VPR:$x, tvimmi:$y),
+                      "$d =\tvpsel !${p}, $y, $x",
+                      [(set VPR:$d, (select PPR:$p, (vNi32 VPR:$x),
+                       (vNi32 (Splat imm:$y))))]>,
+              Bundle<B_Vany>, IsVectorInstruction;
+// NOTE: The following instruction inverts its predicate.
+def VPSELrs : TPUInst<(outs VPR:$d), (ins PPR:$p, VPR:$x, GPR:$y),
+                      "$d =\tvpsel !${p}, $y, $x",
+                      [(set VPR:$d, (select PPR:$p, (vNi32 VPR:$x),
+                       (vNi32 (Splat GPR:$y))))]>,
+              Bundle<B_Vany>, IsVectorInstruction;
+def VPSELir : TPUInst<(outs VPR:$d), (ins PPR:$p, tvimmi:$y, VPR:$x),
+                        "$d =\tvpsel ${p}, $y, $x",
+                        [(set VPR:$d, (select PPR:$p,
+                        (vNi32 (Splat imm:$y)), (vNi32 VPR:$x)))]>,
+              Bundle<B_Vany>, IsVectorInstruction;
+def VPSELsr : TPUInst<(outs VPR:$d), (ins PPR:$p, GPR:$y, VPR:$x),
+                        "$d =\tvpsel ${p}, $y, $x",
+                        [(set VPR:$d, (select PPR:$p,
+                        (vNi32 (Splat GPR:$y)), (vNi32 VPR:$x)))]>,
+              Bundle<B_Vany>, IsVectorInstruction;
+} // isSelect = 1
+} // SubUnits = [SU_vector_move]
+
+def : Pat<(vNf32 (select PPR:$p, VPR:$y, VPR:$x)),
+          (VPSELrr PPR:$p, VPR:$y, VPR:$x)>;
+def : Pat<(vNf32 (select PPR:$p, (vNf32 VPR:$x), (Splat (f32 fpimm:$y)))),
+          (VPSELri PPR:$p, VPR:$x, (ftoi $y))>;
+def : Pat<(vNf32 (select PPR:$p, (Splat (f32 fpimm:$y)), (vNf32 VPR:$x))),
+          (VPSELir PPR:$p, (ftoi $y), VPR:$x)>;
+
+} // Predicates = [IsSC]
+
+let Predicates = [NotBC] in {
+let supportsEmbeddedMask = 1 in {
+def VMMOV : TPUInstP<(outs MPR:$Md), (ins MPR:$Ms), "$Md =\tvmmov${pred} $Ms",
+                    []>,
+            Bundle<B_VM_OP>, IsVectorInstruction, SubUnits<[SU_vmask]>;
+let isPseudo = 1, hasSideEffects = 1 in {
+// VMMOV with side effects, used to escape CSE.
+def VMMOV_SE : TPUInstP<(outs MPR:$Md), (ins MPR:$Ms),
+                       "$Md =\tVMMOV_SE${pred} $Ms", []>,
+               Bundle<B_VM_OP>, IsVectorInstruction, SubUnits<[SU_vmask]>;
+} // isPseudo = 1, hasSideEffects = 1
+} // supportsEmbeddedMask = 1
+
+class VMaskOp<string Name, SDPatternOperator OpNode, ValueTypeByHwMode VMskType> :
+            TPUInstP<(outs MPR:$Md), (ins MPR:$Ms, MPR:$Mt),
+                    "$Md =\t" # Name # "${pred} $Ms, $Mt",
+                    [(set (VMskType MPR:$Md), (OpNode (VMskType MPR:$Ms), (VMskType MPR:$Mt)))]>,
+            Bundle<B_VM_OP>, IsVectorInstruction;
+
+class VUnaryMaskOp<string Name, SDPatternOperator OpNode, ValueTypeByHwMode VMskType> :
+            TPUInstP<(outs MPR:$Md), (ins MPR:$Ms), "$Md =\t" # Name # "${pred} $Ms",
+                    [(set (VMskType MPR:$Md), (OpNode (VMskType MPR:$Ms)))]>,
+            Bundle<B_VM_OP>, IsVectorInstruction;
+
+let SubUnits = [SU_vmask] in {
+def VMNEG : VUnaryMaskOp<"vmneg", vnot, vNi1>;
+def VMAND : VMaskOp<"vmand", and, vNi1>;
+def VMOR : VMaskOp<"vmor", or, vNi1>;
+def VMXOR : VMaskOp<"vmxor", xor, vNi1>;
+
+let Predicates = [NotBC] in {
+let isPseudo = 1 in {
+let isReMaterializable = 1 in {
+def VMZERO : TPUInstP<(outs MPR:$Md), (ins),
+                     "$Md =\t#VMZERO${pred}",
+                     [(set (vNi1 MPR:$Md), (vNi1 (Splat 0)))]>,
+             Bundle<B_VM_OP>, IsVectorInstruction;
+} // isReMaterializable = 1
+let Constraints = "$Md = $Ms" in {
+def CVMNEG : TPUInst<(outs MPR:$Md), (ins MPR:$Ms, PPR:$p),
+                     "$Md = \t#CVMNEG $Ms, $p", []>,
+             Bundle<B_VM_OP>, IsVectorInstruction;
+} // Constraints = "$Md = $Ms"
+} // isPseudo = 1
+} // Predicates = [NotBC]
+} // SubUnits = [SU_vmask]
+
+let AddedComplexity = 1 in {
+def : Pat<(xor (vNi1 MPR:$m), (Splat -1)), (VMNEG (vNi1 MPR:$m))>;
+let Predicates = [IsSC] in {
+def : Pat<(xor (vNi1 MPR:$m), (vNi1 M16)), (VMNEG (vNi1 MPR:$m))>;
+} // Predicates = [IsSC]
+} // AddedComplexity = 1
+def : Pat<(vNi1 (Splat -1)), (VMNEG (VMZERO))>;
+def : Pat<(vNi1 (Splat PPR:$p)), (CVMNEG (VMZERO), PPR:$p)>;
+
+let isPseudo = 1, usesCustomInserter = 1 in {
+def VMREAD : TPUInstP<(outs VPR:$d), (ins MPR:$m),
+                     "$d =\t#VMREAD${pred} $m",
+                     [(set VPR:$d, (vNi32 (zext MPR:$m)))]>;
+} // isPseudo = 1, usesCustomInserter = 1
+
+let isPseudo = 1, isAsCheapAsAMove = 1 in {
+let isReMaterializable = 1 in {
+def VMLANEi : TPUInstP<(outs MPR:$m), (ins VPR:$vseq, i32imm:$lane),
+                     "$m =\t#VMLANEi${pred} $vseq, $lane",
+                     []>, IsVectorInstruction;
+} // isReMaterializable = 1
+def VMLANEr : TPUInstP<(outs MPR:$m), (ins VPR:$vseq, GPR:$lane),
+                     "$m =\t#VMLANEr${pred} $vseq, $lane",
+                     []>, IsVectorInstruction;
+} // let isPseudo = 1, isAsCheapAsAMove = 1
+
+let mayLoad = 1, mayStore = 1 in {
+let isPush = 1 in {
+  def VPUSH : TPUInstP<(outs V2SFPR:$v2s), (ins VPR:$v),
+                      "$v2s =\tvpush${pred} $v", []>,
+              Bundle<B_VPUSH>, Sched<[WriteV2SF]>, SubUnits<[SU_v2s_push]>;
+} // isPush = 1
+
+let isPop = 1 in {
+  def SPOP_V2SF : TPUInstP<(outs GPR:$sdst), (ins V2SFPR:$v2s),
+                     "$sdst =\tspop${pred} $v2s", []>,
+              Bundle<B_Sany>, Sched<[WriteV2SFPop]>, SubUnits<[SU_pop]>;
+} // isPop = 1
+} // mayLoad = 1, mayStore = 1
+} // Predicates = [NotBC]
+
+let Predicates = [IsSC] in {
+  let isComposedV2SFifo = 1 in {
+    def VPUSH_CF : TPUInstP<(outs GPR:$sdst), (ins VPR:$v),
+                           "$sdst =\tVPUSH_CF${pred} $v", []>,
+                   Bundle<B_VPUSH>, Sched<[WriteV2SF]>, SubUnits<[SU_v2s_push]>;
+  } // isComposedV2SFifo = 1
+} // Predicates = [IsSC]
+
+//===----------------------------------------------------------------------===//
+// Vector manipulation ops
+//===----------------------------------------------------------------------===//
+multiclass UnaryVIOp<string Name, SDPatternOperator Intr, TPUInstVEncoding enc = VIntALUEupOpEncoding<0>> {
+  defm "" : UnaryOp<Name, Intr, vNi32, vNi32, enc>;
+}
+
+let Predicates = [NotBC, HasVPU] in {
+defm VPOPCNTr : UnaryVIOp<"vpcnt", ctpop>, SubUnits<[SU_vector_op]>;
+defm VCLZr : UnaryVIOp<"vclz", ctlz>, SubUnits<[SU_vector_op]>;
+defm VEXPONENTr : UnaryOp<"vf32.e.s32", int_tpu_exponent, vNi32, vNf32>,
+     SubUnits<[SU_vector_op]>;
+defm VCOMPOSE : VFPALUOpYX_NC<27, "vf32.f32", int_tpu_compose, vNf32, vNf32>, Sched<[WriteFloatCompose]>,
+     SubUnits<[SU_vector_compose]>;
+defm VSIGNIFICANDr : UnaryOp<"vf32.s.s32", int_tpu_significand, vNi32, vNf32>,
+     SubUnits<[SU_vector_op]>;
+
+// Pseudo instruction to read the zeroth vector element. Will be expanded to
+// vpush; spop. Sparsecore has its own version scVREAD.
+let Predicates = [NotSC] in {
+let isCodeGenOnly = 1, usesCustomInserter = 1 in {
+def VREAD : TPUInstP<(outs GPR:$d), (ins VPR:$v),
+                   "$d =\tvread${pred} $v",
+                   [(set GPR:$d, (f32 (extractelt (vNf32 VPR:$v), (i32 0))))]>;
+}
+} // Predicates = [NotSC]
+
+// Remove unnecessary bitcasts.
+def : Pat<(vNi32 (bitconvert (vNf32 VPR:$value))), (vNi32 VPR:$value)>;
+def : Pat<(vNf32 (bitconvert (vNi32 VPR:$value))), (vNf32 VPR:$value)>;
+def : Pat<(i32 (bitconvert (f32 GPR:$value))), (i32 GPR:$value)>;
+def : Pat<(f32 (bitconvert (i32 GPR:$value))), (f32 GPR:$value)>;
+
+def : Pat<(vNi32 (zext (vNi1 MPR:$x))), (VSELir MPR:$x, (i32 1), (VIMMI 0))>;
+def : Pat<(vNi32 (anyext (vNi1 MPR:$x))), (VSELir MPR:$x, (i32 1), (VIMMI 0))>;
+def : Pat<(vNi32 (sext (vNi1 MPR:$x))), (VSELir MPR:$x, (i32 -1), (VIMMI 0))>;
+def : Pat<(vNi1 (trunc (vNi32 VPR:$x))),
+                (VCMPEQri (vNi32 (VANDri $x, (i32 1))), (i32 1))>;
+
+def : Pat<(vNf32 (uint_to_fp (vNi1 MPR:$x))),
+                 (VSELir MPR:$x, (i32 0x3f800000), (VIMMI 0))>;
+def : Pat<(vNf32 (sint_to_fp (vNi1 MPR:$x))),
+                 (VSELir MPR:$x, (i32 0xbf800000), (VIMMI 0))>;
+
+def : Pat<(int_tpu_make_restrict_ptr (i32 GPR:$value)),
+           (i32 GPR:$value)>;
+} // Predicates = [NotBC, HasVPU]
+
+let Predicates = [HasVPU] in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+// Iota for vectors - produces {0, 1, 2, 3, 4, 5, 6, 7, ...}
+defm VLANESEQ : TPUInstVany<B_Vany, VIntALUEupOpEncoding<24>, (outs VPR:$Vd), (ins),
+                       "$Vd =\tvlaneseq.u32${pred}",
+                       [(set (vNi32 VPR:$Vd), (int_tpu_vlaneseq))],
+                       YOpIdxNone>,
+                IsVectorInstruction, SubUnits<[SU_vector_op]>;
+} // let isReMaterializable = 1, isAsCheapAsAMove = 1
+
+// There is no VROTDOWNri instruction in the ISA yet. This is a pseudo that will
+// be expanded to N VROTDOWNr's early.
+let isCodeGenOnly = 1, usesCustomInserter = 1 in {
+def VROTDOWNri : TPUInstP<(outs VPR:$Vd), (ins VPR:$x, i32imm:$n),
+                          "$Vd =\t#VROTDOWNri${pred} $x, $n",
+                          []>,
+                 IsVectorInstruction, SubUnits<[SU_vector_rotate]>;
+}
+def : Pat<(Vrotdown (vNf32 VPR:$x), imm:$n), (VROTDOWNri VPR:$x, imm:$n)>;
+def : Pat<(Vrotdown (vNi32 VPR:$x), imm:$n), (VROTDOWNri VPR:$x, imm:$n)>;
+
+defm VROTDOWNr : UnaryVIOp<"vrot.slane.down", int_tpu_vrot_sublane_down,
+                           VIntALUEupOpEncoding<29>>,
+  Sched<[WriteRotateSLane]>, SubUnits<[SU_vector_rotate]>;
+
+def : Pat<(vNf32 (int_tpu_vrot_sublane_down (vNf32 VPR:$x))),
+          (VROTDOWNr (vNf32 VPR:$x))>;
+} // Predicates = [HasVPU]
+
+//===----------------------------------------------------------------------===//
+// Vector extended unary (EUP)
+//===----------------------------------------------------------------------===//
+let usesCustomInserter = 1 in {
+let isPush = 1 in {
+let SubUnits = [SU_extended_unary] in {
+defm VRSQRT : TPUInstVany<B_EUP_OP, VIntALUEupOpEncoding<48>, (outs ERFPR:$eup), (ins VPR:$x),
+                          "$eup = vrsqrt.f32${pred} $x",
+                          [(set ERFPR:$eup, (int_tpu_rsqrt (vNf32 VPR:$x)))],
+                          YOpIdxNone>,
+              Sched<[WriteEup]>, IsVectorInstruction;
+defm VPOW2 : TPUInstVany<B_EUP_OP, VIntALUEupOpEncoding<49>, (outs ERFPR:$eup), (ins VPR:$x),
+                         "$eup = vpow2.f32${pred} $x",
+                         [(set ERFPR:$eup, (int_tpu_pow2 (vNf32 VPR:$x)))],
+                         YOpIdxNone>,
+             Sched<[WriteEup]>, IsVectorInstruction;
+defm VLOG2 : TPUInstVany<B_EUP_OP, VIntALUEupOpEncoding<50>, (outs ERFPR:$eup), (ins VPR:$x),
+                         "$eup = vlog2.f32${pred} $x",
+                         [(set ERFPR:$eup, (int_tpu_log2 (vNf32 VPR:$x)))],
+                         YOpIdxNone>,
+             Sched<[WriteEup]>, IsVectorInstruction;
+defm VTANH : TPUInstVany<B_EUP_OP, VIntALUEupOpEncoding<51>, (outs ERFPR:$eup), (ins VPR:$x),
+                         "$eup = vtanh.f32${pred} $x",
+                         [(set ERFPR:$eup, (int_tpu_tanh (vNf32 VPR:$x)))],
+                         YOpIdxNone>,
+             Sched<[WriteEup]>, IsVectorInstruction;
+defm VRCP  : TPUInstVany<B_EUP_OP, VIntALUEupOpEncoding<52>, (outs ERFPR:$eup), (ins VPR:$x),
+                         "$eup = vrcp.f32${pred} $x",
+                         [(set ERFPR:$eup, (int_tpu_rcp (vNf32 VPR:$x)))],
+                         YOpIdxNone>,
+             Sched<[WriteEup]>, IsVectorInstruction;
+defm VSIGSHFT : TPUInstVany<B_EUP_OP, VIntALUEupOpEncoding<51>, (outs ERFPR:$eup), (ins VPR:$x),
+                         "$eup = vsigshft.f32${pred} $x",
+                         [(set ERFPR:$eup, (int_tpu_sigshft (vNf32 VPR:$x)))],
+                         YOpIdxNone>,
+             Sched<[WriteEup]>, IsVectorInstruction, Requires<[IsVFTCOrSC]>;
+} // let SubUnits = [SU_extended_unary]
+
+defm VPUSH_EUP : TPUInstVany<B_Vany, VIntALUEupOpEncoding<53>, (outs ERFPR:$eup), (ins VPR:$x),
+                             "$eup = vpush${pred} $x",
+                             [(set ERFPR:$eup, (int_tpu_eup_push (vNf32 VPR:$x)))],
+                             YOpIdxNone>,
+                 Sched<[WriteEup]>, IsVectorInstruction,
+                 SubUnits<[SU_extended_unary]>;
+} // isPush = 1
+
+let isPop = 1 in {
+
+defm VRES_EUP :
+    TPUInstVResAny<
+        VRES_EUPEncoding,
+        (outs VPR:$Vd), (ins ERFPR:$eup), "$Vd =\tvpop${pred} $eup",
+        [(set (vNf32 VPR:$Vd), (int_tpu_eup_pop (i32 ERFPR:$eup)))]>,
+    Sched<[WriteEupPop]>, IsVectorInstruction, SubUnits<[SU_eup_result]>;
+
+} // isPop = 1
+} // usesCustomInserter = 1
+
+let isComposedErfFifo = 1, SubUnits = [SU_extended_unary] in {
+defm VRSQRT_CF : TPUInstVany<B_EUP_OP, VIntALUEupOpEncoding<48>, (outs VPR:$v), (ins VPR:$x),
+                             "!invalid",
+                             [],
+                             YOpIdxNone>,
+                 Sched<[WriteEup]>, IsVectorInstruction;
+defm VPOW2_CF : TPUInstVany<B_EUP_OP, VIntALUEupOpEncoding<49>, (outs VPR:$v), (ins VPR:$x),
+                            "!invalid",
+                            [],
+                            YOpIdxNone>,
+                Sched<[WriteEup]>, IsVectorInstruction;
+defm VLOG2_CF : TPUInstVany<B_EUP_OP, VIntALUEupOpEncoding<50>, (outs VPR:$v), (ins VPR:$x),
+                            "!invalid",
+                            [],
+                            YOpIdxNone>,
+                Sched<[WriteEup]>, IsVectorInstruction;
+defm VTANH_CF : TPUInstVany<B_EUP_OP, VIntALUEupOpEncoding<51>, (outs VPR:$v), (ins VPR:$x),
+                            "!invalid",
+                            [],
+                            YOpIdxNone>,
+                Sched<[WriteEup]>, IsVectorInstruction;
+defm VRCP_CF  : TPUInstVany<B_EUP_OP, VIntALUEupOpEncoding<52>, (outs VPR:$v), (ins VPR:$x),
+                            "!invalid",
+                            [],
+                            YOpIdxNone>,
+                Sched<[WriteEup]>, IsVectorInstruction;
+defm VSIGSHFT_CF : TPUInstVany<B_EUP_OP, VIntALUEupOpEncoding<51>, (outs VPR:$v), (ins VPR:$x),
+                            "!invalid",
+                            [],
+                            YOpIdxNone>,
+                Sched<[WriteEup]>, IsVectorInstruction;
+defm VPUSH_EUP_CF : TPUInstVany<B_EUP_OP, VIntALUEupOpEncoding<53>, (outs VPR:$v), (ins VPR:$x),
+                            "!invalid",
+                            [],
+                            YOpIdxNone>,
+                Sched<[WriteEup]>, IsVectorInstruction;
+} // isComposedErfFifo = 1, SubUnits = [SU_extended_unary]
+
+def : Pat<(fdiv (vNf32 (Splat (f32 fpimm:$f))), (vNf32 VPR:$v)), (VFMULri (VRES_EUP (VRCP VPR:$v)), fpimm:$f)>;
+def : Pat<(fdiv (vNf32 (Splat (f32 GPR:$f))), (vNf32 VPR:$v)), (VFMULrs (VRES_EUP (VRCP VPR:$v)), GPR:$f)>;
+def : Pat<(fdiv (vNf32 VPR:$vx), (vNf32 VPR:$vy)), (VFMULrr (VRES_EUP (VRCP VPR:$vy)), VPR:$vx)>;
+
+//===----------------------------------------------------------------------===//
+// SFlag instructions
+//===----------------------------------------------------------------------===//
+// Workaround to be able to define patterns along with instructions within
+// multiclass. We set some instructions properties in the final definition.
+// So to be able to compile, the patterns inherit form this structure.
+// We set the following instruction attributes at the high level definition. Add
+// those to pattern to be able to compile.
+class DummyInfo {
+  bit hasSideEffects;
+  bit mayStore;
+  bit mayLoad;
+}
+
+multiclass BaseSflagIntrinsicInst_<string instr, Intrinsic intrinsic, int num_trailing_zeros = 0> {
+  defvar trailing_zeros = !dag(intrinsic,
+                               !listsplat(0, num_trailing_zeros),
+                               !listsplat("", num_trailing_zeros));
+  def ii : TPUInstP<(outs), (ins tsyncimmsi:$targ, tsyncimmsi:$val),
+                   instr,
+                   [!con((intrinsic (Wrapper tglobaladdr:$targ), (i32 tsyncimmsi:$val)), trailing_zeros)]>;
+  def : Pat<!con((intrinsic (i32 tsyncimmsi:$targ), (i32 tsyncimmsi:$val)), trailing_zeros),
+            (!cast<Instruction>(NAME#"ii") (i32 imm:$targ), (i32 imm:$val))>,
+  DummyInfo;
+  def ri : TPUInstP<(outs), (ins GPR:$targ, tsyncimmsi:$val),
+                   instr,
+                   [!con((intrinsic (i32 GPR:$targ), (i32 tsyncimmsi:$val)), trailing_zeros)]>;
+
+  def ir : TPUInstP<(outs), (ins tsyncimmsi:$targ, GPR:$val),
+                   instr,
+                   [!con((intrinsic (Wrapper tglobaladdr:$targ), (i32 GPR:$val)), trailing_zeros)]>;
+  def : Pat<!con((intrinsic (i32 tsyncimmsi:$targ), (i32 GPR:$val)), trailing_zeros),
+            (!cast<Instruction>(NAME#"ir") (i32 imm:$targ), (i32 GPR:$val))>,
+  DummyInfo;
+  def rr : TPUInstP<(outs), (ins GPR:$targ, GPR:$val),
+                   instr,
+                   [!con((intrinsic (i32 GPR:$targ), (i32 GPR:$val)), trailing_zeros)]>;
+}
+
+// SflagIntrinsicInst_ represents any other VSYNCSET/VSYNCADD that
+// is mapped to an intrinsic.
+multiclass SflagIntrinsicInst_<string mnemonic, Intrinsic intrinsic> :
+  BaseSflagIntrinsicInst_<"[sflag:${targ}] =\t" # mnemonic # "${pred} $val",
+                        intrinsic>;
+
+// SflagIntrinsicInstRemote_ represents any other VSYNCSET/VSYNCADD that
+// is mapped to an Atomic Remote Set/Add intrinsic.
+multiclass SflagIntrinsicInstRemote_<string mnemonic, Intrinsic intrinsic> :
+  BaseSflagIntrinsicInst_<"[sflag:${targ}] =\t" # mnemonic # "${pred} $val",
+                        intrinsic, 2>;
+
+// SflagIntrinsicInst_ represents any other VSYNCSET/VSYNCADD that
+// is mapped to an Atomic Remote Set/Add Done intrinsic.
+multiclass SflagIntrinsicInstRemoteDone_<string mnemonic, Intrinsic intrinsic> :
+  BaseSflagIntrinsicInst_<"[sflag:${targ}] =\t" # mnemonic # "${pred} $val",
+                        intrinsic, 3>;
+
+multiclass WaitInst_<string mnemonic, Intrinsic intrinsic> :
+  BaseSflagIntrinsicInst_<"_ =\t" # mnemonic # "${pred} [sflag:${targ}], $val",
+                         intrinsic>;
+
+multiclass WaitDoneInst_<string mnemonic, Intrinsic intrinsic> {
+  def i : TPUInstP<(outs), (ins tsyncimmsi:$imm),
+                  "_ =\t" # mnemonic # "${pred} [sflag:$imm]",
+                  [(intrinsic (Wrapper tglobaladdr:$imm))]>;
+  def : Pat<(intrinsic (i32 tsyncimmsi:$imm)),
+            (!cast<Instruction>(NAME#"i") (i32 imm:$imm))>, DummyInfo;
+  def r : TPUInstP<(outs), (ins GPR:$targ),
+                  "_ =\t" # mnemonic # "${pred} [sflag:$targ]",
+                  [(intrinsic GPR:$targ)]>;
+}
+
+// SflagStoreInst_ represents a pure VSYNCSET that maps to a store_sflag.
+multiclass SflagStoreInst_<string mnemonic> {
+  def ii : TPUInstP<(outs), (ins tsyncimmsi:$targ, tsyncimmsi:$imm),
+                   "[sflag:${targ}] =\t" # mnemonic # "${pred} $imm",
+                   [(store_sflag (i32 tsyncimmsi:$imm), (Wrapper tglobaladdr:$targ))]>;
+  def : Pat<(store_sflag (i32 tsyncimmsi:$imm), (i32 tsyncimmsi:$targ)),
+            (!cast<Instruction>(NAME#"ii") (i32 imm:$targ), (i32 imm:$imm))>,
+  DummyInfo;
+  def ri : TPUInstP<(outs), (ins GPR:$targ, tsyncimmsi:$imm),
+                   "[sflag:${targ}] =\t" # mnemonic # "${pred} $imm",
+                   [(store_sflag (i32 tsyncimmsi:$imm), (i32 GPR:$targ))]>;
+  def ir : TPUInstP<(outs), (ins tsyncimmsi:$targ, GPR:$val),
+                   "[sflag:${targ}] =\t" # mnemonic # "${pred} $val",
+                   [(store_sflag (i32 GPR:$val), (Wrapper tglobaladdr:$targ))]>;
+  def : Pat<(store_sflag (i32 GPR:$val), (i32 tsyncimmsi:$targ)),
+            (!cast<Instruction>(NAME#"ir") (i32 tsyncimmsi:$targ), (i32 GPR:$val))>,
+  DummyInfo;
+  def rr : TPUInstP<(outs), (ins GPR:$targ, GPR:$val),
+                   "[sflag:${targ}] =\t" # mnemonic # "${pred} $val",
+                   [(store_sflag (i32 GPR:$val), (i32 GPR:$targ))]>;
+}
+
+// TODO(b/174059363): Wrong mnemonic selected for "Read Sync Done" instruction.
+// As per the ISA documentation, it should be vsyncdonemov.
+multiclass MoveSyncFlag_<dag oops, string mnemonicDest> {
+let mayLoad = 1, mayStore = 1, isPush = 1 in {
+  def i : TPUInstP<oops, (ins tsyncimmsi:$imm),
+                      mnemonicDest#" =\tvsyncmov${pred} [sflag:$imm]", []>,
+              Bundle<B_SM>, Sched<[WriteSFlagV2SF]>, IsVectorInstruction;
+  def r : TPUInstP<oops, (ins GPR:$r),
+                      mnemonicDest#" =\tvsyncmov${pred} [sflag:$r]", []>,
+              Bundle<B_SM>, Sched<[WriteSFlagV2SF]>, IsVectorInstruction;
+  def DONEi : TPUInstP<oops, (ins tsyncimmsi:$imm),
+                      mnemonicDest#" =\tvsyncmov.done${pred} [sflag:$imm]", []>,
+              Bundle<B_SM>, Sched<[WriteSFlagV2SF]>, IsVectorInstruction;
+  def DONEr : TPUInstP<oops, (ins GPR:$r),
+                      mnemonicDest#" =\tvsyncmov.done${pred} [sflag:$r]", []>,
+              Bundle<B_SM>, Sched<[WriteSFlagV2SF]>, IsVectorInstruction;
+} // mayLoad = 1, mayStore = 1, isPush = 1
+}
+
+//===----------------------------------------------------------------------===//
+// Call related instructions.
+//===----------------------------------------------------------------------===//
+def SDT_TPUCall : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
+def SDT_TPUCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+                                          SDTCisVT<1, i32>]>;
+def SDT_TPUCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_TPUCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_TPUCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                            SDNPSideEffect]>;
+
+def call          : SDNode<"TPUISD::CALL", SDT_TPUCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                            SDNPVariadic]>;
+def call_fast     : SDNode<"TPUISD::CALL_FAST", SDT_TPUCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                            SDNPVariadic]>;
+
+def ret           : SDNode<"TPUISD::RET", SDTNone,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+let isCall = 1 in {
+  let Defs = [LR] in {
+  def CALLrel : TPUInstP<(outs), (ins RelTargetOperand<OtherVT>:$target),
+                         "lr =\tscall.rel${pred} $target",
+                         []>, Bundle<B_S0>, BundleImm<IMM_0>;
+  def CALLabs : TPUInstP<(outs), (ins Operand<OtherVT>:$target),
+                         "lr =\tscall.abs${pred} $target",
+                         []>, Bundle<B_S0>;
+
+  def CALL : TPUInstP<(outs), (ins tsimmi:$dst),
+                      "lr =\tcall${pred} $dst", []>, SubUnits<[SU_control]>,
+                      Bundle<B_S0>, BundleImm<IMM_0>;
+  } // Defs = [LR]
+  def CALL_FAST : TPUInstP<(outs), (ins tsimmi:$dst),
+                           "_ =\tcall${pred} $dst", []>, SubUnits<[SU_control]>,
+                           Bundle<B_S0>, BundleImm<IMM_0>;
+} // isCall = 1
+
+def : Pat<(call (Wrapper tglobaladdr:$dst)), (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
+def : Pat<(call_fast (Wrapper tglobaladdr:$dst)), (CALL_FAST tglobaladdr:$dst)>;
+def : Pat<(call_fast texternalsym:$dst), (CALL_FAST texternalsym:$dst)>;
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1, isPseudo = 1 in {
+let Uses = [LR] in {
+def BRret : TPUInstP<(outs), (ins),
+                    "(pc) =\tBRret${pred}",
+                    [(ret)]>, Bundle<B_S0>;
+} // Uses = [LR]
+} // isReturn = 1, isTerminator = 1, isBarrier = 1, isPseudo = 1
+
+let hasSideEffects = 1, isCodeGenOnly = 1 in {
+def ADJCALLSTACKDOWN :
+  TPUInst<(outs), (ins i32imm:$amts, i32imm:$amt, i32imm:$amtv),
+            "ADJCALLSTACKDOWN $amts, $amtv",
+            []>, Requires<[HasFC]>;
+def ADJCALLSTACKUP :
+  TPUInst<(outs), (ins i32imm:$amts, i32imm:$amt, i32imm:$amtv),
+            "ADJCALLSTACKUP $amts, $amtv",
+            []>, Requires<[HasFC]>;
+
+def CALLSEQ_START :
+  TPUInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+            "Callseq_start $amt1",
+            [(callseq_start timm:$amt1, timm:$amt2)]>, Requires<[NoFC]>;
+def CALLSEQ_END :
+  TPUInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+            "Callseq_end $amt1",
+            [(callseq_end timm:$amt1, timm:$amt2)]>, Requires<[NoFC]>;
+} // hasSideEffects = 1, isCodeGenOnly = 1
+
+def : Pat<(callseq_start timm:$amts, timm:$amtv),
+          (ADJCALLSTACKDOWN i32imm:$amts, (i32 0), i32imm:$amtv)>;
+def : Pat<(callseq_end timm:$amts, timm:$amtv),
+          (ADJCALLSTACKUP i32imm:$amts, (i32 0), i32imm:$amtv)>;
+
+//===----------------------------------------------------------------------===//
+// FIFO pseudo physical register copy instructions.
+//===----------------------------------------------------------------------===//
+
+let isMoveReg = 1, isPseudo = 1, isFifoPseudoCopy = 1 in {
+  def COPY_ERFPR : TPUInst<(outs ERFPR:$Sd), (ins ERFPR:$y), "(invalid)", []>;
+  def COPY_V2SFPR : TPUInst<(outs V2SFPR:$Sd), (ins V2SFPR:$y), "(invalid)", []>;
+  def COPY_SFRFPR : TPUInst<(outs SFRFPR:$Sd), (ins SFRFPR:$y), "(invalid)", []>;
+  def COPY_MRFPR0 : TPUInst<(outs MRFPR0:$Sd), (ins MRFPR0:$y), "(invalid)", []>;
+  def COPY_MRFPR1 : TPUInst<(outs MRFPR1:$Sd), (ins MRFPR1:$y), "(invalid)", []>;
+  def COPY_MRFPR2 : TPUInst<(outs MRFPR2:$Sd), (ins MRFPR2:$y), "(invalid)", []>;
+  def COPY_MRFPR3 : TPUInst<(outs MRFPR3:$Sd), (ins MRFPR3:$y), "(invalid)", []>;
+  def COPY_TRFPR0 : TPUInst<(outs TRFPR0:$Sd), (ins TRFPR0:$y), "(invalid)", []>;
+  def COPY_TRFPR1 : TPUInst<(outs TRFPR1:$Sd), (ins TRFPR1:$y), "(invalid)", []>;
+  def COPY_TRFPR2 : TPUInst<(outs TRFPR2:$Sd), (ins TRFPR2:$y), "(invalid)", []>;
+  def COPY_XRFPR0 : TPUInst<(outs XRFPR0:$Sd), (ins XRFPR0:$y), "(invalid)", []>;
+  def COPY_XRFPR1 : TPUInst<(outs XRFPR1:$Sd), (ins XRFPR1:$y), "(invalid)", []>;
+} // isMoveReg = 1, isPseudo = 1, isFifoPseudoCopy = 1
+
+//===----------------------------------------------------------------------===//
+// Read Register Value instructions.
+//===----------------------------------------------------------------------===//
+multiclass RdReg<string Suffix, SDPatternOperator OpNode> :
+  TPUInstSany<29, (outs GPR:$d), (ins),
+                          "$d =\tsrdreg."#Suffix#"${pred}",
+                          [(set GPR:$d, (OpNode))]>;
+
+let hasSideEffects = 1, SubUnits = [SU_scalar_alu] in {
+defm SRDREG_LCCLO : RdReg<"lcclo", int_tpu_rdreg_lcc_lo>;
+defm SRDREG_LCCHI : RdReg<"lcchi", int_tpu_rdreg_lcc_hi>;
+defm SRDREG_GTCLO : RdReg<"gtclo", int_tpu_rdreg_gtc_lo>;
+defm SRDREG_GTCHI : RdReg<"gtchi", int_tpu_rdreg_gtc_hi>;
+defm SRDREG_TAG   : RdReg<"tag", int_tpu_rdreg_tag>;
+defm SRDREG_TM    : RdReg<"tm", int_tpu_rdreg_tm>;
+defm SRDREG_FSR    : RdReg<"fsr", int_tpu_rdreg_fsr>,
+  Requires<[IsSC]>;
+defm SRDREG_YIELDREQ : RdReg<"yieldreq", int_tpu_rdreg_yieldreq>,
+  Requires<[IsVFTCOrSC]>;
+defm SRDREG_TCID : RdReg<"tcid", int_tpu_rdreg_tcid>,
+  Requires<[IsVFTC]>;
+defm SRDREG_SCID : RdReg<"scid", int_tpu_rdreg_scid>,
+  Requires<[IsSC]>;
+defm SRDREG_CRRLO : RdReg<"crrlo", int_tpu_rdreg_crr_lo>,
+  Requires<[HasJfcDfcTensorCore]>;
+defm SRDREG_CRRHI : RdReg<"crrhi", int_tpu_rdreg_crr_hi>,
+  Requires<[HasJfcDfcTensorCore]>;
+defm SRDREG_BTR   : RdReg<"btr", int_tpu_rdreg_btr>,
+  Requires<[HasJfcDfcTensorCore]>;
+defm SRDREG_TBM   : RdReg<"tbm", int_tpu_rdreg_tbm>,
+  Requires<[IsSC]>;
+defm SRDREG_DDR : RdReg<"ddr", int_tpu_rdreg_ddr>,
+  Requires<[IsSC]>;
+defm SRDREG_DMACRDT : RdReg<"dmacrdt", int_tpu_rdreg_dmacrdt>,
+  Requires<[IsSC]>;
+} // let hasSideEffects = 1, SubUnits = [SU_scalar_alu]
+
+let hasSideEffects = 1, isPseudo = 1 in {
+def LCC_READ : TPUInst<(outs GPR:$d0, GPR:$d1), (ins),
+                       "_ =\t#LCC_READ",
+                       []>, Bundle<B_Sboth>, SubUnits<[SU_scalar_alu_both]>;
+def GTC_READ : TPUInst<(outs GPR:$d0, GPR:$d1), (ins),
+                       "_ =\t#GTC_READ",
+                       []>, Bundle<B_Sboth>, SubUnits<[SU_scalar_alu_both]>;
+} // let hasSideEffects = 1, isPseudo = 1
+
+//===----------------------------------------------------------------------===//
+// Set Register Value instructions.
+//===----------------------------------------------------------------------===//
+
+multiclass SetRegSc_<string Reg, Intrinsic Intr> {
+  def r : TPUInstP<(outs), (ins GPR:$x),
+                  "(" # Reg # ") =\tsset" # Reg # "${pred} $x",
+          [(Intr (i32 GPR:$x))]>, Bundle<B_Sany>;
+  def i : TPUInstP<(outs), (ins tsimmi:$x),
+                  "(" # Reg # ") =\tsset" # Reg # "${pred} $x",
+          [(Intr (i32 imm:$x))]>, Bundle<B_Sany>;
+}
+
+let hasSideEffects = 1, SubUnits = [SU_scalar_alu]  in {
+defm SSETPDEPTH : SetRegSc_<"pdepth", int_tpu_setreg_pdepth>;
+defm SSETTAG : SetRegSc_<"tag", int_tpu_setreg_tag>;
+defm SSETIFVALUE : SetRegSc_<"ifvalue", int_tpu_setreg_ifvalue>;
+defm SSETDMACRDT : SetRegSc_<"dmacrdt", int_tpu_setreg_dmacrdt>;
+defm SSETSFLAGRANGE : SetRegSc_<"sflagrange", int_tpu_setreg_sflagrange>;
+} // hasSideEffects = 1, SubUnits = [SU_scalar_alu]
+
+//===----------------------------------------------------------------------===//
+// MXU operations
+//===----------------------------------------------------------------------===//
+// Mat push/mul can be masked or not (for JF/DF/PF).
+multiclass MatOpMasked<int i, string Name, Intrinsic Intr,  dag Iops, dag Oops,
+           string AsmDstOperand, string AsmPrefix, SchedWrite Schedule,
+           list<Predicate> PredList> {
+  def "" :  TPUInstP<Oops, !con((ins VPR:$Vs), Iops),
+      AsmDstOperand # " =\t" # Name # AsmPrefix # "${pred} $Vs",
+     [!con(!foreach(v, Oops, !subst(outs, set, v)),
+         (set (Intr (vNf32 VPR:$Vs), (vNi1 (Splat -1)), (i32 i),
+               !foreach(v, Iops, !subst(ins, i32, v)))))]>,
+     Sched<[Schedule]>, ExtraPredicates<PredList>;
+  def m : TPUInstP<Oops, !con((ins VPR:$Vs, MPR:$m), Iops),
+      AsmDstOperand # " =\t" # Name # ".msk" # AsmPrefix # "${pred} $m, $Vs",
+      [!con(!foreach(v, Oops, !subst(outs, set, v)),
+          (set (Intr (vNf32 VPR:$Vs), (vNi1 MPR:$m), (i32 i),
+                !foreach(v, Iops, !subst(ins, i32, v)))))]>,
+           Sched<[Schedule]>, ExtraPredicates<PredList>;
+}
+
+//===----------------------------------------------------------------------===//
+// Subsystems.
+//===----------------------------------------------------------------------===//
+
+include "TPUInstrSparseCore.td"
+include "TPUInstrSparseCoreFIFO.td"
+include "TPUInstrPackedALU.td"
+include "TPUInstrPackUnpack.td"
+include "TPUInstrPackedCmp.td"
+include "TPUInstrPackedLoadStore.td"
+include "TPUInstrTensorCore.td"
+include "TPUInstrBarnaCore.td"

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrPackUnpack.td b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrPackUnpack.td
new file mode 100644
index 0000000..168352d
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrPackUnpack.td

@@ -0,0 +1,376 @@
+//===--- TPUInstrPackedPackUnpack.td - Target Description for TPU Target --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the TPU instructions specific to packed low- and full
+// precision pack/unpacking support.
+//
+//===----------------------------------------------------------------------===//
+
+//  -------------------------
+// | PFC/JFC/DFC pack/unpack |
+//  -------------------------
+
+let Predicates = [HasPfcJfcDfcTensorCore, HasVPU] in {
+
+let SubUnits = [SU_vector_pack] in {
+defm VPACK : VFPALUOpYX_NC<28, "vpack.f32.f16", int_tpu_pack, vNf32, vNf32>,
+             Sched<[WritePackingInst]>;
+defm VPACKC : VFPALUOpYX_NC<55, "vpackc.f32.f16", int_tpu_packc, vNf32, vNf32>,
+              Sched<[WritePackingInst]>;
+
+defm VUNPACKU : UnaryVFOp<"vunpacku", int_tpu_unpacku>, Sched<[WritePackingInst]>;
+defm VUNPACKL : UnaryVFOp<"vunpackl", int_tpu_unpackl>, Sched<[WritePackingInst]>;
+
+} // SubUnits = [SU_vector_pack]
+
+} // Predicates = [HasJfcDfcTensorCore, HasVPU]
+
+//  ------------------------------------
+// | Deprecated untyped VFC pack/unpack |
+// | These will get removed.            |
+//  ------------------------------------
+
+let Predicates = [IsVFTCOrVFSC, HasVPU] in {
+
+let SubUnits = [SU_vector_pack] in {
+defm VPACK_I_BF16 : VFPALUOpYX_NC<27, "vpack.i.bf16", int_tpu_deprecated_pack_i_bf16, vNf32, vNf32>,
+                    Sched<[WriteIPackingInst]>;
+defm VPACK_C_BF16 : VFPALUOpYX_NC<29, "vpack.c.bf16", int_tpu_deprecated_pack_c_bf16, vNf32, vNf32>,
+                    Sched<[WriteCPackingInst]>;
+} // SubUnits = [SU_vector_pack]
+
+multiclass UnpackCompressedDataTypesCatA<string Name, string IntrName> {
+ defm _BF16: UnaryVFOp<Name#".bf16", !cast<Intrinsic>(IntrName#"_bf16")>,
+             Sched<[WriteCPackingInst]>;
+}
+
+multiclass UnpackInterleavedDataTypes<string Name, string IntrName> {
+ defm _BF16: UnaryVFOp<Name#".bf16", !cast<Intrinsic>(IntrName#"_bf16")>,
+             Sched<[WriteIPackingInst]>;
+}
+
+multiclass UnpackCompressedLowerUpperSublanes<string Name, string IntrName> {
+ defm _LOWER: UnpackCompressedDataTypesCatA<Name#".l", IntrName#"_l">;
+ defm _UPPER: UnpackCompressedDataTypesCatA<Name#".u", IntrName#"_u">;
+}
+
+multiclass UnpackInterleavedLowerUpperSublanes<string Name, string IntrName> {
+ defm _LOWER: UnpackInterleavedDataTypes<Name#".l", IntrName#"_l">;
+ defm _UPPER: UnpackInterleavedDataTypes<Name#".u", IntrName#"_u">;
+}
+
+multiclass UnpackCompressedInterleaved<string Name, string IntrName> {
+ defm _I: UnpackInterleavedLowerUpperSublanes<Name#".i", IntrName#"_i">;
+ defm _C: UnpackCompressedLowerUpperSublanes<Name#".c", IntrName#"_c">;
+}
+
+defm VUNPACK_DEPRECATED : UnpackCompressedInterleaved<"vunpack",
+                          "int_tpu_deprecated_unpack">,
+                          SubUnits<[SU_vector_pack]>;
+
+} // Predicates = [IsVFTCOrVFSC, HasVPU]
+
+//  -----------------------------------
+// | Low precision pack/unpack VFC/GLC |
+//  -----------------------------------
+
+class PackOpSc_<string Name, Intrinsic Intr,
+                     ValueType DstType, ValueType SrcType> :
+  TPUInstP<(outs VPR:$Vd), (ins VPR:$y, VPR:$x),
+           "$Vd =\t"#Name#"${pred} $y, $x",
+           [(set (DstType VPR:$Vd),
+           (Intr (SrcType VPR:$y), (SrcType VPR:$x)))]>,
+  Bundle<B_PACK>, IsVectorInstruction;
+
+class UnpackOpSc_<string Name, Intrinsic Intr,
+                     ValueType DstType, ValueType SrcType> :
+  TPUInstP<(outs VPR:$Vd), (ins VPR:$y),
+           "$Vd =\t"#Name#"${pred} $y",
+           [(set (DstType VPR:$Vd), (Intr (SrcType VPR:$y)))]>,
+  Bundle<B_UNPACK>, IsVectorInstruction;
+
+let Predicates = [IsSC, HasLPVF] in {
+
+def VPACK_I_F32_BF16 : PackOpSc_<"vpack.i.f32.bf16",
+                                  int_tpu_pack_i_f32_bf16, v16bf16, v8f32>,
+                       Sched<[WriteIPackingInst]>;
+def VPACK_I_B32_B16  : PackOpSc_<"vpack.i.b32.b16",
+                                 int_tpu_pack_i_b32_b16, v16i16, v8i32>,
+                       Sched<[WriteIPackingInst]>;
+def VPACK_I_B16_B8   : PackOpSc_<"vpack.i.b16.b8",
+                                 int_tpu_pack_i_b16_b8, v32i8, v16i16>,
+                       Sched<[WriteIPackingInst]>;
+def VPACK_I_B8_B4    : PackOpSc_<"vpack.i.b8.b4",
+                                 int_tpu_pack_i_b8_b4, v64i4, v32i8>,
+                       Sched<[WriteIPackingInst]>;
+
+def VPACK_C_F32_BF16 : PackOpSc_<"vpack.c.f32.bf16",
+                                 int_tpu_pack_c_f32_bf16, v16bf16, v8f32>,
+                       Sched<[WriteCPackingInst]>;
+def VPACK_C_B32_B16  : PackOpSc_<"vpack.c.b32.b16",
+                                 int_tpu_pack_c_b32_b16, v16i16, v8i32>,
+                       Sched<[WriteCPackingInst]>;
+def VPACK_C_B16_B8   : PackOpSc_<"vpack.c.b16.b8",
+                                 int_tpu_pack_c_b16_b8, v32i8, v16i16>,
+                       Sched<[WriteCPackingInst]>;
+def VPACK_C_B8_B4    : PackOpSc_<"vpack.c.b8.b4",
+                                 int_tpu_pack_c_b8_b4, v64i4, v32i8>,
+                       Sched<[WriteCPackingInst]>;
+def VPACK_C_B4_B2    : PackOpSc_<"vpack.c.b4.b2",
+                                 int_tpu_pack_c_b4_b2, v128i2, v64i4>,
+                       Sched<[WriteCPackingInst]>;
+def VPACK_C_B2_B1    : PackOpSc_<"vpack.c.b2.b1",
+                                 int_tpu_pack_c_b2_b1, v256i1, v128i2>,
+                       Sched<[WriteCPackingInst]>;
+
+} // Predicates = [IsSC, HasLPVF]
+
+let Predicates = [IsSC, HasLPGL] in {
+
+def VPACK_C_BF16_S8  : PackOpSc_<"vpack.c.bf16.s8",
+                                 int_tpu_pack_c_bf16_s8, v32i8, v16bf16>,
+                       Sched<[WriteCPackingInst]>;
+def VPACK_C_BF16_U8  : PackOpSc_<"vpack.c.bf16.u8",
+                                 int_tpu_pack_c_bf16_u8, v32i8, v16bf16>,
+                       Sched<[WriteCPackingInst]>;
+def VPACK_I_BF16_S8  : PackOpSc_<"vpack.i.bf16.s8",
+                                 int_tpu_pack_i_bf16_s8, v32i8, v16bf16>,
+                       Sched<[WriteIPackingInst]>;
+def VPACK_I_BF16_U8  : PackOpSc_<"vpack.i.bf16.u8",
+                                 int_tpu_pack_i_bf16_u8, v32i8, v16bf16>,
+                       Sched<[WriteIPackingInst]>;
+
+} // Predicates = [IsSC, HasLPGL]
+
+let Predicates = [IsSC, HasLPVF] in {
+
+def VUNPACK_I_L_BF16_F32 : UnpackOpSc_<"vunpack.i.l.bf16.f32",
+                                      int_tpu_unpack_i_l_bf16_f32, v8f32, v16bf16>,
+                           Sched<[WriteIPackingInst]>;
+def VUNPACK_I_U_BF16_F32 : UnpackOpSc_<"vunpack.i.u.bf16.f32",
+                                      int_tpu_unpack_i_u_bf16_f32, v8f32, v16bf16>,
+                           Sched<[WriteIPackingInst]>;
+
+def VUNPACK_I_L_S16_S32 : UnpackOpSc_<"vunpack.i.l.s16.s32",
+                                      int_tpu_unpack_i_l_s16_s32, v8i32, v16i16>,
+                           Sched<[WriteIPackingInst]>;
+def VUNPACK_I_U_S16_S32 : UnpackOpSc_<"vunpack.i.u.s16.s32",
+                                      int_tpu_unpack_i_u_s16_s32, v8i32, v16i16>,
+                           Sched<[WriteIPackingInst]>;
+def VUNPACK_C_L_BF16_F32 : UnpackOpSc_<"vunpack.c.l.bf16.f32",
+                                      int_tpu_unpack_c_l_bf16_f32, v8f32, v16bf16>,
+                          Sched<[WriteCPackingInst]>;
+def VUNPACK_C_U_BF16_F32 : UnpackOpSc_<"vunpack.c.u.bf16.f32",
+                                      int_tpu_unpack_c_u_bf16_f32, v8f32, v16bf16>,
+                          Sched<[WriteCPackingInst]>;
+def VUNPACK_C_L_HF16_F32 : UnpackOpSc_<"vunpack.c.l.hf16.f32",
+                                      int_tpu_unpack_c_l_hf16_f32, v8f32, v16f16>,
+                          Sched<[WriteCPackingInst]>;
+def VUNPACK_C_U_HF16_F32 : UnpackOpSc_<"vunpack.c.u.hf16.f32",
+                                      int_tpu_unpack_c_u_hf16_f32, v8f32, v16f16>,
+                          Sched<[WriteCPackingInst]>;
+
+def VUNPACK_C_L_S16_S32 : UnpackOpSc_<"vunpack.c.l.s16.s32",
+                                     int_tpu_unpack_c_l_s16_s32, v8i32, v16i16>,
+                          Sched<[WriteCPackingInst]>;
+def VUNPACK_C_U_S16_S32 : UnpackOpSc_<"vunpack.c.u.s16.s32",
+                                     int_tpu_unpack_c_u_s16_s32, v8i32, v16i16>,
+                          Sched<[WriteCPackingInst]>;
+def VUNPACK_C_0_BF8_F32 : UnpackOpSc_<"vunpack.c.0.bf8.f32",
+                                     int_tpu_unpack_c_0_bf8_f32, v8f32, v32i8>,
+                          Sched<[WriteCPackingInst]>;
+def VUNPACK_C_1_BF8_F32 : UnpackOpSc_<"vunpack.c.1.bf8.f32",
+                                     int_tpu_unpack_c_1_bf8_f32, v8f32, v32i8>,
+                          Sched<[WriteCPackingInst]>;
+def VUNPACK_C_2_BF8_F32 : UnpackOpSc_<"vunpack.c.2.bf8.f32",
+                                     int_tpu_unpack_c_2_bf8_f32, v8f32, v32i8>,
+                          Sched<[WriteCPackingInst]>;
+def VUNPACK_C_3_BF8_F32 : UnpackOpSc_<"vunpack.c.3.bf8.f32",
+                                     int_tpu_unpack_c_3_bf8_f32, v8f32, v32i8>,
+                          Sched<[WriteCPackingInst]>;
+def VUNPACK_C_0_S8_S32 : UnpackOpSc_<"vunpack.c.0.s8.s32",
+                                    int_tpu_unpack_c_0_s8_s32, v8i32, v32i8>,
+                         Sched<[WriteCPackingInst]>;
+def VUNPACK_C_1_S8_S32 : UnpackOpSc_<"vunpack.c.1.s8.s32",
+                                    int_tpu_unpack_c_1_s8_s32, v8i32, v32i8>,
+                         Sched<[WriteCPackingInst]>;
+def VUNPACK_C_2_S8_S32 : UnpackOpSc_<"vunpack.c.2.s8.s32",
+                                    int_tpu_unpack_c_2_s8_s32, v8i32, v32i8>,
+                         Sched<[WriteCPackingInst]>;
+def VUNPACK_C_3_S8_S32 : UnpackOpSc_<"vunpack.c.3.s8.s32",
+                                    int_tpu_unpack_c_3_s8_s32, v8i32, v32i8>,
+                         Sched<[WriteCPackingInst]>;
+def VUNPACK_C_L_S4_S8 : UnpackOpSc_<"vunpack.c.l.s4.s8",
+                                   int_tpu_unpack_c_l_s4_s8, v32i8, v64i4>,
+                        Sched<[WriteCPackingInst]>;
+def VUNPACK_C_U_S4_S8 : UnpackOpSc_<"vunpack.c.u.s4.s8",
+                                   int_tpu_unpack_c_u_s4_s8, v32i8, v64i4>,
+                        Sched<[WriteCPackingInst]>;
+def VUNPACK_C_L_S2_S4 : UnpackOpSc_<"vunpack.c.l.s2.s4",
+                                   int_tpu_unpack_c_l_s2_s4, v64i4, v128i2>,
+                        Sched<[WriteCPackingInst]>;
+def VUNPACK_C_U_S2_S4 : UnpackOpSc_<"vunpack.c.u.s2.s4",
+                                   int_tpu_unpack_c_u_s2_s4, v64i4, v128i2>,
+                        Sched<[WriteCPackingInst]>;
+def VUNPACK_C_L_S1_S2 : UnpackOpSc_<"vunpack.c.l.s1.s2",
+                                   int_tpu_unpack_c_l_s1_s2, v128i2, v256i1>,
+                        Sched<[WriteCPackingInst]>;
+def VUNPACK_C_U_S1_S2 : UnpackOpSc_<"vunpack.c.u.s1.s2",
+                                   int_tpu_unpack_c_u_s1_s2, v128i2, v256i1>,
+                        Sched<[WriteCPackingInst]>;
+def VUNPACK_C_0_IF8_F32 : UnpackOpSc_<"vunpack.c.0.if8.f32",
+                                     int_tpu_unpack_c_0_if8_f32, v8f32, v32i8>,
+                          Sched<[WriteCPackingInst]>;
+def VUNPACK_C_1_IF8_F32 : UnpackOpSc_<"vunpack.c.1.if8.f32",
+                                     int_tpu_unpack_c_1_if8_f32, v8f32, v32i8>,
+                          Sched<[WriteCPackingInst]>;
+def VUNPACK_C_2_IF8_F32 : UnpackOpSc_<"vunpack.c.2.if8.f32",
+                                     int_tpu_unpack_c_2_if8_f32, v8f32, v32i8>,
+                          Sched<[WriteCPackingInst]>;
+def VUNPACK_C_3_IF8_F32 : UnpackOpSc_<"vunpack.c.3.if8.f32",
+                                     int_tpu_unpack_c_3_if8_f32, v8f32, v32i8>,
+                          Sched<[WriteCPackingInst]>;
+
+} // Predicates = [IsSC, HasLPVF]
+
+let Predicates = [IsSC, HasLPGL] in {
+
+def VUNPACK_C_L_S8_BF16 : UnpackOpSc_<"vunpack.c.l.s8.bf16",
+                                     int_tpu_unpack_c_l_s8_bf16, v16bf16, v32i8>,
+                          Sched<[WriteCPackingInst]>;
+def VUNPACK_C_U_S8_BF16 : UnpackOpSc_<"vunpack.c.u.s8.bf16",
+                                     int_tpu_unpack_c_u_s8_bf16, v16bf16, v32i8>,
+                          Sched<[WriteCPackingInst]>;
+def VUNPACK_C_L_U8_BF16 : UnpackOpSc_<"vunpack.c.l.u8.bf16",
+                                     int_tpu_unpack_c_l_u8_bf16, v16bf16, v32i8>,
+                          Sched<[WriteCPackingInst]>;
+def VUNPACK_C_U_U8_BF16 : UnpackOpSc_<"vunpack.c.u.u8.bf16",
+                                     int_tpu_unpack_c_u_u8_bf16, v16bf16, v32i8>,
+                          Sched<[WriteCPackingInst]>;
+def VUNPACK_C_L_S8_S16 : UnpackOpSc_<"vunpack.c.l.s8.s16",
+                                    int_tpu_unpack_c_l_s8_s16, v16i16, v32i8>,
+                         Sched<[WriteCPackingInst]>;
+def VUNPACK_C_U_S8_S16 : UnpackOpSc_<"vunpack.c.u.s8.s16",
+                                    int_tpu_unpack_c_u_s8_s16, v16i16, v32i8>,
+                         Sched<[WriteCPackingInst]>;
+def VUNPACK_C_L_U8_U16 : UnpackOpSc_<"vunpack.c.l.u8.u16",
+                                    int_tpu_unpack_c_l_u8_u16, v16i16, v32i8>,
+                         Sched<[WriteCPackingInst]>;
+def VUNPACK_C_U_U8_U16 : UnpackOpSc_<"vunpack.c.u.u8.u16",
+                                    int_tpu_unpack_c_u_u8_u16, v16i16, v32i8>,
+                         Sched<[WriteCPackingInst]>;
+def VUNPACK_I_L_S8_S16 : UnpackOpSc_<"vunpack.i.l.s8.s16",
+                                    int_tpu_unpack_i_l_s8_s16, v16i16, v32i8>,
+                         Sched<[WriteIPackingInst]>;
+def VUNPACK_I_U_S8_S16 : UnpackOpSc_<"vunpack.i.u.s8.s16",
+                                    int_tpu_unpack_i_u_s8_s16, v16i16, v32i8>,
+                         Sched<[WriteIPackingInst]>;
+def VUNPACK_I_L_U8_U16 : UnpackOpSc_<"vunpack.i.l.u8.u16",
+                                    int_tpu_unpack_i_l_u8_u16, v16i16, v32i8>,
+                         Sched<[WriteIPackingInst]>;
+def VUNPACK_I_U_U8_U16 : UnpackOpSc_<"vunpack.i.u.u8.u16",
+                                    int_tpu_unpack_i_u_u8_u16, v16i16, v32i8>,
+                         Sched<[WriteIPackingInst]>;
+def VUNPACK_I_L_S8_BF16 : UnpackOpSc_<"vunpack.i.l.s8.bf16",
+                                     int_tpu_unpack_i_l_s8_bf16, v16bf16, v32i8>,
+                          Sched<[WriteIPackingInst]>;
+def VUNPACK_I_U_S8_BF16 : UnpackOpSc_<"vunpack.i.u.s8.bf16",
+                                     int_tpu_unpack_i_u_s8_bf16, v16bf16, v32i8>,
+                          Sched<[WriteIPackingInst]>;
+def VUNPACK_I_L_U8_BF16 : UnpackOpSc_<"vunpack.i.l.u8.bf16",
+                                     int_tpu_unpack_i_l_u8_bf16, v16bf16, v32i8>,
+                          Sched<[WriteIPackingInst]>;
+def VUNPACK_I_U_U8_BF16 : UnpackOpSc_<"vunpack.i.u.u8.bf16",
+                                     int_tpu_unpack_i_u_u8_bf16, v16bf16, v32i8>,
+                          Sched<[WriteIPackingInst]>;
+def VUNPACK_IC_L_S8_S16 : UnpackOpSc_<"vunpack.ic.l.s8.s16",
+                                     int_tpu_unpack_ic_l_s8_s16, v16i16, v32i8>,
+                          Sched<[WriteIPackingInst]>;
+def VUNPACK_IC_U_S8_S16 : UnpackOpSc_<"vunpack.ic.u.s8.s16",
+                                     int_tpu_unpack_ic_u_s8_s16, v16i16, v32i8>,
+                          Sched<[WriteIPackingInst]>;
+def VUNPACK_IC_L_U8_U16 : UnpackOpSc_<"vunpack.ic.l.u8.u16",
+                                     int_tpu_unpack_ic_l_u8_u16, v16i16, v32i8>,
+                          Sched<[WriteIPackingInst]>;
+def VUNPACK_IC_U_U8_U16 : UnpackOpSc_<"vunpack.ic.u.u8.u16",
+                                     int_tpu_unpack_ic_u_u8_u16, v16i16, v32i8>,
+                          Sched<[WriteIPackingInst]>;
+def VUNPACK_IC_L_S8_BF16 : UnpackOpSc_<"vunpack.ic.l.s8.bf16",
+                                      int_tpu_unpack_ic_l_s8_bf16, v16bf16, v32i8>,
+                           Sched<[WriteIPackingInst]>;
+def VUNPACK_IC_U_S8_BF16 : UnpackOpSc_<"vunpack.ic.u.s8.bf16",
+                                      int_tpu_unpack_ic_u_s8_bf16, v16bf16, v32i8>,
+                           Sched<[WriteIPackingInst]>;
+def VUNPACK_IC_L_U8_BF16 : UnpackOpSc_<"vunpack.ic.l.u8.bf16",
+                                      int_tpu_unpack_ic_l_u8_bf16, v16bf16, v32i8>,
+                           Sched<[WriteIPackingInst]>;
+def VUNPACK_IC_U_U8_BF16 : UnpackOpSc_<"vunpack.ic.u.u8.bf16",
+                                      int_tpu_unpack_ic_u_u8_bf16, v16bf16, v32i8>,
+                           Sched<[WriteIPackingInst]>;
+
+} // Predicates = [IsSC, HasLPGL]
+
+class CvtOpXSc_<string Name, Intrinsic Intr,
+                ValueType DstType, ValueType SrcType> :
+  TPUInstP<(outs VPR:$Vd), (ins VPR:$x),
+           "$Vd =\t"#Name#"${pred} $x",
+           [(set (DstType VPR:$Vd), (Intr (SrcType VPR:$x)))]>,
+  Bundle<B_VCVT>, Sched<[WriteFPConvert]>, IsVectorInstruction;
+
+class CvtOpYXSc_<string Name, Intrinsic Intr,
+                ValueType DstType, ValueType SrcYType, ValueType SrcXType> :
+  TPUInstP<(outs VPR:$Vd), (ins VPR:$y, VPR:$x),
+           "$Vd =\t"#Name#"${pred} $y, $x",
+           [(set (DstType VPR:$Vd), (Intr (SrcYType VPR:$y), (SrcXType VPR:$x)))]>,
+  Bundle<B_VCVT>, Sched<[WriteFPConvert]>, IsVectorInstruction;
+
+let Predicates = [IsSC, HasLPVF] in {
+
+def VCVT_S32_F32 : CvtOpXSc_<"vcvt.s32.f32",
+                            int_tpu_vcvt_s32_f32, v8f32, v8i32>;
+def VCVT_F32_S32 : CvtOpXSc_<"vcvt.f32.s32",
+                            int_tpu_vcvt_f32_s32, v8i32, v8f32>;
+def VCVT_F32_BF8 : CvtOpXSc_<"vcvt.f32.bf8",
+                            int_tpu_vcvt_f32_bf8, v32i8, v8f32>;
+def VCVT_F32_IF8 : CvtOpXSc_<"vcvt.f32.if8",
+                            int_tpu_vcvt_f32_if8, v32i8, v8f32>;
+def VCVT_F32_BF16 : CvtOpXSc_<"vcvt.f32.bf16",
+                             int_tpu_vcvt_f32_bf16, v16bf16, v8f32>;
+def VCVT_F32_HF16 : CvtOpXSc_<"vcvt.f32.hf16",
+                             int_tpu_vcvt_f32_hf16, v16f16, v8f32>;
+def VCVT_SR_F32_BF8 : CvtOpYXSc_<"vcvt.sr.f32.bf8",
+                                int_tpu_vcvt_sr_f32_bf8, v32i8, v8i32, v8f32>;
+def VCVT_SR_F32_IF8 : CvtOpYXSc_<"vcvt.sr.f32.if8",
+                                int_tpu_vcvt_sr_f32_if8, v32i8, v8i32, v8f32>;
+def VCVT_SR_F32_BF16 : CvtOpYXSc_<"vcvt.sr.f32.bf16",
+                                 int_tpu_vcvt_sr_f32_bf16, v16bf16, v8i32, v8f32>;
+def VCVT_SR_F32_HF16 : CvtOpYXSc_<"vcvt.sr.f32.hf16",
+                                 int_tpu_vcvt_sr_f32_hf16, v16f16, v8i32, v8f32>;
+
+} // Predicates = [IsSC, HasLPVF]
+
+let Predicates = [IsSC, HasLPGL] in {
+
+def VCVT_BF16_S8 : CvtOpXSc_<"vcvt.bf16.s8",
+                            int_tpu_vcvt_bf16_s8, v32i8, v16bf16>;
+def VCVT_BF16_U8 : CvtOpXSc_<"vcvt.bf16.u8",
+                            int_tpu_vcvt_bf16_u8, v32i8, v16bf16>;
+def VCVT_BF16_S4 : CvtOpXSc_<"vcvt.bf16.s4",
+                            int_tpu_vcvt_bf16_s4, v64i4, v16bf16>;
+def VCVT_BF16_U4 : CvtOpXSc_<"vcvt.bf16.u4",
+                            int_tpu_vcvt_bf16_u4, v64i4, v16bf16>;
+def VCVT_S8_BF16 : CvtOpXSc_<"vcvt.s8.bf16",
+                            int_tpu_vcvt_s8_bf16, v16bf16, v32i8>;
+def VCVT_U8_BF16 : CvtOpXSc_<"vcvt.u8.bf16",
+                            int_tpu_vcvt_u8_bf16, v16bf16, v32i8>;
+def VCVT_S4_BF16 : CvtOpXSc_<"vcvt.s4.bf16",
+                            int_tpu_vcvt_s4_bf16, v16bf16, v64i4>;
+def VCVT_U4_BF16 : CvtOpXSc_<"vcvt.u4.bf16",
+                            int_tpu_vcvt_u4_bf16, v16bf16, v64i4>;
+
+} // Predicates = [IsSC, HasLPGL]

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrTensorCore.td b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrTensorCore.td
new file mode 100644
index 0000000..711186a
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrTensorCore.td

@@ -0,0 +1,1002 @@
+//===-- TPUInstrTensorCore.td - Target Description for TPU Target -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the TPU instructions specific to TensorCore
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Flag instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasVectorSflags] in {
+
+multiclass SyncInstTc_<int isyieldable> {
+let hasSideEffects = 1 in {
+defm EQ : WaitInst_<!if(isyieldable, "vwait.eq.yieldable", "vwait.eq"),
+                   !if(isyieldable, int_tpu_waiteq_yieldable, int_tpu_waiteq)>;
+defm NE : WaitInst_<!if(isyieldable, "vwait.ne.yieldable", "vwait.ne"),
+                   !if(isyieldable, int_tpu_waitne_yieldable, int_tpu_waitne)>;
+defm GT : WaitInst_<!if(isyieldable, "vwait.gt.yieldable", "vwait.gt"),
+                   !if(isyieldable, int_tpu_waitgt_yieldable, int_tpu_waitgt)>;
+defm GE : WaitInst_<!if(isyieldable, "vwait.ge.yieldable", "vwait.ge"),
+                   !if(isyieldable, int_tpu_waitge_yieldable, int_tpu_waitge)>;
+defm LT : WaitInst_<!if(isyieldable, "vwait.lt.yieldable", "vwait.lt"),
+                   !if(isyieldable, int_tpu_waitlt_yieldable, int_tpu_waitlt)>;
+defm LE : WaitInst_<!if(isyieldable, "vwait.le.yieldable", "vwait.le"),
+                   !if(isyieldable, int_tpu_waitle_yieldable, int_tpu_waitle)>;
+defm DONE : WaitDoneInst_<!if(isyieldable, "vwait.done.yieldable", "vwait.done"),
+                   !if(isyieldable, int_tpu_waitdone_yieldable, int_tpu_waitdone)>;
+defm NOTDONE : WaitDoneInst_<!if(isyieldable, "vwait.notdone.yieldable", "vwait.notdone"),
+                   !if(isyieldable, int_tpu_waitnotdone_yieldable, int_tpu_waitnotdone)>;
+}
+}
+
+multiclass SetAddSyncFlagTc_ {
+let mayStore = 1 in {
+defm SET                : SflagStoreInst_<"vsyncset.s32">;
+defm SET_DONE           : SflagIntrinsicInst_<"vsyncset.done.s32",           int_tpu_syncset_done>;
+defm SET_NOTDONE        : SflagIntrinsicInst_<"vsyncset.notdone.s32",        int_tpu_syncset_notdone>;
+defm SET_REMOTE         : SflagIntrinsicInstRemote_<"vsyncset.remote.s32",         int_tpu_syncset_remote>;
+defm SET_REMOTE_DONE    : SflagIntrinsicInstRemoteDone_<"vsyncset.remote.done.s32",    int_tpu_syncset_remote_done>;
+defm SET_REMOTE_DONEINV : SflagIntrinsicInstRemoteDone_<"vsyncset.remote.doneinv.s32", int_tpu_syncset_remote_doneinv>;
+}
+
+let mayLoad = 1, mayStore = 1 in {
+defm ADD                : SflagIntrinsicInst_<"vsyncadd.s32",                int_tpu_syncadd>;
+defm ADD_DONE           : SflagIntrinsicInst_<"vsyncadd.done.s32",           int_tpu_syncadd_done>;
+defm ADD_NOTDONE        : SflagIntrinsicInst_<"vsyncadd.notdone.s32",        int_tpu_syncadd_notdone>;
+defm ADD_REMOTE         : SflagIntrinsicInstRemote_<"vsyncadd.remote.s32",         int_tpu_syncadd_remote>;
+defm ADD_REMOTE_DONE    : SflagIntrinsicInstRemoteDone_<"vsyncadd.remote.done.s32",    int_tpu_syncadd_remote_done>;
+defm ADD_REMOTE_DONEINV : SflagIntrinsicInstRemoteDone_<"vsyncadd.remote.doneinv.s32", int_tpu_syncadd_remote_doneinv>;
+}
+}
+
+// Mark all  WAIT/SYNC instruction as using BundleVs. Not all the derived
+// instructions use Vs slot but the bundle tracker only reserves Vs slot for
+// scalar registers so it is okay to be conservative and this simplify sharing
+// the declaration with SparseCore.
+defm tcWAIT : SyncInstTc_<0 /* !isyieldable */>, Bundle<B_SM>, IsVectorInstruction,
+                       SubUnits<[SU_vwait]>;
+defm tcSYNC : SetAddSyncFlagTc_, Bundle<B_SM>, IsVectorInstruction,
+                       SubUnits<[SU_set_sync]>;
+
+let isPseudo = 1, usesCustomInserter = 1 in {
+def VFREADi : TPUInstP<(outs GPR:$d), (ins tsyncimmsi:$imm),
+                     "$d =\t#SFLAGREAD${pred} [sflag:$imm]",
+                     [(set GPR:$d, (i32 (load_sflag (Wrapper tglobaladdr:$imm))))]>,
+            Bundle<B_SM>;
+def VFREADr : TPUInstP<(outs GPR:$d), (ins GPR:$r),
+                     "$d =\t#SFLAGREAD${pred} [sflag:$r]",
+                     [(set GPR:$d, (i32 (load_sflag GPR:$r)))]>,
+            Bundle<B_SM>;
+def VFREADDONEi : TPUInstP<(outs GPR:$d), (ins tsyncimmsi:$imm),
+                           "$d =\t#SFLAGREAD.done${pred} [sflag:$imm]",
+                           [(set GPR:$d, (i32 (int_tpu_syncdonemov (Wrapper tglobaladdr:$imm))))]>,
+                   Bundle<B_SM>;
+def VFREADDONEr : TPUInstP<(outs GPR:$d), (ins GPR:$r),
+                           "$d =\t#SFLAGREAD.done${pred} [sflag:$r]",
+                           [(set GPR:$d, (i32 (int_tpu_syncdonemov GPR:$r)))]>,
+                   Bundle<B_SM>;
+}
+
+defm  VSYNCMOVE : MoveSyncFlag_<(outs V2SFPR:$v2s), "$v2s">,
+      SubUnits<[SU_read_sync]>;
+
+} // Predicates = [HasVectorSflags]
+
+//===----------------------------------------------------------------------===//
+// Local DMA Operations
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1, mayStore = 1, isDMA = 1 in {
+let SubUnits = [SU_local_dma] in {
+multiclass DMA<string srcmem, string dstmem, Intrinsic intr, string mod = ""> {
+  def rrrr : TPUInstP<(outs), (ins GPR:$dst, GPR:$sflag, GPR:$src, GPR:$len),
+                     "["#dstmem#":${dst}], [sflag:${sflag}] =\tdma.local"#mod#"${pred} ["#srcmem#":${src}], ${len}",
+                     [(intr GPR:$sflag, GPR:$src, GPR:$dst, (i32 GPR:$len))]>,
+             Bundle<B_Sboth>, Sched<[WriteDmaLocal]>;
+  def rirr : TPUInstP<(outs), (ins GPR:$dst, i32imm:$sflag, GPR:$src, GPR:$len),
+                     "["#dstmem#":${dst}], [sflag:${sflag}] =\tdma.local"#mod#"${pred} ["#srcmem#":${src}], ${len}",
+                     [(intr (Wrapper tglobaladdr:$sflag), GPR:$src, GPR:$dst, (i32 GPR:$len))]>,
+             Bundle<B_Sboth>, BundleImmSy, Sched<[WriteDmaLocal]>;
+  def riri : TPUInstP<(outs), (ins GPR:$dst, i32imm:$sflag, GPR:$src, i32imm:$len),
+                     "["#dstmem#":${dst}], [sflag:${sflag}] =\tdma.local"#mod#"${pred} ["#srcmem#":${src}], ${len}",
+                     [(intr (Wrapper tglobaladdr:$sflag), GPR:$src, GPR:$dst, (i32 imm:$len))]>,
+             Bundle<B_Sboth>, BundleImmSy<[IMM_OP_0, IMM_OP_1]>, Sched<[WriteDmaLocal]>;
+  def rrri : TPUInstP<(outs), (ins GPR:$dst, GPR:$sflag, GPR:$src, i32imm:$len),
+                     "["#dstmem#":${dst}], [sflag:${sflag}] =\tdma.local"#mod#"${pred} ["#srcmem#":${src}], ${len}",
+                     [(intr GPR:$sflag, GPR:$src, GPR:$dst, (i32 imm:$len))]>,
+             Bundle<B_Sboth>, BundleImmSy, Sched<[WriteDmaLocal]>;
+}
+
+defm DMA_HBM_TO_SMEM : DMA<"hbm", "smem", int_tpu_dma_hbm_to_smem>;
+defm DMA_HBM_TO_TIMEM : DMA<"hbm", "timem", int_tpu_dma_hbm_to_timem>;
+defm DMA_HBM_TO_VMEM : DMA<"hbm", "vmem", int_tpu_dma_hbm_to_vmem>;
+defm DMA_HBM_TO_HBM : DMA<"hbm", "hbm", int_tpu_dma_hbm_to_hbm>;
+defm DMA_SMEM_TO_HBM : DMA<"smem", "hbm", int_tpu_dma_smem_to_hbm>;
+defm DMA_VMEM_TO_HBM : DMA<"vmem", "hbm", int_tpu_dma_vmem_to_hbm>;
+defm DMA_TIMEM_TO_HBM : DMA<"timem", "hbm", int_tpu_dma_timem_to_hbm>;
+defm DMA_HBM_TO_VMEM_HIB_UPDATE : DMA<"hbm", "vmem",
+  int_tpu_dma_hbm_to_vmem_hib_update, ".hibupdate">;
+} // SubUnits = [SU_local_dma]
+
+// DMA to HIB is a bit different as it doesn't have a destination pointer.
+let SubUnits = [SU_local_dma] in {
+def DMA_HBM_TO_HIBrrrr : TPUInstP<(outs), (ins GPR:$sflag, GPR:$src, GPR:$len),
+                     "[hib], [sflag:${sflag}] =\tdma.local${pred} [hbm:${src}], ${len}",
+                     [(int_tpu_dma_hbm_to_hib GPR:$sflag, GPR:$src, (i32 GPR:$len))]>,
+             Bundle<B_Sboth>, Sched<[WriteDmaLocal]>;
+def DMA_HBM_TO_HIBrirr : TPUInstP<(outs), (ins i32imm:$sflag, GPR:$src, GPR:$len),
+                     "[hib], [sflag:${sflag}] =\tdma.local${pred} [hbm:${src}], ${len}",
+                     [(int_tpu_dma_hbm_to_hib (Wrapper tglobaladdr:$sflag), GPR:$src, (i32 GPR:$len))]>,
+             Bundle<B_Sboth>, BundleImmSy, Sched<[WriteDmaLocal]>;
+def DMA_HBM_TO_HIBriri : TPUInstP<(outs), (ins i32imm:$sflag, GPR:$src, i32imm:$len),
+                     "[hib], [sflag:${sflag}] =\tdma.local${pred} [hbm:${src}], ${len}",
+                     [(int_tpu_dma_hbm_to_hib (Wrapper tglobaladdr:$sflag), GPR:$src, (i32 imm:$len))]>,
+             Bundle<B_Sboth>, BundleImmSy<[IMM_OP_0, IMM_OP_1]>, Sched<[WriteDmaLocal]>;
+def DMA_HBM_TO_HIBrrri : TPUInstP<(outs), (ins GPR:$sflag, GPR:$src, i32imm:$len),
+                     "[hib], [sflag:${sflag}] =\tdma.local${pred} [hbm:${src}], ${len}",
+                     [(int_tpu_dma_hbm_to_hib GPR:$sflag, GPR:$src, (i32 imm:$len))]>,
+             Bundle<B_Sboth>, BundleImmSy, Sched<[WriteDmaLocal]>;
+} // SubUnits = [SU_local_dma]
+} // mayLoad = 1, mayStore = 1, isDMA = 1
+
+class LdStInfo<string Inst> {
+  Instruction Opcode = !cast<Instruction>(Inst);
+  bit HasAddress = 0;
+  bit HasMask = 0;
+  bit HasStride = 0;
+  bit HasShuffle = 0;
+  bit HasVMask = 0;
+  bit HasLdReplicateEvenOdd = 0;
+  bit HasVsEvenOdd = 0;
+  bit HasIndex = 0;
+}
+
+def LdSTMemAccessTable : GenericTable {
+  let FilterClass = "LdStInfo";
+  let CppTypeName = "LdStInfoTy";
+  let Fields = ["Opcode", "HasAddress", "HasMask", "HasStride", "HasShuffle",
+                "HasVMask", "HasLdReplicateEvenOdd", "HasVsEvenOdd", "HasIndex"];
+  let PrimaryKey = ["Opcode"];
+  let PrimaryKeyName = "LdStInfo";
+}
+
+//===----------------------------------------------------------------------===//
+// Load
+//===----------------------------------------------------------------------===//
+let mayLoad = 1, isVMemLoadInstr = 1 in {
+class Vld<int immcount, dag iops, string ldtype, string address, string sublanespec,
+          string selector> : TPUInstP<(outs VPR:$Vd), iops,
+         "$Vd =\tvld"#ldtype#"${pred} ["#address#""#sublanespec#"]"#selector,[]>,
+          Bundle<B_VLD>,
+          BundleImmVy<!cond(!eq(immcount, 1): [IMM_OP_0],
+                            !eq(immcount, 2): [IMM_OP_0, IMM_OP_1],
+                            !eq(immcount, 3): [IMM_OP_0, IMM_OP_1, IMM_OP_2]),
+                      IMM_2_to_5>, Sched<[WriteVLD]>, LdStInfo<NAME>, IsVectorInstruction,
+                      SubUnits<[SU_vector_load]>;
+} // let mayLoad = 1, isVMemLoadInstr = 1
+
+// Two addressing mode.
+multiclass VldAddr<int immcount, dag iops, string ldtype, string sublanespec,
+                 string selector> {
+  def i :  Vld<!add(immcount, 1), !con((ins tmoffimmsi:$imm), iops),
+                   ldtype, "vmem:$imm", sublanespec,
+                   selector>;
+  def ri : Vld<!add(immcount, 1), !con((ins GPR:$Ss, tmoffimmsi:$imm), iops),
+                   ldtype, "vmem:${Ss}+$imm", sublanespec,
+                   selector>
+                   { let HasAddress = 1; }
+}
+
+// May or may not have mask.
+multiclass VLdMask<int immcount, dag iops, string ldtype, string sublanespec,
+                   string selector> {
+let HasMask = 1 in {
+  defm _MaskR : VldAddr<immcount, !con((ins GPR:$mask), iops),
+                   ldtype, sublanespec#" sm:$mask", selector>;
+  defm _MaskI : VldAddr<!add(immcount, 1), !con((ins tsmskimmsi:$mask), iops),
+             ldtype, sublanespec#" sm:$mask", selector>;
+}
+  defm "" :     VldAddr<immcount, iops, ldtype, sublanespec, selector>;
+}
+
+// May or may not have mask.
+multiclass VLdStride<dag iops, string ldtype> {
+let HasStride = 1 in {
+  // stride.
+  defm _StrideR : VLdMask<0, !con((ins GPR:$stride), iops), ldtype, " ss:$stride", "">;
+  defm _StrideI : VLdMask<1, !con((ins tstrdimmsi:$stride), iops), ldtype, " ss:$stride", "">;
+}
+  // No stride
+  defm "" :       VLdMask<0, iops, ldtype, "", "">;
+}
+
+multiclass tcVld_ {
+  // Strided ld.
+  defm "" : VLdStride<(ins), "">;
+let HasShuffle = 1 in {
+  // Shuffle.
+  defm _ShuffleR : VLdMask<0, (ins GPR:$selector), ".sshfl", "", ", $selector">;
+  defm _ShuffleI : VLdMask<1, (ins tvldimmi:$selector),
+    ".sshfl", "", ", $selector">;
+}
+  // Use custom inserter to attach the right memory operands.
+let usesCustomInserter = 1, isIndexedLoadStore = 1, HasIndex = 1 in {
+  // Indexed ld.
+  defm _IAR0 : VLdStride<(ins IARPR0:$iar), ".iar0">;
+  defm _IAR1 : VLdStride<(ins IARPR1:$iar), ".iar1">;
+}
+}
+
+defm tcVLV : tcVld_;
+
+// Class to map pseudo IAR instruction to the opcode we lower to after bundle
+// packing.
+class PseudoIAR {
+  Instruction PseudoOp = !cast<Instruction>(NAME);
+  Instruction NativeOp =
+    !cast<Instruction>(!subst("tcVSV_ODDEVEN", "tcVSV_IAR0",
+                       !subst("tcVLV_REPLICATE_EVENODD", "tcVLV_IAR1", NAME)));
+}
+
+def PseudoIARInst : GenericTable {
+  let FilterClass = "PseudoIAR";
+  let CppTypeName = "PseudoIARTy";
+  let Fields = ["PseudoOp", "NativeOp"];
+  let PrimaryKey = ["PseudoOp"];
+  let PrimaryKeyName = "getPseudoIAROpcode";
+}
+
+let isPseudo = 1, HasLdReplicateEvenOdd = 1 in {
+defm tcVLV_REPLICATE_EVENODD : VLdStride<(ins IARPR1:$iar), "PSEUDO_IAR1">,
+                                          PseudoIAR;
+}
+
+multiclass Vld_pat<string Name, dag in_ops, dag out_ops>{
+  // We build the output dag children incrementally. To be able to merge all
+  // the nodes they need to have the same root. As we don't know the final
+  // instruction when we create the arguments we use outs and substitute it
+  // with the instruction when creating the final pattern.
+  defm : MultiVTypePat<in_ops,
+                  !foreach(v, out_ops, !subst(outs, !cast<Instruction>(Name), v))>;
+}
+
+multiclass LdAddressing<SDPatternOperator Intr, string Name, dag in_ops, dag out_ops> {
+  defm : Vld_pat<Name#"ri", !con((Intr (add GPR:$Ss, tmoffimmsi:$imm)), in_ops),
+                  !con((outs GPR:$Ss, imm:$imm), out_ops)>;
+  defm : Vld_pat<Name#"ri", !con((Intr (or AddLikeOrOp:$Ss, tmoffimmsi:$imm)), in_ops),
+                  !con((outs GPR:$Ss, imm:$imm), out_ops)>;
+  defm : Vld_pat<Name#"ri", !con((Intr GPR:$Ss), in_ops),
+                  !con((outs GPR:$Ss, PatLeaf<(i32 0)>), out_ops)>;
+  defm : Vld_pat<Name#i, !con((Intr tmoffimmsi:$imm), in_ops),
+                  !con((outs imm:$imm), out_ops)>;
+}
+
+multiclass LdPatMask<SDPatternOperator Intr, string Name, dag in_ops, dag out_ops> {
+  // No Mask
+  defm : LdAddressing<Intr, Name, !con((Intr (i32 255)), in_ops), out_ops>;
+  // Mask imm or register.
+  defm : LdAddressing<Intr, Name#"_MaskR", !con((Intr GPR:$mask), in_ops),
+                      !con((outs GPR:$mask), out_ops)>;
+  defm : LdAddressing<Intr, Name#"_MaskI", !con((Intr tsmskimmsi:$mask), in_ops),
+                      !con((outs imm:$mask), out_ops)>;
+}
+
+multiclass LdPatStride<SDPatternOperator Intr, string Name, dag in_ops, dag out_ops> {
+// No stride.
+defm : LdPatMask<Intr, Name,
+                !con((Intr (i32 1)), in_ops), out_ops>;
+// Stride imm or register.
+defm : LdPatMask<Intr, Name#"_StrideR",
+                !con((Intr GPR:$stride), in_ops), !con((outs GPR:$stride), out_ops)>;
+defm : LdPatMask<Intr, Name#"_StrideI",
+                !con((Intr tstrdimmsi:$stride), in_ops), !con((outs imm:$stride), out_ops)>;
+}
+
+// Strided ld
+defm : LdPatStride<int_tpu_vld_strided, "tcVLV", (int_tpu_vld_strided), (outs)>;
+
+// Indexed load
+defm : LdPatStride<int_tpu_vld_indexed, "tcVLV_IAR0",
+  (int_tpu_vld_indexed IARPR0:$iar, (i32 0)), (outs IARPR0:$iar)>;
+defm : LdPatStride<int_tpu_vld_indexed, "tcVLV_IAR1",
+  (int_tpu_vld_indexed IARPR1:$iar, (i32 1)), (outs IARPR1:$iar)>;
+
+// Shuffle imm or register.
+defm : LdPatMask<int_tpu_vld_shuffle, "tcVLV_ShuffleR",
+                (int_tpu_vld_shuffle GPR:$shuffle), (outs GPR:$shuffle)>;
+defm : LdPatMask<int_tpu_vld_shuffle, "tcVLV_ShuffleI",
+                (int_tpu_vld_shuffle imm:$shuffle), (outs imm:$shuffle)>;
+
+// Pattern for native load instruction.
+defm : MultiVTypePat<(load_vmem (Wrapper tglobaladdr:$imm)), (tcVLVi imm:$imm)>;
+defm : MultiVTypePat<(load_vmem (tmoffimmsi:$imm)), (tcVLVi imm:$imm)>;
+defm : MultiVTypePat<(load_vmem (add GPR:$Ss, tmoffimmsi:$imm)), (tcVLVri GPR:$Ss, imm:$imm)>;
+defm : MultiVTypePat<(load_vmem (or AddLikeOrOp:$Ss, tmoffimmsi:$imm)), (tcVLVri GPR:$Ss, imm:$imm)>;
+defm : MultiVTypePat<(load_vmem GPR:$Ss), (tcVLVri GPR:$Ss, (i32 0))>;
+
+defm : LdPatStride<int_tpu_vld_replicate_evenodd_sublanes,
+                  "tcVLV_REPLICATE_EVENODD",
+                  (int_tpu_vld_replicate_evenodd_sublanes IARPR1:$iar),
+                  (outs IARPR1:$iar)>;
+//===----------------------------------------------------------------------===//
+// Store
+//===----------------------------------------------------------------------===//
+let mayStore = 1, isVMemStoreInstr = 1 in {
+class Vst<int immcount, dag iops, string sttype, string address, string sublanespec,
+          string mask> : TPUInstP<(outs), !con((ins VPR:$Vs), iops),
+         "["#address#""#sublanespec#"] =\tvst"#sttype#"${pred}"#mask#" $Vs", []>,
+          Bundle<B_VST>, BundleImmVy<!cond(!eq(immcount, 1): [IMM_OP_0],
+                                           !eq(immcount, 2): [IMM_OP_0, IMM_OP_1],
+                                           !eq(immcount, 3): [IMM_OP_0, IMM_OP_1, IMM_OP_2]),
+                                     IMM_2_to_5>, LdStInfo<NAME>, IsVectorInstruction,
+                                     SubUnits<[SU_vector_store]>;
+} // let mayStore = 1, isVMemStoreInstr = 1
+
+// Two addressing mode.
+multiclass VstAddr<int immcount, dag iops, string sttype, string sublanespec,
+          string mask> {
+  def i :  Vst<!add(1, immcount), !con((ins tmoffimmsi:$imm), iops), sttype, "vmem:$imm", sublanespec, mask>;
+  def ri : Vst<!add(1, immcount), !con((ins GPR:$Ss, tmoffimmsi:$imm), iops), sttype, "vmem:${Ss}+$imm", sublanespec, mask>
+            { let HasAddress = 1; }
+}
+
+// May or may not have mask.
+multiclass VstMask<int immcount, dag iops, string sttype, string sublanespec, string vmask> {
+let HasMask = 1 in {
+  defm _MaskR : VstAddr<immcount, !con((ins GPR:$mask), iops), sttype, sublanespec#" sm:$mask",
+                   vmask>;
+  defm _MaskI : VstAddr<!add(1, immcount), !con((ins tsmskimmsi:$mask), iops), sttype, sublanespec#" sm:$mask",
+             vmask>;
+}
+  // No mask
+  defm "" :     VstAddr<immcount, iops, sttype, sublanespec, vmask>;
+}
+
+// May or may not have a stride.
+multiclass VstStride<dag iops, string sttype, string vmask> {
+let HasStride = 1 in {
+  defm _StrideR : VstMask<0, !con((ins GPR:$stride), iops), sttype, " ss:$stride", vmask>;
+  defm _StrideI : VstMask<1, !con((ins tstrdimmsi:$stride), iops), sttype, " ss:$stride", vmask>;
+}
+   // No stride
+  defm "" :       VstMask<0, iops, sttype, "", vmask>;
+}
+
+// May or may not have vmask.
+multiclass VstVMask<dag iops, string sttype, string source> {
+let HasVMask = 1 in {
+  defm _VMask : VstStride<!con((ins MPR:$vmask), iops), sttype#".msk", source#" $vmask,">;
+}
+   // No VMask
+  defm "" :     VstStride<iops, sttype, source>;
+}
+
+// May or may not have index.
+multiclass VstIndexed {
+// Use custom inserter to attach the right memory operands.
+let usesCustomInserter = 1, isIndexedLoadStore = 1, HasIndex = 1 in {
+  defm _IAR0 : VstVMask<(ins IARPR0:$iar), ".iar", " $iar,">;
+  defm _IAR1 : VstVMask<(ins IARPR1:$iar), ".iar", " $iar,">;
+}
+   // No iar
+  defm "" :     VstVMask<(ins), "", "">;
+}
+
+defm tcVSV : VstIndexed;
+
+let isPseudo = 1, HasVsEvenOdd = 1 in {
+defm tcVSV_ODDEVEN : VstVMask<(ins IARPR0:$iar), ".PSEUDO_IAR0", " $iar,">, PseudoIAR;
+}
+
+// Match to the right store.
+multiclass MultiTypeStore<SDPatternOperator Intr, string Name, dag in_ops,
+                          dag out_ops> {
+  def : Pat<!con((Intr (vNf32 VPR:$Vs)), in_ops),
+            !con((!cast<Instruction>(Name) VPR:$Vs), out_ops)>;
+  def : Pat<!con((Intr (vNi32 VPR:$Vs)), in_ops),
+            !con((!cast<Instruction>(Name) VPR:$Vs), out_ops)>;
+}
+
+multiclass Vst_pat<SDPatternOperator Intr, string Name, dag in_ops, dag out_ops> {
+  defm : MultiTypeStore<Intr, Name, in_ops,
+             !foreach(v, out_ops, !subst(outs, !cast<Instruction>(Name), v))>;
+}
+
+multiclass StAddressing<SDPatternOperator Intr, string Name, dag in_ops, dag out_ops> {
+  defm : Vst_pat<Intr, Name#"ri", !con((Intr (add GPR:$Ss, tmoffimmsi:$imm)), in_ops),
+                  !con((outs GPR:$Ss, imm:$imm), out_ops)>;
+  defm : Vst_pat<Intr, Name#"ri", !con((Intr (or AddLikeOrOp:$Ss, tmoffimmsi:$imm)), in_ops),
+                  !con((outs GPR:$Ss, imm:$imm), out_ops)>;
+  defm : Vst_pat<Intr, Name#"ri", !con((Intr GPR:$Ss), in_ops),
+                  !con((outs GPR:$Ss, PatLeaf<(i32 0)>), out_ops)>;
+  defm : Vst_pat<Intr, Name#i, !con((Intr tmoffimmsi:$imm), in_ops),
+                  !con((outs imm:$imm), out_ops)>;
+}
+
+multiclass StPatMask<SDPatternOperator Intr, string Name, dag in_ops, dag out_ops> {
+  // No Mask.
+  defm : StAddressing<Intr, Name, !con((Intr (i32 255)), in_ops), out_ops>;
+  // Mask Register or Imm.
+  defm : StAddressing<Intr, Name#"_MaskR", !con((Intr GPR:$mask), in_ops),
+                      !con((outs GPR:$mask), out_ops)>;
+  defm : StAddressing<Intr, Name#"_MaskI", !con((Intr tsmskimmsi:$mask), in_ops),
+                      !con((outs imm:$mask), out_ops)>;
+}
+
+multiclass StPatStride<SDPatternOperator Intr, string Name, dag in_ops, dag out_ops> {
+  // No stride.
+  defm : StPatMask<Intr, Name, !con((Intr (i32 1)), in_ops), out_ops>;
+  // Stride Register or Imm.
+  defm : StPatMask<Intr, Name#"_StrideR", !con((Intr GPR:$stride), in_ops),
+                      !con((outs GPR:$stride), out_ops)>;
+  defm : StPatMask<Intr, Name#"_StrideI", !con((Intr tstrdimmsi:$stride), in_ops),
+                      !con((outs imm:$stride), out_ops)>;
+}
+
+multiclass StPatVMask<SDPatternOperator Intr, string Name, dag in_ops, dag out_ops> {
+// No vector Mask.
+defm : StPatStride<Intr, Name, !con((Intr (vNi1 (Splat -1))), in_ops), out_ops>;
+// Vector register mask.
+defm : StPatStride<Intr, Name#"_VMask", !con((Intr (vNi1 MPR:$vmask)), in_ops),
+      !con((outs MPR:$vmask), out_ops)>;
+}
+
+// No IAR
+defm : StPatVMask<int_tpu_vst_strided, "tcVSV", (int_tpu_vst_strided),  (outs)>;
+// IAR 0 and 1
+defm : StPatVMask<int_tpu_vst_indexed, "tcVSV_IAR0",
+  (int_tpu_vst_indexed IARPR0:$iar, (i32 0)),  (outs IARPR0:$iar)>;
+defm : StPatVMask<int_tpu_vst_indexed, "tcVSV_IAR1",
+  (int_tpu_vst_indexed IARPR1:$iar, (i32 1)),  (outs IARPR1:$iar)>;
+
+
+// Native store matching.
+multiclass MatchTcStoreType<ValueTypeByHwMode VType> {
+  def : Pat<(store_vmem (VType VPR:$Vs), (Wrapper tglobaladdr:$imm)), (tcVSVi VPR:$Vs, imm:$imm)>;
+  def : Pat<(store_vmem (VType VPR:$Vs), (tmoffimmsi:$imm)), (tcVSVi VPR:$Vs, imm:$imm)>;
+  def : Pat<(store_vmem (VType VPR:$Vs), (i32 GPR:$Ss)), (tcVSVri VPR:$Vs, GPR:$Ss, (i32 0))>;
+  def : Pat<(store_vmem (VType VPR:$Vs), (add (i32 GPR:$Ss), tmoffimmsi:$imm)),
+          (tcVSVri VPR:$Vs, GPR:$Ss, imm:$imm)>;
+  def : Pat<(store_vmem (VType VPR:$Vs), (or (i32 AddLikeOrOp:$Ss), tmoffimmsi:$imm)),
+          (tcVSVri VPR:$Vs, GPR:$Ss, imm:$imm)>;
+}
+defm : MatchTcStoreType<vNf32>;
+defm : MatchTcStoreType<vNi32>;
+
+defm : StPatVMask<int_tpu_vst_evenodd_sublanes, "tcVSV_ODDEVEN",
+                    (int_tpu_vst_evenodd_sublanes IARPR0:$iar), (outs IARPR0:$iar)>;
+
+//===----------------------------------------------------------------------===//
+// Set IAR Intrinsics
+//===----------------------------------------------------------------------===//
+multiclass SetIAR<string postFix, SDPatternOperator OpNode, int iarIndex,
+                       DAGOperand iar = !cast<DAGOperand>("IARPR"#iarIndex),
+                       SchedWrite Sch = !cast<SchedWrite>("WriteIar"#iarIndex)> {
+// Use custom inserter to attach the right memory operands.
+let usesCustomInserter = 1 in {
+  def "" : TPUInstP<(outs iar:$iar), (ins VPR:$vsrc),
+              "$iar =\tvsetiar."#postFix#"${pred} $vsrc",
+              [(set iar:$iar, (OpNode (vNi32 VPR:$vsrc), (i32 iarIndex)))]>,
+    Bundle<B_VST>, Sched<[Sch]>, IsVectorInstruction, SubUnits<[SU_set_iar]>;
+}
+}
+
+multiclass SetIARMode<int iarIndex> {
+  defm _SET_LANE : SetIAR<"lane", int_tpu_set_lane_indexed, iarIndex>;
+  defm _SET_SUBLANE : SetIAR<"sublane", int_tpu_set_sublane_indexed, iarIndex>;
+  defm _SET_RAW : SetIAR<"raw", int_tpu_set_iar_raw, iarIndex>;
+}
+
+defm IAR0 : SetIARMode<0>;
+defm IAR1 : SetIARMode<1>;
+
+multiclass VectorTraceInst<Intrinsic intrinsic> {
+  // Note that vtrace takes two args, but LLO actually implements a simplified version which takes
+  // either sreg or immidiate. The following implementation take same simplified approach, which may
+  // be revised later.
+  def r : TPUInstP<(outs), (ins GPR:$op),
+                  "_ =\tvtrace${pred} $op",
+                  [(intrinsic GPR:$op)]>,
+                  Bundle<B_SM>;
+  def i : TPUInstP<(outs), (ins i32imm:$imm),
+                  "_ =\tvtrace${pred} $imm",
+                  [(intrinsic imm:$imm)]>,
+                  Bundle<B_SM>;
+}
+multiclass VectorSetTracemark<Intrinsic intrinsic> {
+  def r : TPUInstP<(outs), (ins GPR:$op),
+                  "(tm) =\tvsettm${pred} $op",
+                  [(intrinsic GPR:$op)]>,
+                  Bundle<B_SM>;
+  def i : TPUInstP<(outs), (ins i32imm:$imm),
+                  "(tm) =\tvsettm${pred} $imm",
+                  [(intrinsic imm16:$imm)]>,
+                  Bundle<B_SM>;
+}
+let Predicates = [HasVectorSflags] in {
+let hasSideEffects = 1, SubUnits = [SU_vmisc] in {
+defm tcVTRACE : VectorTraceInst<int_tpu_vtrace>, IsVectorInstruction;
+defm tcVSETTM : VectorSetTracemark<int_tpu_vsettm>, IsVectorInstruction;
+} // hasSideEffects = 1, SubUnits = [SU_vmisc]
+
+} // Predicates = [HasVectorSflags]
+
+//===----------------------------------------------------------------------===//
+// MXU operations
+//===----------------------------------------------------------------------===//
+multiclass MatPush<int i, string IntrName, string OpName, string FifoName,
+  DAGOperand fiforeg = !cast<DAGOperand>(FifoName#i)> :
+    MatOpMasked<i, OpName, !cast<Intrinsic>(IntrName#"_f32"), (ins fiforeg:$srcgsf),
+    (outs fiforeg:$dstgsf), "$dstgsf", ".f32",
+    !cast<SchedWrite>("WriteMatPush"#i), [HasMXU,NotVFTC]>;
+
+// MatPush may transpose or not (For JF/DF/PF)
+multiclass MatPushXPos<int i, string IntrName, string OpName> {
+  defm "" : MatPush<i, IntrName, OpName, "GSFNPR">,
+       SubUnits<[SU_matpush_pfc]>;
+let OtherPredicates = [UseGsftForXpose] in {
+  defm _XPOS : MatPush<i, IntrName#"_xpose", OpName#".xpose", "GSFTPR">,
+       SubUnits<[SU_matpush_pfc]>;
+}
+// JF/DF don't have a special latch for transpose matpush.
+let OtherPredicates = [UseGsfnForXpose] in {
+  defm _XPOS_JF : MatPush<i, IntrName#"_xpose", OpName#".xpose", "GSFNPR">,
+       SubUnits<[SU_matpush_jfc_xp]>;
+}
+}
+
+multiclass MatPushMode<int i, string IntrName, string OpName> {
+  defm ""      : MatPushXPos<i, IntrName, OpName>;
+  defm _LOW    : MatPushXPos<i, IntrName#"_low", OpName#".low">;
+  defm _HI     : MatPushXPos<i, IntrName#"_hi", OpName#".hi">;
+  defm _PACKED : MatPushXPos<i, IntrName#"_packed", OpName#".packed">;
+}
+
+multiclass MatMul<int i, string IntrName, string OpName, string ScheduleName,
+           DAGOperand fiforegsrc = !cast<DAGOperand>("GMRPR"#i),
+           DAGOperand fiforegdst = !cast<DAGOperand>("MRFPR"#i)> :
+    MatOpMasked<i, OpName, !cast<Intrinsic>(IntrName#"_f32"), (ins fiforegsrc:$srcgmr),
+    (outs fiforegdst:$dstmrf), "$dstmrf", ".f32",
+    !cast<SchedWrite>(ScheduleName#i), [HasMXU,NotVFTC]>;
+
+multiclass MatMulDwgGsfnOrGsftMask<int i, Intrinsic Intr, string Name,
+           SchedWrite Schedule, DAGOperand fiforeg_gsf,
+           DAGOperand fiforeg_mrf = !cast<DAGOperand>("MRFPR"#i),
+           DAGOperand fiforeg_gmr = !cast<DAGOperand>("GMRPR"#i)> {
+  def "" :  TPUInstP<(outs fiforeg_mrf:$dstmrf, fiforeg_gmr:$dstgmr, fiforeg_gsf:$dstgsf),
+    (ins VPR:$Vs, fiforeg_gsf:$srcgsf),
+    "($dstgmr, $dstgsf, $dstmrf) =\t"#Name#"${pred} $Vs",
+    [(set fiforeg_mrf:$dstmrf, fiforeg_gmr:$dstgmr, fiforeg_gsf:$dstgsf,
+          (Intr (vNf32 VPR:$Vs), (vNi1 (Splat -1)), (i32 i), (i32 fiforeg_gsf:$srcgsf)))]>,
+           Sched<[Schedule]>;
+
+  def m :  TPUInstP<(outs fiforeg_mrf:$dstmrf, fiforeg_gmr:$dstgmr, fiforeg_gsf:$dstgsf),
+    (ins VPR:$Vs,  MPR:$m, fiforeg_gsf:$srcgsf),
+    "($dstgmr, $dstgsf, $dstmrf) =\t"#Name#".msk${pred} $m, $Vs",
+    [(set fiforeg_mrf:$dstmrf, fiforeg_gmr:$dstgmr, fiforeg_gsf:$dstgsf,
+          (Intr (vNf32 VPR:$Vs), (vNi1 MPR:$m), (i32 i), (i32 fiforeg_gsf:$srcgsf)))]>,
+           Sched<[Schedule]>;
+}
+
+multiclass MatMulDwgGsfnOrGsft<int i, string IntrName, string OpName,
+           string ScheduleName, string SubUnitNameHint> {
+  defm _GSFN    : MatMulDwgGsfnOrGsftMask<i, !cast<Intrinsic>(IntrName#"_gsfn"), OpName,
+             !cast<SchedWrite>(ScheduleName#i), !cast<DAGOperand>("GSFNPR"#i)>,
+             SubUnits<[!cast<SubUnitEncoding>("SU_matmul_dwgn_pfc"#SubUnitNameHint)]>;
+  defm _GSFT    : MatMulDwgGsfnOrGsftMask<i, !cast<Intrinsic>(IntrName#"_gsft"), OpName,
+             !cast<SchedWrite>(ScheduleName#i), !cast<DAGOperand>("GSFTPR"#i)>,
+             SubUnits<[!cast<SubUnitEncoding>("SU_matmul_dwgt_pfc"#SubUnitNameHint)]>;
+}
+
+multiclass MatMulDwg<int i, string IntrName, string OpName, string ScheduleName,
+           string SubUnitNameHint> {
+  defm ""      : MatMul<i, IntrName, OpName, ScheduleName>,
+      SubUnits<[!cast<SubUnitEncoding>("SU_matmul_pfc"#SubUnitNameHint)]>;
+  defm _DWG    : MatMulDwgGsfnOrGsft<i, IntrName#"_f32_dwg", OpName#".dwg",
+      ScheduleName, SubUnitNameHint>;
+}
+
+multiclass MatMulMode<int i, string IntrName, string OpName> {
+  defm ""      : MatMulDwg<i, IntrName, OpName, "WriteMatMulMxu", "">;
+  defm _LOW    : MatMulDwg<i, IntrName#"_low", OpName#".low",
+       "WriteMatMulMxu", "">;
+  defm _HI     : MatMulDwg<i, IntrName#"_hi", OpName#".hi",
+       "WriteMatMulMxu", "">;
+let isPackedMatMul = 1 in {
+  defm _PACKED : MatMulDwg<i, IntrName#"_packed", OpName#".packed",
+       "WriteMatMulMxuPacked", "_packed">;
+}
+}
+
+multiclass Dwg<int i,
+   DAGOperand gmr = !cast<DAGOperand>("GMRPR"#i),
+   DAGOperand gsfn = !cast<DAGOperand>("GSFNPR"#i),
+   DAGOperand gsft = !cast<DAGOperand>("GSFTPR"#i)> {
+  def N :  TPUInstP<(outs gmr:$gmr), (ins gsfn:$gsfn),
+    "$gmr =\tvdwg.f16${pred} $gsfn",
+    [(set gmr:$gmr, (int_tpu_vdwg (i32 i), (i32 gsfn:$gsfn)))]>,
+    ExtraPredicates<[NotVFTC]>, SubUnits<[SU_matdwg_jfc]>;
+  def T :  TPUInstP<(outs gmr:$gmr), (ins gsft:$gsft),
+    "$gmr =\tvdwg.f16${pred} $gsft",
+    [(set gmr:$gmr, (int_tpu_vdwg_xpose (i32 i), (i32 gsft:$gsft)))]>,
+      ExtraPredicates<[NotVFTC]>, SubUnits<[SU_matdwg_jfc]>;
+}
+
+multiclass MXU<int i, DAGOperand mrf = !cast<DAGOperand>("MRFPR"#i)> {
+let mayLoad = 1, mayStore = 1, usesCustomInserter = 1 in {
+let Itinerary = IIC_MXU_PUSH in {
+  defm MATPUSH : MatPushMode<i, "int_tpu_vmatpush", "vmatpush">;
+}
+let Itinerary = IIC_MXU_MUL, isPush = 1 in {
+  defm MATMUL : MatMulMode<i, "int_tpu_vmatmul", "vmatmul">;
+}
+let isDwg = 1 in {
+  defm DWG : Dwg<i>, Bundle<B_VEX>;
+} // isDwg = 1
+def MATPOP :  TPUInstP<(outs VPR:$Vd), (ins mrf:$mrf),
+    "$Vd =\tvmatres.8x128.f32${pred} $mrf",
+    [(set VPR:$Vd, (int_tpu_vmatres_f32 i, (i32 mrf:$mrf)))]>,
+    Sched<[!cast<SchedWrite>("WriteMatRes"#i)]>, ExtraPredicates<[NotVFTC]>,
+    SubUnits<[SU_mxu_result]>, Bundle<B_VResAny> { let isPop = 1; }
+} // mayLoad = 1, mayStore = 1, usesCustomInserter = 1
+}
+
+// Define 4 MXU for all platforms, we assume user won't try to use more MXU
+// than available on the platform. We can add more fine grain predicates later
+// to be able to report user errors.
+foreach Index = 0-3 in {
+defm tcMXU#Index : MXU<Index>, IsVectorInstruction, IsMXUInst;
+}
+
+//===----------------------------------------------------------------------===//
+// XLU operations
+//===----------------------------------------------------------------------===//
+class PseudoInstMapping<string PseudoInst, string Inst> {
+  Instruction Pseudo = !cast<Instruction>(PseudoInst);
+  Instruction Lowered = !cast<Instruction>(Inst);
+}
+
+def PseudoInstTable : GenericTable {
+  let FilterClass = "PseudoInstMapping";
+  let CppTypeName = "PseudoInstMappingTy";
+  let Fields = ["Pseudo", "Lowered"];
+  let PrimaryKey = ["Pseudo"];
+  let PrimaryKeyName = "PseudoInstMapping";
+}
+
+multiclass Transpose<string Name, string PostFix, string Sch, string IntrName,
+                      int busIndex, DAGOperand trf, string SubUnitNameHint> {
+// immediate width support only for now. Having none-constant width makes it
+// really hard to match Pop instructions associated to a transpose.
+// Height is an argument. Even though hardware doesn't need it we force user to
+// pass it to be able to compute an accurate latency.
+def "" :  TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, i32imm:$width, i32imm:$height, trf:$trfsrc),
+              "$trf =\t"#Name#"."#busIndex#PostFix#"${pred} $vsrc, $width",
+              [(set trf:$trf, (!cast<Intrinsic>(IntrName) (vNi32 VPR:$vsrc),
+                               (timm:$width), (timm:$height), (i32 busIndex), (i32 trf:$trfsrc)))]>,
+              Bundle<B_VEX>, Sched<[!cast<SchedWrite>(Sch#busIndex)]>,
+              SubUnits<[!cast<SubUnitEncoding>("SU_"#SubUnitNameHint)]>;
+
+// Packed transpose needs to be broken down into two instructions within going
+// in the same bundle. We emit a pseudo instruction with both source and expand
+// it post bundle packing into a packed instruction and a vsupp instruction.
+let isPacked = 1 in {
+def _PACKED :  TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, i32imm:$width, i32imm:$height, trf:$trfsrc),
+              "$trf =\t"#Name#"."#busIndex#".packed"#PostFix#"${pred} $vsrc, $width",
+              []>, Bundle<B_VEX1>, SubUnits<[!cast<SubUnitEncoding>("SU_packed_"#SubUnitNameHint)]>;
+let isPseudo = 1 in {
+def _PACKED_PSEUDO :  TPUInstP<(outs trf:$trf), (ins VPR:$vsrclow, VPR:$vsrchigh,
+                                          i32imm:$width, i32imm:$height, trf:$trfsrc),
+              "$trf =\t"#Name#"PACKED"#busIndex#PostFix#"${pred} $vsrclow, $width, $vsrchigh",
+              [(set trf:$trf, (!cast<Intrinsic>(IntrName#"_packed") (vNi32 VPR:$vsrclow),
+                               (vNi32 VPR:$vsrchigh), (timm:$width),
+                               (timm:$height), (i32 busIndex), (i32 trf:$trfsrc)))]>,
+              Bundle<B_VEXBoth>, Sched<[!cast<SchedWrite>(Sch#"Packed"#busIndex)]>,
+              PseudoInstMapping<NAME#"_PACKED_PSEUDO", NAME#"_PACKED">,
+              SubUnits<[!cast<SubUnitEncoding>("SU_packed_"#SubUnitNameHint)]>;
+} // isPseudo = 1
+} // isPacked = 1
+}
+
+// Transpose can be segmented or not.
+multiclass TransposeStartSegmented<string PostFix, string Sch, string IntrName,
+                      int busIndex, DAGOperand trf, string SubUnitNameHint> {
+  defm "" : Transpose<"vxpose", PostFix, Sch, IntrName, busIndex, trf,
+  SubUnitNameHint>;
+let isSegmented = 1 in {
+  defm _SEGMENTED : Transpose<"vsxpose", PostFix, Sch,
+                        IntrName#"_segmented", busIndex, trf, SubUnitNameHint>;
+}
+}
+
+multiclass TransposeEndSegmented<string PostFix, string Sch, string IntrName,
+                      int busIndex, DAGOperand trf, string SubUnitNameHint> {
+  defm "" : Transpose<"vxpose", PostFix, Sch, IntrName, busIndex, trf,
+  SubUnitNameHint>;
+let isSegmented = 1 in {
+  defm _SEGMENTED : Transpose<"vsxpose", PostFix, Sch,
+                        IntrName#"_segmented", busIndex, trf,
+                        "segmented_"#SubUnitNameHint>;
+}
+}
+
+// One transpose and one transpose end
+multiclass TransposeEnd<string Sch, string IntrName, int busIndex, DAGOperand trf> {
+  defm "" : TransposeStartSegmented<"", Sch, IntrName, busIndex, trf, "transpose_start">;
+let isTransposeEnd = 1, isPush = 1 in {
+  defm _END : TransposeEndSegmented<".end", Sch#"End", IntrName#"_end", busIndex, trf, "transpose_end">;
+}
+}
+
+// Pattern for the float case.
+multiclass TransposeFloatPat<string Name, string IntrName, int busIndex,
+                              DAGOperand trf> {
+ def : Pat<(!cast<Intrinsic>(IntrName) (vNf32 VPR:$vsrc), (timm:$width), (timm:$height),
+                   (i32 busIndex), (i32 trf:$trfsrc)),
+                  (!cast<Instruction>(Name) VPR:$vsrc, i32imm:$width,
+                   i32imm:$height, trf:$trfsrc)>;
+ // Packed case.
+ def : Pat<(!cast<Intrinsic>(IntrName#"_packed") (vNf32 VPR:$vsrclow),
+                  (vNf32 VPR:$vsrchigh), (timm:$width), (timm:$height),
+                  (i32 busIndex), (i32 trf:$trfsrc)),
+                  (!cast<Instruction>(Name#"_PACKED_PSEUDO") VPR:$vsrclow, VPR:$vsrchigh,
+                   i32imm:$width, i32imm:$height, trf:$trfsrc)>;
+}
+
+// Pattern for segmented and normal case.
+multiclass TransposeSegmentedFloatPat<string Name, string Intrinsic, int busIndex,
+                              DAGOperand trf> {
+ defm : TransposeFloatPat<Name, Intrinsic, busIndex, trf>;
+ defm : TransposeFloatPat<Name#"_SEGMENTED", Intrinsic#"_segmented", busIndex, trf>;
+}
+
+// Pattern for transpose and transpose_end intrinsics.
+multiclass TransposeEndFloatPat<string Name, string Intrinsic, int busIndex,
+                              DAGOperand trf> {
+ defm : TransposeSegmentedFloatPat<Name, Intrinsic, busIndex, trf>;
+ defm : TransposeSegmentedFloatPat<Name#"_END", Intrinsic#"_end", busIndex, trf>;
+}
+
+multiclass RotateSource<int busIndex, DAGOperand trf, DAGOperand SrcT, DAGOperand PatType,
+                        ImmSlotRequirement slots, list<ImmOperRequirement> operands> {
+def "" :  TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, SrcT:$amount),
+              "$trf =\tvrot."#busIndex#"${pred} $vsrc, $amount",
+              [(set trf:$trf, (int_tpu_vrotate (vNi32 VPR:$vsrc),
+                               (i32 PatType:$amount), (i32 busIndex)))]>,
+  Bundle<B_VEX>, Sched<[!cast<SchedWrite>("WritePermute"#busIndex)]>, BundleImm<slots, operands>,
+  SubUnits<[SU_permute]>;
+let isPacked = 1 in {
+  def _PACKED :  TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, SrcT:$amount),
+              "$trf =\tvrot."#busIndex#".packed${pred} $vsrc, $amount",
+              []>, Bundle<B_VEX1>, BundleImm<slots, operands>,
+              SubUnits<[SU_packed_permute]>;
+let isPseudo = 1 in {
+def _PACKED_PSEUDO :  TPUInstP<(outs trf:$trf), (ins VPR:$vsrclow, VPR:$vsrchigh, SrcT:$amount),
+              "$trf =\t#VROTPACKED_PSEUDO."#busIndex#".packed${pred} $vsrclow, $vsrchigh, $amount",
+              [(set trf:$trf, (int_tpu_vrotate_packed (vNi32 VPR:$vsrclow),
+                   (vNi32 VPR:$vsrchigh), (i32 PatType:$amount), (i32 busIndex)))]>,
+  Bundle<B_VEXBoth>, Sched<[!cast<SchedWrite>("WritePermutePacked"#busIndex)]>,
+  SubUnits<[SU_packed_permute]>,
+  BundleImm<slots, operands>,
+  PseudoInstMapping<NAME#"_PACKED_PSEUDO", NAME#"_PACKED">;
+} // isPacked = 1
+} // isPseudo = 1
+}
+
+multiclass Rotate<int busIndex, DAGOperand trf> {
+  defm r : RotateSource<busIndex, trf, GPR, GPR, IMM_NONE, []>;
+  // immediate amount case.
+  defm i : RotateSource<busIndex, trf, timmsi, timmsi, IMM_2_to_5, [IMM_OP_0]>;
+}
+
+// Pattern for the float case.
+multiclass RotateFloatPat<string Name, int busIndex> {
+ def : Pat<(int_tpu_vrotate (vNf32 VPR:$vsrc), (i32 GPR:$amount), (i32 busIndex)),
+           (!cast<Instruction>(Name#r) VPR:$vsrc, GPR:$amount)>;
+ def : Pat<(int_tpu_vrotate (vNf32 VPR:$vsrc), (i32 imm:$amount), (i32 busIndex)),
+           (!cast<Instruction>(Name#i) VPR:$vsrc, imm:$amount)>;
+  // Packed pattern
+ def : Pat<(int_tpu_vrotate_packed (vNf32 VPR:$vsrclow), (vNf32 VPR:$vsrchigh),
+            (i32 GPR:$amount), (i32 busIndex)),
+           (!cast<Instruction>(Name#"r_PACKED_PSEUDO") VPR:$vsrclow, VPR:$vsrchigh, GPR:$amount)>;
+ def : Pat<(int_tpu_vrotate_packed (vNf32 VPR:$vsrclow), (vNf32 VPR:$vsrchigh),
+            (i32 imm:$amount), (i32 busIndex)),
+           (!cast<Instruction>(Name#"i_PACKED_PSEUDO") VPR:$vsrclow, VPR:$vsrchigh, imm:$amount)>;
+}
+
+let Predicates = [HasV1024] in {
+def : Pat<(extractelt (vNi32 VPR:$v), (i32 0)), (VREAD (vNi32 VPR:$v))>;
+def : Pat<(extractelt (vNi1 MPR:$m), (i32 0)),
+  (CMPNEri (VREAD (VSELir MPR:$m, (i32 1), (VIMMI 0))), (i32 0))>;
+}
+
+multiclass XLaneInst<string Name, string IntrName, int busIndex, DAGOperand trf, DAGOperand spr,
+  SchedWrite Sch = !cast<SchedWrite>("WriteXLane"#busIndex)> {
+def "" : TPUInstP<(outs trf:$trf), (ins VPR:$vsrc),
+              "$trf =\t"#Name#".xlane."#busIndex#"${pred} $vsrc",
+              [(set trf:$trf, (!cast<Intrinsic>("int_tpu_xlane_"#IntrName)
+                               (vNf32 VPR:$vsrc), (i32 busIndex)))]>,
+    Bundle<B_VEX>, Sched<[Sch]>, SubUnits<[SU_reduce]>;
+def _SEGMENTED : TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, spr:$spr),
+              "$trf =\t"#Name#".xlane."#busIndex#"${pred}.seg.perm $vsrc",
+              [(set trf:$trf, (!cast<Intrinsic>("int_tpu_xlane_segmented_"#IntrName)
+                               (vNf32 VPR:$vsrc), (i32 spr:$spr), (i32 busIndex)))]>,
+    Bundle<B_VEX>, Sched<[Sch]>, SubUnits<[SU_segmented_reduce]>;
+}
+
+multiclass SetPatternReg<string postFix, SDPatternOperator OpNode, int busIndex,
+                       DAGOperand pcr, SchedWrite Sch> {
+  def "" : TPUInstP<(outs pcr:$pcr), (ins VPR:$vsrc),
+              "$pcr =\tvsetperm."#busIndex#"."#postFix#"${pred} $vsrc",
+              [(set pcr:$pcr, (OpNode (vNi32 VPR:$vsrc), (i32 busIndex)))]>,
+    Bundle<B_VEX>, Sched<[Sch]>;
+}
+
+multiclass SetPermute<string postFix, SDPatternOperator OpNode, int busIndex,
+                       DAGOperand pcr, SchedWrite Sch> {
+  defm "" : SetPatternReg<postFix, OpNode, busIndex, pcr, Sch>;
+}
+
+multiclass Permute<int busIndex, DAGOperand trf, DAGOperand pcr> {
+  def "" : TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, pcr:$pcr),
+              "$trf =\tvperm."#busIndex#"${pred} $vsrc",
+              [(set trf:$trf, (int_tpu_permute (vNi32 VPR:$vsrc),
+                               (i32 pcr:$pcr), (i32 busIndex)))]>,
+    Bundle<B_VEX>, Sched<[!cast<SchedWrite>("WritePermute"#busIndex)]>,
+    SubUnits<[SU_permute]>;
+let isPacked = 1 in {
+  def _PACKED : TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, pcr:$pcr),
+              "$trf =\tvperm."#busIndex#".packed${pred} $vsrc",
+              []>, Bundle<B_VEX1>,
+              SubUnits<[SU_packed_permute]>;
+let isPseudo = 1 in {
+  // Packed instruction also generates a vsupp instruction for the high bits.
+  // It gets expanded post bundle packing.
+  def _PACKED_PSEUDO : TPUInstP<(outs trf:$trf), (ins VPR:$vsrclow, VPR:$vsrchigh, pcr:$pcr),
+              "$trf =\t#VPERMPACKED."#busIndex#"${pred} $vsrclow, $vsrchigh",
+              [(set trf:$trf, (int_tpu_permute_packed (vNi32 VPR:$vsrclow),
+                     (vNi32 VPR:$vsrchigh), (i32 pcr:$pcr), (i32 busIndex)))]>,
+    Bundle<B_VEXBoth>, Sched<[!cast<SchedWrite>("WritePermutePacked"#busIndex)]>,
+    PseudoInstMapping<NAME#"_PACKED_PSEUDO", NAME#"_PACKED">,
+    SubUnits<[SU_packed_permute]>;
+} // isPseudo = 1
+} // isPacked = 1
+}
+
+// Instruction for supplemental packed source.
+def XLUSUPP_PACKED : TPUInstP<(outs), (ins VPR:$vsrc),
+              "_ =\tvsupp${pred} $vsrc", []>, Bundle<B_VEX0>;
+
+multiclass PermuteFloatPat<string Name, int busIndex, DAGOperand pcr> {
+   def : Pat<(int_tpu_permute (vNf32 VPR:$vsrc), (i32 pcr:$pcr), (i32 busIndex)),
+           (!cast<Instruction>(Name) VPR:$vsrc, pcr:$pcr)>;
+   def : Pat<(int_tpu_permute_packed (vNf32 VPR:$vsrclow), (vNf32 VPR:$vsrchigh), (i32 pcr:$pcr), (i32 busIndex)),
+           (!cast<Instruction>(Name#"_PACKED_PSEUDO") VPR:$vsrclow, VPR:$vsrchigh, pcr:$pcr)>;
+}
+
+multiclass XLUBus<int busIndex, DAGOperand trf, DAGOperand pcr, DAGOperand spr> {
+// Use custom inserter to attach the right memory operands.
+let usesCustomInserter = 1 in {
+// Transpose is not marked as push. We only model other transpose_end
+// instructions as pushing in the FIFO. That allows us to model transpose as
+// a normal FIFO. Tranpose_end pushes a variable number of items based on its
+// width.
+let isTranspose = 1 in {
+defm TRANSPOSE : TransposeEnd<"WriteTranspose", "int_tpu_tc_transpose", busIndex, trf>;
+}
+let isPush = 1 in {
+let isPermute = 1 in {
+defm ROTATE : Rotate<busIndex, trf>;
+defm PERMUTE : Permute<busIndex, trf, pcr>;
+} // isPermute = 1
+let isReduce = 1 in {
+defm XLANE_ADD : XLaneInst<"vadd", "add", busIndex, trf, spr>;
+defm XLANE_MAX : XLaneInst<"vmax", "max",busIndex, trf, spr>;
+defm XLANE_MIN : XLaneInst<"vmin", "min", busIndex, trf, spr>;
+defm XLANE_MAXINDEX : XLaneInst<"vmax.index", "maxindex", busIndex, trf, spr>;
+defm XLANE_MININDEX : XLaneInst<"vmin.index", "minindex", busIndex, trf, spr>;
+} // isReduce = 1
+} // isPush = 1
+
+// SetPermute instructions are not FIFO.
+defm SETPERMUTE_U8 :
+  SetPermute<"u8", int_tpu_set_permute, busIndex, pcr, WriteSetPermute>,
+  SubUnits<[SU_set_pattern]>;
+defm SETSPR :
+  SetPatternReg<"u1", int_tpu_set_spr, busIndex, spr, WriteSetPermute>,
+  SubUnits<[SU_set_pattern_jfc, SU_set_pattern_pfc]>;
+defm SETPERMUTE_SUBLANE :
+  SetPermute<"all.u8", int_tpu_set_permute_sublane, busIndex, pcr,
+              !cast<SchedWrite>("WriteSetPermuteAll"#busIndex)>,
+              SubUnits<[SU_set_pattern_all]>;
+defm SETPERMUTE_BYTES :
+  SetPermute<"all.bytes.u32", int_tpu_set_permute_bytes, busIndex, pcr,
+              !cast<SchedWrite>("WriteSetPermuteAll"#busIndex)>,
+              SubUnits<[SU_set_pattern_all]>;
+} // let usesCustomInserter = 1
+
+defm : TransposeEndFloatPat<NAME#"TRANSPOSE", "int_tpu_tc_transpose", busIndex, trf>;
+defm : RotateFloatPat<NAME#"ROTATE", busIndex>;
+defm : PermuteFloatPat<NAME#"PERMUTE", busIndex, pcr>;
+}
+
+multiclass XLUPop<int XLUIndex,
+  DAGOperand trf = !cast<DAGOperand>("TRFPR"#XLUIndex)> {
+let isPop = 1, usesCustomInserter = 1 in {
+  def Pop :  TPUInstP<(outs VPR:$Vd), (ins trf:$trf),
+            "$Vd =\tvpop${pred} $trf",
+            [(set (vNi32 VPR:$Vd),
+              (int_tpu_tc_vtrfpop (i32 XLUIndex), (i32 trf:$trf)))]>,
+             Bundle<B_VResAny>, Sched<[!cast<SchedWrite>("WriteTrf"#XLUIndex#"Pop0")]>,
+             SubUnits<[SU_xlu_result]>;
+}
+def : Pat<(vNf32 (int_tpu_tc_vtrfpop (i32 XLUIndex), (i32 trf:$trf))),
+              (!cast<Instruction>(NAME#Pop) trf:$trf)>;
+}
+
+multiclass TransposeUnit<int XluIndex,
+  DAGOperand trf = !cast<DAGOperand>("TRFPR"#XluIndex),
+  DAGOperand pcr = !cast<DAGOperand>("PCRPR"#XluIndex),
+  DAGOperand spr = !cast<DAGOperand>("SPRPR"#XluIndex)> {
+  defm B0 : XLUBus<XluIndex, trf, pcr, spr>;
+  defm B1 : XLUBus<!add(XluIndex, 2), trf, pcr, spr>;
+  defm "" : XLUPop<XluIndex>;
+}
+
+let Predicates = [NotVFTC] in {
+defm tcXLU0 : TransposeUnit<0>, IsVectorInstruction, IsXLUInst;
+defm tcXLU1 : TransposeUnit<1>, IsVectorInstruction, IsXLUInst;
+}
+
+// Host interrupt
+let hasSideEffects = 1, SubUnits = [SU_vmisc] in {
+def VINTr : TPUInstP<(outs), (ins GPR:$src),
+                     "_ = vint${pred} $src",
+                     [(int_tpu_tc_vint (i32 GPR:$src))]>,
+            Bundle<B_SM>, IsVectorInstruction;
+def VINTi : TPUInstP<(outs), (ins i32imm:$src),
+                     "_ = vint${pred} $src",
+                     [(int_tpu_tc_vint (i32 imm:$src))]>,
+            Bundle<B_SM>, BundleImm<IMM_0_to_3>, IsVectorInstruction;
+} // hasSideEffects = 1, SubUnits = [SU_vmisc]
+
+// Pseudo random number generation
+let hasSideEffects = 1 in {
+def SetRngSeed : TPUInstP<(outs), (ins VPR:$Vx),
+                          "_ = setrngseed${pred} $Vx",
+                          [(int_tpu_tc_setrngseed (vNi32 VPR:$Vx))]>,
+                 Bundle<B_Vany>, Sched<[WriteSetRngSeed]>, IsVectorInstruction,
+                 SubUnits<[SU_png_set]>;
+def GetRngSeed : TPUInstP<(outs VPR:$Vdst), (ins),
+                          "$Vdst = getrngseed${pred}",
+                          [(set VPR:$Vdst, (int_tpu_tc_getrngseed))]>,
+                  Bundle<B_Vany>, Sched<[WriteGetRngState]>,
+                  IsVectorInstruction, SubUnits<[SU_png_read]>;
+def VRng : TPUInstP<(outs VPR:$Vdst), (ins),
+                    "$Vdst = vrng.8x128.u32${pred}",
+                    [(set VPR:$Vdst, (int_tpu_tc_vrng))]>,
+           Bundle<B_Vany>, Sched<[WriteRng]>, IsVectorInstruction,
+           SubUnits<[SU_png_gen]>;
+} // let hasSideEffects = 1
+
+let Predicates = [HasV1024], SubUnits = [SU_vector_cmp] in {
+def VLANEMASK : TPUInstP<(outs MPR:$Md), (ins VPR:$Vs),
+                     "$Md =\tvlmask${pred} $Vs",
+                     [(set MPR:$Md,
+                           (int_tpu_lane_mask (vNi32 VPR:$Vs)))]>,
+                     Bundle<B_Vany>, IsVectorInstruction;
+def VSUBLANE_MASK : TPUInstP<(outs MPR:$Md), (ins VPR:$Vs),
+                     "$Md =\tvsmask${pred} $Vs",
+                     [(set MPR:$Md,
+                           (int_tpu_sublane_mask (vNi32 VPR:$Vs)))]>,
+                     Bundle<B_Vany>, IsVectorInstruction;
+}
+
+include "TPUInstrTensorCoreVF.td"

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrTensorCoreVF.td b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrTensorCoreVF.td
new file mode 100644
index 0000000..fedb67f
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUInstrTensorCoreVF.td

@@ -0,0 +1,630 @@
+//===-- TPUInstrTensorCoreVF.td - Target Description for TPU Target -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the TPU instructions specific to ViperFish
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Vector ALU ops
+//===----------------------------------------------------------------------===//f
+let Predicates = [IsVFTC, HasPxcVPU] in {
+defm tcvfVROT_SLANE : VIntALUOpXY<23, "vrot.slane", int_tpu_vrot_sublane, vNi32, vNi32>,
+     SubUnits<[SU_vector_rotate]>;
+def : Pat<(vNf32 (int_tpu_vrot_sublane (vNf32 VPR:$x), (vNf32 VPR:$y))),
+          (tcvfVROT_SLANErr (vNf32 VPR:$x), (vNf32 VPR:$y))>;
+def : Pat<(vNf32 (int_tpu_vrot_sublane (vNf32 VPR:$x), (vNf32 (Splat fpimm:$y)))),
+          (tcvfVROT_SLANEri (vNf32 VPR:$x), (ftoi $y))>;
+def : Pat<(vNf32 (int_tpu_vrot_sublane (vNf32 VPR:$x), (vNf32 (Splat GPR:$y)))),
+          (tcvfVROT_SLANErs (vNf32 VPR:$x), GPR:$y)>;
+
+defm tcvfVPERM_SLANE : VIntALUOpXY<24, "vperm.slane", int_tpu_vperm_sublane, vNi32, vNi32>,
+     SubUnits<[SU_vector_op]>;
+def : Pat<(vNf32 (int_tpu_vperm_sublane (vNf32 VPR:$x), (vNf32 VPR:$y))),
+          (tcvfVPERM_SLANErr (vNf32 VPR:$x), (vNf32 VPR:$y))>;
+def : Pat<(vNf32 (int_tpu_vperm_sublane (vNf32 VPR:$x), (vNf32 (Splat fpimm:$y)))),
+          (tcvfVPERM_SLANEri (vNf32 VPR:$x), (ftoi $y))>;
+def : Pat<(vNf32 (int_tpu_vperm_sublane (vNf32 VPR:$x), (vNf32 (Splat GPR:$y)))),
+          (tcvfVPERM_SLANErs (vNf32 VPR:$x), GPR:$y)>;
+} // let Predicates = [IsVFTC, HasPxcVPU]
+
+//===----------------------------------------------------------------------===//
+// Flag instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasVectorSflags, IsVFTC] in {
+defm tcvfWAIT : SyncInstTc_<1 /* isyieldable */>, Bundle<B_V2>,
+     IsVectorInstruction, SubUnits<[SU_vwait]>;
+
+let isPseudo = 1, usesCustomInserter = 1 in {
+def VFREADPAi : TPUInstP<(outs GPR:$d), (ins tsyncimmsi:$imm),
+                           "$d =\t#SFLAGREAD.pa${pred} [sflag:$imm]",
+                           [(set GPR:$d, (i32 (int_tpu_syncpamov (Wrapper tglobaladdr:$imm))))]>,
+                   Bundle<B_SM>;
+def VFREADPAr : TPUInstP<(outs GPR:$d), (ins GPR:$r),
+                           "$d =\t#SFLAGREAD.pa${pred} [sflag:$r]",
+                           [(set GPR:$d, (i32 (int_tpu_syncpamov GPR:$r)))]>,
+                   Bundle<B_SM>;
+}
+
+let mayLoad = 1, mayStore = 1, isPush = 1 in {
+  def tcvfVSYNCMOVEPAi : TPUInstP<(outs SFRFPR:$sfrf), (ins tsyncimmsi:$imm),
+                      "$sfrf =\tvsyncpamov${pred} [sflag:$imm]", []>,
+              Bundle<B_SM>, Sched<[WriteSFlagV2SF]>, IsVectorInstruction,
+              SubUnits<[SU_read_sync]>;
+  def tcvfVSYNCMOVEPAr : TPUInstP<(outs SFRFPR:$sfrf), (ins GPR:$r),
+                      "$sfrf =\tvsyncpamov${pred} [sflag:$r]", []>,
+              Bundle<B_SM>, Sched<[WriteSFlagV2SF]>, IsVectorInstruction,
+              SubUnits<[SU_read_sync]>;
+} // mayLoad = 1, mayStore = 1, isPush = 1
+
+defm  tcvfVSYNCMOVE : MoveSyncFlag_<(outs SFRFPR:$sfrf), "$sfrf">,
+      SubUnits<[SU_read_sync]>;
+
+} // Predicates = [HasVectorSflags, IsVFTC]
+
+let Predicates = [IsVFTC], mayLoad = 1, mayStore = 1, isPop = 1 in {
+  def SPOP_SFRF : TPUInstP<(outs GPR:$sdst), (ins SFRFPR:$sfrf),
+                     "$sdst =\tspop${pred} $sfrf", []>,
+              Bundle<B_Sany>, Sched<[WriteV2SFPop]>, SubUnits<[SU_pop]>;
+} // Predicates = [IsVFTC], mayLoad = 1, mayStore = 1, isPop = 1
+
+//===----------------------------------------------------------------------===//
+// MXU operations
+//===----------------------------------------------------------------------===//
+multiclass MatPushVF<int i, string IntrName, string OpName, string FifoName,
+  DAGOperand fiforeg = !cast<DAGOperand>(FifoName#i)> :
+    MatOpMasked<i, OpName, !cast<Intrinsic>(IntrName), (ins fiforeg:$srcmsr),
+    (outs fiforeg:$dstmsr), "$dstmsr", "", !cast<SchedWrite>("WriteMatPush"#i),
+    [HasMXU,IsVFTC]>;
+
+// MatPush may transpose or not
+// VF has two options for push latches, msra & msrb
+multiclass MatPushXPosVF<int i, string IntrName, string OpName, string SubUnitNameHint> {
+  defm _MSRA      : MatPushVF<i, IntrName#"_msra", OpName, "MSRAPR">,
+       SubUnits<[!cast<SubUnitEncoding>("SU_"#SubUnitNameHint)]>;
+  defm _MSRA_XPOS : MatPushVF<i, IntrName#"_msra_xpose", OpName#".xpose", "MSRAPR">,
+       SubUnits<[!cast<SubUnitEncoding>("SU_"#SubUnitNameHint#"x")]>;
+  defm _MSRB      : MatPushVF<i, IntrName#"_msrb", OpName, "MSRBPR">,
+       SubUnits<[!cast<SubUnitEncoding>("SU_"#SubUnitNameHint)]>;
+  defm _MSRB_XPOS : MatPushVF<i, IntrName#"_msrb_xpose", OpName#".xpose", "MSRBPR">,
+       SubUnits<[!cast<SubUnitEncoding>("SU_"#SubUnitNameHint#"x")]>;
+}
+
+// VF MatPush may have different interpretation of Vreg
+multiclass MatPushModeVF<int i, string IntrName, string OpName,
+           string SubUnitNameHint> {
+  defm ""        : MatPushXPosVF<i, IntrName, OpName,
+       SubUnitNameHint#"_f32">;
+  defm _IF8_BF16 : MatPushXPosVF<i, IntrName#"_if8_bf16", OpName#".if8.bf16",
+       SubUnitNameHint#"_f8">;
+  defm _BF16     : MatPushXPosVF<i, IntrName#"_bf16", OpName#".bf16",
+       SubUnitNameHint#"_f16">;
+  defm _BF8_BF16 : MatPushXPosVF<i, IntrName#"_bf8_bf16", OpName#".bf8.bf16",
+       SubUnitNameHint#"_f8">;
+  defm _U8       : MatPushXPosVF<i, IntrName#"_u8", OpName#".u8",
+       SubUnitNameHint#"_f8">;
+  defm _S8       : MatPushXPosVF<i, IntrName#"_s8", OpName#".s8",
+       SubUnitNameHint#"_f8">;
+  defm _U4       : MatPushXPosVF<i, IntrName#"_u4", OpName#".u4",
+       SubUnitNameHint#"_f8">;
+  defm _S4       : MatPushXPosVF<i, IntrName#"_s4", OpName#".s4",
+       SubUnitNameHint#"_f8">;
+}
+
+multiclass MatMulVF<int i, string IntrName, string OpName, string ScheduleName,
+           DAGOperand fiforegsrc = !cast<DAGOperand>("GMRPR"#i),
+           DAGOperand fiforegdst = !cast<DAGOperand>("MRFPR"#i)> :
+    MatOpMasked<i, OpName, !cast<Intrinsic>(IntrName), (ins fiforegsrc:$srcgmr),
+    (outs fiforegdst:$dstmrf), "$dstmrf", "", !cast<SchedWrite>(ScheduleName#i),
+    [HasMXU,IsVFTC]>;
+
+// vmatmul.lmr versions: BEGIN
+multiclass MatMulLmrMsraOrbVF<int i, string IntrName, string Name, SchedWrite Schedule,
+  DAGOperand fiforeg_mrf, DAGOperand fiforeg_lmr, DAGOperand fiforeg_gmr,
+  DAGOperand fiforeg_msra = !cast<DAGOperand>("MSRAPR"#i),
+  DAGOperand fiforeg_msrb = !cast<DAGOperand>("MSRBPR"#i),
+  Intrinsic Intr_msra = !cast<Intrinsic>(IntrName#"_msra"),
+  Intrinsic Intr_msrb = !cast<Intrinsic>(IntrName#"_msrb")> {
+
+  def _MSRA : TPUInstP<
+    (outs fiforeg_mrf:$dstmrf, fiforeg_lmr:$dstlmr,
+        fiforeg_gmr:$dstgmr, fiforeg_msra:$dstmsr),
+    (ins i32imm:$width, fiforeg_msra:$srcmsr),
+  "($dstmrf, $dstlmr, $dstgmr, $dstmsr) =\t"
+       #Name#".${width:s}"#".lgmr${pred}",
+  [(set fiforeg_mrf:$dstmrf, fiforeg_lmr:$dstlmr,
+      fiforeg_gmr:$dstgmr, fiforeg_msra:$dstmsr,
+      (Intr_msra (i32 i), (i32 imm:$width), (i32 fiforeg_msra:$srcmsr)))]>,
+         Sched<[Schedule]>, ExtraPredicates<[HasMXU,IsVFTC]>;
+
+  def _MSRB : TPUInstP<
+    (outs fiforeg_mrf:$dstmrf, fiforeg_lmr:$dstlmr,
+        fiforeg_gmr:$dstgmr, fiforeg_msrb:$dstmsr),
+    (ins i32imm:$width, fiforeg_msrb:$srcmsr),
+  "($dstmrf, $dstlmr, $dstgmr, $dstmsr) =\t"
+       #Name#".${width:s}"#".lgmr${pred}",
+  [(set fiforeg_mrf:$dstmrf, fiforeg_lmr:$dstlmr,
+      fiforeg_gmr:$dstgmr, fiforeg_msrb:$dstmsr,
+      (Intr_msrb (i32 i), (i32 imm:$width), (i32 fiforeg_msrb:$srcmsr)))]>,
+         Sched<[Schedule]>, ExtraPredicates<[HasMXU,IsVFTC]>;
+}
+
+multiclass MatMulLmrVF<int i, string IntrName, string Name, SchedWrite Schedule,
+  DAGOperand fiforeg_mrf = !cast<DAGOperand>("MRFPR"#i),
+  DAGOperand fiforeg_lmr = !cast<DAGOperand>("LMRPR"#i),
+  DAGOperand fiforeg_gmr = !cast<DAGOperand>("GMRPR"#i),
+  Intrinsic Intr = !cast<Intrinsic>(IntrName)> {
+  let isCodeGenOnly = 1 in {
+    def "" : TPUInstP<(outs fiforeg_mrf:$dstmrf, fiforeg_lmr:$dstlmr),
+            (ins i32imm:$width),
+            "($dstmrf, $dstlmr) =\t"#Name#".${width:s}${pred}",
+            [(set fiforeg_mrf:$dstmrf, fiforeg_lmr:$dstlmr,
+                (Intr (i32 i), (i32 imm:$width)))]>,
+            Sched<[Schedule]>, ExtraPredicates<[HasMXU,IsVFTC]>;
+
+    defm _LGMR : MatMulLmrMsraOrbVF<i, IntrName#"_lgmr", Name,
+        Schedule, fiforeg_mrf, fiforeg_lmr, fiforeg_gmr>;
+  } // let isCodeGenOnly = 1
+}
+// vmatmul.lmr versions: END
+
+// For fused-load and non fused-load versions: BEGIN
+multiclass MatMulLgmrMsraOrbMaskVF<int i, string IntrName, string Name,
+           SchedWrite Schedule, DAGOperand fiforeg_msr,
+  DAGOperand fiforeg_mrf = !cast<DAGOperand>("MRFPR"#i),
+  DAGOperand fiforeg_gmr = !cast<DAGOperand>("GMRPR"#i),
+  Intrinsic Intr = !cast<Intrinsic>(IntrName)> {
+  def "" :  TPUInstP<(outs fiforeg_mrf:$dstmrf, fiforeg_gmr:$dstgmr, fiforeg_msr:$dstmsr),
+    (ins VPR:$Vs, fiforeg_msr:$srcmsr),
+    "($dstmrf, $dstgmr, $dstmsr) =\t"#Name#"${pred} $Vs",
+    [(set fiforeg_mrf:$dstmrf, fiforeg_gmr:$dstgmr, fiforeg_msr:$dstmsr,
+          (Intr (vNf32 VPR:$Vs), (vNi1 (Splat -1)), (i32 i), (i32 fiforeg_msr:$srcmsr)))]>,
+           Sched<[Schedule]>, ExtraPredicates<[HasMXU,IsVFTC]>;
+
+  def m : TPUInstP<(outs fiforeg_mrf:$dstmrf, fiforeg_gmr:$dstgmr, fiforeg_msr:$dstmsr),
+    (ins VPR:$Vs, MPR:$m, fiforeg_msr:$srcmsr),
+    "($dstmrf, $dstgmr, $dstmsr) =\t"#Name#".msk${pred} $m, $Vs",
+    [(set fiforeg_mrf:$dstmrf, fiforeg_gmr:$dstgmr, fiforeg_msr:$dstmsr,
+          (Intr (vNf32 VPR:$Vs), (vNi1 MPR:$m), (i32 i), (i32 fiforeg_msr:$srcmsr)))]>,
+           Sched<[Schedule]>, ExtraPredicates<[HasMXU,IsVFTC]>;
+}
+
+multiclass MatMulLgmrMsraOrbVF<int i, string IntrName, string OpName, string ScheduleName> {
+    defm _MSRA : MatMulLgmrMsraOrbMaskVF<i, IntrName#"_msra", OpName, !cast<SchedWrite>(ScheduleName#i),
+    !cast<DAGOperand>("MSRAPR"#i)>;
+    defm _MSRB : MatMulLgmrMsraOrbMaskVF<i, IntrName#"_msrb", OpName, !cast<SchedWrite>(ScheduleName#i),
+    !cast<DAGOperand>("MSRBPR"#i)>;
+}
+// For fused-load and non fused-load versions: END
+
+multiclass MatMulLgmrVF<int i, string IntrName, string OpName, string ScheduleName> {
+  defm "" : MatMulVF<i, IntrName, OpName, ScheduleName>;
+  defm _LGMR : MatMulLgmrMsraOrbVF<i, IntrName#"_lgmr", OpName#".lgmr", ScheduleName>;
+}
+
+multiclass MatMulModeVF<int i, string IntrName, string OpName, string SubUnitNameHint> {
+  defm ""        : MatMulLgmrVF<i, IntrName#"_f32", OpName, "WriteMatMulMxu">,
+       SubUnits<[!cast<SubUnitEncoding>("SU_"#SubUnitNameHint#"_f32")]>;
+  defm _IF8_BF16 : MatMulLgmrVF<i, IntrName#"_if8_bf16", OpName#".if8.bf16", "WriteMatMulMxu">,
+       SubUnits<[!cast<SubUnitEncoding>("SU_"#SubUnitNameHint#"_f8")]>;
+  defm _BF16     : MatMulLgmrVF<i, IntrName#"_bf16", OpName#".bf16", "WriteMatMulMxu">,
+       SubUnits<[!cast<SubUnitEncoding>("SU_"#SubUnitNameHint#"_f16")]>;
+  defm _BF8_BF16 : MatMulLgmrVF<i, IntrName#"_bf8_bf16", OpName#".bf8.bf16", "WriteMatMulMxu">,
+       SubUnits<[!cast<SubUnitEncoding>("SU_"#SubUnitNameHint#"_f8")]>;
+  defm _U8       : MatMulLgmrVF<i, IntrName#"_u8", OpName#".u8", "WriteMatMulMxuInt">,
+       SubUnits<[!cast<SubUnitEncoding>("SU_"#SubUnitNameHint#"_u8")]>;
+  defm _S8       : MatMulLgmrVF<i, IntrName#"_s8", OpName#".s8", "WriteMatMulMxuInt">,
+       SubUnits<[!cast<SubUnitEncoding>("SU_"#SubUnitNameHint#"_s8")]>;
+  defm _U4       : MatMulLgmrVF<i, IntrName#"_u4", OpName#".u4", "WriteMatMulMxuInt">,
+       SubUnits<[!cast<SubUnitEncoding>("SU_"#SubUnitNameHint#"_u8")]>;
+  defm _S4       : MatMulLgmrVF<i, IntrName#"_s4", OpName#".s4", "WriteMatMulMxuInt">,
+       SubUnits<[!cast<SubUnitEncoding>("SU_"#SubUnitNameHint#"_s8")]>;
+
+  // vmatmul.lmr versions
+  defm _LMR      : MatMulLmrVF<i, IntrName#"_lmr", OpName#".lmr",
+       !cast<SchedWrite>("WriteMatMulMxu"#i)>, SubUnits<[SU_matmul_lmr_8]>;
+  defm _BF16_FROM_LMR      : MatMulLmrVF<i, IntrName#"_bf16_lmr", OpName#".lmr",
+       !cast<SchedWrite>("WriteMatMulMxu"#i)>, SubUnits<[SU_matmul_lmr_16]>;
+}
+
+multiclass Lmr<int i,
+   DAGOperand gmr = !cast<DAGOperand>("GMRPR"#i),
+   DAGOperand lmr = !cast<DAGOperand>("LMRPR"#i),
+   DAGOperand msra = !cast<DAGOperand>("MSRAPR"#i),
+   DAGOperand msrb = !cast<DAGOperand>("MSRBPR"#i)> {
+  def _GMR_FROM_MSRA :  TPUInstP<(outs gmr:$dstgmr), (ins msra:$srcmsra),
+    "$dstgmr =\tvlgmr${pred} $srcmsra",
+    [(set gmr:$dstgmr, (int_tpu_vlgmr_msra (i32 i), (i32 msra:$srcmsra)))]>,
+    ExtraPredicates<[IsVFTC]>, SubUnits<[SU_matlmr]>;
+
+  def _GMR_FROM_MSRB :  TPUInstP<(outs gmr:$dstgmr), (ins msrb:$srcmsrb),
+    "$dstgmr =\tvlgmr${pred} $srcmsrb",
+    [(set gmr:$dstgmr, (int_tpu_vlgmr_msrb (i32 i), (i32 msrb:$srcmsrb)))]>,
+    ExtraPredicates<[IsVFTC]>, SubUnits<[SU_matlmr]>;
+
+  def _LMR_FROM_MSRA :  TPUInstP<(outs lmr:$dstlmr), (ins msra:$srcmsra),
+    "$dstlmr =\tvllmr${pred} $srcmsra",
+    [(set lmr:$dstlmr, (int_tpu_vllmr_msra (i32 i), (i32 msra:$srcmsra)))]>,
+    ExtraPredicates<[IsVFTC]>, SubUnits<[SU_matlmr]>;
+
+  def _LMR_FROM_MSRB :  TPUInstP<(outs lmr:$dstlmr), (ins msrb:$srcmsrb),
+    "$dstlmr =\tvllmr${pred} $srcmsrb",
+    [(set lmr:$dstlmr, (int_tpu_vllmr_msrb (i32 i), (i32 msrb:$srcmsrb)))]>,
+    ExtraPredicates<[IsVFTC]>, SubUnits<[SU_matlmr]>;
+}
+
+multiclass MXUVF<int i, DAGOperand mrf = !cast<DAGOperand>("MRFPR"#i)> {
+let mayLoad = 1, mayStore = 1, usesCustomInserter = 1 in {
+let Itinerary = IIC_MXU_PUSH in {
+  defm MATPUSH : MatPushModeVF<i, "int_tpu_vmatpush", "vmatpush", "matpush">;
+}
+let Itinerary = IIC_MXU_MUL, isPush = 1 in {
+  defm MATMUL : MatMulModeVF<i, "int_tpu_vmatmul", "vmatmul", "matmul">;
+}
+defm LOADMATREG : Lmr<i>, Bundle<B_VEX>;
+def MATPOP :  TPUInstP<(outs VPR:$Vd), (ins mrf:$srcmrf),
+    "$Vd =\tvmatres.8x128${pred} $srcmrf",
+    [(set VPR:$Vd, (int_tpu_vmatres_f32 i, (i32 mrf:$srcmrf)))]>,
+    Sched<[!cast<SchedWrite>("WriteMatRes"#i)]>, ExtraPredicates<[IsVFTC]>,
+    SubUnits<[SU_mxu_result]>, Bundle<B_VResAny> { let isPop = 1; }
+}
+}
+
+// Define 4 MXU for all platforms, we assume user won't try to use more MXU
+// than available on the platform. We can add more fine grain predicates later
+// to be able to report user errors.
+foreach Index = 0-3 in {
+defm tcvfMXU#Index : MXUVF<Index>, IsVectorInstruction, IsMXUInst;
+}
+
+//===----------------------------------------------------------------------===//
+// XLU operations
+//===----------------------------------------------------------------------===//
+// The following mapping table is used to assign cross-unit latencies
+// (in TPUSubtarget::UpdateCrossUnitLatency) which are dependent on the XLU unit
+// number.
+class XLUUnitInfo<string Inst, int XluIndex> {
+  Instruction Opcode = !cast<Instruction>(Inst);
+  bits<3> UnitId = XluIndex;
+}
+
+def XLUUnitInfoTable : GenericTable {
+  let FilterClass = "XLUUnitInfo";
+  let CppTypeName = "XLUUnitInfoTy";
+  let Fields = ["Opcode", "UnitId"];
+  let PrimaryKey = ["Opcode"];
+  let PrimaryKeyName = "XLUUnitInfo";
+}
+
+// Set Pattern Registers
+multiclass SetPatternRegVF<string postFix, SDPatternOperator OpNode, int XluIndex,
+                           DAGOperand pcr, SchedWrite Sch> {
+  def "" : TPUInstP<(outs pcr:$pcr), (ins VPR:$vsrc),
+              "$pcr =\tvsetperm."#postFix#"${pred} $vsrc",
+              [(set pcr:$pcr, (OpNode (vNi32 VPR:$vsrc), (i32 XluIndex)))]>,
+    Bundle<B_VEX>, Sched<[Sch]>, XLUUnitInfo<NAME, XluIndex>;
+}
+
+multiclass SetPermuteVF<string postFix, SDPatternOperator OpNode, int XluIndex,
+                       DAGOperand pcr, SchedWrite Sch> {
+  defm "" : SetPatternRegVF<postFix, OpNode, XluIndex, pcr, Sch>;
+}
+
+// Permute instructions
+multiclass PermuteVF<int XluIndex, DAGOperand trf, DAGOperand pcr> {
+  def "" : TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, pcr:$pcr),
+              "$trf =\tvperm.lane${pred} $vsrc",
+              [(set trf:$trf, (int_tpu_permute (vNi32 VPR:$vsrc),
+                               (i32 pcr:$pcr), (i32 XluIndex)))]>,
+    Bundle<B_VEX>, Sched<[!cast<SchedWrite>("WritePermute"#XluIndex)]>,
+    SubUnits<[SU_permute]>;
+let isPacked = 1 in {
+  def _PACKED : TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, pcr:$pcr),
+              "$trf =\tvperm.lane.packed${pred} $vsrc",
+              []>, Bundle<B_VEX1>, SubUnits<[SU_packed_permute]>;
+let isPseudo = 1 in {
+  // Packed instruction also generates a vsupp instruction for the high bits.
+  // It gets expanded post bundle packing.
+  def _PACKED_PSEUDO : TPUInstP<(outs trf:$trf), (ins VPR:$vsrclow, VPR:$vsrchigh, pcr:$pcr),
+              "$trf =\t#VPERMPACKED."#XluIndex#"${pred} $vsrclow, $vsrchigh",
+              [(set trf:$trf, (int_tpu_permute_packed (vNi32 VPR:$vsrclow),
+                     (vNi32 VPR:$vsrchigh), (i32 pcr:$pcr), (i32 XluIndex)))]>,
+    Bundle<B_VEXBoth>, Sched<[!cast<SchedWrite>("WritePermutePacked"#XluIndex)]>,
+    PseudoInstMapping<NAME#"_PACKED_PSEUDO", NAME#"_PACKED">,
+    SubUnits<[SU_packed_permute]>;
+} // isPseudo = 1
+} // isPacked = 1
+}
+
+multiclass PermuteFloatPatVF<string Name, int XluIndex, DAGOperand pcr> {
+   def : Pat<(int_tpu_permute (vNf32 VPR:$vsrc), (i32 pcr:$pcr), (i32 XluIndex)),
+           (!cast<Instruction>(Name) VPR:$vsrc, pcr:$pcr)>;
+   def : Pat<(int_tpu_permute_packed (vNf32 VPR:$vsrclow), (vNf32 VPR:$vsrchigh), (i32 pcr:$pcr), (i32 XluIndex)),
+           (!cast<Instruction>(Name#"_PACKED_PSEUDO") VPR:$vsrclow, VPR:$vsrchigh, pcr:$pcr)>;
+}
+
+
+// Lane broadcast instructions
+multiclass BroadcastSourceVF<int XluIndex, DAGOperand trf, DAGOperand SrcT, DAGOperand PatType,
+                        ImmSlotRequirement slots, list<ImmOperRequirement> operands> {
+def "" :  TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, SrcT:$slane),
+              "$trf =\tvbcast.lane${pred} $vsrc, $slane",
+              [(set trf:$trf, (int_tpu_bcast (vNi32 VPR:$vsrc),
+                               (i32 PatType:$slane), (i32 XluIndex)))]>,
+  Bundle<B_VEX>, Sched<[!cast<SchedWrite>("WritePermute"#XluIndex)]>, BundleImm<slots, operands>,
+  XLUUnitInfo<NAME, XluIndex>, SubUnits<[SU_permute]>;
+let isPacked = 1 in {
+  def _PACKED :  TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, SrcT:$slane),
+              "$trf =\tvbcast.lane.packed${pred} $vsrc, $slane",
+              []>, Bundle<B_VEX1>, BundleImm<slots, operands>,
+              XLUUnitInfo<NAME, XluIndex>, SubUnits<[SU_packed_permute]>;
+let isPseudo = 1 in {
+def _PACKED_PSEUDO :  TPUInstP<(outs trf:$trf), (ins VPR:$vsrclow, VPR:$vsrchigh, SrcT:$slane),
+              "$trf =\t#VBCASTPACKED_PSEUDO.packed${pred} $vsrclow, $vsrchigh, $slane",
+              [(set trf:$trf, (int_tpu_bcast_packed (vNi32 VPR:$vsrclow),
+                   (vNi32 VPR:$vsrchigh), (i32 PatType:$slane), (i32 XluIndex)))]>,
+  Bundle<B_VEXBoth>, Sched<[!cast<SchedWrite>("WritePermutePacked"#XluIndex)]>,
+  BundleImm<slots, operands>,
+  PseudoInstMapping<NAME#"_PACKED_PSEUDO", NAME#"_PACKED">,
+  XLUUnitInfo<NAME#"_PACKED_PSEUDO", XluIndex>, SubUnits<[SU_packed_permute]>;
+} // isPacked = 1
+} // isPseudo = 1
+}
+
+multiclass BroadcastVF<int XluIndex, DAGOperand trf> {
+  defm r : BroadcastSourceVF<XluIndex, trf, GPR, GPR, IMM_NONE, []>;
+  defm i : BroadcastSourceVF<XluIndex, trf, timmsi, timmsi, IMM_2_to_5, [IMM_OP_0]>;
+}
+
+// Rotate instructions
+multiclass RotateSourceVF<int XluIndex, DAGOperand trf, DAGOperand SrcT, DAGOperand PatType,
+                        ImmSlotRequirement slots, list<ImmOperRequirement> operands> {
+def "" :  TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, SrcT:$amount),
+              "$trf =\tvrot.lane${pred} $vsrc, $amount",
+              [(set trf:$trf, (int_tpu_vrotate (vNi32 VPR:$vsrc),
+                               (i32 PatType:$amount), (i32 XluIndex)))]>,
+  Bundle<B_VEX>, Sched<[!cast<SchedWrite>("WritePermute"#XluIndex)]>, BundleImm<slots, operands>,
+  XLUUnitInfo<NAME, XluIndex>, SubUnits<[SU_permute]>;
+let isPacked = 1 in {
+  def _PACKED :  TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, SrcT:$amount),
+              "$trf =\tvrot.lane.packed${pred} $vsrc, $amount",
+              []>, Bundle<B_VEX1>, BundleImm<slots, operands>,
+              XLUUnitInfo<NAME, XluIndex>, SubUnits<[SU_packed_permute]>;
+let isPseudo = 1 in {
+def _PACKED_PSEUDO :  TPUInstP<(outs trf:$trf), (ins VPR:$vsrclow, VPR:$vsrchigh, SrcT:$amount),
+              "$trf =\t#VROTPACKED_PSEUDO.packed${pred} $vsrclow, $vsrchigh, $amount",
+              [(set trf:$trf, (int_tpu_vrotate_packed (vNi32 VPR:$vsrclow),
+                   (vNi32 VPR:$vsrchigh), (i32 PatType:$amount), (i32 XluIndex)))]>,
+  Bundle<B_VEXBoth>, Sched<[!cast<SchedWrite>("WritePermutePacked"#XluIndex)]>,
+  BundleImm<slots, operands>,
+  PseudoInstMapping<NAME#"_PACKED_PSEUDO", NAME#"_PACKED">,
+  XLUUnitInfo<NAME#"_PACKED_PSEUDO", XluIndex>, SubUnits<[SU_packed_permute]>;
+} // isPacked = 1
+} // isPseudo = 1
+}
+
+multiclass RotateVF<int XluIndex, DAGOperand trf> {
+  defm r : RotateSourceVF<XluIndex, trf, GPR, GPR, IMM_NONE, []>;
+  // immediate amount case.
+  defm i : RotateSourceVF<XluIndex, trf, timmsi, timmsi, IMM_2_to_5, [IMM_OP_0]>;
+}
+
+// Pattern for the float case.
+multiclass RotateFloatPatVF<string Name, int XluIndex> {
+ def : Pat<(int_tpu_vrotate (vNf32 VPR:$vsrc), (i32 GPR:$amount), (i32 XluIndex)),
+           (!cast<Instruction>(Name#r) VPR:$vsrc, GPR:$amount)>;
+ def : Pat<(int_tpu_vrotate (vNf32 VPR:$vsrc), (i32 imm:$amount), (i32 XluIndex)),
+           (!cast<Instruction>(Name#i) VPR:$vsrc, imm:$amount)>;
+  // Packed pattern
+ def : Pat<(int_tpu_vrotate_packed (vNf32 VPR:$vsrclow), (vNf32 VPR:$vsrchigh),
+            (i32 GPR:$amount), (i32 XluIndex)),
+           (!cast<Instruction>(Name#"r_PACKED_PSEUDO") VPR:$vsrclow, VPR:$vsrchigh, GPR:$amount)>;
+ def : Pat<(int_tpu_vrotate_packed (vNf32 VPR:$vsrclow), (vNf32 VPR:$vsrchigh),
+            (i32 imm:$amount), (i32 XluIndex)),
+           (!cast<Instruction>(Name#"i_PACKED_PSEUDO") VPR:$vsrclow, VPR:$vsrchigh, imm:$amount)>;
+}
+
+// Cross Lane reduction instructions
+multiclass XLaneInstVF<string Name, string IntrName, int XLUIndex, DAGOperand trf,
+  SchedWrite Sch = !cast<SchedWrite>("WriteXLane"#XLUIndex)> {
+def "" : TPUInstP<(outs trf:$trf), (ins VPR:$vsrc),
+              "$trf =\t"#Name#".xlane${pred} $vsrc",
+              [(set trf:$trf, (!cast<Intrinsic>("int_tpu_xlane_"#IntrName)
+                               (vNf32 VPR:$vsrc), (i32 XLUIndex)))]>,
+    Bundle<B_VEX>, Sched<[Sch]>,
+    XLUUnitInfo<NAME, XLUIndex>;
+}
+
+// Transpose result instruction
+multiclass XLUPopVF<int XLUIndex,
+  DAGOperand trf = !cast<DAGOperand>("TRFPR"#XLUIndex)> {
+let isPop = 1, usesCustomInserter = 1 in {
+  def Pop :  TPUInstP<(outs VPR:$Vd), (ins trf:$trf),
+            "$Vd =\tvpop${pred} $trf",
+            [(set (vNi32 VPR:$Vd),
+              (int_tpu_tc_vtrfpop (i32 XLUIndex), (i32 trf:$trf)))]>,
+             Bundle<B_VResAny>, Sched<[!cast<SchedWrite>("WriteTrf"#XLUIndex#"Pop0")]>,
+             SubUnits<[SU_xlu_result]>;
+}
+def : Pat<(vNf32 (int_tpu_tc_vtrfpop (i32 XLUIndex), (i32 trf:$trf))),
+              (!cast<Instruction>(NAME#Pop) trf:$trf)>;
+}
+
+// Transpose instructions
+multiclass TransposeVF<string Name, string PostFix, string Sch, string IntrName,
+                      int XluIndex, DAGOperand trf, string SubUnitNameHint> {
+// immediate width support only for now. Having none-constant width makes it
+// really hard to match Pop instructions associated to a transpose.
+// Height is an argument. Even though hardware doesn't need it we force user to
+// pass it to be able to compute an accurate latency.
+def "" :  TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, i32imm:$width, i32imm:$height, trf:$trfsrc),
+              "$trf =\t"#Name#PostFix#"${pred} $vsrc, $width",
+              [(set trf:$trf, (!cast<Intrinsic>(IntrName) (vNi32 VPR:$vsrc),
+                               (timm:$width), (timm:$height), (i32 XluIndex), (i32 trf:$trfsrc)))]>,
+              Bundle<B_VEX>, Sched<[!cast<SchedWrite>(Sch#XluIndex)]>,
+              XLUUnitInfo<NAME, XluIndex>,
+              SubUnits<[!cast<SubUnitEncoding>("SU_"#SubUnitNameHint)]>;
+
+// Packed transpose needs to be broken down into two instructions within going
+// in the same bundle. We emit a pseudo instruction with both source and expand
+// it post bundle packing into a packed instruction and a vsupp instruction.
+let isPacked = 1 in {
+def _PACKED :  TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, i32imm:$width, i32imm:$height, trf:$trfsrc),
+              "$trf =\t"#Name#".packed"#PostFix#"${pred} $vsrc, $width",
+              []>, Bundle<B_VEX1>, XLUUnitInfo<NAME, XluIndex>,
+              SubUnits<[!cast<SubUnitEncoding>("SU_packed_"#SubUnitNameHint)]>;
+let isPseudo = 1 in {
+def _PACKED_PSEUDO :  TPUInstP<(outs trf:$trf), (ins VPR:$vsrclow, VPR:$vsrchigh,
+                                          i32imm:$width, i32imm:$height, trf:$trfsrc),
+              "$trf =\t"#Name#"PACKED"#PostFix#"${pred} $vsrclow, $width, $vsrchigh",
+              [(set trf:$trf, (!cast<Intrinsic>(IntrName#"_packed") (vNi32 VPR:$vsrclow),
+                               (vNi32 VPR:$vsrchigh), (timm:$width),
+                               (timm:$height), (i32 XluIndex), (i32 trf:$trfsrc)))]>,
+              Bundle<B_VEXBoth>, Sched<[!cast<SchedWrite>(Sch#"Packed"#XluIndex)]>,
+              PseudoInstMapping<NAME#"_PACKED_PSEUDO", NAME#"_PACKED">,
+              XLUUnitInfo<NAME#"_PACKED_PSEUDO", XluIndex>,
+              SubUnits<[!cast<SubUnitEncoding>("SU_packed_"#SubUnitNameHint)]>;
+} // isPseudo = 1
+} // isPacked = 1
+}
+
+// Transpose can be segmented or not.
+multiclass TransposeStartSegmentedVF<string PostFix, string Sch, string IntrName,
+                      int XluIndex, DAGOperand trf, string SubUnitNameHint> {
+  defm "" : TransposeVF<"vxpose", PostFix, Sch, IntrName, XluIndex, trf,
+  SubUnitNameHint>;
+let isSegmented = 1 in {
+  defm _SEGMENTED : TransposeVF<"vsxpose", PostFix, Sch,
+                        IntrName#"_segmented", XluIndex, trf, SubUnitNameHint>;
+}
+}
+
+multiclass TransposeEndSegmentedVF<string PostFix, string Sch, string IntrName,
+                      int XluIndex, DAGOperand trf, string SubUnitNameHint> {
+  defm "" : TransposeVF<"vxpose", PostFix, Sch, IntrName, XluIndex, trf,
+  SubUnitNameHint>;
+let isSegmented = 1 in {
+  defm _SEGMENTED : TransposeVF<"vsxpose", PostFix, Sch,
+                        IntrName#"_segmented", XluIndex, trf,
+                        "segmented_"#SubUnitNameHint>;
+}
+}
+
+
+multiclass TransposeStartEndVF<string Sch, string IntrName, int XluIndex, DAGOperand trf> {
+// Transpose instruction start/end prefixes
+  defm "" : TransposeStartSegmentedVF<".start", Sch, IntrName#"_start",
+       XluIndex, trf, "transpose_start">;
+let isTransposeEnd = 1, isPush = 1 in {
+  defm _START_END : TransposeEndSegmentedVF<".start.end", Sch#"End",
+       IntrName#"_start_end", XluIndex, trf, "transpose_end">;
+
+  def _END : TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, i32imm:$width, i32imm:$height, trf:$trfsrc),
+              "$trf =\tvxpose.end${pred} $vsrc, $width",
+              [(set trf:$trf, (!cast<Intrinsic>(IntrName#"_end") (vNi32 VPR:$vsrc),
+                               (timm:$width), (timm:$height), (i32 XluIndex), (i32 trf:$trfsrc)))]>,
+              Bundle<B_VEX>, Sched<[!cast<SchedWrite>(Sch#"End"#XluIndex)]>,
+              SubUnits<[SU_transpose_end]>;
+  def _CONT : TPUInstP<(outs trf:$trf), (ins VPR:$vsrc, i32imm:$width, i32imm:$height, trf:$trfsrc),
+              "$trf =\tvxpose.cont${pred} $vsrc, $width",
+              [(set trf:$trf, (!cast<Intrinsic>(IntrName#"_cont") (vNi32 VPR:$vsrc),
+                               (timm:$width), (timm:$height), (i32 XluIndex), (i32 trf:$trfsrc)))]>,
+              Bundle<B_VEX>, Sched<[!cast<SchedWrite>(Sch#XluIndex)]>,
+              SubUnits<[SU_transpose_continue]>;
+}
+}
+
+// Pattern for the float case.
+multiclass TransposeFloatPatVF<string Name, string IntrName, int XluIndex,
+                              DAGOperand trf> {
+ def : Pat<(!cast<Intrinsic>(IntrName) (vNf32 VPR:$vsrc), (timm:$width), (timm:$height),
+                   (i32 XluIndex), (i32 trf:$trfsrc)),
+                  (!cast<Instruction>(Name) VPR:$vsrc, i32imm:$width,
+                   i32imm:$height, trf:$trfsrc)>;
+ // Packed case.
+ def : Pat<(!cast<Intrinsic>(IntrName#"_packed") (vNf32 VPR:$vsrclow),
+                  (vNf32 VPR:$vsrchigh), (timm:$width), (timm:$height),
+                  (i32 XluIndex), (i32 trf:$trfsrc)),
+                  (!cast<Instruction>(Name#"_PACKED_PSEUDO") VPR:$vsrclow, VPR:$vsrchigh,
+                   i32imm:$width, i32imm:$height, trf:$trfsrc)>;
+}
+
+// Pattern for segmented and normal case.
+multiclass TransposeSegmentedFloatPatVF<string Name, string Intrinsic, int XluIndex,
+                              DAGOperand trf> {
+ defm : TransposeFloatPatVF<Name, Intrinsic, XluIndex, trf>;
+ defm : TransposeFloatPatVF<Name#"_SEGMENTED", Intrinsic#"_segmented", XluIndex, trf>;
+}
+
+// Pattern for transpose and transpose_end intrinsics.
+multiclass TransposeEndFloatPatVF<string Name, string Intrinsic, int XluIndex,
+                              DAGOperand trf> {
+ defm : TransposeSegmentedFloatPatVF<Name, Intrinsic#"_start", XluIndex, trf>;
+ defm : TransposeSegmentedFloatPatVF<Name#"_START_END", Intrinsic#"_start_end", XluIndex, trf>;
+
+ def : Pat<(!cast<Intrinsic>(Intrinsic#"_end") (vNf32 VPR:$vsrc), (timm:$width), (timm:$height),
+                   (i32 XluIndex), (i32 trf:$trfsrc)),
+                  (!cast<Instruction>(Name#"_END") VPR:$vsrc, i32imm:$width,
+                   i32imm:$height, trf:$trfsrc)>;
+}
+
+multiclass TransposeUnitVF<int XluIndex,
+  DAGOperand trf = !cast<DAGOperand>("TRFPR"#XluIndex),
+  DAGOperand pcr = !cast<DAGOperand>("PCRPR"#XluIndex)> {
+
+// Use custom inserter to attach the right memory operands.
+let usesCustomInserter = 1 in {
+// Transpose is not marked as push. We only model other transpose_end
+// instructions as pushing in the FIFO. That allows us to model transpose as
+// a normal FIFO. Tranpose_end pushes a variable number of items based on its
+// width.
+let isTranspose = 1 in {
+defm TRANSPOSE : TransposeStartEndVF<"WriteTranspose", "int_tpu_tc_transpose", XluIndex, trf>;
+}
+let isPush = 1 in {
+let isPermute = 1 in {
+defm ROTATE : RotateVF<XluIndex, trf>;
+defm BROADCAST : BroadcastVF<XluIndex, trf>;
+defm PERMUTE : PermuteVF<XluIndex, trf, pcr>;
+} // isPermute = 1
+let isReduce = 1, SubUnits = [SU_reduce] in {
+defm XLANE_ADD : XLaneInstVF<"vadd", "add", XluIndex, trf>;
+defm XLANE_MAX : XLaneInstVF<"vmax", "max", XluIndex, trf>;
+defm XLANE_MIN : XLaneInstVF<"vmin", "min", XluIndex, trf>;
+defm XLANE_MAXINDEX : XLaneInstVF<"vmax.index", "maxindex", XluIndex, trf>;
+defm XLANE_MININDEX : XLaneInstVF<"vmin.index", "minindex", XluIndex, trf>;
+} // isReduce = 1, SubUnits = [SU_reduce]
+} // isPush = 1
+
+// SetPermute instructions are not FIFO.
+defm SETPERMUTE_U8 :
+  SetPermuteVF<"u8", int_tpu_set_permute, XluIndex, pcr, WriteSetPermute>,
+  SubUnits<[SU_set_pattern]>;
+defm SETPERMUTE_SUBLANE :
+  SetPermuteVF<"all.u8", int_tpu_set_permute_sublane, XluIndex, pcr,
+              !cast<SchedWrite>("WriteSetPermuteAll"#XluIndex)>,
+              SubUnits<[SU_set_pattern_all]>;
+defm SETPERMUTE_BYTES :
+  SetPermuteVF<"all.bytes.u32", int_tpu_set_permute_bytes, XluIndex, pcr,
+              !cast<SchedWrite>("WriteSetPermuteAll"#XluIndex)>,
+              SubUnits<[SU_set_pattern_all]>;
+} // let usesCustomInserter = 1
+
+defm : TransposeEndFloatPatVF<NAME#"TRANSPOSE", "int_tpu_tc_transpose", XluIndex, trf>;
+defm : RotateFloatPatVF<NAME#"ROTATE", XluIndex>;
+defm : PermuteFloatPatVF<NAME#"PERMUTE", XluIndex, pcr>;
+
+// Pop
+defm "" : XLUPopVF<XluIndex>;
+}
+
+let Predicates = [IsVFTC] in {
+defm tcvfXLU0 : TransposeUnitVF<0>, IsVectorInstruction, IsXLUInst;
+defm tcvfXLU1 : TransposeUnitVF<1>, IsVectorInstruction, IsXLUInst;
+defm tcvfXLU2 : TransposeUnitVF<2>, IsVectorInstruction, IsXLUInst;
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUMemAllocation.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUMemAllocation.cpp
new file mode 100644
index 0000000..4950c75
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUMemAllocation.cpp

@@ -0,0 +1,845 @@
+//=--------- TPUMemAllocation.cpp - Convert Alloca intrinsics --------------=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts the TPU specific allocation intrinsics into immediate
+// addresses and alias metadata to keep the information that allocations are
+// independent. This pass may be used for unallocated or allocated buffers.
+// If the buffers are unallocated this pass will just do a bump pointer
+// allocation.
+// Allocations will alias if and only if their physical address range overlap.
+// This means allocations across different scope may not alias if their range
+// are disjoint.
+//
+//===----------------------------------------------------------------------===//
+#include "TPU.h"
+#include "TPUIRUtils.h"
+#include "TPUSubtarget.h"
+#include "TPUTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsTPU.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstdint>
+#include <tuple>
+
+#define DEBUG_TYPE "tpu-mem-alloc"
+using namespace llvm;
+
+static cl::opt<bool>
+    TPUFatalMemAllocError("tpu-fatal-mem-alloc-error", cl::Hidden,
+                          cl::init(true),
+                          cl::desc("Make TPU mem-alloc errors fatal."));
+static cl::opt<bool>
+    TPUEnableSanityChecks("tpu-enable-sanity-checks", cl::Hidden,
+                          cl::init(true),
+                          cl::desc("Enables spill limit sanity checks."));
+
+namespace {
+// Helper class for all alloca intrinsics.
+class AllocaIntrinsic : public IntrinsicInst {
+public:
+  unsigned getOffset() const {
+    assert(hasOffset());
+    return cast<ConstantInt>(getOperand(1))->getZExtValue();
+  }
+  bool hasOffset() const {
+    return !isDynamic() &&
+           (getIntrinsicID() == Intrinsic::tpu_allocate_hbm ||
+            getIntrinsicID() == Intrinsic::tpu_allocate_hbm_any ||
+            getIntrinsicID() == Intrinsic::tpu_allocate_dyn_hbm ||
+            getIntrinsicID() == Intrinsic::tpu_allocate_smem ||
+            getIntrinsicID() == Intrinsic::tpu_allocate_smem_any ||
+            getIntrinsicID() == Intrinsic::tpu_allocate_tilespmem ||
+            getIntrinsicID() == Intrinsic::tpu_allocate_timem ||
+            getIntrinsicID() == Intrinsic::tpu_allocate_spmem ||
+            getIntrinsicID() == Intrinsic::tpu_allocate_vmem ||
+            getIntrinsicID() == Intrinsic::tpu_allocate_dreg ||
+            getIntrinsicID() == Intrinsic::tpu_allocate_sflag ||
+            getIntrinsicID() == Intrinsic::tpu_allocate_sflag_any ||
+            getIntrinsicID() == Intrinsic::tpu_allocate_sflag_other ||
+            getIntrinsicID() == Intrinsic::tpu_allocate_dyn_iova ||
+            getIntrinsicID() == Intrinsic::tpu_allocate_iova);
+  }
+  // Dynamic allocations don't have a static offset and size. Treat
+  // tpu_allocate_dyn as tpu_allocate if the offset is constant.
+  bool isDynamic() const {
+    Intrinsic::ID id = getIntrinsicID();
+    return (id == Intrinsic::tpu_allocate_dyn_hbm ||
+            id == Intrinsic::tpu_allocate_dyn_iova) &&
+           !isa<ConstantInt>(getOperand(1));
+  }
+  unsigned getSize() const {
+    return cast<ConstantInt>(getOperand(0))->getZExtValue();
+  }
+
+  unsigned getAddressSpace() const {
+    return getType()->getPointerAddressSpace();
+  }
+  static bool classof(const IntrinsicInst *I) {
+    switch (I->getIntrinsicID()) {
+    case Intrinsic::tpu_alloca_hbm:
+    case Intrinsic::tpu_alloca_smem:
+    case Intrinsic::tpu_alloca_tilespmem:
+    case Intrinsic::tpu_alloca_spmem:
+    case Intrinsic::tpu_alloca_dreg:
+    case Intrinsic::tpu_alloca_sflag:
+    case Intrinsic::tpu_allocate_hbm:
+    case Intrinsic::tpu_allocate_hbm_any:
+    case Intrinsic::tpu_allocate_smem:
+    case Intrinsic::tpu_allocate_smem_any:
+    case Intrinsic::tpu_allocate_tilespmem:
+    case Intrinsic::tpu_allocate_timem:
+    case Intrinsic::tpu_allocate_spmem:
+    case Intrinsic::tpu_allocate_vmem:
+    case Intrinsic::tpu_allocate_dreg:
+    case Intrinsic::tpu_allocate_sflag:
+    case Intrinsic::tpu_allocate_sflag_any:
+    case Intrinsic::tpu_allocate_sflag_other:
+    case Intrinsic::tpu_allocate_dyn_hbm:
+    case Intrinsic::tpu_allocate_iova:
+    case Intrinsic::tpu_allocate_dyn_iova:
+      return true;
+    default:
+      return false;
+    }
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+static void report_fatal_error_or_test(const Twine &reason) {
+  if (TPUFatalMemAllocError)
+    report_fatal_error(reason);
+  else
+    dbgs() << reason << "\n";
+}
+
+class TPUMemAlloc : public ModulePass {
+public:
+  static char ID;
+  TPUMemAlloc() : ModulePass(ID), TM(nullptr) {}
+  TPUMemAlloc(TPUTargetMachine *TM) : ModulePass(ID), TM(TM) {}
+
+  bool runOnModule(Module &M) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+
+  StringRef getPassName() const override { return "TPU memory allocation"; }
+
+private:
+  TPUTargetMachine *TM;
+};
+
+class TPUMemAllocImpl {
+public:
+  TPUMemAllocImpl(Module &M, TPUTargetMachine *TM) : TM(TM) {
+    // TODO(hgreving): Support for old style metadata. We should convert the llo
+    // side to preducing only new style metadata.
+    LegacySMemSpillLimitsExist =
+        M.getNamedMetadata("smem.spill.start") != nullptr &&
+        M.getNamedMetadata("smem.spill.limit") != nullptr;
+    LegacyTileSpmemSpillLimitsExist =
+        M.getNamedMetadata("tilespmem.spill.start") != nullptr &&
+        M.getNamedMetadata("tilespmem.spill.limit") != nullptr;
+    LegacyVMemSpillLimitsExist =
+        M.getNamedMetadata("vmem.spill.start") != nullptr &&
+        M.getNamedMetadata("vmem.spill.limit") != nullptr;
+    // New style per function metadata.
+    if (M.getNamedMetadata("smem.funcs.spill") == nullptr) {
+      SMemSpillLimitsExist = false;
+      assert(M.getNamedMetadata("smem.ranges.spill.start") == nullptr);
+      assert(M.getNamedMetadata("smem.ranges.spill.limit") == nullptr);
+    } else {
+      SMemSpillLimitsExist = true;
+      assert(M.getNamedMetadata("smem.ranges.spill.start") != nullptr);
+      assert(M.getNamedMetadata("smem.ranges.spill.limit") != nullptr);
+    }
+    if (M.getNamedMetadata("tilespmem.funcs.spill") == nullptr) {
+      TileSpmemSpillLimitsExist = false;
+      assert(M.getNamedMetadata("tilespmem.ranges.spill.start") == nullptr);
+      assert(M.getNamedMetadata("tilespmem.ranges.spill.limit") == nullptr);
+    } else {
+      TileSpmemSpillLimitsExist = true;
+      assert(M.getNamedMetadata("tilespmem.ranges.spill.start") != nullptr);
+      assert(M.getNamedMetadata("tilespmem.ranges.spill.limit") != nullptr);
+    }
+    if (M.getNamedMetadata("vmem.funcs.spill") == nullptr) {
+      VMemSpillLimitsExist = false;
+      assert(M.getNamedMetadata("vmem.ranges.spill.start") == nullptr);
+      assert(M.getNamedMetadata("vmem.ranges.spill.limit") == nullptr);
+    } else {
+      VMemSpillLimitsExist = true;
+      assert(M.getNamedMetadata("vmem.ranges.spill.start") != nullptr);
+      assert(M.getNamedMetadata("vmem.ranges.spill.limit") != nullptr);
+    }
+  }
+  bool runOnFunction(Function &F);
+
+  // If function call ABI is enabled, we convert all tpu.alloca.* intrinsics
+  // into real allocas.
+  bool convertAllocasForStack(Function &F);
+
+  // Flag to reflect the current convention. As soon as metadata is present,
+  // we assume that LLO took care of the spill limits, and we disobey the
+  // existing allocation limits (excluding bump alloca intrinsics).
+  bool LegacySMemSpillLimitsExist;
+  bool LegacyTileSpmemSpillLimitsExist;
+  bool LegacyVMemSpillLimitsExist;
+
+private:
+  void SetAllocationMD(AllocaIntrinsic *Allocation, unsigned Offset);
+  // Mark memory access with metadata so that they are considered as independent
+  // memory in the rest of the compiler.
+  void SetAliasingMetadata(const std::vector<Instruction *> &MemoryAccesses);
+  // Updates and sets the spill limits after allocation.
+  void SetSpillLimitsMetadata();
+  // Auxiliary function, retrieve function based metadata based on CurFunc.
+  int GetOrInsertFuncMetadata(const Module *M, NamedMDNode *NMDNFuncs);
+  // Associate a physical address to each allocation.
+  void AllocateRegion(const std::vector<AllocaIntrinsic *> &Allocations);
+  // Reset bump pointer base offsets for each memory type based on metadata.
+  void InitBumpPointerBase(Module *M);
+  // Replace allocation intrinsics with literal addresses.
+  void ReplaceAllocationIntrinsics();
+  // Run sanity checking of allocations and spill limit metadata. Currently
+  // checks Smem and TileSpmem allocations on SparseCore.
+  void
+  SanityCheckSpillLimits(const std::vector<AllocaIntrinsic *> &Allocations);
+  // Clean up cast instructions and tpu intrinsic pointer cast before tracking
+  // the pointer dependencies.
+  void simplifyPtrCast(Function &F);
+  MDNode *Domain = nullptr;
+  // Map between allocation physical offset.
+  DenseMap<const AllocaIntrinsic *, unsigned> AllocationOffsets;
+  // Map between allocation and dynamic offset.
+  DenseMap<const AllocaIntrinsic *, Value *> DynamicAllocationOffsets;
+  // Auxiliary track and check functions.
+  DenseMap<const AllocaIntrinsic *, MDNode *> AliasMD;
+  std::pair<unsigned *, unsigned *> TrackAbsCurOffset(unsigned MemAddressSpace);
+  bool CheckAbsLimits(AllocaIntrinsic *Allocation, unsigned Offset,
+                      unsigned MemAddressSpace, bool SetAbsOffset);
+  bool CheckFixedAllocationOffset(AllocaIntrinsic *Allocation, unsigned Offset,
+                                  unsigned MemAddressSpace);
+
+  // Per-function metadata.
+  bool SMemSpillLimitsExist;
+  bool TileSpmemSpillLimitsExist;
+  bool VMemSpillLimitsExist;
+  // Current max memory offset of bump allocator for TileSpmem, Spmem, Dreg,
+  // and Sflag. Current absolute module's memory offset for Smem and Vmem.
+  SmallDenseMap<unsigned, uint64_t, 16> CurMemOffset;
+  // Hash map containing base bump pointers per address space.
+  SmallDenseMap<unsigned, uint64_t, 16> BumpPointerBase;
+  TPUTargetMachine *TM;
+  // The currently processed function.
+  Function *CurFun = nullptr;
+  // Structure to describe allocations.
+  struct AllocationDesc : std::tuple<unsigned, unsigned, unsigned> {
+    AllocationDesc(unsigned AddressSpace, unsigned Offset, unsigned Size)
+        : std::tuple<unsigned, unsigned, unsigned>(AddressSpace, Offset, Size) {
+    }
+    unsigned getAddressSpace() const { return std::get<0>(*this); }
+    unsigned getOffset() const { return std::get<1>(*this); }
+    unsigned getSize() const { return std::get<2>(*this); }
+  };
+  // Map of de-duplicated allocation to the Metadata associated.
+  std::map<AllocationDesc, MDNode *> AllocMDCache;
+};
+char TPUMemAlloc::ID = 0;
+} // namespace
+
+Pass *llvm::createTPUMemAllocPass(TPUTargetMachine *TM) {
+  return new TPUMemAlloc(TM);
+}
+
+static bool runImpl(Module &M, TPUTargetMachine *TM) {
+  bool Changed = false;
+  TPUMemAllocImpl TPUMA(M, TM);
+  for (auto &&F : M.functions()) {
+    if (!F.isDeclaration())
+      Changed |= TPUMA.runOnFunction(F);
+  }
+  if (TPUMA.LegacySMemSpillLimitsExist) {
+    M.eraseNamedMetadata(M.getNamedMetadata("smem.spill.start"));
+    M.eraseNamedMetadata(M.getNamedMetadata("smem.spill.limit"));
+  }
+  if (TPUMA.LegacyTileSpmemSpillLimitsExist) {
+    M.eraseNamedMetadata(M.getNamedMetadata("tilespmem.spill.start"));
+    M.eraseNamedMetadata(M.getNamedMetadata("tilespmem.spill.limit"));
+  }
+  if (TPUMA.LegacyVMemSpillLimitsExist) {
+    M.eraseNamedMetadata(M.getNamedMetadata("vmem.spill.start"));
+    M.eraseNamedMetadata(M.getNamedMetadata("vmem.spill.limit"));
+  }
+  return Changed;
+}
+
+bool TPUMemAlloc::runOnModule(Module &M) { return runImpl(M, TM); }
+
+void TPUMemAllocImpl::simplifyPtrCast(Function &F) {
+  for (auto &BB : F)
+    for (auto I = BB.begin(); I != BB.end(); ++I) {
+      if (isa<IntrinsicInst>(*I) &&
+          cast<IntrinsicInst>(*I).getIntrinsicID() == Intrinsic::tpu_inttoptr) {
+        if (isa<IntrinsicInst>(*I->getOperand(0)) &&
+            cast<IntrinsicInst>(*I->getOperand(0)).getIntrinsicID() ==
+                Intrinsic::tpu_ptrtoint) {
+          IRBuilder<> B(&*I);
+          Value *V = B.CreateBitCast(
+              cast<IntrinsicInst>(*I->getOperand(0)).getOperand(0),
+              I->getType());
+          I->replaceAllUsesWith(V);
+        }
+      } else if (isa<CastInst>(*I)) {
+        SimplifyQuery Q(F.getParent()->getDataLayout(), &(*I));
+        if (Value *V = simplifyCastInst(I->getOpcode(), I->getOperand(0),
+                                        I->getType(), Q))
+          I->replaceAllUsesWith(V);
+      }
+    }
+}
+
+bool TPUMemAllocImpl::convertAllocasForStack(Function &F) {
+  auto &ST = TM->getSubtarget<TPUSubtarget>(F);
+  if (!ST.isTPUABIEnabled())
+    return false;
+  bool Changed = false;
+  std::vector<AllocaIntrinsic *> Allocations;
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  for (BasicBlock *BB : RPOT) {
+    for (Instruction &I : BB->instructionsWithoutDebug()) {
+      if (AllocaIntrinsic *Allocation = dyn_cast<AllocaIntrinsic>(&I)) {
+        if (Allocation->isDynamic() || Allocation->hasOffset())
+          continue;
+        Allocations.push_back(Allocation);
+      }
+    }
+  }
+  for (AllocaIntrinsic *Allocation : Allocations) {
+    Changed = true;
+    IRBuilder<> B(Allocation);
+    AllocaInst *Alloca =
+        B.CreateAlloca(Allocation->getType(), Allocation->getAddressSpace(),
+                       Allocation->getOperand(0));
+    Allocation->replaceAllUsesWith(Alloca);
+    Allocation->eraseFromParent();
+  }
+  return Changed;
+}
+
+bool TPUMemAllocImpl::runOnFunction(Function &F) {
+  bool Changed = false;
+  simplifyPtrCast(F);
+  Changed |= convertAllocasForStack(F);
+  std::vector<AllocaIntrinsic *> Allocations;
+  std::vector<Instruction *> Barriers;
+  std::vector<Instruction *> MemoryInstructions;
+  CurFun = &F;
+  AllocationOffsets.clear();
+  AllocMDCache.clear();
+  DynamicAllocationOffsets.clear();
+  CurMemOffset[TPUAS_SmemAny] = 0;
+  CurMemOffset[TPUAS_Hbm] = 0;
+  CurMemOffset[TPUAS_HbmAny] = 0;
+  CurMemOffset[TPUAS_TileSpmem] = 0;
+  CurMemOffset[TPUAS_Timem] = 0;
+  CurMemOffset[TPUAS_Spmem] = 0;
+  CurMemOffset[TPUAS_Dreg] = 0;
+  CurMemOffset[TPUAS_Sflag] = 0;
+  CurMemOffset[TPUAS_SflagOther] = 0;
+  CurMemOffset[TPUAS_SflagAny] = 0;
+  InitBumpPointerBase(CurFun->getParent());
+  // Create a new scope domain for this function.
+  MDBuilder MDB(F.getContext());
+  Domain = MDB.createAnonymousAliasScopeDomain(F.getName());
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  for (BasicBlock *BB : RPOT) {
+    for (Instruction &I : BB->instructionsWithoutDebug()) {
+      if (AllocaIntrinsic *Allocation = dyn_cast<AllocaIntrinsic>(&I)) {
+        Changed = true;
+        Allocations.push_back(Allocation);
+      } else if (isa<LoadInst>(I) || isa<StoreInst>(I) ||
+                 (isa<CallBase>(I) &&
+                  !cast<CallBase>(I).doesNotAccessMemory())) {
+        MemoryInstructions.push_back(&I);
+      }
+      if (isa<IntrinsicInst>(I) && cast<IntrinsicInst>(I).getIntrinsicID() ==
+                                       Intrinsic::tpu_end_allocation_scope) {
+        Changed = true;
+        // Assign physical address to each allocation.
+        AllocateRegion(Allocations);
+        // After the end of scope reset all the allocations.
+        Allocations.clear();
+        // Clear tracking Hbm, TileSpmem, SPMem, Sflag, DReg, as well as Other
+        // and Any types.
+        CurMemOffset[TPUAS_SmemAny] = 0;
+        CurMemOffset[TPUAS_Hbm] = 0;
+        CurMemOffset[TPUAS_HbmAny] = 0;
+        CurMemOffset[TPUAS_TileSpmem] = 0;
+        CurMemOffset[TPUAS_Timem] = 0;
+        CurMemOffset[TPUAS_Spmem] = 0;
+        CurMemOffset[TPUAS_Dreg] = 0;
+        CurMemOffset[TPUAS_Sflag] = 0;
+        CurMemOffset[TPUAS_SflagOther] = 0;
+        CurMemOffset[TPUAS_SflagAny] = 0;
+        Barriers.push_back(&I);
+      }
+    }
+  }
+  AllocateRegion(Allocations);
+  SetAliasingMetadata(MemoryInstructions);
+  SetSpillLimitsMetadata();
+  SanityCheckSpillLimits(Allocations);
+  // Replace allocation intrinsic with their physical addresses.
+  ReplaceAllocationIntrinsics();
+  // Delete the end of scope intrinsics as we don't need them anymore.
+  for (auto *I : Barriers)
+    I->eraseFromParent();
+  CurMemOffset[TPUAS_Smem] = 0;
+  CurMemOffset[TPUAS_Vmem] = 0;
+  return Changed;
+}
+
+void TPUMemAllocImpl::ReplaceAllocationIntrinsics() {
+  auto ReplaceAllocation = [this](AllocaIntrinsic *Allocation,
+                                  Value *Replacement) -> void {
+    Function *IntToPtrFn = Intrinsic::getDeclaration(
+        CurFun->getParent(), llvm::Intrinsic::tpu_inttoptr,
+        Allocation->getType());
+    IRBuilder<> Builder(Allocation);
+    Value *IntTopPtr = Builder.CreateCall(IntToPtrFn, Replacement);
+    Allocation->replaceAllUsesWith(IntTopPtr);
+    Allocation->eraseFromParent();
+  };
+
+  for (auto It : AllocationOffsets) {
+    AllocaIntrinsic *Allocation = (AllocaIntrinsic *)It.first;
+    unsigned Offset = It.second;
+    IRBuilder<> Builder(Allocation);
+    ReplaceAllocation(Allocation, Builder.getInt32(Offset));
+  }
+  for (auto It : DynamicAllocationOffsets) {
+    AllocaIntrinsic *Allocation = (AllocaIntrinsic *)It.first;
+    Value *Replacement = It.second;
+    ReplaceAllocation(Allocation, Replacement);
+  }
+}
+
+void TPUMemAllocImpl::InitBumpPointerBase(Module *M) {
+  // TODO(b/187962134): Prevent stack allocation in the sflag_other address
+  // space using information from the metadata instead (will we actually need
+  // this or leave as is?).
+  auto &ST = TM->getSubtarget<TPUSubtarget>(*CurFun);
+  BumpPointerBase[TPUAS_SflagOther] = ST.getMemSize(TPUAS_SflagOther);
+  BumpPointerBase[TPUAS_SflagAny] = ST.getMemSize(TPUAS_SflagAny);
+  BumpPointerBase[TPUAS_HbmAny] = ST.getMemSize(TPUAS_HbmAny);
+  BumpPointerBase[TPUAS_SmemAny] = ST.getMemSize(TPUAS_SmemAny);
+  BumpPointerBase[TPUAS_Iova] = ST.getMemSize(TPUAS_Iova);
+  auto Init = [&](const unsigned AddrSpace, const Twine &FuncsName,
+                        const Twine &Name) -> void {
+    BumpPointerBase[AddrSpace] = 0;
+    NamedMDNode *NMDNFuncs = M->getNamedMetadata(FuncsName);
+    if (!NMDNFuncs)
+      return;
+    NamedMDNode *NMDN = M->getNamedMetadata(Name);
+    if (NMDN == nullptr) {
+      report_fatal_error_or_test("Missing expected metadata: " + Name);
+    }
+    int Idx = GetOrInsertFuncMetadata(M, NMDNFuncs);
+    assert(Idx < NMDN->getNumOperands());
+    MDNode *MDN = cast<MDNode>(NMDN->getOperand(Idx));
+    ConstantAsMetadata *CAM = cast<ConstantAsMetadata>(MDN->getOperand(0));
+    ConstantInt *CI = cast<ConstantInt>(CAM->getValue());
+    unsigned MemSize = ST.getMemSize(AddrSpace);
+    if (MemSize > 0 && CI->getSExtValue() > MemSize) {
+      // We assume that if memory size is 0, it doesn't exist on the current
+      // subtarget, so we ignore, because metadata may be shared across the
+      // module.
+      report_fatal_error_or_test("Bump pointer base metadata " + Name +
+                                 " too large: " + Twine(CI->getSExtValue()) +
+                                 " > " + Twine(MemSize));
+    }
+    BumpPointerBase[AddrSpace] = CI->getSExtValue();
+  };
+  Init(TPUAS_Hbm, "hbm.funcs.alloca", "hbm.start.alloca");
+  Init(TPUAS_HbmAny, "hbmany.funcs.alloca", "hbmany.start.alloca");
+  Init(TPUAS_Smem, "smemany.funcs.alloca", "smemany.start.alloca");
+  Init(TPUAS_SmemAny, "smem.funcs.alloca", "smem.start.alloca");
+  Init(TPUAS_Vmem, "vmem.funcs.alloca", "vmem.start.alloca");
+  Init(TPUAS_Sflag, "sflag.funcs.alloca", "sflag.start.alloca");
+  Init(TPUAS_SflagOther, "sflagother.funcs.alloca", "sflagother.start.alloca");
+  Init(TPUAS_SflagAny, "sflagany.funcs.alloca", "sflagany.start.alloca");
+  Init(TPUAS_Dreg, "dreg.funcs.alloca", "dreg.start.alloca");
+  Init(TPUAS_TileSpmem, "tilespmem.funcs.alloca", "tilespmem.start.alloca");
+  Init(TPUAS_Spmem, "spmem.funcs.alloca", "spmem.start.alloca");
+  Init(TPUAS_Timem, "timem.funcs.alloca", "timem.start.alloca");
+  // No TPUAS_Iova alloca.
+}
+
+bool TPUMemAllocImpl::CheckAbsLimits(AllocaIntrinsic *Allocation,
+                                     unsigned Offset, unsigned MemAddressSpace,
+                                     bool SetAbsOffset) {
+  auto &ST = TM->getSubtarget<TPUSubtarget>(*CurFun);
+  if (SetAbsOffset) {
+    CurMemOffset[MemAddressSpace] =
+        std::max((unsigned)CurMemOffset[MemAddressSpace],
+                 Offset + Allocation->getSize());
+  }
+  return Offset + Allocation->getSize() > ST.getMemSize(MemAddressSpace);
+}
+
+bool TPUMemAllocImpl::CheckFixedAllocationOffset(AllocaIntrinsic *Allocation,
+                                                 unsigned Offset,
+                                                 unsigned MemAddressSpace) {
+  assert(BumpPointerBase.find(MemAddressSpace) != BumpPointerBase.end());
+  if (BumpPointerBase[MemAddressSpace] == 0)
+    return true;
+  if (Offset + Allocation->getSize() >= BumpPointerBase[MemAddressSpace])
+    return false;
+  return true;
+}
+
+void TPUMemAllocImpl::AllocateRegion(
+    const std::vector<AllocaIntrinsic *> &Allocations) {
+  // Hash map tracking allocations per address space.
+  DenseMap<unsigned, uint64_t> BumpPointer;
+  for (AllocaIntrinsic *Allocation : Allocations) {
+    // Don't check bounds or bump allocation for dynamic allocations.
+    if (Allocation->isDynamic()) {
+      DynamicAllocationOffsets[Allocation] = Allocation->getOperand(1);
+      continue;
+    }
+    unsigned MemAddressSpace = Allocation->getType()->getPointerAddressSpace();
+    unsigned Offset = 0;
+    bool HasOffset = Allocation->hasOffset();
+    if (HasOffset) {
+      Offset = Allocation->getOffset();
+    } else {
+      auto It = BumpPointer.find(MemAddressSpace);
+      Offset = It != BumpPointer.end() ? It->second : 0;
+      Offset += BumpPointerBase[MemAddressSpace];
+      BumpPointer[MemAddressSpace] = Offset + Allocation->getSize();
+    }
+    if (HasOffset) {
+      if (!CheckFixedAllocationOffset(Allocation, Offset, MemAddressSpace))
+        report_fatal_error_or_test(
+            "Fixed allocation is addressing bump allocation space.");
+    }
+    bool IsOutOfAllocationSpace = false;
+    auto &ST = TM->getSubtarget<TPUSubtarget>(*CurFun);
+    switch (MemAddressSpace) {
+    default:
+      // This includes TPUAS_Hbm, where no allocation intrinsics
+      // currently exist.
+      llvm_unreachable("Unknown address space for pointer.");
+    case TPUAS_Smem:
+      IsOutOfAllocationSpace =
+          CheckAbsLimits(Allocation, Offset, MemAddressSpace,
+                         !LegacySMemSpillLimitsExist || !HasOffset);
+      break;
+    case TPUAS_Vmem: {
+      // As pointed out before, we ignore checks and updates for existing
+      // allocations as determined by LLO.
+      IsOutOfAllocationSpace =
+          CheckAbsLimits(Allocation, Offset, MemAddressSpace,
+                         !LegacyVMemSpillLimitsExist || !HasOffset);
+      break;
+    }
+    case TPUAS_SmemAny:
+    case TPUAS_Sflag:
+    case TPUAS_SflagAny:
+    case TPUAS_SflagOther:
+    case TPUAS_Dreg:
+    case TPUAS_Hbm:
+    case TPUAS_HbmAny:
+    case TPUAS_TileSpmem:
+    case TPUAS_Timem:
+    case TPUAS_Spmem:
+    case TPUAS_Iova: {
+      CurMemOffset[MemAddressSpace] = Offset + Allocation->getSize();
+      IsOutOfAllocationSpace |=
+          CurMemOffset[MemAddressSpace] > ST.getMemSize(MemAddressSpace);
+      IsOutOfAllocationSpace |=
+          ST.getNumTiles() * CurMemOffset[TPUAS_TileSpmem] >
+          ST.getMemSize(TPUAS_Spmem);
+      break;
+    }
+    }
+    if (IsOutOfAllocationSpace)
+      report_fatal_error_or_test("Scoped allocation overflow.");
+    AllocationOffsets[Allocation] = Offset;
+    SetAllocationMD(Allocation, Offset);
+  }
+}
+
+void TPUMemAllocImpl::SetAllocationMD(AllocaIntrinsic *Allocation,
+                                      unsigned Offset) {
+  auto key = AllocationDesc(Allocation->getAddressSpace(), Offset,
+                           Allocation->getSize());
+  auto It = AllocMDCache.find(key);
+  if (It != AllocMDCache.end()) {
+    AliasMD[Allocation] = It->second;
+    return;
+  }
+  MDBuilder MDB(Allocation->getContext());
+  auto *MD = MDB.createAnonymousAliasScope(Domain, "alloc");
+  AliasMD[Allocation] = MD;
+  AllocMDCache[key] = MD;
+}
+
+int TPUMemAllocImpl::GetOrInsertFuncMetadata(const Module *M,
+                                             NamedMDNode *NMDNFuncs) {
+  int i = 0;
+  for (i = 0; i < NMDNFuncs->getNumOperands(); i++) {
+    MDNode *MDN = cast<MDNode>(NMDNFuncs->getOperand(i));
+    if (cast<ValueAsMetadata>(MDN->getOperand(0))->getValue() == CurFun)
+      break;
+  }
+  if (i == NMDNFuncs->getNumOperands()) {
+    NMDNFuncs->addOperand(
+        MDNode::get(M->getContext(), ValueAsMetadata::get(CurFun)));
+  }
+  return i;
+};
+
+void TPUMemAllocImpl::SetSpillLimitsMetadata() {
+  auto &ST = TM->getSubtarget<TPUSubtarget>(*CurFun);
+  Module *M = CurFun->getParent();
+  unsigned SMemStart = 0;
+  unsigned SMemLimit = UINT_MAX;
+  unsigned TileSpmemStart = 0;
+  unsigned TileSpmemLimit = UINT_MAX;
+  unsigned VMemStart = 0;
+  unsigned VMemLimit = UINT_MAX;
+  if (LegacySMemSpillLimitsExist) {
+    std::pair<int, int> SMemRange =
+        TPU::GetSpillRange(M, "smem.spill.start", "smem.spill.limit");
+    SMemStart = SMemRange.first;
+    SMemLimit = SMemRange.second;
+  }
+  if (LegacyTileSpmemSpillLimitsExist) {
+    std::pair<int, int> TileSpmemRange =
+        TPU::GetSpillRange(M, "tilespmem.spill.start", "tilespmem.spill.limit");
+    TileSpmemStart = TileSpmemRange.first;
+    TileSpmemLimit = TileSpmemRange.second;
+  }
+  if (LegacyVMemSpillLimitsExist) {
+    std::pair<int, int> VMemRange =
+        TPU::GetSpillRange(M, "vmem.spill.start", "vmem.spill.limit");
+    VMemStart = VMemRange.first;
+    VMemLimit = VMemRange.second;
+  }
+  auto SetSpillRangeMetadata = [M](StringRef Name, int Idx,
+                                         unsigned Val) {
+    NamedMDNode *NMDN = M->getOrInsertNamedMetadata(Name);
+    assert(Idx <= NMDN->getNumOperands());
+    if (Idx == NMDN->getNumOperands()) {
+      NMDN->addOperand(
+          MDNode::get(M->getContext(),
+                      ConstantAsMetadata::get(Constant::getIntegerValue(
+                          Type::getInt32Ty(M->getContext()), APInt(32, Val)))));
+    }
+    NMDN->setOperand(
+        Idx,
+        MDNode::get(M->getContext(),
+                    ConstantAsMetadata::get(Constant::getIntegerValue(
+                        Type::getInt32Ty(M->getContext()), APInt(32, Val)))));
+  };
+  //
+  // We are setting the spill limits iff not already preset.
+  //
+  if (!SMemSpillLimitsExist) {
+    NamedMDNode *NMDNSMemFuncs =
+        M->getOrInsertNamedMetadata("smem.funcs.spill");
+    int Idx = GetOrInsertFuncMetadata(M, NMDNSMemFuncs);
+    SetSpillRangeMetadata("smem.ranges.spill.start", Idx,
+                          SMemStart + CurMemOffset[TPUAS_Smem]);
+    SetSpillRangeMetadata("smem.ranges.spill.limit", Idx,
+                          std::min(ST.getMemSize(TPUAS_Smem), SMemLimit));
+  }
+  if (!TileSpmemSpillLimitsExist) {
+    NamedMDNode *NMDNTileSpmemFuncs =
+        M->getOrInsertNamedMetadata("tilespmem.funcs.spill");
+    int Idx = GetOrInsertFuncMetadata(M, NMDNTileSpmemFuncs);
+    SetSpillRangeMetadata("tilespmem.ranges.spill.start", Idx,
+                          TileSpmemStart + CurMemOffset[TPUAS_TileSpmem]);
+    SetSpillRangeMetadata(
+        "tilespmem.ranges.spill.limit", Idx,
+        std::min(ST.getMemSize(TPUAS_TileSpmem), TileSpmemLimit));
+  }
+  if (!VMemSpillLimitsExist) {
+    NamedMDNode *NMDNVMemFuncs =
+        M->getOrInsertNamedMetadata("vmem.funcs.spill");
+    int Idx = GetOrInsertFuncMetadata(M, NMDNVMemFuncs);
+    SetSpillRangeMetadata("vmem.ranges.spill.start", Idx,
+                          VMemStart + CurMemOffset[TPUAS_Vmem]);
+    SetSpillRangeMetadata("vmem.ranges.spill.limit", Idx,
+                          std::min(ST.getMemSize(TPUAS_Vmem), VMemLimit));
+  }
+}
+
+void TPUMemAllocImpl::SanityCheckSpillLimits(
+    const std::vector<AllocaIntrinsic *> &Allocations) {
+  if (!TPUEnableSanityChecks)
+    return;
+  auto &ST = TM->getSubtarget<TPUSubtarget>(*CurFun);
+  if (!ST.isSparseCore())
+    return;
+  Module *M = CurFun->getParent();
+  auto GetSpillRange = [this, &M](StringRef MemType) {
+    // LINT.IfChange
+    return TPU::GetSpillRange(M, CurFun, MemType + Twine(".funcs.spill"),
+                              MemType + Twine(".ranges.spill.start"),
+                              MemType + Twine(".ranges.spill.limit"));
+    // LINT.ThenChange(//depot/google3/platforms/xla/sparse_core/mlo/convert_to_llvm_ir.cc)
+  };
+  for (auto *AI : Allocations) {
+    if (!AI->hasOffset())
+      continue;
+    std::pair<int, int> SR;
+    switch (AI->getAddressSpace()) {
+    case TPUAS_Smem:
+      if (!SMemSpillLimitsExist)
+        continue;
+      SR = GetSpillRange("smem");
+      break;
+    case TPUAS_TileSpmem:
+      if (!TileSpmemSpillLimitsExist)
+        continue;
+      SR = GetSpillRange("tilespmem");
+      break;
+    default:
+      break;
+    }
+    if ((AI->getOffset() >= SR.first && AI->getOffset() < SR.second) ||
+        (AI->getOffset() + AI->getSize() > SR.first &&
+         AI->getOffset() + AI->getSize() < SR.second)) {
+      errs() << "Sanity check failed:\n";
+#ifndef NDEBUG
+      AI->dump();
+#endif
+      report_fatal_error_or_test("Allocation within spill limits.");
+    }
+  }
+}
+
+void TPUMemAllocImpl::SetAliasingMetadata(
+    const std::vector<Instruction *> &MemoryAccesses) {
+  DenseMap<const AllocaIntrinsic *, MDNode *> ScopeMD;
+  for (auto *MemOp : MemoryAccesses) {
+    SmallVector<const Value *, 2> PtrArgs;
+    if (const LoadInst *LI = dyn_cast<LoadInst>(MemOp))
+      PtrArgs.push_back(LI->getPointerOperand());
+    else if (const StoreInst *SI = dyn_cast<StoreInst>(MemOp))
+      PtrArgs.push_back(SI->getPointerOperand());
+    else if (const auto *Call = dyn_cast<CallBase>(MemOp)) {
+      if (!Call->onlyAccessesArgMemory())
+        continue;
+      for (Value *Arg : Call->args()) {
+        if (!Arg->getType()->isPointerTy())
+          continue;
+        PtrArgs.push_back(Arg);
+      }
+    }
+    SmallPtrSet<const Value *, 4> ObjSet;
+    for (const Value *V : PtrArgs) {
+      SmallVector<const Value *, 4> Objects;
+      getUnderlyingObjects(V, Objects, /*LI =*/nullptr, /*MaxLookup =*/20);
+      for (const Value *O : Objects)
+        ObjSet.insert(O);
+    }
+    bool UnknownAliasing = false;
+    for (const Value *V : ObjSet) {
+      // If the value is constant it cannot affect any allocation.
+      if (isa<Constant>(V))
+        continue;
+      if (!isa<AllocaIntrinsic>(V) ||
+          AliasMD.count(cast<AllocaIntrinsic>(V)) == 0) {
+        UnknownAliasing = true;
+        break;
+      }
+    }
+    // If we couldn't prove that the memory access only depends on allocations
+    // skip setting any metadata.
+    if (UnknownAliasing)
+      continue;
+    SmallVector<Metadata *, 4> Scopes, NoAliases;
+    // Any allocation that doesn't overlap with any of the pointers this access
+    // depends on is marked as noalias.
+    for (auto It : AllocMDCache) {
+      bool AllocaAlias = false;
+      unsigned AddressSpace = It.first.getAddressSpace();
+      unsigned Offset = It.first.getOffset();
+      unsigned EndOffset = Offset + It.first.getSize();
+      for (const Value *V : ObjSet) {
+        const AllocaIntrinsic *DepAlloc = dyn_cast<AllocaIntrinsic>(V);
+        if (DepAlloc == nullptr)
+          continue;
+        // There is no aliasing if the address space is not the same.
+        if (AddressSpace != DepAlloc->getType()->getPointerAddressSpace())
+          continue;
+        unsigned OffsetDep = AllocationOffsets[DepAlloc];
+        unsigned EndOffsetDep = OffsetDep + DepAlloc->getSize();
+        // Check if the interval [Offset, EndOffset) intersect with
+        // [OffsetDep, EndOffsetDep(
+        if ((Offset < OffsetDep && EndOffset > OffsetDep) ||
+            (Offset >= OffsetDep && Offset < EndOffsetDep)) {
+          AllocaAlias = true;
+          break;
+        }
+      }
+      if (AllocaAlias == false)
+        NoAliases.push_back(It.second);
+    }
+    if (!NoAliases.empty())
+      MemOp->setMetadata(
+          LLVMContext::MD_noalias,
+          MDNode::concatenate(MemOp->getMetadata(LLVMContext::MD_noalias),
+                              MDNode::get(CurFun->getContext(), NoAliases)));
+
+    for (const Value *A : ObjSet) {
+      if (const AllocaIntrinsic *Allocation = dyn_cast<AllocaIntrinsic>(A))
+        Scopes.push_back(AliasMD[Allocation]);
+    }
+    if (!Scopes.empty())
+      MemOp->setMetadata(
+          LLVMContext::MD_alias_scope,
+          MDNode::concatenate(MemOp->getMetadata(LLVMContext::MD_alias_scope),
+                              MDNode::get(CurFun->getContext(), Scopes)));
+  }
+}
+
+PreservedAnalyses TPUMemAllocPass::run(Module &M, ModuleAnalysisManager &AM) {
+  if (runImpl(M, &TM)) {
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    return PA;
+  }
+  return PreservedAnalyses::all();
+}
+
+INITIALIZE_PASS(TPUMemAlloc, DEBUG_TYPE, "TPU memory allocation", false, false)

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUNopCoalescing.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUNopCoalescing.cpp
new file mode 100644
index 0000000..8be0b9e
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUNopCoalescing.cpp

@@ -0,0 +1,257 @@
+//===- TPUNopCoalescing.cpp - Rewrite NOP sleds to SDELAY -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TPU.h"
+#include "TPUBundleTracker.h"
+#include "TPUInstrInfo.h"
+#include "TPUSchedule.h"
+#include "TPUSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+
+#define DEBUG_TYPE "tpu-nop-coalescing"
+
+using namespace llvm;
+
+extern cl::opt<bool> EnableLoopAnalysis;
+
+namespace {
+class TPUNopCoalescing : public MachineFunctionPass {
+public:
+  static char ID;
+  TPUNopCoalescing() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF);
+  StringRef getPassName() const override {
+    return "TPU {S|V}NOP -> {S|V}DELAY coalescing";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  int MaxShortDelay = 8;
+  bool runOnMachineFunction(MachineFunction &MF, unsigned Nop,
+                            unsigned ShortDelay, unsigned LongDelay);
+  // A (very-) late pass to clean up empty blocks. Those are artifacts most
+  // likely due to pseudo instructions like FIFO pseudo copies that were needed
+  // during bundle packing, and removed post-bundle packing.
+  bool cleanUpEmptyBlocks(MachineFunction &MF);
+  // Auxiliary function, returns true if the block only contains an empty
+  // bundle, or no bundle.
+  bool blockHasNoRealInstruction(const MachineBasicBlock &MBB);
+};
+
+char TPUNopCoalescing::ID = 0;
+
+} // namespace
+
+INITIALIZE_PASS(TPUNopCoalescing, DEBUG_TYPE, "TPU NOP coalescing", false,
+                false)
+
+bool TPUNopCoalescing::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  Changed |= runOnMachineFunction(MF, TPU::SNOP, TPU::SDELAY, TPU::SDELAY);
+  Changed |= runOnMachineFunction(MF, TPU::VNOP, TPU::VDELAY, TPU::VDELAY_LONG);
+
+  if (EnableLoopAnalysis) {
+    int BundleCount = std::accumulate(
+        MF.begin(), MF.end(), 0, [&](int MFC, MachineBasicBlock &MBB) {
+          return MFC +
+                 std::count_if(MBB.begin(), MBB.end(), [&](MachineInstr &MI) {
+                   return MI.getOpcode() == TPU::BUNDLE;
+                 });
+        });
+    dbgs() << "Bundles in function " << MF.getName() << ": " << BundleCount
+           << "\n";
+    auto &MLI = getAnalysis<MachineLoopInfo>();
+    for (auto &L : MLI.getBase().getLoopsInPreorder()) {
+      int BundleCount = std::accumulate(
+          L->blocks().begin(), L->blocks().end(), 0,
+          [&](int MFC, MachineBasicBlock *MBB) {
+            return MFC + std::count_if(MBB->begin(), MBB->end(),
+                                       [&](MachineInstr &MI) {
+                                         return MI.getOpcode() == TPU::BUNDLE;
+                                       });
+          });
+      dbgs() << "Bundles in loop with header block bb."
+             << L->getHeader()->getNumber() << ": " << BundleCount
+             << "\nFull loop:\n"
+             << *L;
+    }
+  }
+
+  return Changed;
+}
+
+bool TPUNopCoalescing::runOnMachineFunction(MachineFunction &MF, unsigned Nop,
+                                            unsigned ShortDelay,
+                                            unsigned LongDelay) {
+  auto &ST = MF.getSubtarget<TPUSubtarget>();
+  auto *TII = ST.getInstrInfo();
+  BundleTracker BT(ST);
+
+  if(ST.hasVfcTensorCore())
+    MaxShortDelay = 16;
+
+  for (MachineBasicBlock &MBB : MF) {
+    if (blockHasNoRealInstruction(MBB))
+      continue;
+    auto DelayInstPt = MBB.begin();
+    bool IsTopLevelNop = true;
+    for (auto I = MBB.begin(); I != MBB.end();) {
+      auto BundleMayBeBranchDelay = [](MachineBasicBlock::instr_iterator II,
+                                       MachineBasicBlock::instr_iterator EE) {
+        while (II != EE && II->isBundledWithPred()) {
+          if (II->isTerminator() || II->isCall())
+            return true;
+          II++;
+        }
+        return false;
+      };
+      assert(I->getOpcode() == TPU::BUNDLE);
+      if (BundleMayBeBranchDelay(std::next(I.getInstrIterator()),
+                                 MBB.instr_end())) {
+        // We're not coalesing once a terminator has been encountered. This has
+        // no effect if the number of delay slots is == 1, but matters if > 1.
+        break;
+      }
+      if (!TPUInstrInfo::isNopBundle(&*I, Nop)) {
+        DelayInstPt = I;
+        IsTopLevelNop = false;
+        ++I;
+        continue;
+      }
+      auto E = std::next(I);
+      unsigned N = 1;
+      while (E != MBB.end() && TPUInstrInfo::isNopBundle(&*E, Nop))
+        ++E, ++N;
+      MachineInstr *MI = AddDefaultPred(
+          BuildMI(MF, DelayInstPt->getDebugLoc(),
+                  TII->get(N > MaxShortDelay ? LongDelay : ShortDelay))
+              .addImm(N));
+      MachineBasicBlock::instr_iterator It;
+      bool CanInsertInPrevBundle = false;
+      if (!IsTopLevelNop) {
+        BT.clear();
+        BT.addBundle(&(*DelayInstPt));
+        It = std::next(DelayInstPt.getInstrIterator());
+        // We're making sure MI has a parent when calling the bundle tracker.
+        MBB.insert(MBB.begin(), MI);
+        CanInsertInPrevBundle = BT.canAddMI(*MI);
+        MI->removeFromParent();
+      } else {
+        // If the Nops are at the top there is no previous bundle. Insert
+        // above the first bundle.
+        It = MBB.begin().getInstrIterator();
+      }
+      if (CanInsertInPrevBundle) {
+        // There is room in the previous bundle. Merge the VDelay in it.
+        MIBundleBuilder Builder(&(*DelayInstPt));
+        Builder.append(MI);
+      } else if (N == 1) {
+        // It is a single NOP that cannot be folded in the previous bundle so
+        // we leave it alone.
+        MBB.insert(MBB.begin(), MI);
+        MI->eraseFromParent();
+        I = E;
+        continue;
+      } else {
+        while (It->isBundledWithPred())
+          It++;
+        // We couldn't insert the delay in the previous bundle so create a new
+        // bundle with a delay of N - 1. As the delay will be alone in its
+        // bundle it doesn't matter if we use short or long delay.
+        MI->getOperand(0).setImm(N - 1);
+        MIBundleBuilder Builder(MBB, It);
+        Builder.append(MI);
+        llvm::finalizeBundle(MBB, MI->getIterator(), It);
+      }
+      MBB.erase(I, E);
+      I = E;
+    }
+  }
+  // The following is not strictly necessary if we encode empty bundles as nops.
+  auto InsertNop = [&MF, &TII](auto I) {
+    const TPUSubtarget &ST = MF.getSubtarget<TPUSubtarget>();
+    MachineInstr *Nop = AddDefaultPred(
+        BuildMI(MF, I->getDebugLoc(), TII->get(ST.getDefaultNop())));
+    MIBundleBuilder Builder(&*I);
+    Builder.append(Nop);
+  };
+  auto AllInstructionsPseudo = [](auto I) {
+    for (auto It = std::next(I->getIterator());; ++It) {
+      if (!It->isPseudo() && It->getOpcode() != TPU::IMPLICIT_DEF)
+        return false;
+      if (!It->isBundledWithSucc())
+        break;
+    }
+    return true;
+  };
+  for (MachineBasicBlock &MBB : MF) {
+    if (blockHasNoRealInstruction(MBB)) {
+      // We're not filling in empty blocks with nops. These are artifacts of
+      // pseudo instructions, most likely fifo pseudo copies that were removed
+      // post bundle packing.
+      MBB.clear();
+      continue;
+    }
+    for (auto I = MBB.begin(); I != MBB.end(); I++) {
+      assert(I->getOpcode() == TPU::BUNDLE);
+      if (I->getBundleSize() == 0) {
+        InsertNop(I);
+        continue;
+      }
+      if (!AllInstructionsPseudo(I))
+        continue;
+      InsertNop(I);
+    }
+  }
+
+  cleanUpEmptyBlocks(MF);
+
+  return true;
+}
+
+bool TPUNopCoalescing::blockHasNoRealInstruction(const MachineBasicBlock &MBB) {
+  if (MBB.empty())
+    return true;
+  if (MBB.size() == 1) {
+    assert(MBB.instr_begin()->isBundle());
+    return true;
+  }
+  return false;
+}
+
+bool TPUNopCoalescing::cleanUpEmptyBlocks(MachineFunction &MF) {
+  SmallVector<MachineBasicBlock *> EmptyBlocks;
+  for (MachineBasicBlock &MBB : MF) {
+    if (!blockHasNoRealInstruction(MBB))
+      continue;
+    EmptyBlocks.push_back(&MBB);
+  }
+  for (MachineBasicBlock *MBB : EmptyBlocks) {
+    assert(MBB->succ_size() == 1);
+    MachineBasicBlock *SuccMBB = *MBB->succ_begin();
+    for (MachineBasicBlock *PredMBB : MBB->predecessors()) {
+      PredMBB->replaceSuccessor(MBB, SuccMBB);
+      TPUInstrInfo::updateTerminator(*PredMBB, MBB, SuccMBB);
+    }
+    MBB->eraseFromParent();
+  }
+  return true;
+}
+
+Pass *llvm::createTPUNopCoalescingPass() { return new TPUNopCoalescing(); }

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUPostBundleLowerPseudos.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUPostBundleLowerPseudos.cpp
new file mode 100644
index 0000000..32b20c6
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUPostBundleLowerPseudos.cpp

@@ -0,0 +1,710 @@
+//===- TPUPostBundleLowerPseudos.cpp - Remove BRreserve/BRs -----*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers pseudo instructions after bundle packing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/TPUMCTargetDesc.h"
+#include "TPU.h"
+#include "TPUBundleTracker.h"
+#include "TPUInstrInfo.h"
+#include "TPUSchedule.h"
+#include "TPUSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+
+#define DEBUG_TYPE "tpu-post-bundle-lower"
+#define DEBUG_TYPE_LOOP_ANALYSIS "tpu-loop-analysis"
+#define LLVM_DEBUG_LOOP_ANALYSIS(X) DEBUG_WITH_TYPE(DEBUG_TYPE_LOOP_ANALYSIS, X)
+
+using namespace llvm;
+
+namespace {
+class PostBundleLowerPseudos : public MachineFunctionPass {
+public:
+  static char ID;
+  PostBundleLowerPseudos() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF);
+  StringRef getPassName() const override {
+    return "TPU late pseudo expansion pass";
+  }
+
+  void AssertBundleValid(MachineBasicBlock::iterator Bundle);
+
+private:
+  void handleBcLoopEnd(MachineInstr *MI);
+  void handleBcPipelinedInst(MachineInstr *MI, int &PipelineDepth);
+  void handleBcSubregCopyInst(MachineInstr *MI);
+  void handleXLUPacked(MachineInstr *MI);
+  void handleTrap(MachineInstr *MI);
+  void handlePseudoIAR(MachineInstr *MI);
+  void handlePseudoPMOV(MachineInstr *MI);
+  void handleFifoPseudoCopies(MachineInstr *MI);
+  void handleScBundleLimiter(MachineInstr *MI);
+  void handleReadLocalCycleCountOrGlobalTimeCounter(MachineInstr *MI);
+  void handlePseudoTrap(MachineInstr *MI);
+  void handlePseudoSfenceImem(MachineInstr *MI);
+  // Returns true if instruction was a spill pseudo instruction and was handled.
+  bool handleSpillOpcodes(MachineInstr *MI);
+  bool handleBranchBundle(MachineInstr *MI);
+  bool handleCallBundle(MachineInstr *MI);
+  // On Sparsecore, we find the vmovs that need to be converted into vmovc.
+  void lowerVmovc(MachineFunction &MF);
+
+  const TPUSubtarget *ST;
+  const TPUInstrInfo *TII;
+  // Late initialized bundle tacker.
+  std::optional<BundleTracker> BT;
+#ifndef NDEBUG
+  SmallDenseSet<MachineBasicBlock *> TrackFillBlocks;
+  SmallDenseSet<MachineBasicBlock *> TrackSpillBlocks;
+#endif
+};
+
+char PostBundleLowerPseudos::ID = 0;
+
+struct PseudoIARTy {
+  unsigned PseudoOp;
+  unsigned NativeOp;
+};
+
+using namespace TPU;
+#define GET_PseudoIARInst_IMPL
+#include "TPUGenSearchableTables.inc"
+#undef GET_PseudoIARInst_IMPL
+} // namespace
+
+INITIALIZE_PASS(PostBundleLowerPseudos, DEBUG_TYPE,
+                "TPU post bundle lower pseudos", false, false)
+
+bool PostBundleLowerPseudos::runOnMachineFunction(MachineFunction &MF) {
+  ST = &MF.getSubtarget<TPUSubtarget>();
+  TII = ST->getInstrInfo();
+#ifndef NDEBUG
+  TrackFillBlocks.clear();
+  TrackSpillBlocks.clear();
+#endif
+  BT.emplace(*ST);
+
+  int BcPipelineDepth = -1;
+  for (auto &MBB : MF) {
+    if (ST->isPxcBarnaCore()) {
+      // These Barnacore post pseudos rely on traversing in forward order.
+      for (auto BI = MBB.instr_begin(); BI != MBB.instr_end();) {
+        MachineBasicBlock::instr_iterator MI = BI++;
+        unsigned Opcode = MI->getOpcode();
+        if (Opcode == TPU::bcLOOP_END) {
+          handleBcLoopEnd(&*MI);
+          continue;
+        }
+        if (!MI->isBundle() && TII->isVectorInstruction(*MI)) {
+          handleBcPipelinedInst(&*MI, BcPipelineDepth);
+          continue;
+        }
+        if (Opcode == TPU::bcIMPLICIT_SUBREG_COPY) {
+          handleBcSubregCopyInst(&*MI);
+          continue;
+        }
+      }
+      continue;
+    }
+    assert(!ST->isPxcBarnaCore());
+    for (auto BI = MBB.instr_rbegin(); BI != MBB.instr_rend();) {
+      // Some of the post pseudos rely on traversing in reverse order.
+      MachineBasicBlock::reverse_instr_iterator MI = BI++;
+      unsigned Opcode = MI->getOpcode();
+      if (MI->getOpcode() == TPU::BUNDLE)
+        continue;
+      if (TPUInstrInfo::isPacked(MI->getDesc())) {
+        handleXLUPacked(&*MI);
+        continue;
+      }
+      if (Opcode == TPU::TRAP) {
+        handleTrap(&*MI);
+        continue;
+      }
+      if (getPseudoIAROpcode(Opcode)) {
+        handlePseudoIAR(&*MI);
+        continue;
+      }
+      if (TPUInstrInfo::isFifoPseudoCopy(*MI)) {
+        handleFifoPseudoCopies(&*MI);
+        continue;
+      }
+      if (TPUInstrInfo::isScBundleLimiter(*MI)) {
+        handleScBundleLimiter(&*MI);
+        continue;
+      }
+      if (Opcode == TPU::LCC_READ || Opcode == TPU::GTC_READ) {
+        handleReadLocalCycleCountOrGlobalTimeCounter(&*MI);
+        continue;
+      }
+      if (Opcode == TPU::PMOV) {
+        handlePseudoPMOV(&*MI);
+        continue;
+      }
+      if (Opcode == TPU::scPSEUDO_TRAPi || Opcode == TPU::scPSEUDO_TRAPr) {
+        handlePseudoTrap(&*MI);
+        continue;
+      }
+      if (Opcode == TPU::SFENCE_IMEM) {
+        handlePseudoSfenceImem(&*MI);
+        continue;
+      }
+      if (handleSpillOpcodes(&*MI))
+        continue;
+      if (handleBranchBundle(&*MI))
+        continue;
+      if (handleCallBundle(&*MI))
+        continue;
+    }
+  }
+  if (ST->isSparseCore()) {
+    // This could probably be done within the loop above, but we simplify and do
+    // it separately.
+    lowerVmovc(MF);
+  }
+
+  LLVM_DEBUG_LOOP_ANALYSIS({
+    for (MachineBasicBlock &MBB : MF) {
+      std::string s;
+      if (TrackSpillBlocks.count(&MBB)) {
+        s = "spill";
+      }
+      if (TrackFillBlocks.count(&MBB)) {
+        if (!s.empty())
+          s += " + ";
+        s += "fill";
+      }
+      if (!s.empty())
+        dbgs() << "bb." << MBB.getNumber() << " has " << s << "\n";
+    }
+  });
+
+  return true;
+}
+
+void PostBundleLowerPseudos::lowerVmovc(MachineFunction &MF) {
+  // This relies on the fact that TPU::VMOVC is defined using only one
+  // SLOT_VRES0 resource. The second constraint of sharing another read port is
+  // modeled by checked the number of read ports in the bundle tracker.
+  unsigned VMOVCResources = 0;
+  auto InstrItins = MF.getSubtarget().getInstrItineraryData();
+  unsigned SchedClassIdx = TII->get(TPU::scVMOVC).SchedClass;
+  for (const InstrStage &IS : make_range(InstrItins->beginStage(SchedClassIdx),
+                                         InstrItins->endStage(SchedClassIdx)))
+    VMOVCResources |= IS.getUnits();
+  auto ProcessBundle = [this,
+                        VMOVCResources](MachineBasicBlock::instr_iterator MIB,
+                                        MachineBasicBlock::instr_iterator MIE) {
+    if (MIB == MIE)
+      return;
+    SmallVector<std::pair<int, MachineInstr *>, 4> VMOVrs;
+    // The Idx is used to query the instruction's resource in the bundle.
+    int Idx = 0;
+    BT->clear();
+    // Traverse in reverse order to match the bundle packer.
+    // TODO(b/178145223, b/181709129): We can do better.
+    for (auto MII = std::prev(MIE); MII != std::prev(MIB); MII--) {
+      if (!MII->getDesc().getSchedClass() || MII->isBranch())
+        continue;
+      assert(BT->canAddMI(*MII));
+      BT->addMI(*MII);
+      if (MII->getOpcode() == TPU::VMOVr)
+        VMOVrs.push_back(std::make_pair(Idx, &*MII));
+      Idx++;
+    }
+    for (auto &I : VMOVrs) {
+      unsigned Resources = BT->getSlotsUsed(I.first);
+      // See above. This would not work if shared resources would be added to
+      // the vmovc instruction's itinerary.
+      if (Resources == VMOVCResources)
+        I.second->setDesc(TII->get(TPU::scVMOVC));
+    }
+    assert(std::count_if(VMOVrs.begin(), VMOVrs.end(), [](auto &I) {
+             return I.second->getOpcode() == TPU::scVMOVC;
+           }) <= 1);
+  };
+  for (auto &MBB : MF) {
+    for (auto BundleIt =
+             MachineBasicBlock::iterator::getAtBundleBegin(MBB.instr_begin());
+         BundleIt != MBB.instr_end(); BundleIt++) {
+      auto It = BundleIt.getInstrIterator();
+      ProcessBundle(getBundleStart(It), getBundleEnd(It));
+    }
+  }
+}
+
+MachineInstr *findLoopSetup(MachineBasicBlock *LoopBB) {
+  assert(LoopBB->pred_size() == 2);
+  MachineBasicBlock *LoopPreheader = *LoopBB->pred_begin();
+  if (LoopPreheader == LoopBB)
+    LoopPreheader = *std::next(LoopBB->pred_begin());
+
+  auto LoopStart = find_if(LoopPreheader->instrs(), [](const MachineInstr &I) {
+    return I.getOpcode() == TPU::bcLOOP_SETUP;
+  });
+  assert(LoopStart != LoopPreheader->instr_end() &&
+         "Couldn't find bcLOOP_START!");
+  return &*LoopStart;
+}
+
+void PostBundleLowerPseudos::handleBcLoopEnd(MachineInstr *MI) {
+  MachineFunction &MF = *MI->getMF();
+  const TargetSubtargetInfo *STI = &MF.getSubtarget();
+  const TargetInstrInfo *TII = STI->getInstrInfo();
+
+  // Find the bcLOOP_SETUP in the preheader block.
+  MachineBasicBlock *LoopBB = MI->getParent();
+  MachineInstr *LoopStart = findLoopSetup(LoopBB);
+  MachineBasicBlock *LoopPreheader = LoopStart->getParent();
+
+  // Reverse to the start of the LOOP_END bundle so we can use std::distance
+  // to work out the instruction count.
+  auto It = MI->getIterator();
+  while (It->isBundledWithPred() && !It->isBundle()) {
+    --It;
+  }
+  unsigned InstrCount =
+      std::distance(LoopBB->begin(), MachineBasicBlock::iterator(It));
+  if (InstrCount == 0) {
+    // Special case: BarnaCore does not allow 1-cycle loops. As a workaround,
+    // we're adding a nop to the loop.
+    MachineBasicBlock::instr_iterator Before = LoopBB->instr_begin();
+    MachineInstr *MI = AddDefaultPred(
+        BuildMI(*LoopBB, Before, LoopBB->instr_front().getDebugLoc(),
+                TII->get(TPU::bcNOP)));
+    llvm::finalizeBundle(*MI->getParent(), MI->getIterator(), Before);
+    InstrCount++;
+  }
+  unsigned PipelineDepth = LoopStart->getOperand(0).getImm();
+  BuildMI(*LoopPreheader, *LoopStart, LoopStart->getDebugLoc(),
+          TII->get(TPU::bcLOOP_START))
+      .addImm(InstrCount)
+      .addImm(PipelineDepth);
+
+  LoopStart->eraseFromBundle();
+}
+
+void PostBundleLowerPseudos::handleBcPipelinedInst(MachineInstr *MI,
+                                                   int &PipelineDepth) {
+  if (TPUPredicate(MI).getBarnaCorePipelineStage() == 0)
+    // Nothing to do.
+    return;
+  // Any use or def of VPRs within a pipelined instruction need to be adjusted
+  // for the pipeline stage.
+  //
+  // Up to this point, for a VPR base vN, pipelined instructions in stage $ps
+  // use register vN+$ps. This allows the scheduler and bundle packer to treat
+  // defs and use across different stages as disjoint, and models accurately
+  // the actual register that the hardware will use (modulo the modulo :) ).
+  if (PipelineDepth == -1) {
+    MachineInstr *LoopStart = findLoopSetup(MI->getParent());
+    PipelineDepth = LoopStart->getOperand(0).getImm() + 1;
+  }
+  assert(PipelineDepth > 0 && PipelineDepth < 5);
+  DenseSet<unsigned> ValidRegs;
+  for (unsigned I = 0; I < 32; I += PipelineDepth)
+    ValidRegs.insert(TPU::V0 + I);
+  for (unsigned I = 0; I < 8; I += PipelineDepth)
+    ValidRegs.insert(TPU::M0 + I);
+  for (MachineOperand &MO : MI->operands()) {
+    if (MO.isReg() && ((MO.getReg() >= TPU::V0 && MO.getReg() <= TPU::V31) ||
+                       (MO.getReg() >= TPU::M0 && MO.getReg() <= TPU::M7))) {
+      unsigned R = MO.getReg();
+      while (ValidRegs.count(R) == 0)
+        --R;
+      MO.setReg(R);
+    }
+  }
+
+  // Note that this doesn't apply to VAGG registers as we represent
+  // VAGG.sub_ps1 subregs as explicitly different physical registers, so
+  // they're printed correctly as-is. This isn't feasible for VPRs as every
+  // VPR is part of multiple register tuples at different pipeline stages, so
+  // we would end up with an explosion of physregs.
+}
+
+void PostBundleLowerPseudos::handleBcSubregCopyInst(MachineInstr *MI) {
+  auto Bundle =
+      MachineBasicBlock::iterator::getAtBundleBegin(MI->getIterator());
+  (void)Bundle;
+  MI->eraseFromBundle();
+  // Final sanity check.
+  LLVM_DEBUG(AssertBundleValid(Bundle));
+}
+
+void PostBundleLowerPseudos::AssertBundleValid(
+    MachineBasicBlock::iterator Bundle) {
+  BT->clear();
+  BT->addBundle(&(*Bundle));
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instruction mapping.
+//===----------------------------------------------------------------------===//
+namespace {
+struct PseudoInstMappingTy {
+  unsigned Pseudo;
+  unsigned Lowered;
+};
+
+using namespace TPU;
+#define GET_PseudoInstTable_IMPL
+#include "TPUGenSearchableTables.inc"
+#undef GET_PseudoInstTable_IMPL
+} // namespace
+
+void PostBundleLowerPseudos::handleXLUPacked(MachineInstr *MI) {
+  MachineBasicBlock *MBB = &*MI->getParent();
+  const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+  auto Bundle =
+      MachineBasicBlock::iterator::getAtBundleBegin(MI->getIterator());
+  (void)Bundle;
+  const PseudoInstMappingTy *PseudoOpcodeMap =
+      PseudoInstMapping(MI->getOpcode());
+  assert(PseudoOpcodeMap && "Missing pseudo opcode mapping.");
+  const MachineInstrBuilder &MB = BuildMI(*MBB, &*MI, MI->getDebugLoc(),
+                                          TII->get(PseudoOpcodeMap->Lowered));
+  for (unsigned I = 0; I < MI->getNumOperands(); I++) {
+    // Copy all operands except for second vector source.
+    if (I == 2)
+      continue;
+    MB.add(MI->getOperand(I));
+  }
+  BuildMI(*MBB, &*MI, MI->getDebugLoc(),
+          TII->get(TPU::XLUSUPP_PACKED))
+      .add(MI->getOperand(2))                        // second source
+      .add(MI->getOperand(MI->getNumOperands() - 2)) // Predicate
+      .add(MI->getOperand(MI->getNumOperands() - 1));
+  MI->eraseFromBundle();
+  // Final sanity check.
+  LLVM_DEBUG(AssertBundleValid(Bundle));
+}
+
+void PostBundleLowerPseudos::handleTrap(MachineInstr *MI) {
+  MachineBasicBlock *MBB = &*MI->getParent();
+  const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+  auto Bundle =
+      MachineBasicBlock::iterator::getAtBundleBegin(MI->getIterator());
+  (void)Bundle;
+  BuildMI(*MBB, &*MI, MI->getDebugLoc(), TII->get(TPU::HALT))
+      .add(MI->getOperand(0))
+      .addImm(0);
+  MI->eraseFromBundle();
+  // Final sanity check.
+  LLVM_DEBUG(AssertBundleValid(Bundle));
+}
+
+void PostBundleLowerPseudos::handlePseudoIAR(MachineInstr *MI) {
+  MachineBasicBlock *MBB = &*MI->getParent();
+  const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+  auto Bundle =
+      MachineBasicBlock::iterator::getAtBundleBegin(MI->getIterator());
+  (void)Bundle;
+  const PseudoIARTy *PseudoOpcodeMap = getPseudoIAROpcode(MI->getOpcode());
+  assert(PseudoOpcodeMap && "Missing pseudo opcode mapping.");
+  const MachineInstrBuilder &MB = BuildMI(*MBB, &*MI, MI->getDebugLoc(),
+                                          TII->get(PseudoOpcodeMap->NativeOp));
+  // Copy all operands.
+  for (unsigned I = 0; I < MI->getNumOperands(); I++)
+    MB.add(MI->getOperand(I));
+  MI->eraseFromBundle();
+  // Final sanity check.
+  LLVM_DEBUG(AssertBundleValid(Bundle));
+}
+
+void PostBundleLowerPseudos::handlePseudoPMOV(MachineInstr *MI) {
+  MachineBasicBlock *MBB = &*MI->getParent();
+  const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+  auto Bundle =
+      MachineBasicBlock::iterator::getAtBundleBegin(MI->getIterator());
+  (void)Bundle;
+  BuildMI(*MBB, &*MI, MI->getDebugLoc(), TII->get(TPU::POR))
+      .add(MI->getOperand(0))
+      .add(MI->getOperand(1))
+      .addImm(0)
+      .add(MI->getOperand(1))
+      .addImm(0)
+      .add(MI->getOperand(2))
+      .add(MI->getOperand(3));
+  MI->eraseFromBundle();
+  // Final sanity check.
+  LLVM_DEBUG(AssertBundleValid(Bundle));
+}
+
+void PostBundleLowerPseudos::handleFifoPseudoCopies(MachineInstr *MI) {
+  auto Bundle =
+      MachineBasicBlock::iterator::getAtBundleBegin(MI->getIterator());
+  (void)Bundle;
+  MI->eraseFromBundle();
+  // Final sanity check.
+  LLVM_DEBUG(AssertBundleValid(Bundle));
+}
+
+void PostBundleLowerPseudos::handleScBundleLimiter(MachineInstr *MI) {
+  auto Bundle =
+      MachineBasicBlock::iterator::getAtBundleBegin(MI->getIterator());
+  (void)Bundle;
+  MI->eraseFromBundle();
+  // Final sanity check.
+  LLVM_DEBUG(AssertBundleValid(Bundle));
+}
+
+void PostBundleLowerPseudos::handlePseudoTrap(MachineInstr *MI) {
+  MachineBasicBlock *MBB = &*MI->getParent();
+  const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+  assert(MI->getOpcode() == TPU::scPSEUDO_TRAPi ||
+         MI->getOpcode() == TPU::scPSEUDO_TRAPr);
+
+  bool IsImmediateVersion = MI->getOpcode() == TPU::scPSEUDO_TRAPi;
+  // This adds a bundle. This potentially makes the branch move one bundle down,
+  // which is ok, because the swait bundle we're creating here has enough space.
+  auto SintMIB =
+      BuildMI(*MBB, &*MI, MI->getDebugLoc(),
+              TII->get(IsImmediateVersion ? TPU::scTRAPi : TPU::scTRAPr));
+  SintMIB.addGlobalAddress(MI->getOperand(0).getGlobal());
+  if (IsImmediateVersion)
+    SintMIB.addImm(MI->getOperand(2).getImm());
+  else
+    SintMIB.addReg(MI->getOperand(2).getReg());
+
+  TPUPredicate Pred(&*MI);
+  Pred.addTo(&SintMIB);
+
+  auto TrapBundle =
+      MachineBasicBlock::iterator::getAtBundleBegin(MI->getIterator());
+  assert(TrapBundle->isBundle());
+
+  auto PotentialWaitBundle = std::next(TrapBundle);
+  bool PotentialWaitBundleExists = PotentialWaitBundle != MBB->end();
+  MachineInstr *OnlyNop = nullptr;
+  if (PotentialWaitBundleExists) {
+    BT->clear();
+    MachineBasicBlock::instr_iterator MIB =
+        getBundleStart(PotentialWaitBundle.getInstrIterator());
+    MachineBasicBlock::instr_iterator MIE =
+        getBundleEnd(PotentialWaitBundle.getInstrIterator());
+    // Traverse in reverse order to match the bundle packer.
+    // TODO(b/178145223, b/181709129): We can do better.
+    for (auto MII = std::prev(MIE); MII != std::prev(MIB); MII--) {
+      if (MII->isBundle()) {
+        // FIXME: This is contrary the LLVM documentation
+        // https://llvm.org/doxygen/namespacellvm.html#a0b1a8d3b98bc35fd5cb5b04843beeea5
+        // which mentions getBundleStart points to the first instruction in the
+        // bundle, not the bundle itself.
+        break;
+      }
+      if (!MII->getDesc().getSchedClass()) {
+        OnlyNop = MII->getOpcode() == TPU::SNOP ? &*MII : nullptr;
+        continue;
+      }
+      assert(BT->canAddMI(*MII));
+      BT->addMI(*MII);
+    }
+  }
+  MachineInstrBuilder WaitMIB =
+      BuildMI(*MBB, MBB->instr_end(), MI->getDebugLoc(),
+              TII->get(TPU::scWAITEQri))
+          .addReg(MI->getOperand(1).getReg())
+          .addImm(1);
+  Pred.addTo(&WaitMIB);
+  // We rely on the BRrel already been transformed by iterating the block in
+  // reverse order.
+  if (PotentialWaitBundleExists && BT->canAddMI(*WaitMIB)) {
+    WaitMIB->moveBefore(&*getBundleEnd(PotentialWaitBundle.getInstrIterator()));
+    if (OnlyNop)
+      // Small optimization, remove from the bundle if it was a single nop,
+      // which is commonly the case, if the trap was shifted up from the BDS.
+      OnlyNop->eraseFromBundle();
+  } else {
+    auto WaitBundle = BuildMI(*MBB, std::next(TrapBundle), MI->getDebugLoc(),
+                              TII->get(TPU::BUNDLE));
+    WaitMIB->moveBefore(&*std::next(WaitBundle->getIterator()));
+  }
+  WaitMIB->bundleWithPred();
+  MI->eraseFromBundle();
+}
+
+void PostBundleLowerPseudos::handlePseudoSfenceImem(MachineInstr *MI) {
+  assert(MI->getOpcode() == TPU::SFENCE_IMEM);
+  MI->setDesc(TII->get(TPU::SFENCE));
+}
+
+void PostBundleLowerPseudos::handleReadLocalCycleCountOrGlobalTimeCounter(
+    MachineInstr *MI) {
+  MachineBasicBlock *MBB = &*MI->getParent();
+  const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+  auto Bundle =
+      MachineBasicBlock::iterator::getAtBundleBegin(MI->getIterator());
+  (void)Bundle;
+  unsigned LowCounterOpcode = TPU::SRDREG_LCCLO;
+  unsigned HighCounterOpcode = TPU::SRDREG_LCCHI;
+  if (MI->getOpcode() == TPU::GTC_READ) {
+    LowCounterOpcode = TPU::SRDREG_GTCLO;
+    HighCounterOpcode = TPU::SRDREG_GTCHI;
+  }
+  if (!ST->isSparseCore()) {
+    // The Pseudo should consume both scalar slots, so expanding arbitrarily
+    // LO to S0 and HI to S1
+    if (MI->getOpcode() == TPU::LCC_READ) {
+      LowCounterOpcode = TPU::SRDREG_LCCLO_S0;
+      HighCounterOpcode = TPU::SRDREG_LCCHI_S1;
+    } else {
+      LowCounterOpcode = TPU::SRDREG_GTCLO_S0;
+      HighCounterOpcode = TPU::SRDREG_GTCHI_S1;
+    }
+  }
+  AddDefaultPred(BuildMI(*MBB, &*MI, MI->getDebugLoc(),
+                         TII->get(LowCounterOpcode),
+                         MI->getOperand(0).getReg()));
+  AddDefaultPred(BuildMI(*MBB, &*MI, MI->getDebugLoc(),
+                         TII->get(HighCounterOpcode),
+                         MI->getOperand(1).getReg()));
+  MI->eraseFromBundle();
+  // Final sanity check.
+  LLVM_DEBUG(AssertBundleValid(Bundle));
+}
+
+bool PostBundleLowerPseudos::handleSpillOpcodes(MachineInstr *MI) {
+  if (MI->getOpcode() == TPU::SPILL_GPR_Ps ||
+      MI->getOpcode() == TPU::SPILL_GPRs) {
+    // Special case, we're dropping the frame index immediate.
+    auto NewMI = BuildMI(*MI->getParent(), MI->getIterator(), MI->getDebugLoc(),
+                         TII->get(TPU::SSTr), MI->getOperand(0).getReg())
+                     .addReg(MI->getOperand(1).getReg());
+    TPUPredicate Pred(MI);
+    Pred.addTo(&NewMI);
+    auto Bundle =
+        MachineBasicBlock::iterator::getAtBundleBegin(MI->getIterator());
+    (void)Bundle;
+    MI->eraseFromBundle();
+    // Final sanity check.
+    LLVM_DEBUG(AssertBundleValid(Bundle));
+    return true;
+  }
+#ifndef NDEBUG
+  if (TPUInstrInfo::isSpill(MI))
+    TrackSpillBlocks.insert(MI->getParent());
+  if (TPUInstrInfo::isRestore(MI))
+    TrackFillBlocks.insert(MI->getParent());
+#endif
+  return TII->eliminateSpillOpcode(*MI);
+}
+
+bool PostBundleLowerPseudos::handleBranchBundle(MachineInstr *MI) {
+  bool isReturn = MI->getOpcode() == TPU::BRret;
+  if (!TPUInstrInfo::isBR(MI) && !isReturn)
+    return false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  auto BranchBundle =
+      MachineBasicBlock::iterator::getAtBundleBegin(MI->getIterator());
+
+  for (int i = 0; i < ST->getNumDelaySlots(); i++) {
+    if (BranchBundle == MBB.begin()) {
+      // Insert a new empty bundle to receive the BR, or pad with an empty
+      // bundle if the subtarget has more than 1 delay slot.
+      BuildMI(MBB, *BranchBundle, BranchBundle->getDebugLoc(),
+              TII->get(TPU::BUNDLE));
+    }
+    BranchBundle = std::prev(BranchBundle);
+  }
+  if (isReturn) {
+    // This is the bundle that the branch *must* be scheduled in.
+    auto BRind = BuildMI(MBB, std::next(BranchBundle), MI->getDebugLoc(),
+                         TII->get(TPU::BRind))
+                     .addReg(TPU::LR); // Target
+    TPUPredicate Pred(&*MI);
+    Pred.addTo(&BRind);
+    BRind->bundleWithPred();
+  } else {
+    bool IsConditionalBR = false;
+    unsigned Opcode;
+    if (MI->getOpcode() == TPU::BRcond) {
+      // Conditional branch, expands to either BRrel or BRabs.
+      IsConditionalBR = true;
+      if (MI->getOperand(0).isMBB())
+        Opcode = BRrel;
+      else
+        Opcode = BRabs;
+    } else if (MI->getOpcode() == TPU::BRcondClr) {
+      IsConditionalBR = true;
+      // There's only an absolute branch for the .clr version.
+      Opcode = BRabsClr;
+    } else if (MI->getOpcode() == TPU::BRClr) {
+      // There's only an absolute branch for the .clr version.
+      Opcode = BRabsClr;
+    } else {
+      // Unconditional branch, expands to either BRrel or BRabs.
+      assert(MI->getOpcode() == TPU::BR);
+      if (MI->getOperand(0).isMBB())
+        Opcode = BRrel;
+      else
+        Opcode = BRabs;
+    }
+
+    // This is the bundle that the branch *must* be scheduled in.
+    MachineInstr *BRrel =
+        BuildMI(MBB, std::next(BranchBundle), MI->getDebugLoc(),
+                TII->get(Opcode))
+            .add(MI->getOperand(0)) // Target
+            .addReg(IsConditionalBR ? MI->getOperand(1).getReg()
+                                    : TPU::Palways) // PredReg
+            .addImm(IsConditionalBR ? MI->getOperand(2).getImm()
+                                    : 0); // PredInvert
+    BRrel->bundleWithPred();
+  }
+
+  // Final sanity check.
+  LLVM_DEBUG(AssertBundleValid(BranchBundle));
+  return true;
+}
+
+bool PostBundleLowerPseudos::handleCallBundle(MachineInstr *MI) {
+  if (!ST->isTPUABIEnabled())
+    return false;
+  if (MI->getOpcode() != TPU::CALL)
+    return false;
+  MachineBasicBlock &MBB = *MI->getParent();
+  auto BranchBundle =
+      MachineBasicBlock::iterator::getAtBundleBegin(MI->getIterator());
+
+  for (int i = 0; i < ST->getNumDelaySlots(); i++) {
+    if (BranchBundle == MBB.begin()) {
+      // Insert a new empty bundle to receive the BR, or pad with an empty
+      // bundle if the subtarget has more than 1 delay slot.
+      BuildMI(MBB, *BranchBundle, BranchBundle->getDebugLoc(),
+              TII->get(TPU::BUNDLE));
+    }
+    BranchBundle = std::prev(BranchBundle);
+  }
+  // This is the bundle that the call will be scheduled in. We currently only
+  // support relative targets.
+  MachineInstr *Callabs =
+      BuildMI(MBB, std::next(BranchBundle), MI->getDebugLoc(),
+              TII->get(TPU::CALLabs))
+          .add(MI->getOperand(0))              // Target
+          .addReg(MI->getOperand(1).getReg())  // PredReg
+          .addImm(MI->getOperand(2).getImm()); // PredInvert
+  Callabs->bundleWithPred();
+  MI->eraseFromBundle();
+
+  // Final sanity check.
+  LLVM_DEBUG(AssertBundleValid(BranchBundle));
+  return true;
+}
+
+Pass *llvm::createTPUPostBundleLowerPseudosPass() {
+  return new PostBundleLowerPseudos();
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUPredication.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUPredication.cpp
new file mode 100644
index 0000000..cd3eeb5
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUPredication.cpp

@@ -0,0 +1,175 @@
+//===- TPULowerBranchPseudos.cpp - Remove BRreserve/BRs ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the OverPredicatePass and UnderPredicatePass.
+//
+//   OverPredicatePass adds branch predicates to unpredicated instructions to
+// allow scheduling flexibility.
+//   UnderPredicatePass removes unnecessary predicates from instructions after
+// scheduling.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TPU.h"
+#include "TPUSchedule.h"
+#include "TPUSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+using namespace llvm;
+
+extern cl::opt<bool> BrCondForTest;
+
+namespace {
+class OverPredicate : public MachineFunctionPass {
+public:
+  static char ID;
+  OverPredicate() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF);
+  StringRef getPassName() const override {
+    return "TPU overpredication";
+  }
+
+private:
+  const TargetSubtargetInfo *ST;
+  const TargetInstrInfo *TII;
+};
+
+char OverPredicate::ID = 0;
+
+class UnderPredicate : public MachineFunctionPass {
+public:
+  static char ID;
+  UnderPredicate() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF);
+  StringRef getPassName() const override {
+    return "TPU underpredication";
+  }
+
+private:
+  const TargetSubtargetInfo *ST;
+};
+
+char UnderPredicate::ID = 0;
+} // namespace
+
+INITIALIZE_PASS(OverPredicate, "tpu-over-predicate",
+                "TPU overpredication", false, false)
+INITIALIZE_PASS(UnderPredicate, "tpu-under-predicate",
+                "TPU underpredication", false, false)
+
+bool OverPredicate::runOnMachineFunction(MachineFunction &MF) {
+  ST = &MF.getSubtarget();
+  TII = ST->getInstrInfo();
+  if (BrCondForTest) {
+    for (auto &MBB : MF) {
+      for (auto &MI : MBB) {
+        if (MI.getOpcode() == TPU::BRcondT)
+          MI.setDesc(TII->get(TPU::BRcond));
+      }
+    }
+  }
+  bool IsTensorCore = MF.getSubtarget<TPUSubtarget>().hasV1024();
+  // Because we have multiple branch opcodes based on the fact that it is a
+  // conditional branch or not we need an helper function that changes the
+  // branch type when we want to apply a predicate to a branch.
+  auto ApplyPredicateToInstr = [this](MachineInstr &I, const TPUPredicate &P) {
+    if (!I.isBranch())
+      P.applyTo(&I);
+    const unsigned Opcode = I.getOpcode();
+    // If it is a straight BR then it has already the correct predicate.
+    if (P.isAlways() && Opcode == TPU::BR)
+      return;
+    MachineBasicBlock &MBB = *I.getParent();
+    // Converting an unconditional branch to conditional.
+    if (Opcode == TPU::BR) {
+      auto NewBR = BuildMI(MBB, I, I.getDebugLoc(), TII->get(TPU::BRcond))
+                       .add(I.getOperand(0));
+      P.addTo(&NewBR);
+      I.eraseFromParent();
+      return;
+    }
+    // Converting a conditional branch to unconditional.
+    if (P.isAlways()) {
+      BuildMI(MBB, I, I.getDebugLoc(), TII->get(TPU::BR)).add(I.getOperand(0));
+      I.eraseFromParent();
+      return;
+    }
+    // Applying new conditional predicate.
+    P.applyTo(&I);
+  };
+
+  for (auto &MBB : MF) {
+    std::optional<TPUPredicate> InverseBranchPredicate;
+    for (auto I = MBB.begin(), E = MBB.end(); I != E;) {
+      auto &MI = (*I++);
+      if (InverseBranchPredicate.has_value()) {
+        // FIXME: We have to disallow over-predicating branch instruction
+        // because it may lead to placing it in a delay slot of another branch
+        // which is not supported by some components in existing TensorCore
+        // infrastructure, namely the overlayer and DFC power verifier. We will
+        // have to either fix or rewrite these components, but have to disable
+        // this on TensorCore sub-tartgets until then. See b/141012999 for
+        // details and discussion.
+        if (MI.isPredicable() && TPUPredicate(MI).isAlways() &&
+            !(IsTensorCore && MI.isBranch())) {
+          ApplyPredicateToInstr(MI, InverseBranchPredicate.value());
+        }
+
+        if (MI.definesRegister(InverseBranchPredicate->getReg()))
+          // Predicate register has been clobbered.
+          InverseBranchPredicate.reset();
+      }
+
+      if (MI.getOpcode() == TPU::BR || MI.getOpcode() == TPU::BRcond)
+        InverseBranchPredicate = TPUPredicate(&MI).toggleInvert();
+    }
+  }
+  return true;
+}
+
+bool UnderPredicate::runOnMachineFunction(MachineFunction &MF) {
+  ST = &MF.getSubtarget();
+  auto TII = ST->getInstrInfo();
+  if (BrCondForTest) {
+    for (auto &MBB : MF) {
+      for (auto &MI : MBB) {
+        if (MI.getOpcode() == TPU::BRcondT)
+          MI.setDesc(TII->get(TPU::BRcond));
+      }
+    }
+  }
+  for (auto &MBB : MF) {
+    std::optional<TPUPredicate> InverseBranchPredicate;
+    for (auto &MI : MBB) {
+      if (MI.getOpcode() == TPU::BR || MI.getOpcode() == TPU::BRcond) {
+        InverseBranchPredicate = TPUPredicate(&MI).toggleInvert();
+        continue;
+      }
+      if (InverseBranchPredicate.has_value()) {
+        if (TPUPredicate(&MI) == *InverseBranchPredicate) {
+          // This instruction has the inverse predicate of the preceding branch.
+          // Switch it to always because the predicate is unnecessary.
+          TPUPredicate().applyTo(&MI);
+        }
+
+        if (MI.definesRegister(InverseBranchPredicate->getReg()))
+          // Predicate register has been clobbered.
+          InverseBranchPredicate.reset();
+      }
+    }
+  }
+  return true;
+}
+
+Pass *llvm::createTPUOverPredicatePass() { return new OverPredicate(); }
+Pass *llvm::createTPUUnderPredicatePass() { return new UnderPredicate(); }

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPURawHazard.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPURawHazard.cpp
new file mode 100644
index 0000000..292595e
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPURawHazard.cpp

@@ -0,0 +1,197 @@
+//===- TPURawHazard.cpp - Handle raw hazards by adding NOP -------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass to handle functionality for raw hazard on
+// jellyfish and dragonfish platforms. It does a conservative analysis and
+// inserts nop if needed to ensure that we don't have any vmem read less than
+// 5 cycles away from an overlapping vmem store
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/TPUMCTargetDesc.h"
+#include "TPU.h"
+#include "TPUBundleTracker.h"
+#include "TPUSchedule.h"
+#include "TPUSubtarget.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tpu-raw-hazard"
+
+namespace {
+class TPURawHazard : public MachineFunctionPass {
+public:
+  static char ID;
+  TPURawHazard() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    AU.addRequired<AAResultsWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+  bool runOnMachineFunction(MachineFunction &MF);
+  StringRef getPassName() const override {
+    return "Handle raw hazards conservatively";
+  }
+
+private:
+  AliasAnalysis *AA = nullptr;
+  const TPUInstrInfo *TII = nullptr;
+  const TPUSubtarget *ST = nullptr;
+  std::optional<BundleTracker> BT;
+  // For a given bundle analyse raw hazard and potentially insert NOP before.
+  void HandleBundle(MachineInstr *Bundle);
+  // Return if there is a RAW hazard within the given number of cycles.
+  // The value returned indicates how many cycles we need to wait to avoid
+  // the hazard.
+  unsigned FindHazard(MachineInstr *MI, MachineInstr *Bundle, unsigned Cycles,
+                      unsigned VIFDepth);
+};
+
+char TPURawHazard::ID = 0;
+
+// Return true if any instruction in the bundle writing to vm0.
+bool UseVM0(MachineInstr *Bundle) {
+  auto InstIter = std::next(Bundle->getIterator());
+  MachineBasicBlock &MBB = *Bundle->getParent();
+  auto ItEnd = MBB.instr_end();
+  while (InstIter != ItEnd && InstIter->isBundledWithPred()) {
+    MachineInstr &MI = (*InstIter++);
+    for (auto D : MI.defs()) {
+      if (D.isReg() && D.getReg() == TPU::M0)
+        return true;
+    }
+  }
+  return false;
+}
+} // namespace
+
+INITIALIZE_PASS_BEGIN(TPURawHazard, DEBUG_TYPE, "TPU handle raw hazards", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(TPURawHazard, DEBUG_TYPE, "TPU handle raw hazards", false,
+                    false)
+
+// Return how many stall cycles are needed to avoid a RAW hazard.
+unsigned TPURawHazard::FindHazard(MachineInstr *MI, MachineInstr *Bundle,
+                                  unsigned Cycles, unsigned VIFDepth) {
+  if (Bundle == nullptr)
+    return Cycles;
+  auto MBB = Bundle->getParent();
+  auto RE = MBB->rend();
+  bool FoundVPUInst = false;
+  for (auto It = Bundle->getReverseIterator(); It != RE; It++) {
+    MachineInstr &OtherMI = *It;
+    if (Cycles == 0)
+      return 0;
+    if (OtherMI.getOpcode() == TPU::BUNDLE) {
+      if (VIFDepth == 0) {
+        Cycles--;
+        continue;
+      }
+      // Will burn one cycle in the VIF.
+      if (FoundVPUInst) {
+        Cycles--;
+        VIFDepth--;
+        FoundVPUInst = false;
+        continue;
+      }
+      BT->clear();
+      BT->addBundle(&OtherMI);
+      // Since VNOP gets transformed into a dummy vmov from vm0 to vm0 it cannot
+      // be bundles with another instruction using vm0.
+      MachineInstr *NopMI =
+          AddDefaultPred(BuildMI(*Bundle->getParent(), OtherMI.getNextNode(),
+                                 MI->getDebugLoc(), TII->get(TPU::VNOP)));
+      if (BT->canAddMI(*NopMI) && !UseVM0(&OtherMI)) {
+        Cycles--;
+        VIFDepth--;
+        continue;
+      } else {
+        NopMI->getParent()->erase_instr(NopMI);
+      }
+      // There may not be an instruction pushed to the VIF in this cycle. We
+      // can be sure that the worst case VIF size is reduced by one. Once we
+      // know the VIF is empty even bundles without vector instructions would
+      // consume a cycle.
+      VIFDepth--;
+      continue;
+    }
+    if (TII->isVMemStoreInstr(&OtherMI) && MI->mayAlias(AA, OtherMI, true))
+      return Cycles;
+    if (OtherMI.isPredicable() && TPUPredicate(&OtherMI).isAlways() &&
+        TPUInstrInfo::isVectorInstruction(OtherMI.getDesc())) {
+      FoundVPUInst = true;
+    }
+  }
+  // TODO(thomasraoux) Could potentially be improved by looking at predecessor
+  // blocks.
+  return Cycles;
+}
+
+static constexpr int VIFDepth = 16;
+
+void TPURawHazard::HandleBundle(MachineInstr *Bundle) {
+  if (Bundle->getOpcode() != TPU::BUNDLE)
+    return;
+  if (!Bundle->isBundledWithSucc())
+    return;
+  auto E = Bundle->getParent()->instr_end();
+  for (auto I = std::next(Bundle->getIterator()); I != E && I->isInsideBundle();
+       I++) {
+    if (TII->isVMemLoadInstr(&(*I))) {
+      unsigned Latency =
+          FindHazard(&(*I), Bundle->getPrevNode(),
+                     ST->getVMemHazardLatency() - 1, VIFDepth - 1);
+      if (Latency > 0) {
+        LLVM_DEBUG({
+          dbgs() << "Potential raw hazard adding " << Latency
+                 << " vnop bundles due to:\n";
+          I->dump();
+        });
+        // Insert a delay
+        auto InsertPt = Bundle->getIterator();
+        for (unsigned int Nop = 0; Nop < Latency; Nop++) {
+          MachineInstr *MI = AddDefaultPred(
+              BuildMI(*Bundle->getParent(), InsertPt, InsertPt->getDebugLoc(),
+                      TII->get(TPU::VNOP)));
+          llvm::finalizeBundle(*Bundle->getParent(), MI->getIterator(),
+                               Bundle->getIterator());
+        }
+      }
+    }
+  }
+}
+
+bool TPURawHazard::runOnMachineFunction(MachineFunction &MF) {
+  ST = &MF.getSubtarget<TPUSubtarget>();
+  BT.emplace(*ST);
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  TII = ST->getInstrInfo();
+  if (!ST->hasFatalRawHazard())
+    return false;
+
+  for (auto &MBB : MF) {
+    for (auto I = MBB.begin(); I != MBB.end();) {
+      auto Bundle = I++;
+      HandleBundle(&(*Bundle));
+    }
+  }
+  return true;
+}
+
+Pass *llvm::createRawHazardPass() { return new TPURawHazard(); }

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPURegisterInfo.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPURegisterInfo.cpp
new file mode 100644
index 0000000..faf7cef
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPURegisterInfo.cpp

@@ -0,0 +1,744 @@
+//===---- TPURegisterInfo.cpp - TPU Register Information --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the TPU implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TPURegisterInfo.h"
+#include "MCTargetDesc/TPUMCTargetDesc.h"
+#include "TPU.h"
+#include "TPUIRUtils.h"
+#include "TPUMachineFunctionInfo.h"
+#include "TPURegisterInfo.h"
+#include "TPUSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "TPUGenRegisterInfo.inc"
+
+using namespace llvm;
+
+extern cl::opt<bool> NeedOverlayerPasses;
+extern cl::opt<bool> TPUReportUsedSpillSlots;
+
+static cl::opt<std::string> FixedVRegs("tpu-fixed-vregs", cl::Hidden,
+                                       cl::desc("Test this"), cl::init(""));
+
+static cl::opt<std::string> FixedMaskRegs("tpu-fixed-maskregs", cl::Hidden,
+                                          cl::desc("Test this"), cl::init(""));
+
+static cl::opt<bool>
+    EnableRoundRobin("tpu-round-robin-registers", cl::Hidden,
+                     cl::desc("Enables hinting registers round-robin in order "
+                              "to avoid anti-dependencies"),
+                     cl::init(false));
+
+#define DEBUG_TYPE "tpu-register-info"
+
+TPURegisterInfo::TPURegisterInfo(unsigned HwMode)
+    : TPUGenRegisterInfo(TPU::LR, 0, 0, 0, HwMode) {}
+
+const uint16_t *
+TPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  const TPUSubtarget &ST = MF->getSubtarget<TPUSubtarget>();
+  if (ST.isTPUABIEnabled())
+    return CSR_TPU32_SaveList;
+  return CSR_NoRegs_SaveList;
+}
+
+BitVector
+TPURegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  auto SetReservedAlias = [&Reserved, this](Register R) {
+    for (MCRegAliasIterator Alias(R, this, true); Alias.isValid(); ++Alias)
+      Reserved.set(*Alias);
+  };
+  SetReservedAlias(TPU::LR);
+  const TPUSubtarget &ST = MF.getSubtarget<TPUSubtarget>();
+  if (ST.isTPUABIEnabled()) {
+    SetReservedAlias(TPU::SPS);
+    if (ST.hasVPU())
+      SetReservedAlias(TPU::SPV);
+    // If function calls are enabled, we reserve the stack pointer register.
+    const TPUFrameLowering *TFI = getFrameLowering(MF);
+    if (TFI->hasFPS(MF))
+      SetReservedAlias(TPU::FPS);
+    else
+      Reserved.set(TPU::FPS);
+    if (TFI->hasFPV(MF)) {
+      assert(ST.hasVPU());
+      SetReservedAlias(TPU::FPV);
+    } else {
+      Reserved.set(TPU::FPV);
+    }
+  } else {
+    // If function calls are disabled, we only reserve the stack pointer alias.
+    Reserved.set(TPU::SPS);
+    Reserved.set(TPU::FPS);
+    Reserved.set(TPU::SPV);
+    Reserved.set(TPU::FPV);
+  }
+  Reserved.set(TPU::Palways);
+  if (MF.getSubtarget<TPUSubtarget>().isPxcBarnaCore()) {
+    const TPUMachineFunctionInfo *MFI = MF.getInfo<TPUMachineFunctionInfo>();
+    // By default all aggregate regs are unallocatable.
+    for (unsigned I = TPU::VAGG0; I <= TPU::VAGG63; ++I)
+      Reserved.set(I);
+    for (const std::pair<unsigned, unsigned> &Regs :
+         MFI->getBarnaCoreAggregateRegSequences()) {
+      // Set all aggregate registers except the base as reserved. The base
+      // register is used during register allocation, and the rest are
+      // implicitly written outside of the compiler's knowledge.
+      for (unsigned I = Regs.first; I < Regs.second; ++I) {
+        for (MCRegAliasIterator Alias(TPU::V0 + I, this, true); Alias.isValid();
+             ++Alias) {
+          // Set all shadow VPRs, VAGGs, and VPR tuples including the base as
+          // reserved.
+          Reserved.set(*Alias);
+        }
+      }
+      // Un-reserve the base.
+      Reserved.reset(TPU::VAGG0 + Regs.first);
+    }
+  }
+  if (MF.getSubtarget<TPUSubtarget>().isSparseCore()) {
+    Reserved.set(TPU::P14);
+    // See comment in TPURegisterInfo.td about pseudo fifo register class sizes.
+    for (unsigned I = TPU::ERF_16; I <= TPU::ERF_32; ++I)
+      Reserved.set(I);
+    for (unsigned I = TPU::V2SF_16; I <= TPU::V2SF_128; ++I)
+      Reserved.set(I);
+    // XRF0's and XRF1's sizes are already suitable (not shared with TC).
+  } else {
+    for (unsigned I = TPU::M8; I <= TPU::M15; ++I)
+      Reserved.set(I);
+    for (unsigned I = TPU::V32; I <= TPU::V63; ++I)
+      Reserved.set(I);
+    for (unsigned I = TPU::VAGG32; I <= TPU::VAGG63; ++I)
+      Reserved.set(I);
+    for (unsigned I = TPU::VAGGps0_32; I <= TPU::VAGGps0_63; ++I)
+      Reserved.set(I);
+    for (unsigned I = TPU::VAGGps1_32; I <= TPU::VAGGps1_63; ++I)
+      Reserved.set(I);
+    for (unsigned I = TPU::VAGGps2_32; I <= TPU::VAGGps2_63; ++I)
+      Reserved.set(I);
+    for (unsigned I = TPU::VAGGps3_32; I <= TPU::VAGGps3_63; ++I)
+      Reserved.set(I);
+    for (unsigned I = TPU::V32_V33; I <= TPU::V62_V63; ++I)
+      Reserved.set(I);
+    for (unsigned I = TPU::V30_V31_V32; I <= TPU::V60_V61_V62; ++I)
+      Reserved.set(I);
+    for (unsigned I = TPU::V32_V33_V34_V35; I <= TPU::V60_V61_V62_V63; ++I)
+      Reserved.set(I);
+    for (unsigned I = TPU::M8_M9; I <= TPU::M14_M15; ++I)
+      Reserved.set(I);
+    for (unsigned I = TPU::M6_M7_M8; I <= TPU::M12_M13_M14; ++I)
+      Reserved.set(I);
+    for (unsigned I = TPU::M8_M9_M10_M11; I <= TPU::M12_M13_M14_M15; ++I)
+      Reserved.set(I);
+  }
+  // These mask register are either reserved or used for special HW masks.
+  // Please note that besides excluding them from RA, this also means we do not
+  // need implicit_def, and that they are avoided during copy propagation.
+  for (unsigned I = TPU::M16; I <= TPU::M31; ++I)
+    Reserved.set(I);
+  if (NeedOverlayerPasses) {
+    // The overlayer assumes S31 to be a reserved register.
+    Reserved.set(TPU::S31);
+  }
+  Reserved.set(TPU::Void);
+  // The pseudo FIFO physical registers and the real ones are in the same
+  // register classes. The way we're preventing the register allocator to
+  // allocate the real ones is to make them globally reserved.
+  Reserved.set(TPU::ERF);
+  Reserved.set(TPU::V2SF);
+  Reserved.set(TPU::SFRF);
+  Reserved.set(TPU::TRF0);
+  Reserved.set(TPU::TRF1);
+  Reserved.set(TPU::TRF2);
+  Reserved.set(TPU::XRF0);
+  Reserved.set(TPU::XRF1);
+  Reserved.set(TPU::DRF);
+  Reserved.set(TPU::MRF0);
+  Reserved.set(TPU::MRF1);
+  Reserved.set(TPU::MRF2);
+  Reserved.set(TPU::MRF3);
+
+  auto FixedUserRegs = [&Reserved](StringRef Sopt, unsigned StartReg) {
+    if (Sopt.empty())
+      return;
+    std::pair<StringRef, StringRef> FromToS = Sopt.split('-');
+    unsigned From, To;
+    if (FromToS.first.empty() || FromToS.second.empty() ||
+        FromToS.first.getAsInteger(10, From) ||
+        FromToS.second.getAsInteger(10, To))
+      llvm_unreachable(
+          "Needs to be of format -tpu-fixed-vregs=n-m, with n, m integers.");
+    for (unsigned I = StartReg + From; I <= StartReg + To; ++I)
+      Reserved.set(I);
+  };
+
+  FixedUserRegs(FixedVRegs, TPU::V0);
+  FixedUserRegs(FixedMaskRegs, TPU::M0);
+
+  return Reserved;
+}
+
+bool TPURegisterInfo::requiresRegisterScavenging(
+    const MachineFunction & /*MF*/) const {
+  return true;
+}
+
+bool TPURegisterInfo::trackLivenessAfterRegAlloc(
+    const MachineFunction & /*MF*/) const {
+  return true;
+}
+
+static std::pair<int, int>
+ReturnAndCheckMemoryAvailability(const MachineFunction &MF, int FI,
+                                 StringRef MemType, int SlotIdx,
+                                 int Multiplier) {
+  // LINT.IfChange
+  auto SR = TPU::GetSpillRange(MF.getMMI().getModule(), &MF.getFunction(),
+                               MemType + Twine(".funcs.spill"),
+                               MemType + Twine(".ranges.spill.start"),
+                               MemType + Twine(".ranges.spill.limit"));
+  // LINT.ThenChange(//depot/google3/platforms/xla/sparse_core/mlo/convert_to_llvm_ir.cc)
+
+  if (SR.second - SR.first < (SlotIdx + 1) * Multiplier) {
+    if (!TPUReportUsedSpillSlots) {
+      report_fatal_error("Not enough space for " + Twine(MemType.upper()) +
+                         " spill: FrameIndex = " + Twine(FI) +
+                         ", mapped into spill offset " + Twine(SlotIdx) +
+                         ", but the spill range is [" + Twine(SR.first) + ", " +
+                         Twine(SR.second) + ")");
+    } else {
+#ifndef NDEBUG
+      dbgs() << ("Not enough space for " + Twine(MemType.upper()) +
+                 " spill: FrameIndex = " + Twine(FI) +
+                 ", mapped into spill offset " + Twine(SlotIdx) +
+                 ", but the spill range is [" + Twine(SR.first) + ", " +
+                 Twine(SR.second) + ")\n");
+#endif
+    }
+  }
+    return SR;
+};
+
+bool TPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                          int SPAdj, unsigned FIOperandNum,
+                                          RegScavenger *RS) const {
+    assert(SPAdj == 0 && "Unexpected");
+
+    MachineInstr &MI = *II;
+    MachineBasicBlock &MBB = *MI.getParent();
+    MachineFunction &MF = *MBB.getParent();
+    const TPUSubtarget &ST = MF.getSubtarget<TPUSubtarget>();
+    const TargetInstrInfo *TII = ST.getInstrInfo();
+    const TargetFrameLowering *TFI = ST.getFrameLowering();
+    bool HasFP = TFI->hasFP(MF);
+    DebugLoc DL = MI.getDebugLoc();
+    (void)HasFP;
+    int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+    const MachineFrameInfo &MFI = MF.getFrameInfo();
+    bool isFixed = MFI.isFixedObjectIndex(FrameIndex);
+    assert(!isFixed || ST.isTPUABIEnabled());
+    int Offset = MFI.getObjectOffset(FrameIndex);
+    unsigned Opcode = II->getOpcode();
+
+    switch (Opcode) {
+    case TPU::SLDi:
+    case TPU::SSTi:
+    assert(!ST.isSparseCore());
+    LLVM_FALLTHROUGH;
+    case TPU::RESTORE_GPR:
+    case TPU::SPILL_GPR:
+    case TPU::RESTORE_GPR_P:
+    case TPU::SPILL_GPR_P:
+    case TPU::RESTORE_GPRs:
+    case TPU::RESTORE_GPR_Ps:
+    case TPU::SPILL_GPRs:
+    case TPU::SPILL_GPR_Ps: {
+    if (ST.isTPUABIEnabled() || isFixed) {
+      MI.getOperand(FIOperandNum).ChangeToImmediate(Offset);
+    } else {
+      auto SlotIndex =
+          MF.getInfo<TPUMachineFunctionInfo>()->getSmemSpillSlotIndex(
+              FrameIndex);
+      auto SR = ReturnAndCheckMemoryAvailability(MF, FrameIndex, "smem",
+                                                 SlotIndex, 1);
+      MI.getOperand(FIOperandNum).ChangeToImmediate(SR.second - SlotIndex - 1);
+    }
+    break;
+    }
+    case TPU::tcVLVi:
+    case TPU::tcVSVi: {
+    constexpr int SublaneCount = 8;
+    auto SlotIndex =
+        MF.getInfo<TPUMachineFunctionInfo>()->getVectorSpillSlotIndex(
+            FrameIndex);
+    auto SR = ReturnAndCheckMemoryAvailability(MF, FrameIndex, "vmem",
+                                               SlotIndex, SublaneCount);
+    MI.getOperand(FIOperandNum)
+        .ChangeToImmediate(SR.second - (SlotIndex + 1) * SublaneCount);
+    break;
+    }
+    case TPU::scVLDi:
+    case TPU::scVSTi:
+    assert(!ST.isSparseCore());
+    LLVM_FALLTHROUGH;
+    case TPU::RESTORE_VPR:
+    case TPU::SPILL_VPR:
+    case TPU::RESTORE_VPR_P:
+    case TPU::SPILL_VPR_P:
+    case TPU::RESTORE_VPRs:
+    case TPU::SPILL_VPRs:
+    case TPU::RESTORE_VPR_Ps:
+    case TPU::SPILL_VPR_Ps: {
+    assert(ST.hasVPU());
+    if (ST.isTPUABIEnabled() || isFixed) {
+      MI.getOperand(FIOperandNum).ChangeToImmediate(Offset);
+    } else {
+      const int VectorSize = ST.vectorSizeInElements(/*ElementSizeInBytes=*/4);
+      auto SlotIndex =
+          MF.getInfo<TPUMachineFunctionInfo>()->getVectorSpillSlotIndex(
+              FrameIndex);
+      auto SR = ReturnAndCheckMemoryAvailability(MF, FrameIndex, "tilespmem",
+                                                 SlotIndex, VectorSize);
+      MI.getOperand(FIOperandNum)
+          .ChangeToImmediate(SR.second - (SlotIndex + 1) * VectorSize);
+    }
+    break;
+    }
+    case TPU::SPILL_MPR_P:
+    case TPU::SPILL_MPR_Ps:
+    llvm_unreachable("Opcode currently not expected.");
+    assert(ST.hasVPU());
+    LLVM_FALLTHROUGH;
+    case TPU::SPILL_MPR:
+    case TPU::SPILL_MPRs: {
+    assert(ST.hasVPU());
+    Register MaskReg = MI.getOperand(0).getReg();
+    bool MaskIsKill = MI.getOperand(0).isKill();
+    if (!ST.isTPUABIEnabled() && !isFixed) {
+      auto SlotIndex =
+          MF.getInfo<TPUMachineFunctionInfo>()->getVectorSpillSlotIndex(
+              FrameIndex);
+      const int ElementsPerSlot =
+          ST.isSparseCore() ? ST.vectorSizeInElements(/*ElementSizeInBytes=*/4)
+                            : 8;
+      ReturnAndCheckMemoryAvailability(MF, FrameIndex,
+                                       ST.hasV1024() ? "vmem" : "tilespmem",
+                                       SlotIndex, ElementsPerSlot);
+    }
+    // We scavenge a register and allow spilling if necessary.
+    Register TmpVReg = RS->scavengeRegister(&TPU::VPRRegClass, MI, 0, true);
+    AddDefaultPred(
+        BuildMI(MBB, II, DL, TII->get(TPU::VIMMI), TmpVReg).addImm(0));
+    AddDefaultPred(BuildMI(MBB, II, DL, TII->get(TPU::VSELir), TmpVReg)
+                       .addReg(MaskReg, getKillRegState(MaskIsKill))
+                       .addImm(0xFFFFFFFF)
+                       .addReg(TmpVReg, getKillRegState(true)));
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, FrameIndex),
+        MachineMemOperand::MOStore, MFI.getObjectSize(FrameIndex),
+        MFI.getObjectAlign(FrameIndex));
+    if (II->getOpcode() == TPU::SPILL_MPRs) {
+      AddDefaultPred(
+          BuildMI(MBB, II, II->getDebugLoc(), TII->get(TPU::SPILL_VPRs))
+              .addReg(TmpVReg, getKillRegState(true))
+              .addReg(II->getOperand(1).getReg())
+              .addFrameIndex(FrameIndex)
+              .addMemOperand(MMO));
+    } else {
+      assert(II->getOpcode() == TPU::SPILL_MPR);
+      AddDefaultPred(
+          BuildMI(MBB, II, II->getDebugLoc(),
+                  TII->get(ST.hasV1024() ? TPU::tcVSVi : TPU::SPILL_VPR))
+              .addReg(TmpVReg, getKillRegState(true))
+              .addFrameIndex(FrameIndex)
+              .addMemOperand(MMO));
+    }
+    MI.eraseFromParent();
+    break;
+    }
+    case TPU::RESTORE_MPR_P:
+    case TPU::RESTORE_MPR_Ps:
+    llvm_unreachable("Opcode currently not expected.");
+    assert(ST.hasVPU());
+    LLVM_FALLTHROUGH;
+    case TPU::RESTORE_MPR:
+    case TPU::RESTORE_MPRs: {
+    assert(ST.hasVPU());
+    bool HasTwoBitsPerVmregLaneAndSublane =
+        MF.getSubtarget<TPUSubtarget>().hasPfcTensorCore();
+    Register MaskReg = MI.getOperand(0).getReg();
+    if (!ST.isTPUABIEnabled() && !isFixed) {
+      auto SlotIndex =
+          MF.getInfo<TPUMachineFunctionInfo>()->getVectorSpillSlotIndex(
+              FrameIndex);
+      const int ElementsPerSlot =
+          ST.isSparseCore() ? ST.vectorSizeInElements(/*ElementSizeInBytes=*/4)
+                            : 8;
+      ReturnAndCheckMemoryAvailability(MF, FrameIndex,
+                                       ST.hasV1024() ? "vmem" : "tilespmem",
+                                       SlotIndex, ElementsPerSlot);
+    }
+    // We scavenge a register and allow spilling if necessary.
+    Register TmpVReg = RS->scavengeRegister(&TPU::VPRRegClass, MI, 0, true);
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, FrameIndex),
+        MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
+        MFI.getObjectAlign(FrameIndex));
+    if (II->getOpcode() == TPU::RESTORE_MPRs) {
+      AddDefaultPred(BuildMI(MBB, II, DL, TII->get(TPU::RESTORE_VPRs), TmpVReg)
+                         .addReg(II->getOperand(1).getReg())
+                         .addFrameIndex(FrameIndex)
+                         .addMemOperand(MMO));
+    } else {
+      assert(II->getOpcode() == TPU::RESTORE_MPR);
+      AddDefaultPred(BuildMI(MBB, II, DL,
+                             TII->get(MF.getSubtarget<TPUSubtarget>().hasV1024()
+                                          ? TPU::tcVLVi
+                                          : TPU::RESTORE_VPR),
+                             TmpVReg)
+                         .addFrameIndex(FrameIndex)
+                         .addMemOperand(MMO));
+    }
+    if (HasTwoBitsPerVmregLaneAndSublane) {
+      AddDefaultPred(BuildMI(MBB, II, DL, TII->get(TPU::VSUBLANE_MASK), MaskReg)
+                         .addReg(TmpVReg, getKillRegState(true)));
+    } else {
+      AddDefaultPred(BuildMI(MBB, II, DL, TII->get(TPU::VCMPNEri), MaskReg)
+                         .addReg(TmpVReg, getKillRegState(true))
+                         .addImm(0));
+    }
+    MI.eraseFromParent();
+    break;
+    }
+    case TPU::SPILL_PPR_P:
+    case TPU::SPILL_PPR_Ps:
+    llvm_unreachable("Opcode currently not expected.");
+    LLVM_FALLTHROUGH;
+    case TPU::SPILL_PPR:
+    case TPU::SPILL_PPRs: {
+    Register PredReg = MI.getOperand(0).getReg();
+    auto SlotIndex =
+        MF.getInfo<TPUMachineFunctionInfo>()->getSmemSpillSlotIndex(FrameIndex);
+    constexpr int ElementsPerSlot = 1;
+    ReturnAndCheckMemoryAvailability(MF, FrameIndex, "smem", SlotIndex,
+                                     ElementsPerSlot);
+    Register TmpSReg = RS->scavengeRegister(&TPU::GPRRegClass, MI, 0, true);
+    if (TmpSReg == 0)
+      report_fatal_error("Register scavenger failed. Fix spilling.");
+    BuildMI(MBB, II, DL, TII->get(TPU::IMM), TmpSReg)
+        .addImm(0)
+        .addReg(PredReg)
+        .addImm(1);
+    BuildMI(MBB, II, DL, TII->get(TPU::IMM), TmpSReg)
+        .addImm(1)
+        .addReg(PredReg)
+        .addImm(0);
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, FrameIndex),
+        MachineMemOperand::MOStore, MFI.getObjectSize(FrameIndex),
+        MFI.getObjectAlign(FrameIndex));
+    if (II->getOpcode() == TPU::SPILL_PPRs) {
+      AddDefaultPred(
+          BuildMI(MBB, II, II->getDebugLoc(), TII->get(TPU::SPILL_GPRs))
+              .addReg(TmpSReg, getKillRegState(true))
+              .addReg(II->getOperand(1).getReg())
+              .addFrameIndex(FrameIndex)
+              .addMemOperand(MMO));
+    } else {
+      assert(II->getOpcode() == TPU::SPILL_PPR);
+      AddDefaultPred(BuildMI(MBB, II, II->getDebugLoc(),
+                             TII->get(MF.getSubtarget<TPUSubtarget>().hasV1024()
+                                          ? TPU::SSTi
+                                          : TPU::SPILL_GPR))
+                         .addReg(TmpSReg, getKillRegState(true))
+                         .addFrameIndex(FrameIndex)
+                         .addMemOperand(MMO));
+    }
+    MI.eraseFromParent();
+    break;
+    }
+    case TPU::RESTORE_PPR_P:
+    case TPU::RESTORE_PPR_Ps:
+    llvm_unreachable("Opcode currently not expected.");
+    LLVM_FALLTHROUGH;
+    case TPU::RESTORE_PPR:
+    case TPU::RESTORE_PPRs: {
+    Register PredReg = MI.getOperand(0).getReg();
+    auto SlotIndex =
+        MF.getInfo<TPUMachineFunctionInfo>()->getSmemSpillSlotIndex(FrameIndex);
+    constexpr int ElementsPerSlot = 1;
+    ReturnAndCheckMemoryAvailability(MF, FrameIndex, "smem", SlotIndex,
+                                     ElementsPerSlot);
+    Register TmpSReg = RS->scavengeRegister(&TPU::GPRRegClass, MI, 0, true);
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, FrameIndex),
+        MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
+        MFI.getObjectAlign(FrameIndex));
+    if (TmpSReg == 0)
+      report_fatal_error("Register scavenger failed. Fix spilling.");
+    if (II->getOpcode() == TPU::RESTORE_PPRs) {
+      AddDefaultPred(BuildMI(MBB, II, DL, TII->get(TPU::RESTORE_GPRs), TmpSReg)
+                         .addReg(II->getOperand(1).getReg())
+                         .addFrameIndex(FrameIndex)
+                         .addMemOperand(MMO));
+    } else {
+      assert(II->getOpcode() == TPU::RESTORE_PPR);
+      AddDefaultPred(BuildMI(MBB, II, DL,
+                             TII->get(MF.getSubtarget<TPUSubtarget>().hasV1024()
+                                          ? TPU::SLDi
+                                          : TPU::RESTORE_GPR),
+                             TmpSReg)
+                         .addFrameIndex(FrameIndex)
+                         .addMemOperand(MMO));
+    }
+    AddDefaultPred(BuildMI(MBB, II, DL, TII->get(TPU::CMPEQri), PredReg)
+                       .addReg(TmpSReg, getKillRegState(true))
+                       .addImm(1));
+    MI.eraseFromParent();
+    break;
+    }
+    case TPU::SPILL_GPR_ADD: {
+    // FIXME(b/237788792): (1) is this the right offset? Also (2) optimize
+    // this if offset is zero.
+    MI.getOperand(FIOperandNum).ChangeToImmediate(Offset);
+    break;
+    }
+    case TPU::IMM: {
+    if (!ST.isTPUABIEnabled())
+      llvm_unreachable("Only supported when stack is present.");
+    // We (should-) have resolved the scalar/vector offset during the PEI pass.
+    MI.getOperand(FIOperandNum).ChangeToImmediate(Offset);
+    break;
+    }
+    default:
+    llvm_unreachable("Unimplemented frame instruction");
+    }
+    return false;
+}
+
+Register
+TPURegisterInfo::getFrameRegister(const MachineFunction & /*MF*/) const {
+  return TPU::FPS;
+}
+
+const uint32_t *
+TPURegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                      CallingConv::ID /*CC*/) const {
+  const TPUSubtarget &ST = MF.getSubtarget<TPUSubtarget>();
+  if (ST.isTPUABIEnabled())
+    return CSR_TPU32_RegMask;
+  return CSR_NoRegs_RegMask;
+}
+
+bool TPURegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+                                           unsigned DefSubReg,
+                                           const TargetRegisterClass *SrcRC,
+                                           unsigned SrcSubReg) const  {
+  if ((DefRC == &TPU::VAGGRegClass && SrcRC == &TPU::VPRRegClass) ||
+      (SrcRC == &TPU::VAGGRegClass && DefRC == &TPU::VPRRegClass)) {
+    return true;
+  }
+  return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg, SrcRC, SrcSubReg);
+}
+
+bool TPURegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator I,
+                                            MachineBasicBlock::iterator &UseMI,
+                                            const TargetRegisterClass *RC,
+                                            Register Reg) const {
+  const TPUSubtarget &ST = MBB.getParent()->getSubtarget<TPUSubtarget>();
+  if (ST.isSparseCore()) {
+    const MachineFrameInfo &MFI = MBB.getParent()->getFrameInfo();
+    const TPUMachineFunctionInfo *MFInfo =
+        MBB.getParent()->getInfo<TPUMachineFunctionInfo>();
+    MachineFunction &MF = *MBB.getParent();
+    const TPUSubtarget &ST = MF.getSubtarget<TPUSubtarget>();
+    const TargetInstrInfo *TII = ST.getInstrInfo();
+    int Offset;
+    if (RC == &TPU::GPRRegClass) {
+      // We're spilling using real store/load instead of spill/restore + frame
+      // index, because the register scavenger may run after pseudo
+      // instruction and frame index resolution.
+      if (ST.isTPUABIEnabled()) {
+        Offset = MFI.getObjectOffset(MFInfo->getEmergencyFrameIndexS());
+      } else {
+        auto SlotIndex =
+            MF.getInfo<TPUMachineFunctionInfo>()->getSmemSpillSlotIndex(
+                MFInfo->getEmergencyFrameIndexS());
+        auto SR = ReturnAndCheckMemoryAvailability(
+            MF, MFInfo->getEmergencyFrameIndexS(), "smem", SlotIndex, 1);
+        Offset = SR.second - SlotIndex - 1;
+      }
+      AddDefaultPred(BuildMI(MBB, I, DebugLoc(), TII->get(TPU::SSTi))
+                         .addReg(Reg)
+                         .addImm(Offset));
+      AddDefaultPred(BuildMI(MBB, UseMI, DebugLoc(), TII->get(TPU::SLDi), Reg)
+                         .addImm(Offset));
+    } else if (RC == &TPU::VPRRegClass) {
+      assert(ST.hasVPU());
+      // Same comment about store/load as above.
+      if (ST.isTPUABIEnabled()) {
+        Offset = MFI.getObjectOffset(MFInfo->getEmergencyFrameIndexV());
+      } else {
+        const int VectorSize =
+            ST.vectorSizeInElements(/*ElementSizeInBytes=*/4);
+        auto SlotIndex =
+            MF.getInfo<TPUMachineFunctionInfo>()->getVectorSpillSlotIndex(
+                MFInfo->getEmergencyFrameIndexV());
+        auto SR = ReturnAndCheckMemoryAvailability(
+            MF, MFInfo->getEmergencyFrameIndexV(), "tilespmem", SlotIndex,
+            VectorSize);
+        Offset = SR.second - (SlotIndex + 1) * VectorSize;
+      }
+      AddDefaultPred(BuildMI(MBB, I, DebugLoc(), TII->get(TPU::scVSTi))
+                         .addReg(Reg)
+                         .addImm(Offset));
+      AddDefaultPred(BuildMI(MBB, UseMI, DebugLoc(), TII->get(TPU::scVLDi), Reg)
+                         .addImm(Offset));
+    } else {
+      llvm_unreachable("TODO/unimplemented for this register class.");
+    }
+    return true;
+  }
+  // Let it slip for TensorCore. Scavenging is only need during frame lowering,
+  // and as long as there's a spill slot, it should be fine. We may remove
+  // support for TensorCore in the future.
+  return false;
+}
+
+bool TPURegisterInfo::isScalarMemSpilledRegister(const MachineFunction &MF,
+                                                 Register Reg) const {
+  if (Register::isPhysicalRegister(Reg))
+    return TPU::GPRRegClass.contains(Reg) || TPU::PPRRegClass.contains(Reg);
+  assert(Register::isVirtualRegister(Reg) && "Expected a virtual register");
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterClass *RegClass = MRI.getRegClass(Reg);
+  return TPU::GPRRegClass.hasSubClassEq(RegClass) ||
+         TPU::PPRRegClass.hasSubClassEq(RegClass);
+}
+
+bool TPURegisterInfo::isVectorMemSpilledRegister(const MachineFunction &MF,
+                                                 Register Reg) const {
+  if (Register::isPhysicalRegister(Reg))
+    return TPU::VPRRegClass.contains(Reg) || TPU::MPRRegClass.contains(Reg);
+  assert(Register::isVirtualRegister(Reg) && "Expected a virtual register");
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterClass *RegClass = MRI.getRegClass(Reg);
+  return TPU::VPR_AGGRegClass.hasSubClassEq(RegClass) ||
+         TPU::MPRRegClass.hasSubClassEq(RegClass);
+}
+
+bool TPURegisterInfo::isFifoRegister(const MachineFunction &MF,
+                                     Register Reg) const {
+  const TPUSubtarget &ST = MF.getSubtarget<TPUSubtarget>();
+  for (auto *Info : ST.getFifoInfos()) {
+    for (const MCPhysReg &FR : Info->getRegisterClass()->getRegisters()) {
+      if (FR == Reg)
+        return true;
+    }
+  }
+  return false;
+}
+
+bool TPURegisterInfo::isEmbeddedMask(Register Reg) const {
+  return Reg >= TPU::M16 && Reg <= TPU::M23;
+}
+
+bool TPURegisterInfo::getRegAllocationHints(Register VirtReg,
+                                            ArrayRef<MCPhysReg> Order,
+                                            SmallVectorImpl<MCPhysReg> &Hints,
+                                            const MachineFunction &MF,
+                                            const VirtRegMap *VRM,
+                                            const LiveRegMatrix *Matrix) const {
+  if (EnableRoundRobin && MF.getSubtarget<TPUSubtarget>().isSparseCore()) {
+    // This is an optimization that aims to avoid unnecessary anti-dependencies
+    // due to register allocation. We search and cache MF once, and record all
+    // virtual register definitions that happen in pipelined MBBs. Once the
+    // register allocator allocates them, we hint a new physical register of its
+    // class starting from the top of order 'Order'. This hint gets the
+    // allocator to choose a new register rather than aggressively reusing one.
+    // The allocator will still prefer reusing over spilling.
+    //
+    // The hint here is done experimentally for Sparsecore. Alternatively, we
+    // could improve the anti-dependency breaker and make it capable of breaking
+    // anti-dependencies where cross-block live ranges are involved.
+    //
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    if (!PipelinedRegisterHintsCached.count(&MF)) {
+      PipelinedRegisterHints.clear();
+      const TPUMachineFunctionInfo *MFI = MF.getInfo<TPUMachineFunctionInfo>();
+      for (auto &MBB : MF) {
+        if (!MFI->isBasicBlockPipelined(&MBB))
+          continue;
+        int VPRCount = 0;
+        int GPRCount = 0;
+        int MPRCount = 0;
+        for (auto &MI : MBB) {
+          for (auto &MO : MI.defs()) {
+            if (!MO.isReg())
+              continue;
+            if (!Register::isVirtualRegister(MO.getReg()))
+              continue;
+            if (MRI.getRegClass(MO.getReg()) == &TPU::VPRRegClass)
+              PipelinedRegisterHints[MO.getReg()] = VPRCount++;
+            else if (MRI.getRegClass(MO.getReg()) == &TPU::GPRRegClass)
+              PipelinedRegisterHints[MO.getReg()] = GPRCount++;
+            else if (MRI.getRegClass(MO.getReg()) == &TPU::MPRRegClass)
+              PipelinedRegisterHints[MO.getReg()] = MPRCount++;
+          }
+        }
+      }
+      PipelinedRegisterHintsCached.insert(&MF);
+    }
+    if (PipelinedRegisterHints.count(VirtReg)) {
+      if (MRI.getRegClass(VirtReg) == &TPU::VPRRegClass ||
+          MRI.getRegClass(VirtReg) == &TPU::GPRRegClass ||
+          MRI.getRegClass(VirtReg) == &TPU::MPRRegClass) {
+        MCPhysReg HintReg =
+            Order[Order.size() -
+                  PipelinedRegisterHints[VirtReg] % Order.size() - 1];
+        LLVM_DEBUG({
+          const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+          dbgs() << "Hinting reg ";
+          dumpReg(VirtReg);
+          dbgs() << " to ";
+          dumpReg(HintReg, 0, TRI);
+        });
+        Hints.push_back(HintReg);
+      }
+    }
+  }
+  return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+                                                   VRM, Matrix);
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPURegisterInfo.h b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPURegisterInfo.h
new file mode 100644
index 0000000..33039ad
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPURegisterInfo.h

@@ -0,0 +1,89 @@
+//===---- TPURegisterInfo.h - TPU Register File -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the TPU implementation of TargetRegisterInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_TPU_TPUREGISTERINFO_H
+#define LLVM_LIB_TARGET_TPU_TPUREGISTERINFO_H
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "TPUGenRegisterInfo.inc"
+
+namespace llvm {
+
+struct TPURegisterInfo : public TPUGenRegisterInfo {
+  TPURegisterInfo(unsigned HwMode);
+
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID) const override;
+
+  // Code Generation virtual methods.
+  const uint16_t *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
+
+  bool eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+
+  bool
+  getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+                        SmallVectorImpl<MCPhysReg> &Hints,
+                        const MachineFunction &MF,
+                        const VirtRegMap *VRM = nullptr,
+                        const LiveRegMatrix *Matrix = nullptr) const override;
+
+  // Debug information queries.
+  Register getFrameRegister(const MachineFunction &MF) const override;
+
+  bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+                            unsigned DefSubReg,
+                            const TargetRegisterClass *SrcRC,
+                            unsigned SrcSubReg) const override;
+
+  bool saveScavengerRegister(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I,
+                             MachineBasicBlock::iterator &UseMI,
+                             const TargetRegisterClass *RC, Register Reg) const;
+
+  bool isScalarMemSpilledRegister(const MachineFunction &MF,
+                                  Register Reg) const;
+  bool isVectorMemSpilledRegister(const MachineFunction &MF,
+                                  Register Reg) const;
+  // Returns true if register is a fifo register.
+  bool isFifoRegister(const MachineFunction &MF, Register Reg) const;
+  // Returns true if register is a fixed embedded mask.
+  bool isEmbeddedMask(Register Reg) const;
+
+  // Clears the caches.
+  // FIXME(hgreving): const but not-const, bad design. IMO
+  // TargetRegisterInfo::getRegAllocationHints also should not be const.
+  void clear() const {
+    PipelinedRegisterHintsCached.clear();
+    PipelinedRegisterHints.clear();
+  }
+
+private:
+  mutable DenseSet<const MachineFunction *> PipelinedRegisterHintsCached;
+  mutable DenseMap<Register, int> PipelinedRegisterHints;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_TPU_TPUREGISTERINFO_H

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPURegisterInfo.td b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPURegisterInfo.td
new file mode 100644
index 0000000..7aa88c5
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPURegisterInfo.td

@@ -0,0 +1,433 @@
+//===- TPURegisterInfo.td - TPU Register defs --------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the TPU register file
+//===----------------------------------------------------------------------===//
+
+class TPUReg<bits<16> num, string n, list<Register> subregs = [],
+               list<string> altNames = []> : Register<n, altNames> {
+  let Namespace = "TPU";
+  let SubRegs = subregs;
+  let HWEncoding = num;
+}
+
+let Namespace = "TPU" in {
+  def sub_32 : SubRegIndex<32>;
+  def sub_256 : SubRegIndex<256>;
+}
+
+class TupleRegInfo<int numRegs, int regSizeInBits,
+  int spillSizeInBits, int spillAlignInBits> :
+  RegInfo<!mul(numRegs, regSizeInBits), !mul(numRegs, spillSizeInBits),
+  spillAlignInBits>;
+class MaskTCRegInfo<int numRegs> : TupleRegInfo<numRegs, 1024, 32768, 32768>;
+class MaskSCRegInfo<int numRegs> : TupleRegInfo<numRegs, 8, 256, 256>;
+
+class VPRTCRegInfo<int numRegs> : TupleRegInfo<numRegs, 32768, 32768, 32768>;
+class VPRSCRegInfo<int numRegs> : TupleRegInfo<numRegs, 256, 256, 256>;
+
+class MPRRegInfos<int numRegs> : RegInfoByHwMode<
+  [SparseCoreV8HwMode, SparseCoreV16HwMode, DefaultMode],
+  [MaskSCRegInfo<numRegs>, MaskSCRegInfo<numRegs>, MaskTCRegInfo<numRegs>]>;
+class VPRRegInfos<int numRegs> : RegInfoByHwMode<
+  [SparseCoreV8HwMode, SparseCoreV16HwMode, DefaultMode],
+  [VPRSCRegInfo<numRegs>, VPRSCRegInfo<numRegs>, VPRTCRegInfo<numRegs>]>;
+
+def vNi32 : ValueTypeByHwMode<[SparseCoreV8HwMode, SparseCoreV16HwMode, DefaultMode],
+                              [v8i32, v16i32, v1024i32]>;
+def vNf32 : ValueTypeByHwMode<[SparseCoreV8HwMode, SparseCoreV16HwMode, DefaultMode],
+                              [v8f32, v16f32, v1024f32]>;
+def vNi1 : ValueTypeByHwMode<[SparseCoreV8HwMode, SparseCoreV16HwMode, DefaultMode],
+                             [v8i1, v16i1, v1024i1]>;
+def vNbf16 : ValueTypeByHwMode<[SparseCoreV8HwMode, SparseCoreV16HwMode, DefaultMode],
+                               [v16bf16, v32bf16, v16bf16]>;
+def vNf16 : ValueTypeByHwMode<[SparseCoreV8HwMode, SparseCoreV16HwMode, DefaultMode],
+                              [v16f16, v32f16, v16f16]>;
+def vNb16 : ValueTypeByHwMode<[SparseCoreV8HwMode, SparseCoreV16HwMode, DefaultMode],
+                              [v16i16, v32i16, v16i16]>;
+def vNbf16i1 : ValueTypeByHwMode<[SparseCoreV8HwMode, SparseCoreV16HwMode, DefaultMode],
+                                 [v16i1, v32i1, v16i1]>;
+def vNb16i1 : ValueTypeByHwMode<[SparseCoreV8HwMode, SparseCoreV16HwMode, DefaultMode],
+                                [v16i1, v32i1, v16i1]>;
+def vNb8i1 : ValueTypeByHwMode<[SparseCoreV8HwMode, SparseCoreV16HwMode, DefaultMode],
+                               [v32i1, v64i1, v32i1]>;
+def vNb8 : ValueTypeByHwMode<[SparseCoreV8HwMode, SparseCoreV16HwMode, DefaultMode],
+                             [v32i8, v64i8, v32i8]>;
+def vNb4 : ValueTypeByHwMode<[SparseCoreV8HwMode, SparseCoreV16HwMode, DefaultMode],
+                             [v64i4, v128i4, v64i4]>;
+def vNb2 : ValueTypeByHwMode<[SparseCoreV8HwMode, SparseCoreV16HwMode, DefaultMode],
+                             [v128i2, v256i2, v128i2]>;
+def vNb1 : ValueTypeByHwMode<[SparseCoreV8HwMode, SparseCoreV16HwMode, DefaultMode],
+                             [v256i1, v512i1, v256i1]>;
+
+foreach i = 0-31 in {
+  // Sregs
+  def S#i : TPUReg<i, "s"#i>, DwarfRegNum<[i]>;
+}
+// Register aliases
+let SubRegIndices = [sub_32] in {
+  def LR   : TPUReg<27,  "lr", [S27]>, DwarfRegAlias<S27>;
+  def FPV  : TPUReg<28, "fpv", [S28]>, DwarfRegAlias<S28>;
+  def SPV  : TPUReg<29, "spv", [S29]>, DwarfRegAlias<S29>;
+  def FPS  : TPUReg<30, "fps", [S30]>, DwarfRegAlias<S30>;
+  def SPS  : TPUReg<31, "sps", [S31]>, DwarfRegAlias<S31>;
+}
+foreach i = 0-63 in {
+  // Vregs
+  def V#i : TPUReg<i, "v"#i>, DwarfRegNum<[i]>;
+}
+foreach i = 0-15 in {
+  // Vmregs. M8-M15 are only used by Viperfish.
+  // M16-M31 are either reserved or special HW masks.
+  def M#i : TPUReg<i, "vm"#i>, DwarfRegNum<[i]>;
+}
+// M16-M31 are either reserved or special HW masks.
+def M16 : TPUReg<16, "$0xff">, DwarfRegNum<[16]>;
+def M17 : TPUReg<17, "$0x7f">, DwarfRegNum<[17]>;
+def M18 : TPUReg<18, "$0x3f">, DwarfRegNum<[18]>;
+def M19 : TPUReg<19, "$0x1f">, DwarfRegNum<[19]>;
+def M20 : TPUReg<20, "$0xf">, DwarfRegNum<[20]>;
+def M21 : TPUReg<21, "$0x7">, DwarfRegNum<[21]>;
+def M22 : TPUReg<22, "$0x3">, DwarfRegNum<[22]>;
+def M23 : TPUReg<23, "$0x1">, DwarfRegNum<[23]>;
+def M24 : TPUReg<24, "vm24_reserved">, DwarfRegNum<[24]>;
+def M25 : TPUReg<25, "vm25_reserved">, DwarfRegNum<[25]>;
+def M26 : TPUReg<26, "vm26_reserved">, DwarfRegNum<[26]>;
+def M27 : TPUReg<27, "vm27_reserved">, DwarfRegNum<[27]>;
+def M28 : TPUReg<28, "vm28_reserved">, DwarfRegNum<[28]>;
+def M29 : TPUReg<29, "vm29_reserved">, DwarfRegNum<[29]>;
+def M30 : TPUReg<30, "vm30_reserved">, DwarfRegNum<[30]>;
+def M31 : TPUReg<31, "vm31_reserved">, DwarfRegNum<[31]>;
+
+foreach i = 0-14 in {
+  // Pregs
+  def P#i : TPUReg<i, "p"#i>, DwarfRegNum<[i]>;
+}
+// Special, unallocatable always predicate.
+def Palways : TPUReg<15, "always">, DwarfRegNum<[15]>;
+
+def Void : TPUReg<0, "_">, DwarfRegNum<[0]>;
+
+foreach i = 0-15 in {
+  // Circular buffer regs
+  def CB#i : TPUReg<i, "cb"#i>, DwarfRegNum<[i]>;
+}
+
+// Vector-to-scalar fifo.
+// TODO(hgreving): The number of registers are sized to accomodate for TPU. Sparsecore's
+// V2S fifo is only 8 entries deep and we should consider splitting off a separate register
+// class to accomodate for this and enforce better checks.
+def V2SF : TPUReg<0, "(v2sf)">, DwarfRegNum<[0]>;
+// LINT.IfChange
+foreach i = 0-128 in {
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+  // Pseudo V2SF registers
+  def V2SF_#i : TPUReg<i, "(v2sf"#i#")">, DwarfRegNum<[0]>;
+}
+
+def SFRF : TPUReg<0, "(sfrf)">, DwarfRegNum<[0]>;
+// LINT.IfChange
+foreach i = 0-16 in {
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+  // Pseudo SFRF registers
+  def SFRF_#i : TPUReg<i, "(sfrf"#i#")">, DwarfRegNum<[0]>;
+}
+
+// Cross-lane fifo.
+def TRF0 : TPUReg<0, "(trf0)">, DwarfRegNum<[0]>;
+// TODO(hgreving): TPU, TRF1 not needed.
+def TRF1 : TPUReg<0, "(trf1)">, DwarfRegNum<[0]>;
+def TRF2 : TPUReg<0, "(trf2)">, DwarfRegNum<[0]>;
+// TODO(hgreving): TPU, size wrong.
+// LINT.IfChange
+foreach i = 0-63 in {
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+  // Pseudo TRF0, TRF1 registers
+  def TRF0_#i : TPUReg<i, "(trf0_"#i#")">, DwarfRegNum<[0]>;
+  def TRF1_#i : TPUReg<i, "(trf1_"#i#")">, DwarfRegNum<[0]>;
+  def TRF2_#i : TPUReg<i, "(trf2_"#i#")">, DwarfRegNum<[0]>;
+}
+
+// Comment about pseudo fifo register class sizes: LLVM's greedy allocator does not always
+// split each live-range even if it technically could in order to re-use a register. I
+// haven't seen a way whether this is possible to tune, e.g. making pseudo fifo copies
+// completely free. This can cause the register allocator using more registers than the
+// actual fifo depth. For this reason, we define each fifo register class 2X as big as the
+// fifo depth, plus one for the actual fifo register. Fifo overflow is avoided by the
+// software pipeliner and the overflow mutations, but is not caught by the pseudo fifo
+// register class size and most likely, the allocator will not attempt to spill because of
+// the generous 2X size.
+
+// XPU XRF0 fifo.
+def XRF0 : TPUReg<0, "(xrf0)">, DwarfRegNum<[0]>;
+// LINT.IfChange
+foreach i = 0-7 in {
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+  // Pseudo XRF0 registers
+  def XRF0_#i : TPUReg<i, "(xrf0_"#i#")">, DwarfRegNum<[0]>;
+}
+
+// XPU XRF1 fifo.
+def XRF1 : TPUReg<0, "(xrf1)">, DwarfRegNum<[0]>;
+// LINT.IfChange
+foreach i = 0-11 in {
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+  // Pseudo XRF1 registers
+  def XRF1_#i : TPUReg<i, "(xrf1_"#i#")">, DwarfRegNum<[0]>;
+}
+
+// DRF fifo.
+def DRF : TPUReg<0, "(drf)">, DwarfRegNum<[0]>;
+// LINT.IfChange
+foreach i = 0-17 in {
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+  // Pseudo DRF registers
+  def DRF_#i : TPUReg<i, "(drf_"#i#")">, DwarfRegNum<[0]>;
+}
+
+// Extended unary fifo.
+// TODO(hgreving): The number of registers are sized to accomodate for TPU and Barnacore.
+// Sparsecore's ERF fifo is only 8 entries deep and we should consider splitting off a separate
+// register class to accomodate for this and enforce better checks.
+def ERF : TPUReg<0, "(erf)">, DwarfRegNum<[0]>;
+// LINT.IfChange
+foreach i = 0-33 in {
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+  def ERF_#i : TPUReg<i, "(erf_"#i#")">, DwarfRegNum<[0]>;
+}
+
+// Permuation registers.
+def PCR0 : TPUReg<0, "(pcr0)">, DwarfRegNum<[0]>;
+def PCR1 : TPUReg<0, "(pcr1)">, DwarfRegNum<[0]>;
+def PCR2 : TPUReg<0, "(pcr2)">, DwarfRegNum<[0]>;
+def SPR0 : TPUReg<0, "(spr0)">, DwarfRegNum<[0]>;
+def SPR1 : TPUReg<0, "(spr1)">, DwarfRegNum<[0]>;
+
+// Indexed address registers.
+def IAR0 : TPUReg<0, "(iar0)">, DwarfRegNum<[0]>;
+def IAR1 : TPUReg<0, "(iar1)">, DwarfRegNum<[0]>;
+
+// Register classes.
+
+// GPRs are Sregs and can contain 32-bit ints or floats.
+def GPR : RegisterClass<"TPU", [i32, f32, bf16], 32,
+     (add (sequence "S%i", 0, 31), SPS, FPS, SPV, FPV, LR)>;
+
+// Predicate registers can contain i1.
+def PPR : RegisterClass<"TPU", [i1], 32,
+     (add (sequence "P%i", 0, 14), Palways)> {
+  // We're setting PPR's spill size to 32-bit.
+  let Size = 32;
+}
+
+// CBRs are circular buffer regs and can contain 3x32-bit. Only one
+// i32 is exposed as a data dependency.
+def CBR : RegisterClass<"TPU", [x86mmx], 32,
+     (sequence "CB%i", 0, 15)> {
+  let Size = 32;
+}
+
+class MaskRegisterClass<dag regList, int regNum> :
+  RegisterClass<"TPU", [vNi1, vNbf16i1, vNb16i1, vNb8i1], 32, regList> {
+  let RegInfos = MPRRegInfos<regNum>;
+}
+
+class VPRRegisterClass<dag regList, int regNum> :
+  RegisterClass<"TPU", [vNi32, vNf32, vNbf16, vNf16, vNb16,
+                        vNb8, vNb4, vNb2, vNb1], 32, regList> {
+  let RegInfos = VPRRegInfos<regNum>;
+}
+
+// VPRs are Vregs. Their content type is <8 x i32/f32> in SparseCore mode and
+// <1024 x i32/f32> in TensorCore mode.
+def VPR : VPRRegisterClass<(add (sequence "V%i", 0, 63), Void), 1>;
+
+// MPRs are Vmregs; masks for Vregs. They contain <8 x i1> in SparseCore mode
+// and <1024 x i1> in TensorCore mode.
+// Registers TPU::M16 and up have a special meaning representing fixed hardware masks.
+// Must be in sync with TPURegisterInfo::isEmbeddedMask().
+// LINT.IfChange
+def MPR : MaskRegisterClass<(add (sequence "M%i", 0, 31), Void), 1>;
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPURegisterInfo.h)
+
+// The V2SFPR regclass wraps the V2SF fifo.
+def V2SFPR : RegisterClass<"TPU", [i32, f32], 32,
+// LINT.IfChange
+  (add V2SF, (sequence "V2SF_%i", 0, 128))>;
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+
+// The SFRFPR regclass wraps the SFRF fifo.
+def SFRFPR : RegisterClass<"TPU", [i32, f32], 32,
+// LINT.IfChange
+  (add SFRF, (sequence "SFRF_%i", 0, 16))>;
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+
+// The TRFPR regclass wraps the TRF fifo.
+def TRFPR0 : RegisterClass<"TPU", [i32], 32,
+// LINT.IfChange
+  (add TRF0, (sequence "TRF0_%i", 0, 63))>;
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+
+def TRFPR1 : RegisterClass<"TPU", [i32], 32,
+// LINT.IfChange
+  (add TRF1, (sequence "TRF1_%i", 0, 63))>;
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+
+def TRFPR2 : RegisterClass<"TPU", [i32], 32,
+// LINT.IfChange
+  (add TRF2, (sequence "TRF2_%i", 0, 63))>;
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+
+// The XRFPR0 regclass wraps the XPU XRF0 fifo.
+// The number of pseudo registers should be fifo depth + 1.
+def XRFPR0 : RegisterClass<"TPU", [i32], 32,
+// LINT.IfChange
+  (add XRF0, (sequence "XRF0_%i", 0, 7))>;
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+
+// The XRFPR1 regclass wraps the XPU XRF1 fifo.
+// The number of pseudo registers should be fifo depth + 1.
+def XRFPR1 : RegisterClass<"TPU", [i32], 32,
+// LINT.IfChange
+  (add XRF1, (sequence "XRF1_%i", 0, 11))>;
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+
+// The ERFPR regclass wraps the ERF fifo.
+def ERFPR : RegisterClass<"TPU", [i32], 32,
+// LINT.IfChange
+  (add ERF, (sequence "ERF_%i", 0, 33))>;
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+
+// The DRFPR regclass wraps the DRF fifo.
+def DRFPR : RegisterClass<"TPU", [i32], 32,
+// LINT.IfChange
+  (add DRF, (sequence "DRF_%i", 0, 17))>;
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+
+// The PCRPR/SPRPR regclass wraps the PCR/SPR registers.
+def PCRPR0 : RegisterClass<"TPU", [i32], 32, (add PCR0)>;
+def PCRPR1 : RegisterClass<"TPU", [i32], 32, (add PCR1)>;
+def PCRPR2 : RegisterClass<"TPU", [i32], 32, (add PCR2)>;
+def SPRPR0 : RegisterClass<"TPU", [i32], 32, (add SPR0)>;
+def SPRPR1 : RegisterClass<"TPU", [i32], 32, (add SPR1)>;
+
+// The IARPR regclass wraps the IAR registers.
+def IARPR0 : RegisterClass<"TPU", [i32], 32, (add IAR0)>;
+def IARPR1 : RegisterClass<"TPU", [i32], 32, (add IAR1)>;
+
+foreach Index = 0-3 in {
+  // MXU Fifo for matmul results
+  def MRF#Index : TPUReg<0, "(mrf"#Index#")">, DwarfRegNum<[0]>;
+// LINT.IfChange
+  foreach i = 0-136 in {
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+    def MRF#Index#_#i : TPUReg<i, "(mrf"#Index#"_"#i#")">, DwarfRegNum<[0]>;
+  }
+
+  // GMR/LMR and GSFN/GSFT/MSRA/MSRB are not modeled as a FIFO as it is fixed register but it has
+  // its own register type and only one should be alived at a time, if this
+  // is not true there is a program error
+  def GSFN#Index : TPUReg<0, "(gsfn"#Index#")">, DwarfRegNum<[0]>;
+  def GSFNPR#Index : RegisterClass<"TPU", [i32], 32,
+    (add (sequence "GSFN%i", Index, Index))>;
+  def GSFT#Index : TPUReg<0, "(gsft"#Index#")">, DwarfRegNum<[0]>;
+  def GSFTPR#Index : RegisterClass<"TPU", [i32], 32,
+    (add (sequence "GSFT%i", Index, Index))>;
+  def GMR#Index : TPUReg<0, "(gmr"#Index#")">, DwarfRegNum<[0]>;
+  def GMRPR#Index : RegisterClass<"TPU", [i32], 32,
+    (add (sequence "GMR%i", Index, Index))>;
+  def LMR#Index : TPUReg<0, "(lmr"#Index#")">, DwarfRegNum<[0]>;
+  def LMRPR#Index : RegisterClass<"TPU", [i32], 32,
+    (add (sequence "LMR%i", Index, Index))>;
+  def MSRA#Index : TPUReg<0, "(msra"#Index#")">, DwarfRegNum<[0]>;
+  def MSRAPR#Index : RegisterClass<"TPU", [i32], 32,
+    (add (sequence "MSRA%i", Index, Index))>;
+  def MSRB#Index : TPUReg<0, "(msrb"#Index#")">, DwarfRegNum<[0]>;
+  def MSRBPR#Index : RegisterClass<"TPU", [i32], 32,
+    (add (sequence "MSRB%i", Index, Index))>;
+}
+
+// LINT.IfChange
+def MRFPR0 : RegisterClass<"TPU", [i32], 32, (add MRF0, (sequence "MRF0_%i", 0, 136))>;
+def MRFPR1 : RegisterClass<"TPU", [i32], 32, (add MRF1, (sequence "MRF1_%i", 0, 136))>;
+def MRFPR2 : RegisterClass<"TPU", [i32], 32, (add MRF2, (sequence "MRF2_%i", 0, 136))>;
+def MRFPR3 : RegisterClass<"TPU", [i32], 32, (add MRF3, (sequence "MRF3_%i", 0, 136))>;
+// LINT.ThenChange(//depot/google3/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp)
+
+//===----------------------------------------------------------------------===//
+// BarnaCore
+//===----------------------------------------------------------------------===//
+
+// Register tuples for pipelining. We define register classes for 2, 3 and 4-
+// tuples.
+let Namespace = "TPU" in {
+  def sub_ps0 : SubRegIndex<256>;
+  def sub_ps1 : SubRegIndex<256>;
+  def sub_ps2 : SubRegIndex<256>;
+  def sub_ps3 : SubRegIndex<256>;
+}
+
+// BarnaCore aggregate registers. These alias VPRs. We use these registers to
+// identify arrays of VPRs that instructions can write into/read from depending
+// on the hardware (loop_index) instruction.
+foreach i = 0-63 in {
+  def VAGGps0_#i : TPUReg<i, "v"#i#".ali.ps0", []> {
+    let isArtificial = 1;
+  }
+  def VAGGps1_#i : TPUReg<i, "v"#i#".ali.ps1", []> {
+    let isArtificial = 1;
+  }
+  def VAGGps2_#i : TPUReg<i, "v"#i#".ali.ps2", []> {
+    let isArtificial = 1;
+  }
+  def VAGGps3_#i : TPUReg<i, "v"#i#".ali.ps3", []> {
+    let isArtificial = 1;
+  }
+
+  // Print VAGG1 as "v1.ali", with "ali" meaning add_loop_index.
+  def VAGG#i : TPUReg<i, "v"#i#".ali", []>, DwarfRegNum<[i]> {
+    let Aliases = [!cast<Register>("V"#i)];
+    let SubRegs = [!cast<Register>("VAGGps0_"#i),
+                   !cast<Register>("VAGGps1_"#i),
+                   !cast<Register>("VAGGps2_"#i),
+                   !cast<Register>("VAGGps3_"#i)];
+    let SubRegIndices = [sub_ps0, sub_ps1, sub_ps2, sub_ps3];
+  }
+}
+
+// Vreg or aggregate vreg. Accesses to aggregate vregs cause alidst/alix/aliy
+// modifiers to be added to instructions.
+def VPR_AGG : VPRRegisterClass<(add (sequence "VAGG%i", 0, 63),
+                               (sequence "V%i", 0, 63), Void), 1>;
+
+// Only aggregate vregs.
+def VAGG : VPRRegisterClass<(add (sequence "VAGG%i", 0, 63), Void), 1>;
+
+multiclass RegTuple<string RCName, int RegNum> {
+def Tuple_2 : RegisterTuples<[sub_ps0, sub_ps1],
+                                  [(sequence RCName#"%i", 0, RegNum, 2),
+                                   (sequence RCName#"%i", 1, RegNum, 2)]>;
+def Tuple_3 : RegisterTuples<[sub_ps0, sub_ps1, sub_ps2],
+                                [(sequence RCName#"%i", 0, RegNum, 3),
+                                 (sequence RCName#"%i", 1, RegNum, 3),
+                                 (sequence RCName#"%i", 2, RegNum, 3)]>;
+def Tuple_4 : RegisterTuples<[sub_ps0, sub_ps1, sub_ps2, sub_ps3],
+                                [(sequence RCName#"%i", 0, RegNum, 4),
+                                 (sequence RCName#"%i", 1, RegNum, 4),
+                                 (sequence RCName#"%i", 2, RegNum, 4),
+                                 (sequence RCName#"%i", 3, RegNum, 4)]>;
+}
+defm V : RegTuple<"V", 63>;
+defm M : RegTuple<"M", 15>;
+
+def VPR_2Tuple : VPRRegisterClass<(add VTuple_2), 2>;
+def VPR_3Tuple : VPRRegisterClass<(add VTuple_3), 3>;
+def VPR_4Tuple : VPRRegisterClass<(add VTuple_4), 4>;
+
+def MPR_2Tuple : MaskRegisterClass<(add MTuple_2), 2>;
+def MPR_3Tuple : MaskRegisterClass<(add MTuple_3), 3>;
+def MPR_4Tuple : MaskRegisterClass<(add MTuple_4), 4>;

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPURegisterPrepare.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPURegisterPrepare.cpp
new file mode 100644
index 0000000..2ec0dda
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPURegisterPrepare.cpp

@@ -0,0 +1,222 @@
+//===- TPURegisterPrepare.cpp - Process regs before RA ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms the SSA to be more suitable for register allocation:
+//
+// * Propagates `void` to unused fifo pop results.
+// * Ties the destination and a source operand of VMXOR/VMAND/VMOR
+//   instructions for JF/DF.
+// * Performs simple, early copy propagation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TPU.h"
+#include "TPUInstrInfo.h"
+#include "TPUSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "tpu-register-prepare"
+
+namespace {
+class TPURegisterPrepare : public MachineFunctionPass {
+public:
+  static char ID;
+  TPURegisterPrepare() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF);
+  // Copies can occur right after selection DAG. Because we can't predicate
+  // them, we do early, simple copy propagation here. In SSA, if the copy is
+  // virtual registers and the register classes are the same, the use can always
+  // use the right hand side. Note "early", because MachineCopyProp in LLVM is
+  // postRA, and register coalescing is too heavy and is deSSA AFAIK.
+  bool earlyCopyProp(MachineInstr &MI,
+                     SmallVector<MachineInstr *> &CopiesToRemove);
+  bool tie3OperandMask(MachineInstr &MI);
+  bool propagatePop3Void(MachineInstr &MI);
+  void validatePop3Void(MachineInstr &MI);
+  bool verifyHwMask(MachineInstr &MI);
+  StringRef getPassName() const override { return "TPU register prepare pass"; }
+
+private:
+  MachineRegisterInfo *MRI;
+  const TPUSubtarget *ST;
+};
+
+char TPURegisterPrepare::ID = 0;
+
+} // namespace
+
+INITIALIZE_PASS(TPURegisterPrepare, DEBUG_TYPE, "TPU register prepare", true,
+                false)
+
+bool TPURegisterPrepare::tie3OperandMask(MachineInstr &MI) {
+  bool Changed = false;
+  if (!ST->hasJfcTensorCore() && !ST->hasDfcTensorCore())
+    return Changed;
+  // RA constraint enforcement on the operands of certain modify mask
+  // operations for JF/DF sub-targets.
+  if (MI.getOpcode() == TPU::VMXOR || MI.getOpcode() == TPU::VMAND ||
+      MI.getOpcode() == TPU::VMOR) {
+    LLVM_DEBUG(dbgs() << "For instruction:: " << MI);
+    LLVM_DEBUG(dbgs() << "\ttying operand 0 (" << MI.getOperand(0) << ")\n"
+                      << "\tto operand 1 (" << MI.getOperand(1) << ")\n");
+
+    MI.tieOperands(0, 1);
+    Changed = true;
+  }
+  return Changed;
+}
+
+void TPURegisterPrepare::validatePop3Void(MachineInstr &MI) {
+  assert(ST->isVfcSparseCore());
+  if (MI.getOpcode() != TPU::scVPOP3_XRF1)
+    return;
+  MachineInstr *DefMI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
+  const TPUInstrInfo *TII = ST->getInstrInfo();
+  if (!TII->needsPop3Word0Void(*ST, *DefMI))
+    return;
+  if (MI.getOperand(0).getReg() != TPU::Void)
+    llvm_unreachable("Word0 only supported on GLC and up.");
+}
+
+bool TPURegisterPrepare::propagatePop3Void(MachineInstr &MI) {
+  bool Changed = false;
+  // Propagates "Void" to unused vpop results.
+  if (!TPUInstrInfo::supportsFifoPopVoid(MI))
+    return false;
+  assert(MI.getNumExplicitDefs() == 3);
+  auto ReplaceByVoid = [this, &Changed](MachineOperand &MO) {
+    if (!MRI->use_empty(MO.getReg()))
+      return false;
+    MO.setReg(TPU::Void);
+    Changed = true;
+    return true;
+  };
+  auto &V0 = MI.getOperand(0);
+  auto &V1 = MI.getOperand(1);
+  auto &VM = MI.getOperand(2);
+  if (!ReplaceByVoid(V0))
+    ReplaceByVoid(V1);
+  ReplaceByVoid(VM);
+  return Changed;
+}
+
+bool TPURegisterPrepare::verifyHwMask(MachineInstr &MI) {
+  auto UsesHwMaskRegister = [this](const MachineInstr &MI) {
+    for (auto &MO : MI.uses()) {
+      if (!MO.isReg())
+        continue;
+      if (!ST->getRegisterInfo()->isEmbeddedMask(MO.getReg()))
+        continue;
+      assert(TPU::MPRRegClass.contains(MO.getReg()));
+      return true;
+    }
+    return false;
+  };
+  // TODO(hgreving): For instructions that don't support fixed hardware masks,
+  // we should insert a COPY or VMMOV at this point in order to legalize the
+  // instructions. A COPY should be sufficient as copy prop does not modify
+  // physical register copies. For now, we leave this as an assertion and call
+  // it "verify".
+  if (!UsesHwMaskRegister(MI))
+    return true;
+  return MI.getOpcode() == TPU::COPY || TPUInstrInfo::supportsEmbeddedMask(MI);
+}
+
+bool TPURegisterPrepare::earlyCopyProp(
+    MachineInstr &MI, SmallVector<MachineInstr *> &CopiesToRemove) {
+  if (MI.getOpcode() != TPU::COPY)
+    return false;
+  Register DestR = MI.getOperand(0).getReg();
+  Register SrcR = MI.getOperand(1).getReg();
+  if (!Register::isVirtualRegister(DestR))
+    return false;
+  const TargetRegisterClass *SrcRegClass;
+  if (ST->getRegisterInfo()->isEmbeddedMask(SrcR)) {
+    assert(TPU::MPRRegClass.contains(SrcR));
+    SrcRegClass = &TPU::MPRRegClass;
+  } else {
+    if (!Register::isVirtualRegister(SrcR))
+      return false;
+    SrcRegClass = MRI->getRegClass(SrcR);
+  }
+  if (MRI->getRegClass(DestR) != SrcRegClass)
+    return false;
+  SmallVector<MachineOperand *> UseOperands;
+  for (auto &MO : MRI->use_operands(DestR)) {
+    if (!ST->getRegisterInfo()->isEmbeddedMask(SrcR) ||
+        TPUInstrInfo::supportsEmbeddedMask(*MO.getParent()))
+      UseOperands.push_back(&MO);
+  }
+  for (auto *MO : UseOperands)
+    MO->setReg(SrcR);
+  if (ST->getRegisterInfo()->isEmbeddedMask(SrcR)) {
+    if (!MRI->use_empty(DestR)) {
+      const TPUInstrInfo *TII = ST->getInstrInfo();
+      AddDefaultPred(BuildMI(*MI.getParent(), MI.getIterator(),
+                             MI.getDebugLoc(), TII->get(TPU::VMMOV), DestR)
+                         .addReg(SrcR));
+    }
+  }
+  CopiesToRemove.push_back(&MI);
+  return true;
+}
+
+bool TPURegisterPrepare::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  MRI = &MF.getRegInfo();
+  ST = &MF.getSubtarget<TPUSubtarget>();
+
+  // If we won't do this, it might cause indeterminism later when hinting
+  // rewriting the regs.
+  const TPURegisterInfo *TRI = ST->getRegisterInfo();
+  TRI->clear();
+
+  SmallVector<MachineInstr *> CopiesToRemove;
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (earlyCopyProp(MI, CopiesToRemove)) {
+        Changed = true;
+        continue;
+      }
+      if (tie3OperandMask(MI)) {
+        Changed = true;
+        continue;
+      }
+      if (propagatePop3Void(MI)) {
+        Changed = true;
+        continue;
+      }
+      // TODO(hgreving): Bug. This check is useless. We moved this pass before
+      // register coalescing.
+      if (verifyHwMask(MI)) {
+        Changed = true;
+        continue;
+      }
+    }
+  }
+  for (auto &CopyMI : CopiesToRemove)
+    CopyMI->eraseFromParent();
+
+  if (!ST->isVfcSparseCore())
+    return Changed;
+
+  // We are reusing the same intrinsic for both current SparseCore subtargets
+  // and as an exception we are relying on the optimizer to remove any
+  // dangling invalid use.
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      validatePop3Void(MI);
+    }
+  }
+
+  return Changed;
+}
+
+Pass *llvm::createTPURegisterPreparePass() { return new TPURegisterPrepare(); }

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUResourceSolver.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUResourceSolver.cpp
new file mode 100644
index 0000000..c5b333a
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUResourceSolver.cpp

@@ -0,0 +1,776 @@
+//===-- TPUResourceSolver.cpp - Resource Solver classs-------*- C++------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contain the implementation of resource solver for single cycle set
+// of constraints. It currently uses a brute force solution. It can be improved
+// if it causes compile time problems but it runs at similar speed as DFA
+// packetizer on current set of benchmarks.
+//
+// The algorithm follows this simple logic:
+// * The solver keeps an array of all allowed state of allocations.
+// * Every time we add a new instruction we try all the allowed slots allocation
+//   in all the currently allowed states and create a new array of allowed
+//   states.
+// * If there is any allowed states the allocation is valid. In general it
+//   doesn't matter which state we pick. When we turn on resource tracking we
+//   keep track of the slots used by each instruction. In this case we will use
+//   the first state found as final state.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TPUResourceSolver.h"
+#include "TPUInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+
+#define DEBUG_TYPE "tpu-bundle-tracker"
+using namespace llvm;
+
+STATISTIC(NumFailedAddVs, "Failed to add Vs slot");
+
+cl::opt<int>
+    MaxImms("tpu-bundle-num-imms", cl::init(0),
+            cl::desc("Override number of immediate fields in TPU bundles"));
+
+namespace {
+// Return a packed (16-bit Pufferfish, or 20-bit Viperfish) immediate encoding
+// for ImmVal, or none if none exists.
+std::optional<ResourceSolver::ImmediateSlot>
+getPackedImm(unsigned ImmVal, OpEnc::OpEncodings SpecEnc,
+             ImmediateEncoding &Encoding, const TPUSubtarget &ST) {
+  // IsVfc here only refers to TensorCore
+  bool IsVfc = ST.hasVfcTensorCore();
+  bool IsPfc = ST.hasPfcTensorCore();
+  bool IsSc = ST.isSparseCore();
+  bool IsBc = ST.isPxcBarnaCore();
+  switch (SpecEnc) {
+  case OpEnc::SY:
+  case OpEnc::VY: {
+    bool IsInVectorSlot = SpecEnc == OpEnc::VY;
+    // Low-16/20 zext or oneext encoding.
+    if (ImmVal >> ST.getImmediateSizeInBits() == 0) {
+      Encoding.Kind = TPUMCImmKind::VK_TPU_zext;
+      Encoding.Encoding = IsInVectorSlot ? getFirstVyZeroExtEncoding(IsVfc)
+                                         : getFirstSyZeroExtEncoding(IsVfc);
+      return ResourceSolver::ImmediateSlot(
+          ImmVal & (((1 << ST.getImmediateSizeInBits()) - 1)));
+    }
+    if (ImmVal >> ST.getImmediateSizeInBits() ==
+        (1 << (32 - ST.getImmediateSizeInBits())) - 1) {
+      Encoding.Kind = TPUMCImmKind::VK_TPU_oneext;
+      Encoding.Encoding = IsInVectorSlot ? getFirstVyOneExtEncoding(IsVfc)
+                                         : getFirstSyOneExtEncoding(IsVfc);
+      return ResourceSolver::ImmediateSlot(
+          ImmVal & (((1 << ST.getImmediateSizeInBits()) - 1)));
+    }
+    // High-16/20 encoding.
+    if ((ImmVal & ((1 << (32 - ST.getImmediateSizeInBits())) - 1)) == 0) {
+      Encoding.Kind = IsVfc || IsSc ? TPUMCImmKind::VK_TPU_shl12
+                                    : TPUMCImmKind::VK_TPU_shl16;
+      Encoding.Encoding = IsInVectorSlot ? getFirstVyShlEncoding(IsVfc)
+                                         : getFirstSyShlEncoding(IsVfc);
+      return ResourceSolver::ImmediateSlot(ImmVal >>
+                                           (32 - ST.getImmediateSizeInBits()));
+    }
+    break;
+  }
+  case OpEnc::MemOffset: {
+    Encoding.Kind = TPUMCImmKind::VK_TPU_zext;
+    Encoding.Encoding = getFirstMemOffsetZeroExtEncoding(IsPfc);
+
+    return ResourceSolver::ImmediateSlot(
+        ImmVal & ((1 << ST.getImmediateSizeInBits()) - 1));
+  }
+  case OpEnc::MemStride: {
+    Encoding.Kind = TPUMCImmKind::VK_TPU_zext;
+    Encoding.Encoding = getFirstMemStrideZeroExtEncoding(IsVfc, IsPfc);
+
+    return ResourceSolver::ImmediateSlot(
+        ImmVal & ((1 << ST.getImmediateSizeInBits()) - 1));
+  }
+  case OpEnc::SublaneMask: {
+    Encoding.Kind = TPUMCImmKind::VK_TPU_zext;
+    assert(!IsSc && "Sparsecore does not support single-slot immediate "
+                    "values for sublane masks");
+    Encoding.Encoding = getFirstSublaneMaskZeroExtEncoding(IsVfc);
+
+    return ResourceSolver::ImmediateSlot(
+        ImmVal & ((1 << ST.getImmediateSizeInBits()) - 1));
+  }
+  case OpEnc::BcVldVstBase:
+  case OpEnc::Normal: {
+    Encoding.Kind = TPUMCImmKind::VK_TPU_zext;
+    if (IsBc) {
+      Encoding.Encoding = 0;
+    } else {
+      assert(!IsSc && "OpEnc::Normal on Sparsecore not supported");
+      // TODO(sdasgup): Currently, OpEnc::Normal is only used in some VF
+      // instructions. Since only SC and BC are using explicit encoding info, we
+      // don't need anything else here. We might add custom encodings for such
+      // operands in the future.
+    }
+
+    return ResourceSolver::ImmediateSlot(
+        ImmVal & ((1 << ST.getImmediateSizeInBits()) - 1));
+  }
+  case OpEnc::Plain:
+    llvm_unreachable("Plain immediates not expected here.");
+    break;
+  default:
+    break;
+  }
+  // No packed encoding.
+  return {};
+}
+} // namespace
+
+Solver::Solver() { clear(); }
+
+void Solver::setTrackResource(bool Track) {
+  TrackResource = Track;
+  if (TrackResource) {
+    assert(ValidCombinations.size() == 1 && *ValidCombinations.begin() == 0 &&
+           "Cannot change tracking resource while tracking");
+  }
+}
+
+void Solver::addSlot(InstrStage::FuncUnits SlotsAllowed, bool NewInst) {
+  SetVector<unsigned> NewCombinations;
+  MapVector<unsigned, SmallVector<uint32_t, 16>> NewSlotTracking;
+  bool SlotTrackingUpdated = false;
+  // Go through all valid states and test which one would be compatible with
+  // any of the slots allowed for the new resource.
+  for (int VC : ValidCombinations) {
+    unsigned Slots = SlotsAllowed;
+    // Loop until we've tried all the allowed slots for this resource.
+    while (Slots != 0) {
+      unsigned Bit = 1 << countTrailingZeros(Slots);
+      assert((Slots & Bit) != 0);
+      Slots &= ~Bit;
+      // Check if slot is already used in this combination.
+      if ((VC & Bit) != 0)
+        continue;
+      unsigned NewVC = VC | Bit;
+      // We found a valid state with space for the new resource.
+      if (NewCombinations.insert(NewVC)) {
+        // If we are tracking resource keep a mapping of instruction and slots
+        // allocated.
+        if (TrackResource) {
+          SlotTrackingUpdated = true;
+          NewSlotTracking[NewVC] = GlobalSlotTracking[VC];
+          if (NewInst) {
+            NewSlotTracking[NewVC].push_back(Bit);
+          } else {
+            NewSlotTracking[NewVC].back() |= Bit;
+          }
+        }
+      }
+    }
+  }
+  // Replace the state of the solver with the new state calculated.
+  ValidCombinations = std::move(NewCombinations);
+  if (TrackResource && SlotTrackingUpdated) {
+    GlobalSlotTracking = std::move(NewSlotTracking);
+    assert(GlobalSlotTracking.size() == ValidCombinations.size());
+  }
+}
+
+bool Solver::isValid() const { return !ValidCombinations.empty(); }
+
+unsigned Solver::getFirstCombination() const {
+  return *ValidCombinations.begin();
+}
+
+void Solver::clear() {
+  ValidCombinations.clear();
+  ValidCombinations.insert(0);
+  if (TrackResource)
+    GlobalSlotTracking.clear();
+}
+
+#ifndef NDEBUG
+int Solver::getCurrentMaxInstrIdx() const {
+  if (GlobalSlotTracking.empty())
+    return 0;
+  int Size = GlobalSlotTracking.begin()->second.size();
+  for (auto &R : GlobalSlotTracking) {
+    assert(R.second.size() == Size);
+  }
+  return Size;
+}
+#endif
+
+unsigned Solver::getSlotsUsed(unsigned Idx) const {
+  // Take the first valid solution as the one we use.
+  assert(!ValidCombinations.empty() && TrackResource);
+  assert(!GlobalSlotTracking.empty());
+  return GlobalSlotTracking.begin()->second[Idx];
+}
+
+ResourceSolver::ResourceSolver(const TPUSubtarget &st)
+    : ST(&st), InstrItins(st.getInstrItineraryData()),
+      HasPxcOrVfSyEncoding(ST->hasPfcTensorCore() || ST->isPxcBarnaCore() ||
+                           ST->isSparseCore() || ST->hasVfcTensorCore()) {
+  GlobalSolvedInstrs.clear();
+  GlobalVs.resize(ST->getNumVs());
+  GlobalSolver.setTrackResource(true);
+  GlobalImms.resize(MaxImms > 0 ? MaxImms : ST->getNumberImmediateSlots());
+  // TODO(b/181709129): A more thorough solution would be to have a real solver.
+  // We also pay a small price on VF of not sharing imm4/5 immediate slots with
+  // scalar instructions if those instructions were added first.
+  ImmsAllocOrder = ST->getImmediateSlotAllocOrder();
+  if (MaxImms > 0) {
+    auto It = std::remove_if(ImmsAllocOrder.begin(), ImmsAllocOrder.end(),
+                             [](int I) { return I >= MaxImms; });
+    ImmsAllocOrder.erase(It, ImmsAllocOrder.end());
+  }
+  assert(MaxImms <= ST->getNumberImmediateSlots());
+}
+
+void ResourceSolver::addMI(const MachineInstr &MI) {
+  if (!addMIToSolver(&MI.getDesc(), GlobalSolver))
+    return;
+  GlobalSolvedInstrs.push_back(&MI);
+  assert(GlobalSolver.isValid());
+  assert(GlobalSolvedInstrs.size() == GlobalSolver.getCurrentMaxInstrIdx());
+  std::fill(std::begin(GlobalVs), std::end(GlobalVs),
+            std::make_pair(TPU::NoRegister, true));
+  SparseCoreMCBundleInfo BI;
+  int Idx = 0;
+  for (; Idx < GlobalSolvedInstrs.size() - 1; Idx++) {
+    if (!canAddVs(*GlobalSolvedInstrs[Idx],
+                  ST->isSparseCore() &&
+                      BI.hasScalarMiscSlot(*ST, GlobalSolver.getSlotsUsed(Idx)),
+                  GlobalVs, GlobalVsTracking))
+      llvm_unreachable("Must not happen, internal error.");
+  }
+  canAddVs(MI,
+           ST->isSparseCore() &&
+               BI.hasScalarMiscSlot(*ST, GlobalSolver.getSlotsUsed(Idx)),
+           GlobalVs, GlobalVsTracking);
+}
+
+bool ResourceSolver::canAddMI(const MachineInstr &MI) const {
+  Solver CopySolver = GlobalSolver;
+  if (!addMIToSolver(&MI.getDesc(), CopySolver))
+    return true;
+  if (!CopySolver.isValid())
+    return false;
+#ifndef NDEBUG
+  // Sanity check
+  unsigned Slots = 0;
+  assert(GlobalSolvedInstrs.size() + 1 == CopySolver.getCurrentMaxInstrIdx());
+  if (int MaxIdx = GlobalSolvedInstrs.size() + 1; MaxIdx > 0) {
+    for (int Idx = 0; Idx < MaxIdx; Idx++)
+      Slots |= CopySolver.getSlotsUsed(Idx);
+    assert(Slots == CopySolver.getFirstCombination());
+  }
+#endif
+  SmallVector<std::pair<Register, bool>> CopyVs;
+  CopyVs.resize(ST->getNumVs());
+  std::fill(std::begin(CopyVs), std::end(CopyVs),
+            std::make_pair(TPU::NoRegister, true));
+  VsTrackingMapT LocalVsTracking;
+  assert(GlobalSolvedInstrs.size() == GlobalSolver.getCurrentMaxInstrIdx());
+  SparseCoreMCBundleInfo BI;
+  int Idx = 0;
+  for (; Idx < GlobalSolvedInstrs.size(); Idx++) {
+    if (!canAddVs(*GlobalSolvedInstrs[Idx],
+                  ST->isSparseCore() &&
+                      BI.hasScalarMiscSlot(*ST, CopySolver.getSlotsUsed(Idx)),
+                  CopyVs, LocalVsTracking)) {
+      // FIXME(b/181709129): This shouldn't happen, but currently can: if the
+      // last previously added instruction changed the solution, and pushes a
+      // previously added instructions into slot_sm, creating a different VS
+      // solution. This is not good at all, and we need to fix. I don't
+      // currently know any miscompile due this bug. The VS solution should be
+      // part of the solver.
+      return false;
+    }
+  }
+  if (!canAddVs(MI,
+                ST->isSparseCore() &&
+                    BI.hasScalarMiscSlot(*ST, CopySolver.getSlotsUsed(Idx)),
+                CopyVs, LocalVsTracking)) {
+    NumFailedAddVs++;
+    return false;
+  }
+  return true;
+}
+
+bool ResourceSolver::canAddVs(const MachineInstr &MI, bool IsScalarMisc,
+                              SmallVector<std::pair<Register, bool>> &Vs,
+                              VsTrackingMapT &VsTracking) const {
+  auto IsGPRReg = [&MI](Register R) {
+    if (Register::isVirtualRegister(R)) {
+      const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+      if (MRI.getRegClass(R) == &TPU::GPRRegClass)
+        return true;
+    } else if (TPU::GPRRegClass.contains(R)) {
+      return true;
+    }
+    return false;
+  };
+  // The following code handles special VS constraint register operands.
+  for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); OpIdx++) {
+    const MachineOperand &MO = MI.getOperand(OpIdx);
+    if (!MO.isReg() || MO.isImplicit())
+      continue;
+    if (!IsGPRReg(MO.getReg()))
+      continue;
+    std::optional<OpEnc::OpEncodings> SE =
+        getSpecialOpEncoding(MI.getDesc(), OpIdx);
+    int ConstrainVs = -1;
+    if (SE.has_value()) {
+      switch (*SE) {
+      case OpEnc::VS0:
+        ConstrainVs = 0;
+        break;
+      case OpEnc::VS1:
+        ConstrainVs = 1;
+        break;
+      case OpEnc::VS2:
+        ConstrainVs = 2;
+        break;
+      case OpEnc::VS3:
+        ConstrainVs = 3;
+        break;
+      case OpEnc::SM_X:
+        if (!IsScalarMisc)
+          // The sm_x, sm_y semantics only apply to scalar misc instructions.
+          continue;
+        ConstrainVs = 2;
+        break;
+      case OpEnc::SM_Y:
+        if (!IsScalarMisc)
+          // The sm_x, sm_y semantics only apply to scalar misc instructions.
+          continue;
+        ConstrainVs = 3;
+        break;
+      default:
+        break;
+      }
+      if (ConstrainVs != -1) {
+        if (!canAddVs(MI, MO.getReg(), OpIdx, ConstrainVs, Vs, VsTracking))
+          return false;
+      }
+    }
+  }
+  if (!TPUInstrInfo::isVectorInstruction(MI.getDesc()))
+    return true;
+  for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); OpIdx++) {
+    const MachineOperand &MO = MI.getOperand(OpIdx);
+    if (!MO.isReg())
+      continue;
+    if (!IsGPRReg(MO.getReg()))
+      continue;
+    if (!canAddVs(MI, MO.getReg(), OpIdx, /* No Vs slot constraint */ -1, Vs,
+                  VsTracking))
+      return false;
+  }
+  return true;
+}
+
+bool ResourceSolver::findVsSlot(const MachineInstr *MI, Register Sreg,
+                                unsigned OpIdx, int ConstrainVs,
+                                SmallVector<std::pair<Register, bool>> &Vs,
+                                VsTrackingMapT &VsTracking) const {
+  for (unsigned I = 0; I < Vs.size(); ++I) {
+    if (ConstrainVs != -1) {
+      if (ConstrainVs != I)
+        continue;
+    }
+    // We allow sharing currently even for virtual registers. Some
+    // instructions like DMA and Stream rely on being able to share their
+    // operands. We should keep this in mind in case there is something
+    // funny going on due to the register changing. There is no known case
+    // at this time.
+    if (Vs[I].first == Sreg &&
+        Vs[I].second == TPUInstrInfo::isVectorInstruction(*MI)) {
+      VsTracking[MI].push_back(
+          std::make_tuple(Sreg, OpIdx, ConstrainVs != -1, I));
+      return true;
+    }
+  }
+  for (unsigned I = 0; I < Vs.size(); ++I) {
+    if (ConstrainVs != -1) {
+      if (ConstrainVs != I)
+        continue;
+    }
+    if (Vs[I].first == TPU::NoRegister) {
+      Vs[I].first = Sreg;
+      Vs[I].second = TPUInstrInfo::isVectorInstruction(*MI);
+      VsTracking[MI].push_back(
+          std::make_tuple(Sreg, OpIdx, ConstrainVs != -1, I));
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ResourceSolver::canAddVs(const MachineInstr &MI, Register Sreg,
+                              unsigned OpIdx, int ConstrainVs,
+                              SmallVector<std::pair<Register, bool>> &Vs,
+                              VsTrackingMapT &VsTracking) const {
+  SmallVector<std::pair<Register, bool>> NewVs;
+  VsTrackingMapT NewVsTracking;
+  NewVs.resize(ST->getNumVs());
+  auto AddExistingVs = [this, &NewVsTracking, &NewVs,
+                        &VsTracking](bool FilterConstraintVs) {
+    for (auto &[ExistMI, VSentries] : VsTracking) {
+      for (auto &VSentry : VSentries) {
+        auto ExistSreg = std::get<0>(VSentry);
+        assert(ExistSreg != TPU::NoRegister);
+        bool ConstraintVs = std::get<2>(VSentry);
+        if (FilterConstraintVs != ConstraintVs)
+          continue;
+        assert(ExistMI);
+        unsigned Slot = std::get<3>(VSentry);
+        if (!findVsSlot(ExistMI, ExistSreg, std::get<1>(VSentry),
+                        FilterConstraintVs ? Slot : -1, NewVs, NewVsTracking))
+          return false;
+      }
+    }
+    return true;
+  };
+  if (!AddExistingVs(true))
+    llvm_unreachable("Internal error: previously VS slots don't bundle.");
+  if (ConstrainVs != -1) {
+    // We actually only need this case (may or may not constraint), but we do
+    // the if-else here in order not having to change dozens of tests that would
+    // differ in their VS slots assignments.
+    // FIXME(hgreving): Eventually remove this if-else.
+    if (!findVsSlot(&MI, Sreg, OpIdx, ConstrainVs, NewVs, NewVsTracking))
+      return false;
+    if (!AddExistingVs(false))
+      return false;
+  } else {
+    // See comment above.
+    if (!AddExistingVs(false))
+      return false;
+    if (!findVsSlot(&MI, Sreg, OpIdx, ConstrainVs, NewVs, NewVsTracking))
+      return false;
+  }
+  Vs = NewVs;
+  VsTracking = NewVsTracking;
+  return true;
+}
+
+void ResourceSolver::addImm(const MachineInstr &MI) {
+  SmallVector<ImmediateEncoding> &IE = GlobalInstrsToImmEncoding[&MI];
+  SmallVector<std::pair<unsigned, ImmediateSlot>> ImmDiff;
+  canAddImmInternal(MI, ImmDiff, IE);
+  for (auto &I : ImmDiff) {
+    assert(!GlobalImms[I.first].has_value());
+    GlobalImms[I.first] = I.second;
+  }
+}
+
+bool ResourceSolver::canAddImmInternal(
+    const MachineInstr &MI,
+    SmallVector<std::pair<unsigned, ImmediateSlot>> &Diff,
+    SmallVector<ImmediateEncoding> &IE) {
+  uint64_t SlotMask;
+  uint64_t OperMask;
+  if (!TPUInstrInfo::requiresImmediateSlots(MI, SlotMask, OperMask))
+    return true;
+  unsigned ImmOpIdx = 0;
+  for (unsigned OpIdx = 0; OpIdx < MI.getNumExplicitOperands(); OpIdx++) {
+    IE.push_back({TPUMCImmKind::VK_TPU_none, 0, 0});
+    auto OperandType = MI.getDesc().OpInfo[OpIdx].OperandType;
+    // Has to be in sync with the 'if' in requiresImmediateSlots.
+    if (!isTPUImmediate(OperandType) &&
+        (OperandType != MCOI::OPERAND_IMMEDIATE) &&
+        (OperandType != MCOI::OPERAND_UNKNOWN /*BR operand*/) &&
+        (OperandType != MCOI::OPERAND_PCREL /*BRrel operand*/)) {
+      continue;
+    }
+    // This is an operand that may need an immediate slot.
+    if (((1ULL << ImmOpIdx) & OperMask) != 0) {
+      if (!canAddImmInternal(MI, OpIdx, Diff, IE.back(), SlotMask)) {
+        return false;
+      }
+      OperMask &= ~(1ULL << ImmOpIdx);
+    }
+    ImmOpIdx++;
+  }
+  assert((OperMask == 0) &&
+         "Not all operands requiring immediate are processed");
+  return true;
+}
+
+bool ResourceSolver::canAddImm(
+    const MachineInstr &MI,
+    SmallVector<std::pair<unsigned, ImmediateSlot>> &Diff) {
+  SmallVector<ImmediateEncoding> IE;
+  return canAddImmInternal(MI, Diff, IE);
+}
+
+bool ResourceSolver::canAddImmInternal(
+    const MachineInstr &MI, unsigned OpIdx,
+    SmallVector<std::pair<unsigned, ImmediateSlot>> &Diff,
+    ImmediateEncoding &Encoding, uint64_t SlotMask) {
+  const MachineOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isImm() && !MO.isFPImm() && !MO.isMBB() && !MO.isGlobal())
+    return true;
+  auto IsInvalidForSlot = [SlotMask](unsigned SlotIdx) {
+    return ((1ULL << SlotIdx) & SlotMask) == 0;
+  };
+
+  std::optional<TPUOperandTypeRecord> IR = getOperandTypeRecord(
+      static_cast<TPUOp::OperandType>(MI.getDesc().OpInfo[OpIdx].OperandType));
+  std::optional<OpEnc::OpEncodings> SE;
+  if (IR.has_value()) {
+    SE = static_cast<OpEnc::OpEncodings>((*IR).OpEncoding);
+  } else {
+    SE = getSpecialOpEncoding(MI.getDesc(), OpIdx);
+  }
+  if (!SE.has_value()) {
+    // If we failed to get special encoding from getSpecialOpEncoding() we fall
+    // back on using old mechanism for immediate classification, eventually we
+    // should get rid of it and use only getSpecialOpEncoding().
+    if (TPUInstrInfo::isInVectorSlot(MI.getDesc())) {
+      SE = OpEnc::VY;
+    } else if (TPUInstrInfo::isInScalarSlot(MI.getDesc())) {
+      SE = OpEnc::SY;
+    }
+  }
+
+  // Special handling of globals/block addresses. Those need a 16/20bits slot
+  // and cannot be shared, because we don't know its value yet.
+  bool isKnownImm =
+      (MI.isBranch() || MI.isCall()) ? false : MO.isImm() || MO.isFPImm();
+  if (!isKnownImm) {
+    for (auto I : ImmsAllocOrder) {
+      if (IsInvalidForSlot(I))
+        continue;
+      if (!GlobalImms[I].has_value()) {
+        bool SlotAlreadyUsed = false;
+        // Check that this instruction hasn't already reserved the slot.
+        for (const auto &D : Diff) {
+          if (D.first == I) {
+            SlotAlreadyUsed = true;
+            break;
+          }
+        }
+        if (SlotAlreadyUsed)
+          continue;
+        ImmediateSlot Slot;
+        // On some architectures, we still need the immediate encoding, for
+        // example on SparseCore.
+        if (SE.has_value()) {
+          // We pass a fake '0' value. We assume that the unknown immediate can
+          // be encoded in a single zero extended immediate slot.
+          Slot = *getPackedImm(0, *SE, Encoding, *ST);
+          Encoding.ImmBase = I;
+          if (*SE != OpEnc::BcVldVstBase)
+            Encoding.Encoding += I;
+        }
+        Slot.IsUnknown = true;
+        Diff.push_back({I, Slot});
+        return true;
+      }
+    }
+    return false;
+  }
+
+  if (!SE.has_value()) {
+    // Don't know how to encode, assume the immediate will be encoded w/o
+    // immediate slot.
+    return true;
+  }
+
+  unsigned ImmVal =
+      MO.isImm() ? MO.getImm()
+                 : FloatToBits(MO.getFPImm()->getValueAPF().convertToFloat());
+
+  // An encodable immediate is directly bundleable without an imm slot using
+  // embedded encodings.
+  switch (*SE) {
+  case OpEnc::VY:
+  case OpEnc::SY: {
+    std::optional<uint32_t> E =
+        (*SE == OpEnc::VY) ? getVyEncodings(ST, ImmVal)
+                           : getSyEncodings(HasPxcOrVfSyEncoding, ImmVal);
+    if (E.has_value()) {
+      Encoding = {TPUMCImmKind::VK_TPU_embed, 0,
+                  static_cast<uint8_t>(E.value())};
+      return true;
+    }
+    break;
+  }
+  case OpEnc::MemOffset: {
+    std::optional<uint32_t> E = getMemOffsetEncodings(
+        ST->hasVfcTensorCore(), ST->isSparseCore(), ImmVal);
+
+    if (E.has_value()) {
+      Encoding = {TPUMCImmKind::VK_TPU_embed, 0,
+                  static_cast<uint8_t>(E.value())};
+      return true;
+    }
+    break;
+  }
+  case OpEnc::MemStride: {
+    std::optional<uint32_t> E =
+        getMemStrideEncodings(ST->hasVfcTensorCore(), ST->isSparseCore(),
+                              ST->hasPfcTensorCore(), ImmVal);
+
+    if (E.has_value()) {
+      Encoding = {TPUMCImmKind::VK_TPU_embed, 0,
+                  static_cast<uint8_t>(E.value())};
+      return true;
+    }
+    break;
+  }
+  case OpEnc::SublaneMask: {
+    std::optional<uint32_t> E = getSublaneMaskEncodings(
+        ST->hasVfcTensorCore(), ST->isSparseCore(), ImmVal);
+
+    if (E.has_value()) {
+      Encoding = {TPUMCImmKind::VK_TPU_embed, 0,
+                  static_cast<uint8_t>(E.value())};
+      return true;
+    }
+    break;
+  }
+  case OpEnc::Plain:
+    llvm_unreachable("Plain immediates not expected here.");
+    break;
+  default:
+    break;
+  }
+
+  auto GetSlotValue = [&](unsigned I) -> std::optional<ImmediateSlot> {
+    if (GlobalImms[I].has_value()) {
+      return *GlobalImms[I];
+    }
+    for (const auto &D : Diff) {
+      if (D.first == I) {
+        return D.second;
+      }
+    }
+    return {};
+  };
+
+  // First try to re-use, all slots.
+  if (auto Packed = getPackedImm(ImmVal, *SE, Encoding, *ST)) {
+    for (unsigned I = 0; I < GlobalImms.size(); ++I) {
+      if (IsInvalidForSlot(I))
+        continue;
+      std::optional<ImmediateSlot> V = GetSlotValue(I);
+      if (!V.has_value())
+        continue;
+      if (*V == *Packed) {
+        Encoding.ImmBase = I;
+        if (*SE != OpEnc::BcVldVstBase)
+          Encoding.Encoding += I;
+        return true;
+      }
+    }
+  }
+
+  // Try to allocate a slot.
+  if (auto Packed = getPackedImm(ImmVal, *SE, Encoding, *ST)) {
+    for (auto I : ImmsAllocOrder) {
+      if (IsInvalidForSlot(I))
+        continue;
+      std::optional<ImmediateSlot> V = GetSlotValue(I);
+      if (!V.has_value()) {
+        Diff.push_back({I, *Packed});
+        Encoding.ImmBase = I;
+        if (*SE != OpEnc::BcVldVstBase)
+          Encoding.Encoding += I;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // This is a 32-bit immediate spread across two 16-bit fields.
+  switch (*SE) {
+  case OpEnc::VY:
+  case OpEnc::SY:
+  case OpEnc::Normal32: {
+    // IsVfc here only refers to TensorCore
+    bool IsVfc = ST->hasVfcTensorCore();
+    Encoding.Kind = TPUMCImmKind::VK_TPU_32;
+    Encoding.Encoding = (*SE == OpEnc::VY) ? getFirstVyImm32Encoding(IsVfc)
+                                           : getFirstSyImm32Encoding(IsVfc);
+    ImmediateSlot Lo(ImmVal & 0xFFFF);
+    ImmediateSlot Hi(ImmVal >> 16);
+    bool SkipNextI = false;
+    for (auto I : ImmsAllocOrder) {
+      if (SkipNextI) {
+        SkipNextI = false;
+        continue;
+      }
+      SkipNextI = true;
+      if (IsInvalidForSlot(I) || IsInvalidForSlot(I + 1))
+        continue;
+      std::optional<ImmediateSlot> VLo = GetSlotValue(I);
+      std::optional<ImmediateSlot> VHi = GetSlotValue(I + 1);
+      if ((!VLo.has_value() || *VLo == Lo) &&
+          (!VHi.has_value() || *VHi == Hi)) {
+        if (!VLo.has_value())
+          Diff.push_back({I, Lo});
+        if (!VHi.has_value())
+          Diff.push_back({I + 1, Hi});
+        Encoding.ImmBase = I;
+        Encoding.Encoding += I / 2;
+        return true;
+      }
+    }
+    break;
+  }
+  case OpEnc::Plain:
+    llvm_unreachable("Plain immediates not expected here.");
+    break;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+void ResourceSolver::clear() {
+  GlobalSolver.clear();
+  GlobalSolvedInstrs.clear();
+  std::fill(std::begin(GlobalVs), std::end(GlobalVs),
+            std::make_pair(TPU::NoRegister, true));
+  GlobalVsTracking.clear();
+  GlobalInstrsToImmEncoding.clear();
+  std::fill(std::begin(GlobalImms), std::end(GlobalImms),
+            std::optional<ImmediateSlot>());
+}
+
+unsigned ResourceSolver::getSlotsUsed(unsigned Idx) const {
+  return GlobalSolver.getSlotsUsed(Idx);
+}
+
+SmallVector<std::pair<unsigned, VSEncoding>>
+ResourceSolver::getVsUsed(const MachineInstr *MI) {
+  SmallVector<std::pair<unsigned, VSEncoding>> VsTracking;
+  for (auto &VSentries : GlobalVsTracking[MI]) {
+    VSEncoding Encoding = CreateVSFromOrdinal(std::get<3>(VSentries));
+    VsTracking.push_back(std::make_pair(std::get<1>(VSentries), Encoding));
+  }
+  return VsTracking;
+}
+
+bool ResourceSolver::addMIToSolver(const MCInstrDesc *MID, Solver &S) const {
+  unsigned SchedClass = MID->getSchedClass();
+  bool NewInst = true;
+  if (InstrItins && !InstrItins->isEmpty()) {
+    for (const InstrStage &IS : make_range(InstrItins->beginStage(SchedClass),
+                                           InstrItins->endStage(SchedClass))) {
+      InstrStage::FuncUnits funcUnits = IS.getUnits();
+      S.addSlot(funcUnits, NewInst);
+      NewInst = false;
+    }
+  }
+  return !NewInst;
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUResourceSolver.h b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUResourceSolver.h
new file mode 100644
index 0000000..68e05c0
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUResourceSolver.h

@@ -0,0 +1,157 @@
+//===-- TPUResourceSolver.h - Solver for resource constraints ----*-C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contain solver class used to allocate resource slots during bundle
+// packing. This is a replacement of DFA which doesn't scale with our resource
+// requirements.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_GOOGLETPU_TPURESOURCESOLVER_H
+#define LLVM_LIB_TARGET_GOOGLETPU_TPURESOURCESOLVER_H
+
+#include "MCTargetDesc/TPUMCImmExpr.h"
+#include "MCTargetDesc/TPUMCTargetDesc.h"
+#include "TPUSubtarget.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include <vector>
+
+namespace llvm {
+
+class Solver {
+  // A set of all current possible unique valid combinations of resources at
+  // this point.
+  SetVector<unsigned> ValidCombinations;
+  // For each valid combination, keep track of one of the possible solutions of
+  // the resources used by each instruction.
+  MapVector<unsigned, SmallVector<uint32_t, 16>> GlobalSlotTracking;
+  bool TrackResource = false;
+
+public:
+  Solver();
+  /// Enable resource tracking.
+  void setTrackResource(bool Track);
+  /// Add a slot based given a bitfield of slots allowed.
+  void addSlot(InstrStage::FuncUnits SlotsAllowed, bool NewInst = false);
+  /// Return true if there is any valid states after adding slots.
+  bool isValid() const;
+  unsigned getFirstCombination() const;
+  /// clear the state of the solver.
+  void clear();
+  /// Return a bitfield of slots used by the Idx th instruction added.
+  unsigned getSlotsUsed(unsigned Idx) const;
+#ifndef NDEBUG
+  // Returns the highest index of instructions that were added by the solver.
+  // This is only used as a sanity check to keep things coherent.
+  int getCurrentMaxInstrIdx() const;
+#endif
+};
+
+using VsTrackingMapT =
+    SmallMapVector<const MachineInstr *,
+                   SmallVector<std::tuple<Register, unsigned, bool, unsigned>>,
+                   8>;
+
+class ResourceSolver {
+public:
+  // Represent a 16-bit or 20-bit immediate taking one slot. It may be a known
+  // value or a value resolved at link time.
+  struct ImmediateSlot {
+    ImmediateSlot() = default;
+    ImmediateSlot(unsigned ImmValue) : ImmValue(ImmValue) {}
+    // If any of the two immediate is unknown always consider them different.
+    bool operator==(const ImmediateSlot &Op) {
+      return !Op.IsUnknown && !this->IsUnknown && Op.ImmValue == this->ImmValue;
+    }
+    bool operator!=(const ImmediateSlot &Op) {
+      return Op.IsUnknown || this->IsUnknown || Op.ImmValue != this->ImmValue;
+    }
+    bool IsUnknown = false;
+    unsigned ImmValue = 0;
+  };
+
+  ResourceSolver(const TPUSubtarget &ST);
+  /// Add an MI instruction to the bundle.
+  void addMI(const MachineInstr &MI);
+  /// Query to know if we can add a given instruction.
+  bool canAddMI(const MachineInstr &MI) const;
+  /// Clear the solver state.
+  void clear();
+  /// Solver empty
+  bool empty() { return GlobalSolvedInstrs.empty(); }
+  /// Query the resource used by a given instruction.
+  unsigned getSlotsUsed(unsigned Idx) const;
+  // Returns a vector of operand index, VS encoding used by a given instruction.
+  SmallVector<std::pair<unsigned, VSEncoding>>
+  getVsUsed(const MachineInstr *MI);
+  // Attempt to add MI's i32imm or f32imm operand. Diff is populated with the
+  // changes to make to Imms if added.
+  void addImm(const MachineInstr &MI);
+  bool canAddImm(const MachineInstr &MI,
+                 SmallVector<std::pair<unsigned, ImmediateSlot>> &Diff);
+  SmallVector<ImmediateEncoding> &
+  getGlobalInstrsToImmEncoding(const MachineInstr *MI) {
+    return GlobalInstrsToImmEncoding[MI];
+  }
+
+protected:
+  const TPUSubtarget *ST;
+  // Add an instruction to a given solver. The solver is allowed to be in an
+  // invalid state afterward. Returns true if instruction had a schedule and
+  // itinerary and was added to the solver.
+  bool addMIToSolver(const MCInstrDesc *MID, Solver &S) const;
+  // Attempt to add MI's VS operand - a scalar register for a vector
+  // instruction.
+  bool canAddVs(const MachineInstr &MI, bool IsScalarMisc,
+                SmallVector<std::pair<Register, bool>> &Vs,
+                VsTrackingMapT &VsTracking) const;
+  // Helper per operand Vs solver function. May constrain possible VS slots to
+  // slot number (ConstrainVs). ConstrainVs == -1 means don't care.
+  bool canAddVs(const MachineInstr &MI, Register Sreg, unsigned OpIdx,
+                int ConstrainVs, SmallVector<std::pair<Register, bool>> &Vs,
+                VsTrackingMapT &VsTracking) const;
+  // Helper function to find suitable VS resource.
+  bool findVsSlot(const MachineInstr *MI, Register Sreg, unsigned OpIdx,
+                  int ConstrainVs, SmallVector<std::pair<Register, bool>> &Vs,
+                  VsTrackingMapT &VsTracking) const;
+  // Auxiliary functions for canAddImm above.
+  bool canAddImmInternal(const MachineInstr &MI,
+                         SmallVector<std::pair<unsigned, ImmediateSlot>> &Diff,
+                         SmallVector<ImmediateEncoding> &IE);
+  bool canAddImmInternal(const MachineInstr &MI, unsigned OpIdx,
+                         SmallVector<std::pair<unsigned, ImmediateSlot>> &Diff,
+                         ImmediateEncoding &Encoding, uint64_t SlotMask);
+  const InstrItineraryData *InstrItins;
+  Solver GlobalSolver;
+  // Ordered list of already solved instructions previously added.
+  SmallVector<const MachineInstr *> GlobalSolvedInstrs;
+  // The shared VS slots - scalar register indices for vector instructions. The
+  // bool indicates whether the using instruction was a vector instruction,
+  // false for scalar instructions.
+  SmallVector<std::pair<Register, bool>> GlobalVs;
+  // Tracking data structure supporting all possible VS slot combinations and
+  // their assignments.
+  VsTrackingMapT GlobalVsTracking;
+  // The shared immediates.
+  SmallVector<std::optional<ImmediateSlot>> GlobalImms;
+  // Preferred immediate slot allocation order.
+  SmallVector<int, 6> ImmsAllocOrder;
+  // Keeps track of the immediate encodings, per instructions currently tracked.
+  SmallDenseMap<const MachineInstr *, SmallVector<ImmediateEncoding>>
+      GlobalInstrsToImmEncoding;
+  // The current target has Pxc Sy encoding.
+  const bool HasPxcOrVfSyEncoding;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_GOOGLETPU_TPURESOURCESOLVER_H

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSchedule.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSchedule.cpp
new file mode 100644
index 0000000..1608d48
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSchedule.cpp

@@ -0,0 +1,1591 @@
+//===--------------- TPUSchedule.cpp - TPU scheduling -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains utilities for scheduling on TPU.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TPUSchedule.h"
+#include "MCTargetDesc/TPUMCTargetDesc.h"
+#include "TPUBundleTracker.h"
+#include "TPUFifoAnalysis.h"
+#include "TPUFifoFillAnalysis.h"
+#include "TPUInstrInfo.h"
+#include "TPUMachineFunctionInfo.h"
+#include "TPUSubtarget.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <memory>
+#include <queue>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-scheduler"
+
+static cl::opt<bool> NoRawHazardMutation(
+    "tpu-no-rawhazard-mutation", cl::Hidden, cl::init(false),
+    cl::desc("disable raw hazard mutation. This is useful to test the post "
+             "bundle packing RAW hazard pass."));
+
+static cl::opt<bool> NoPushPopReordering(
+    "tpu-no-push-pop-reordering", cl::Hidden, cl::init(false),
+    cl::desc("Forces origin push/pop order in basic blocks. This option does "
+             "not affect the behavior of the software pipeliner."));
+
+namespace {
+// Return the successor BR or HALT or CALL instruction for MI. Uses a cache to
+// avoid accidental quadratic behaviour.
+MachineInstr *getSuccessorBranchOrHaltOrCall(
+    MachineInstr *MI, DenseMap<MachineInstr *, MachineInstr *> &SuccCache) {
+  if (SuccCache.count(MI))
+    return SuccCache[MI];
+  auto E = MI->getParent()->instr_end();
+  auto I = std::next(MI->getIterator());
+  for (; I != E; ++I) {
+    if (I->isTerminator() || I->isCall()) {
+      MachineInstr *Br = &*I;
+      for (auto J = MI->getIterator(); J != I; ++J)
+        SuccCache[&*J] = Br;
+      return Br;
+    }
+  }
+  // No terminator in the basic block.
+  for (auto J = MI->getIterator(); J != E; ++J)
+    SuccCache[&*J] = nullptr;
+  return nullptr;
+}
+
+// If one exists, return an SDep within Preds that is a data dependence on a
+// predicate register.
+SDep *getPredicateDep(SmallVectorImpl<SDep> &Preds) {
+  for (auto &D : Preds) {
+    if (D.getKind() == SDep::Data && TPU::PPRRegClass.contains(D.getReg()))
+      return &D;
+  }
+  return nullptr;
+}
+
+// Updates the edge to latency L in both directions.
+void updateSuccLatency(SUnit *SU, SDep &Succ, int L) {
+  Succ.setLatency(L);
+  for (SDep &D : Succ.getSUnit()->Preds) {
+    if (D.getSUnit() == SU) {
+      D.setLatency(L);
+    }
+  }
+}
+
+void updatePredLatency(SUnit *SU, SDep &Pred, int L) {
+  Pred.setLatency(L);
+  for (SDep &D : Pred.getSUnit()->Succs) {
+    if (D.getSUnit() == SU) {
+      D.setLatency(L);
+    }
+  }
+}
+} // namespace
+
+namespace {
+bool BranchPredicateIsReachingAndOpposite(
+    const MachineInstr &SuccBr,
+    const SmallVectorImpl<MachineInstr *> &PrecedingBranches) {
+  if (!TPUInstrInfo::isBR(&SuccBr) || PrecedingBranches.empty() ||
+      !TPUInstrInfo::isBR(PrecedingBranches.back()))
+    return false;
+  const MachineInstr &PrevBr = *PrecedingBranches.back();
+  TPUPredicate PrevPred(PrevBr);
+  TPUPredicate SuccPred(SuccBr);
+  if (PrevPred.getReg() != SuccPred.getReg())
+    return false;
+  if (PrevPred.getInvert() == SuccPred.getInvert())
+    return false;
+  for (auto I = PrevBr.getIterator(), E = SuccBr.getIterator(); I != E; ++I) {
+    if (I->definesRegister(PrevPred.getReg()))
+      return false;
+  }
+  return true;
+}
+
+// The BranchOrderingMutation modifies the ScheduleDAG to correctly handle
+// terminators. It ensures instructions cannot migrate unchecked across
+// branch instructions. Call instructions are treated like branches as they have
+// similar behavior, they currently don't have a delay slots but this may be
+// added in the future.
+class BranchOrderingMutation : public ScheduleDAGMutation {
+public:
+  void apply(ScheduleDAGInstrs *DAG) override {
+    assert(DAG->begin() != DAG->end() && "Empty scheduling region?");
+    DenseMap<MachineInstr *, MachineInstr *> SuccCache;
+    SmallVector<MachineInstr *, 8> PrecedingBranches;
+    SmallSet<unsigned, 8> DeffedRegistersSinceLastBranch;
+    for (MachineInstr &MI : *DAG) {
+      assert(MI.getParent() == DAG->begin()->getParent());
+      SUnit *SU = DAG->getSUnit(&MI);
+
+      // If MI is a branch instruction, treat it specially.
+      if (MI.isTerminator() || MI.isCall()) {
+        auto SU = DAG->getSUnit(&MI);
+        if (!PrecedingBranches.empty()) {
+          SDep Dep(DAG->getSUnit(PrecedingBranches.back()), SDep::Barrier);
+          // Make the default edge between terminators at least 1.
+          Dep.setLatency(std::max(1U, Dep.getLatency()));
+          if (TPUInstrInfo::isBR(&MI) &&
+              !BranchPredicateIsReachingAndOpposite(MI, PrecedingBranches)) {
+            assert(!MI.isCall());
+            // Branches with non-opposing predicates cannot be scheduled
+            // back-to-back because the second branch will be incorrectly
+            // executed.
+            const TPUSubtarget &ST = DAG->MF.getSubtarget<TPUSubtarget>();
+            Dep.setLatency(ST.getNumDelaySlots() + 1);
+          }
+          DAG->addEdge(DAG->getSUnit(&MI), Dep);
+        }
+        const TPUSubtarget &ST = DAG->MF.getSubtarget<TPUSubtarget>();
+        if (ST.isTPUABIEnabled() && MI.isCall()) {
+          for (auto &D : SU->Preds) {
+            if (D.getKind() != SDep::Data)
+              continue;
+            // Must not be 0 before calling adjustSchedDependency. We're
+            // pushing up the producers of call parameter because clearance code
+            // or any other mutation doesn't account for the call graph.
+            D.setLatency(1);
+            ST.adjustSchedDependency(D.getSUnit(), 0, SU, -1, D);
+            updatePredLatency(SU, D, ST.getNumDelaySlots() + D.getLatency());
+          }
+          for (auto &D : SU->Succs) {
+            // Making sure no successor is scheduled in the call slots.
+            updateSuccLatency(SU, D, std::max(1U, D.getLatency()));
+          }
+        }
+        if (TPUInstrInfo::isBR(&MI) || (ST.isTPUABIEnabled() && MI.isCall())) {
+          if (SDep *P = getPredicateDep(SU->Preds)) {
+            // A BR or a CALL must be scheduled at least delay slots + 1 cycles
+            // after its predicate operand (so the BRrel is scheduled at least
+            // one cycle after the predicate operand).
+            const TPUSubtarget &ST = DAG->MF.getSubtarget<TPUSubtarget>();
+            updatePredLatency(SU, *P, ST.getNumDelaySlots() + 1);
+          }
+        }
+        PrecedingBranches.push_back(&MI);
+        DeffedRegistersSinceLastBranch.clear();
+        continue;
+      }
+
+      // An instruction cannot be moved after its successor BR.
+      if (auto *BI = getSuccessorBranchOrHaltOrCall(&MI, SuccCache)) {
+        auto *BrSU = DAG->getSUnit(BI);
+        bool NeedEdge = true;
+        // If any of the successor is before the branch we can skip adding the
+        // edge.
+        for (auto &Succ : SU->Succs) {
+          // We're not omitting edges if the successor is a bundle limiter on
+          // SparseCore, because we may remove the edge later. There are
+          // currently no compile time concerns on SparseCore in this regard.
+          // TODO(hgreving): It might be better to move this mutation past the
+          // BundleCycleMutation. However, when doing this I ran into a lot of
+          // trouble, including some real test failures. We should look at why
+          // this happens.
+          if (Succ.getSUnit()->getInstr() &&
+              TPUInstrInfo::isScBundleLimiter(*Succ.getSUnit()->getInstr()))
+            continue;
+          if (Succ.getSUnit()->NodeNum < BrSU->NodeNum) {
+            NeedEdge = false;
+            break;
+          }
+        }
+        if (NeedEdge) {
+          assert(DAG->canAddEdge(DAG->getSUnit(BI), SU));
+          SDep D(SU, SDep::Barrier);
+          DAG->addEdge(DAG->getSUnit(BI), D);
+        }
+      }
+
+      // An instruction cannot be moved before a predecessor BR unless it is
+      // speculatable.
+      auto BI = PrecedingBranches.rbegin();
+
+      // If an instruction has the inverse predicate of its preceding branch, it
+      // can be scheduled before that branch.
+      if (BI != PrecedingBranches.rend() &&
+          TPUPredicate(*BI).toggleInvert() == TPUPredicate(&MI) &&
+          DeffedRegistersSinceLastBranch.count(TPUPredicate(*BI).getReg()) == 0)
+        ++BI;
+
+      for (auto &MO : MI.defs())
+        DeffedRegistersSinceLastBranch.insert(MO.getReg());
+      if (BI == PrecedingBranches.rend())
+        continue;
+
+      assert(DAG->canAddEdge(SU, DAG->getSUnit(*BI)));
+      SDep D(DAG->getSUnit(*BI), SDep::Barrier);
+      const TPUSubtarget &ST = DAG->MF.getSubtarget<TPUSubtarget>();
+      if ((*BI)->isCall() && ST.isTPUABIEnabled())
+        // Making sure no successor is scheduled in the call slots.
+        D.setLatency(1);
+      DAG->addEdge(SU, D);
+    }
+  }
+};
+
+} // namespace
+
+TPUHazardRecognizer::TPUHazardRecognizer(const InstrItineraryData *II,
+                                         const ScheduleDAG *DAG) {
+  MaxLookAhead = 1;
+}
+
+void TPUHazardRecognizer::Reset() {
+  if (BT)
+    BT->clear();
+  BundlePredicate.reset();
+  BundlePredicateOverdefined = false;
+  assert(BranchInstrs.empty());
+  BranchInSameBundle = false;
+  TrapInSameBundle = false;
+}
+
+void TPUHazardRecognizer::AdvanceCycle() {
+  if (BT)
+    BT->clear();
+  BundlePredicate.reset();
+  BundlePredicateOverdefined = false;
+  assert(!BranchInSameBundle && BranchInstrs.empty() &&
+         "no br expected in forward pass");
+}
+
+void TPUHazardRecognizer::RecedeCycle() {
+  if (BT)
+    BT->clear();
+  BundlePredicate.reset();
+  BundlePredicateOverdefined = false;
+  BranchInSameBundle = false;
+  TrapInSameBundle = false;
+  if (!BranchInstrs.empty()) {
+    MachineInstr *BranchInst = BranchInstrs.front();
+    BranchInstrs.pop();
+    if (BranchInst != nullptr) {
+      BT->addMI(*BranchInst);
+    }
+  }
+}
+
+void TPUHazardRecognizer::EmitInstruction(SUnit *SU) {
+  MachineInstr *MI = SU->getInstr();
+  if (!MI)
+    return;
+  if (CanHandleDelaySlots) {
+    auto &ST = MI->getMF()->getSubtarget<TPUSubtarget>();
+    if (TPUInstrInfo::isBR(MI) || (ST.isTPUABIEnabled() && MI->isCall())) {
+      const unsigned NumDelaySlots = ST.getNumDelaySlots();
+      int BranchInstrSize = (int)BranchInstrs.size();
+      for (int i = 1; i < (int)NumDelaySlots - BranchInstrSize; i++)
+        BranchInstrs.push(nullptr);
+      // MI gets added to the bundle NumDelaySlots after current bundle.
+      BranchInstrs.push(MI);
+      BranchInSameBundle = true;
+      return;
+    }
+  }
+  if (TPUInstrInfo::isScTrap(*MI)) {
+    TrapInSameBundle = true;
+  }
+  BT->addMI(*MI);
+  if (!MI->isPredicable()) {
+    BundlePredicateOverdefined = true;
+    return;
+  }
+  TPUPredicate Pred(MI);
+  if (BundlePredicate.has_value() && *BundlePredicate != Pred)
+    BundlePredicateOverdefined = true;
+  else if (!BundlePredicate.has_value())
+    BundlePredicate = Pred;
+}
+
+ScheduleHazardRecognizer::HazardType
+TPUHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  MachineInstr *MI = SU->getInstr();
+  if (!BT.has_value())
+    BT.emplace(MI->getMF()->getSubtarget<TPUSubtarget>());
+
+  if (MI->isCall()) {
+    if (!BranchInstrs.empty())
+      return Hazard;
+  }
+
+  // Two branches cannot schedule in the same bundle.
+  if (TPUInstrInfo::isBR(MI)) {
+    if (BranchInSameBundle)
+      return Hazard;
+
+    // A branch cannot schedule in a non-empty bundle unless it has an
+    // opposite predicate to all the bundle contents.
+    if (BundlePredicateOverdefined ||
+        (BundlePredicate.has_value() &&
+         TPUPredicate(MI).toggleInvert() != *BundlePredicate))
+      return Hazard;
+    return NoHazard;
+  }
+  // We keep SC traps alone in a bundle, except with bundle limiters.
+  if (TrapInSameBundle && !TPUInstrInfo::isScBundleLimiter(*MI))
+    return Hazard;
+  if (TPUInstrInfo::isScTrap(*MI)) {
+    if (!BT->empty())
+      return Hazard;
+  }
+  // Note that we specifically allow the case where a bundle has a branch and
+  // new instructions are added. We don't need to predicate them, because they
+  // must occur before the branch in instruction order and we only branch at
+  // the *end* of the bundle.
+  if (!BT->canAddMI(*MI))
+    return Hazard;
+  return NoHazard;
+}
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createTPUBranchOrderingMutation() {
+  return std::make_unique<BranchOrderingMutation>();
+}
+
+class RemoveExitSUMutation : public ScheduleDAGMutation {
+public:
+  void apply(ScheduleDAGInstrs *DAG) override {
+    while (!DAG->ExitSU.Preds.empty()) {
+      DAG->ExitSU.removePred(DAG->ExitSU.Preds.front());
+    }
+  }
+};
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createTPURemoveExitSUMutation() {
+  return std::make_unique<RemoveExitSUMutation>();
+}
+
+// EventDepsMutation modifies the data dependencies of an EVENT instruction
+// to be at least 1 cycle. EVENT is variadic, and the DAG builder doesn't
+// understand this properly.
+class EventDepsMutation : public ScheduleDAGMutation {
+public:
+  void apply(ScheduleDAGInstrs *DAG) override {
+    assert(DAG->begin() != DAG->end() && "Empty scheduling region?");
+
+    for (MachineInstr &MI : *DAG) {
+      if (MI.getOpcode() != TPU::EVENT)
+        continue;
+      SUnit *SU = DAG->getSUnit(&MI);
+      for (SDep &Dep : SU->Preds) {
+        if (Dep.getKind() == SDep::Data && Dep.getLatency() == 0)
+          Dep.setLatency(1);
+      }
+    }
+  }
+};
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createTPUEventDepsMutation() {
+  return std::make_unique<EventDepsMutation>();
+}
+
+// Modifies the minimum latency of barrier edges to 1. This is used for swing
+// scheduling, which relies on the depth ordering of the nodes.
+class BanZeroLatencyMutation : public ScheduleDAGMutation {
+public:
+  void apply(ScheduleDAGInstrs *DAG) override {
+    assert(DAG->begin() != DAG->end() && "Empty scheduling region?");
+    for (SUnit &SU : DAG->SUnits) {
+      for (SDep &Succ : SU.Succs) {
+        if (Succ.getLatency() != 0)
+          continue;
+        SDep P = Succ;
+        P.setSUnit(&SU);
+        Succ.setLatency(1);
+        // Update edge in the other direction.
+        SmallVectorImpl<SDep>::iterator Pred =
+            llvm::find(Succ.getSUnit()->Preds, P);
+        assert(Pred != Succ.getSUnit()->Preds.end() &&
+               "Mismatching preds / succs lists!");
+        Pred->setLatency(1);
+      }
+    }
+  }
+};
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createBanZeroLatencyMutation() {
+  return std::make_unique<BanZeroLatencyMutation>();
+}
+
+// Modifies the barrier edge between a BarnaCore bcVST_concat and bcVSHIFT to
+// have zero latency, as defined in the ISA.
+class BcStoreShiftForwardingMutation : public ScheduleDAGMutation {
+public:
+  void apply(ScheduleDAGInstrs *DAG) override {
+    assert(DAG->begin() != DAG->end() && "Empty scheduling region?");
+
+    auto IsVSTConcat = [](unsigned Opcode) {
+      return Opcode == TPU::bcVST_concat || Opcode == TPU::bcVST_concat_aliaddr;
+    };
+    auto IsShift = [](unsigned Opcode) {
+      return Opcode == TPU::bcVSHIFT || Opcode == TPU::bcVSHIFT_aliaddr;
+    };
+
+    for (MachineInstr &MI : *DAG) {
+      if (!IsShift(MI.getOpcode()))
+        continue;
+      SUnit *SU = DAG->getSUnit(&MI);
+      for (SDep &Dep : SU->Preds) {
+        if (IsVSTConcat(Dep.getSUnit()->getInstr()->getOpcode()))
+          Dep.setLatency(0);
+      }
+    }
+  }
+};
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createBcStoreShiftForwardingMutation() {
+  return std::make_unique<BcStoreShiftForwardingMutation>();
+}
+
+// Model scoreboard dependencies between instructions using the same unit by
+// adding an edge with the correct latency. This allow the machine scheduler to
+// consider it when calculating the critical path.
+class UnitCadenceMutation : public ScheduleDAGMutation {
+public:
+  // Latency between matpush and matmul described here:
+  // https://g3doc.corp.google.com/platforms/deepsea/logic/pfc/g3doc/isa/tensorcore.md#latency-information-31
+  static constexpr int MatPushToMatMulLatency = 7;
+
+  void apply(ScheduleDAGInstrs *DAG) override {
+    assert(DAG->begin() != DAG->end() && "Empty scheduling region?");
+    const TPUSubtarget &TI = DAG->MF.getSubtarget<TPUSubtarget>();
+    for (MachineInstr &MI : *DAG) {
+      SUnit *SU = DAG->getSUnit(&MI);
+      if (!SU)
+        continue;
+      // We're setting any pseudo fifo copy's latency to zero.
+      if (TPUInstrInfo::isFifoPseudoCopy(*SU->getInstr())) {
+        for (auto Succ : SU->Succs) {
+          if (Succ.getKind() != SDep::Data)
+            continue;
+          Succ.setLatency(0);
+          for (auto &Pred : Succ.getSUnit()->Preds) {
+            if (Pred.getSUnit() != SU || Succ.getReg() != Pred.getReg())
+              continue;
+            Pred.setLatency(0);
+          }
+        }
+      }
+      if (TI.getInstrInfo()->isDWGInst(MI)) {
+        // Special case for DWG in MXU. We need to add an edge between the
+        // last push and the first matmul.
+        // FIXME: We add edges to all matmul even though we only need it to
+        // the first matmul. We need an extra analysis to detect the first
+        // matmul to improve compile time.
+        for (auto Succ : SU->Succs) {
+          if (Succ.getKind() == SDep::Data) {
+            for (auto Pred : SU->Preds) {
+              if (Pred.getKind() == SDep::Data) {
+                SDep Dep(Pred.getSUnit(), SDep::Data, TPU::NoRegister);
+                Dep.setLatency(MatPushToMatMulLatency);
+                Succ.getSUnit()->addPred(Dep);
+                break;
+              }
+            }
+          }
+        }
+      }
+      // Special case for Transpose as it behaves differently than other
+      // Fifos. The first stage of the transpose doesn't behave as a FIFO,
+      // it just adds register to a staging area without making them available
+      // to the FIFO. Therefore Transpose instruction take TRF source even
+      // though it doesn't pop from the FIFO. We need to make sure this TRF
+      // source doesn't adds extra latency with instructions pushing to the
+      // TRF.
+      if (TI.getInstrInfo()->isTranspose(MI.getDesc())) {
+        unsigned TRFSrcOp = TI.getInstrInfo()->isPacked(MI.getDesc()) ? 5 : 4;
+        for (auto &Dep : SU->Preds) {
+          if ((Dep.getKind() == SDep::Data || Dep.getKind() == SDep::Output) &&
+              Dep.getReg() == MI.getOperand(TRFSrcOp).getReg())
+            Dep.setLatency(1);
+        }
+      }
+      // Generic case, add an edge when a predecessor uses the same unit as
+      // we know we will have to wait for the unit to be ready. This allows
+      // to calculate a more accurate depth/height before scheduling. Keep track
+      // only of the closest instructions found so that we don't add transitive
+      // edges.
+      //
+      // This turns out to be useful even though we've added resource checks to
+      // the hazard recognizer. In particular the slack scheduler software
+      // pipeliner seems to be better off with the data edges as opposed to
+      // without them.
+      SmallDenseMap<unsigned, SDep> CadenceDeps;
+      auto *SchedModel = DAG->getSchedModel();
+      if (SU->hasReservedResource) {
+        const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+        for (const MCWriteProcResEntry &PE :
+             make_range(SchedModel->getWriteProcResBegin(SC),
+                        SchedModel->getWriteProcResEnd(SC))) {
+          unsigned ResIdx = PE.ProcResourceIdx;
+          // Check if any predecessor use the same resource.
+          for (auto Pred : SU->Preds) {
+            if (Pred.getKind() != SDep::Order ||
+                !Pred.getSUnit()->hasReservedResource)
+              continue;
+            const MCSchedClassDesc *PredSC =
+                DAG->getSchedClass(Pred.getSUnit());
+            for (const MCWriteProcResEntry &PredPE :
+                 make_range(SchedModel->getWriteProcResBegin(PredSC),
+                            SchedModel->getWriteProcResEnd(PredSC))) {
+              if (ResIdx == PredPE.ProcResourceIdx) {
+                auto It = CadenceDeps.find(ResIdx);
+                if (It == CadenceDeps.end()) {
+                  // Special case on SparseCore: If the cadence is due to
+                  // CBREG_PATH, we only want the upd followed by cb cases.
+                  bool ExcludeCb =
+                      TPUInstrInfo::isCb(*Pred.getSUnit()->getInstr()) &&
+                      TPUInstrInfo::isCb(*SU->getInstr());
+                  ExcludeCb |=
+                      TPUInstrInfo::isCb(*Pred.getSUnit()->getInstr()) &&
+                      TPUInstrInfo::isCbUpd(*SU->getInstr());
+                  if (!ExcludeCb) {
+                    SDep Dep(Pred.getSUnit(), SDep::Data, TPU::NoRegister);
+                    Dep.setLatency(PredPE.Cycles);
+                    TI.UpdateCrossUnitLatency(Pred.getSUnit(), SU, Dep);
+                    CadenceDeps[ResIdx] = Dep;
+                  }
+                } else if (It->second.getSUnit()->NodeNum <
+                           Pred.getSUnit()->NodeNum) {
+                  It->second.setSUnit(Pred.getSUnit());
+                }
+                break;
+              }
+            }
+          }
+        }
+        for (auto It : CadenceDeps) {
+          SU->addPred(It.second);
+        }
+      }
+    }
+  }
+};
+
+// Model read after write hazard as a DAG mutation since the edge already
+// exists. This allow the latency to be considered in the critical path
+// calculation.
+class VMemReadAfterWriteMutation : public ScheduleDAGMutation {
+public:
+  void apply(ScheduleDAGInstrs *DAG) override {
+    if(NoRawHazardMutation)
+      return;
+    assert(DAG->begin() != DAG->end() && "Empty scheduling region?");
+    MachineFunction &MF = DAG->MF;
+    const TPUSubtarget &TI = MF.getSubtarget<TPUSubtarget>();
+    for (MachineInstr &MI : *DAG) {
+      SUnit *SU = DAG->getSUnit(&MI);
+      if (!SU)
+        continue;
+      if (!TPUInstrInfo::isVMemLoadInstr(SU->getInstr()) &&
+          !TPUInstrInfo::isIndexedLoadStore(SU->getInstr()->getDesc()))
+        continue;
+      SmallVector<SDep, 16> HazardDeps;
+      for (auto Pred : SU->Preds) {
+        if (TPUInstrInfo::isScBundleLimiter(*Pred.getSUnit()->getInstr()))
+          continue;
+        // For target supporting stalls we only insert the raw hazard latency
+        // if we are sure the memory access overlap as we don't want to
+        // pessimistically insert delay.
+        if ((TI.hasFatalRawHazard() && Pred.isNormalMemory()) ||
+            // For IAR load/store hardware cannot figure out if there is a RAW
+            // hazard so it always stalls.
+            TPUInstrInfo::isIndexedLoadStore(SU->getInstr()->getDesc()) ||
+            TPUInstrInfo::isIndexedLoadStore(
+                Pred.getSUnit()->getInstr()->getDesc()) ||
+            Pred.isMustAlias()) {
+          // Add latency for a load following any potential instructions
+          // writing to VMem.
+          SDep Dep(Pred.getSUnit(), SDep::Data, TPU::NoRegister);
+          Dep.setLatency(TI.getVMemHazardLatency());
+          HazardDeps.push_back(Dep);
+        }
+      }
+      // For simplicity add new dependency for every existing edge without
+      // looking at which instruction was the last.
+      // FIXME: There may be a compile time benefit finding out which one
+      // is needed.
+      for (auto Dep : HazardDeps) {
+        SU->addPred(Dep);
+      }
+    }
+  }
+};
+
+// Adds the pop latency to edges from and to output registers of the
+// composed fifo node's latency. Alternatively, we could add and maintain
+// separate composed fifo schedules, but we chose to keep the latencies
+// generic and instead run this mutation that creates a combined push and
+// pop latency.
+class ComposedFifoLatencyMutation : public ScheduleDAGMutation {
+public:
+  void apply(ScheduleDAGInstrs *DAG) override {
+    assert(DAG->begin() != DAG->end() && "Empty scheduling region?");
+    const TPUSubtarget &ST = DAG->MF.getSubtarget<TPUSubtarget>();
+    for (SUnit &SU : DAG->SUnits) {
+      if (!TPUInstrInfo::isComposedFifo(*SU.getInstr()))
+        continue;
+      unsigned Lpop = ST.getUncomposedPopLatency(SU.getInstr());
+      for (auto &Succ : SU.Succs) {
+        if (Succ.getKind() != SDep::Data)
+          continue;
+        Succ.setLatency(Succ.getLatency() + Lpop);
+        for (auto &Pred : Succ.getSUnit()->Preds) {
+          if (Pred.getKind() != SDep::Data)
+            continue;
+          if (Pred.getSUnit() != &SU || Succ.getReg() != Pred.getReg())
+            continue;
+          Pred.setLatency(Pred.getLatency() + Lpop);
+        }
+      }
+    }
+  }
+};
+
+// Adds extra latency for address calculation delay on SC, before a
+// corresponding vector load or store add is being issued, if a mask or vreg
+// register is used in the address calculation, in addition to the producer's
+// latency.
+class VRegAddressCalcMutation : public ScheduleDAGMutation {
+public:
+  void apply(ScheduleDAGInstrs *DAG) override {
+    const TPUSubtarget &ST = DAG->MF.getSubtarget<TPUSubtarget>();
+    if (!ST.isSparseCore())
+      return;
+    assert(DAG->begin() != DAG->end() && "Empty scheduling region?");
+    for (SUnit &SU : DAG->SUnits) {
+      int OpNo = ST.getInstrInfo()->getVRegAddressCalcOpNo(SU.getInstr());
+      if (OpNo != -1)
+        addPotentialDelay(ST, SU, OpNo);
+      OpNo = ST.getInstrInfo()->getVMaskAddressCalcOpNo(SU.getInstr());
+      if (OpNo != -1)
+        addPotentialDelay(ST, SU, OpNo);
+    }
+  }
+
+private:
+  void addPotentialDelay(const TPUSubtarget &ST, SUnit &SU, int OpNo) {
+    for (auto &Pred : SU.Preds) {
+      if (Pred.getKind() != SDep::Data)
+        continue;
+      MachineInstr *DefMI = Pred.getSUnit()->getInstr();
+      MachineInstr *UseMI = SU.getInstr();
+      Register R = Pred.getReg();
+      if (R != UseMI->getOperand(OpNo).getReg())
+        continue;
+      unsigned Delay = TPUInstrInfo::getAddressCalcDelay(DefMI, UseMI, R);
+      unsigned AdjustedLatency = Pred.getLatency() + Delay;
+      Pred.setLatency(AdjustedLatency);
+      // Also update the successor edge to match the predecessor edge latency.
+      for (auto &Succ : Pred.getSUnit()->Succs) {
+        if (Succ.getKind() != SDep::Data)
+          continue;
+        if (Succ.getSUnit() == &SU && Succ.getReg() == R)
+          Succ.setLatency(AdjustedLatency);
+      }
+    }
+  }
+};
+
+// Mutation for all output dependency rules on SparseCore described in
+// b/180953536.
+class VResHoldMutation : public ScheduleDAGMutation {
+public:
+  VResHoldMutation(
+      MapVector<MachineInstr *, std::tuple<MachineInstr *, Register, int>>
+          *DDPairs)
+      : ScheduleDAGMutation(), DDPairs(DDPairs) {}
+
+  void apply(ScheduleDAGInstrs *DAG) override {
+    const TPUSubtarget &ST = DAG->MF.getSubtarget<TPUSubtarget>();
+    if (!ST.hasVResHold())
+      return;
+    assert(DAG->begin() != DAG->end() && "Empty scheduling region?");
+    for (SUnit &SU : DAG->SUnits) {
+      const MachineInstr *MI = SU.getInstr();
+      if (TPUInstrInfo::isFifoPop(*MI)) {
+        Register FifoR = getFifoRegister(MI);
+        if (Register::isVirtualRegister(FifoR))
+          continue;
+        if (TPU::ERFPRRegClass.contains(FifoR)) {
+          assert(MI->getNumExplicitDefs() == 1);
+          forAllOutputPreds(SU, ST, &VResHoldMutation::vresHoldERF);
+        } else if (TPU::XRFPR0RegClass.contains(FifoR) ||
+                   TPU::XRFPR1RegClass.contains(FifoR)) {
+          assert(MI->getNumExplicitDefs() == 3);
+          forAllOutputPreds(SU, ST, &VResHoldMutation::vresHoldXRF);
+        }
+      } else if (MI->getOpcode() == TPU::VMOVr /* Ideally should just be
+                                              scVMOVC, but we don't know yet */
+                 ||
+                 MI->getOpcode() ==
+                     TPU::scVMOVC /* Can't actually happen, only in tests */) {
+        forAllOutputPreds(SU, ST, &VResHoldMutation::vresHoldVMOV);
+      }
+    }
+  }
+
+private:
+  // Auxiliary function, calling processHold on each potential vres hold
+  // predecessor. Presumes all registers involved are physical.
+  void forAllOutputPreds(
+      SUnit &SU, const TPUSubtarget &ST,
+      std::function<void(
+          SUnit &, SDep &, Register, const TPUSubtarget &,
+          MapVector<MachineInstr *, std::tuple<MachineInstr *, Register, int>>
+              *)>
+          processHold) {
+    for (auto &Pred : SU.Preds) {
+      if (Pred.getKind() != SDep::Output)
+        continue;
+      Register R = Pred.getReg();
+      assert(!Register::isVirtualRegister(R));
+      if (R == TPU::Void)
+        continue;
+      if (!TPU::VPRRegClass.contains(R))
+        continue;
+      if (TPUPredicate(Pred.getSUnit()->getInstr()).toggleInvert() ==
+          TPUPredicate(SU.getInstr()))
+        continue;
+      processHold(SU, Pred, Pred.getReg(), ST, DDPairs);
+    }
+  }
+
+  // Adds latency L to edge Pred, both directions.
+  static void addHoldLatency(
+      SUnit &SU, SDep &Pred, int L, Register R,
+      MapVector<MachineInstr *, std::tuple<MachineInstr *, Register, int>>
+          *DDPairs) {
+    int NewL = 1;
+    // Hold latency adds to the producer latency. This presumes there is
+    // actually a data edge, i.e. no dead code.
+    for (auto &D : Pred.getSUnit()->Succs) {
+      if (D.getKind() != SDep::Data)
+        continue;
+      NewL = std::max(NewL, (int)D.getLatency());
+    }
+    NewL += L;
+    Pred.setLatency(NewL);
+    for (SDep &S : Pred.getSUnit()->Succs) {
+      if (S.getSUnit() == &SU)
+        S.setLatency(NewL);
+    }
+    if (DDPairs) {
+      assert(DDPairs->count(Pred.getSUnit()->getInstr()) == 0);
+      DDPairs->insert(std::make_pair(Pred.getSUnit()->getInstr(),
+                                     std::make_tuple(SU.getInstr(), R, NewL)));
+    }
+  }
+
+  // Adds vres hold for XRF0 and XRF1.
+  static void vresHoldXRF(
+      SUnit &SU, SDep &Pred, Register R, const TPUSubtarget &ST,
+      MapVector<MachineInstr *, std::tuple<MachineInstr *, Register, int>>
+          *DDPairs) {
+    const MachineInstr *MI = SU.getInstr();
+    const MachineInstr *PredMI = Pred.getSUnit()->getInstr();
+    const TPUInstrInfo *TII = ST.getInstrInfo();
+    if (MI->getOperand(0).getReg() == R) {
+      if (TII->isVectorInstruction(PredMI->getDesc()) && !PredMI->mayLoad()) {
+        // We're conservatively considering everything that is not a
+        // vpop or a load, a vector ALU instruction.
+        addHoldLatency(SU, Pred, 2, R, DDPairs);
+      }
+    } else if (MI->getOperand(1).getReg() == R) {
+      if (TII->isVectorInstruction(PredMI->getDesc()) && !PredMI->mayLoad()) {
+        // We're conservatively considering everything that is not a
+        // vpop or a load, a vector ALU instruction.
+        addHoldLatency(SU, Pred, 3, R, DDPairs);
+      } else if (TPUInstrInfo::isFifoPop(*PredMI)) {
+        Register PredFifoR = getFifoRegister(PredMI);
+        if (TPU::ERFPRRegClass.contains(PredFifoR) ||
+            TPU::XRFPR0RegClass.contains(PredFifoR) ||
+            TPU::XRFPR1RegClass.contains(PredFifoR)) {
+          if (PredMI->getOperand(0).getReg() == R)
+            addHoldLatency(SU, Pred, 1, R, DDPairs);
+        }
+      } else if (MI->getOpcode() == TPU::VMOVr /* Same comment as above */ ||
+                 MI->getOpcode() == TPU::scVMOVC /* Same comment as above */) {
+        addHoldLatency(SU, Pred, 1, R, DDPairs);
+      }
+    }
+  }
+
+  // Adds vres hold for ERF0.
+  static void vresHoldERF(
+      SUnit &SU, SDep &Pred, Register R, const TPUSubtarget &ST,
+      MapVector<MachineInstr *, std::tuple<MachineInstr *, Register, int>>
+          *DDPairs) {
+    const MachineInstr *PredMI = Pred.getSUnit()->getInstr();
+    const TPUInstrInfo *TII = ST.getInstrInfo();
+    if (TII->isVectorInstruction(PredMI->getDesc()) && !PredMI->mayLoad()) {
+      // We're conservatively considering everything that is not a vpop
+      // or a load, a vector ALU instruction.
+      assert(!TPUInstrInfo::isFifoPop(*PredMI));
+      assert(SU.getInstr()->getOperand(0).getReg() == Pred.getReg());
+      addHoldLatency(SU, Pred, 2, R, DDPairs);
+    }
+  }
+
+  // Adds vres hold for VMOV.
+  static void vresHoldVMOV(
+      SUnit &SU, SDep &Pred, Register R, const TPUSubtarget &ST,
+      MapVector<MachineInstr *, std::tuple<MachineInstr *, Register, int>>
+          *DDPairs) {
+    const MachineInstr *PredMI = Pred.getSUnit()->getInstr();
+    const TPUInstrInfo *TII = ST.getInstrInfo();
+    if (TII->isVectorInstruction(PredMI->getDesc()) && !PredMI->mayLoad()) {
+      // We're conservatively considering everything that is not a vpop or
+      // a load, a vector ALU instruction.
+      assert(!TPUInstrInfo::isFifoPop(*PredMI));
+      assert(SU.getInstr()->getOperand(0).getReg() == Pred.getReg());
+      addHoldLatency(SU, Pred, 2, R, DDPairs);
+    }
+  }
+
+  // Returns the fifo register of presumed fifo instruction MI.
+  static Register getFifoRegister(const MachineInstr *MI) {
+    const MachineFunction &MF = *MI->getMF();
+    auto &ST = MF.getSubtarget<TPUSubtarget>();
+    const TPURegisterInfo *TRI = ST.getRegisterInfo();
+    (void)TRI; // Silencing NDEBUG
+    auto &MO = *(MI->operands_begin() + MI->getNumExplicitDefs());
+    Register R = MO.getReg();
+    if (Register::isVirtualRegister(R))
+      return TPU::NoRegister;
+    assert(TRI->isFifoRegister(MF, R));
+    return R;
+  }
+
+  // If non-null, the mutation collects a list of all discovered output
+  // dependencies subject to VRes hold.
+  MapVector<MachineInstr *, std::tuple<MachineInstr *, Register, int>> *DDPairs;
+};
+
+class FifoVoidMutation : public ScheduleDAGMutation {
+public:
+  void apply(ScheduleDAGInstrs *DAG) override {
+    assert(DAG->begin() != DAG->end() && "Empty scheduling region?");
+    const TPUSubtarget &ST = DAG->MF.getSubtarget<TPUSubtarget>();
+    if (!ST.isSparseCore())
+      return;
+    auto EdgeOfInterest = [](SDep &D) {
+      if (D.getKind() != SDep::Anti && D.getKind() != SDep::Output &&
+          D.getKind() != SDep::Data)
+        return false;
+      if (D.getReg() != TPU::Void)
+        return false;
+      return true;
+    };
+    for (SUnit &SU : DAG->SUnits) {
+      SmallVector<SDep, 16> EdgesToRemove;
+      for (auto &Pred : SU.Preds) {
+        if (!EdgeOfInterest(Pred))
+          continue;
+        EdgesToRemove.push_back(Pred);
+      }
+      for (auto &D : EdgesToRemove)
+        SU.removePred(D);
+    }
+  }
+};
+
+// Add edges to prevent scheduler to overflow Fifos. We scan the DAG for push
+// and pop instructions and add an edge between a push and the Nth following
+// push of the DAG with N = FifoDepth - InputFifoState.
+class FifoOverflowMutation : public ScheduleDAGMutation {
+public:
+  struct FifoChanges {
+    std::vector<std::pair<MachineInstr *, int>> Pushes;
+    std::vector<std::pair<MachineInstr *, int>> Pops;
+    // These values are used only during construction of the Push/Pop lists
+    // but splitting them in a separate map would probably result in additional
+    // map searches, so keeping them in this map to avoid that.
+    int SequenceNumber = -1;
+    int TotalPushedItems = 0;
+    int TotalPoppedItems = 0;
+  };
+  std::unique_ptr<FifoAnalysis> FA;
+  DenseMap<const FifoInfo *, FifoChanges> FifoInfos;
+  MachineFunction *CurrentMF = nullptr;
+  MachineBasicBlock *MBB = nullptr;
+
+  bool isFifoInstr(ScheduleDAGInstrs *DAG, MachineInstr *MI,
+                   const FifoInfo *FI) {
+    auto *MRI = &DAG->MF.getRegInfo();
+    return any_of(MI->operands(), [&](const MachineOperand &MO) {
+      if (!MO.isReg())
+        return false;
+      unsigned Reg = MO.getReg();
+      if (Register::isVirtualRegister(Reg)) {
+        assert(MRI && "Need RegInfo to handle virtual registers!");
+        return MRI->getRegClass(Reg) == FI->getRegisterClass();
+      } else {
+        return FI->getRegisterClass()->contains(MO.getReg());
+      }
+    });
+  }
+
+  // Adds pop->push edges just enough to prevent overflow, assuming serial code.
+  void applyOverflowMutationByCounting(ScheduleDAGInstrs *DAG) {
+    const TPUSubtarget &ST = DAG->MF.getSubtarget<TPUSubtarget>();
+    // This runs only once per scheduling pass per MachineFunction. The pointers
+    // are used as a tag.
+    FifoFillAnalysis FFA(&DAG->MF, DAG->getSchedModel());
+    for (auto &FI : ST.getFifoInfos()) {
+      int FillLevel =
+          FFA.getBlockInputFillLevel(*DAG->begin()->getParent(), FI);
+      LLVM_DEBUG(dbgs() << "Fill entry level for FIFO "
+                        << StringRef(ST.getRegisterInfo()->getRegClassName(
+                                         FI->getRegisterClass()))
+                               .lower()
+                        << " for block bb."
+                        << DAG->begin()->getParent()->getNumber() << " is "
+                        << FillLevel << "\n");
+      std::queue<SUnit *> Pops;
+      for (SUnit &SU : DAG->SUnits) {
+        MachineInstr *MI = SU.getInstr();
+        if (!MI)
+          continue;
+        if (!isFifoInstr(DAG, MI, FI))
+          continue;
+        if (TPUInstrInfo::isFifoPush(*MI)) {
+          assert(FillLevel <= ST.getFifoDepth(FI));
+          if (FillLevel == ST.getFifoDepth(FI))
+            LLVM_DEBUG(
+                dbgs()
+                << "Warning, potentially unsafe fifo overflow assumption.\n");
+          FillLevel += FI->getPushNumItems(MI->getOpcode());
+          FillLevel = std::min(FillLevel, (int)ST.getFifoDepth(FI));
+          SUnit *Pop = nullptr;
+          // If the sum of the preceding pops could overflow the fifo when
+          // reordering with a push, we need to add an edge to prevent this.
+          while (!Pops.empty() &&
+                 FillLevel + Pops.size() > ST.getFifoDepth(FI)) {
+            Pop = Pops.front();
+            // If this isn't true, we can't use only Pops.size() above.
+            assert(FI->getPopNumItems(Pop->getInstr()->getOpcode()) == 1);
+            Pops.pop();
+          }
+          if (Pop) {
+            SDep D(Pop, SDep::Artificial);
+            D.setLatency(0);
+            SU.addPred(D);
+          }
+        }
+        if (TPUInstrInfo::isFifoPop(*MI)) {
+          Pops.push(&SU);
+          if (FillLevel < FI->getPopNumItems(MI->getOpcode()))
+            LLVM_DEBUG(
+                dbgs()
+                << "Warning, potentially unsafe fifo overflow assumption.\n");
+          FillLevel -= FI->getPopNumItems(MI->getOpcode());
+        }
+      }
+    }
+  }
+
+  // Applies a mutation that doesn't allow push/pop re-ordering.
+  void applyOverflowMutationByRestricting(ScheduleDAGInstrs *DAG) {
+    const TPUSubtarget &ST = DAG->MF.getSubtarget<TPUSubtarget>();
+    // Analysis can't run on bundles, so we need to run it now, even though it's
+    // only actually needed in "ByCounting" analysis.
+    FifoFillAnalysis FFA(&DAG->MF, DAG->getSchedModel());
+    (void)FFA;
+    for (auto &FI : ST.getFifoInfos()) {
+      std::vector<SUnit *> Pushes;
+      std::vector<SUnit *> Pops;
+      for (SUnit &SU : DAG->SUnits) {
+        MachineInstr *MI = SU.getInstr();
+        if (!isFifoInstr(DAG, MI, FI))
+          continue;
+        if (TPUInstrInfo::isFifoPush(*MI))
+          Pushes.push_back(&SU);
+        if (TPUInstrInfo::isFifoPop(*MI))
+          Pops.push_back(&SU);
+      }
+      for (auto &PopSU : Pops) {
+        for (auto &PushSU : Pushes) {
+          if (PopSU->NodeNum > PushSU->NodeNum)
+            continue;
+          SDep D(PopSU, SDep::Artificial);
+          D.setLatency(0);
+          PushSU->addPred(D);
+        }
+      }
+    }
+  }
+
+  // Applies a mutation that allows push/pop re-ordering by analysis.
+  // FIXME(hgreving): deprecated analysis, broken for predicated code.
+  void applyOverflowMutationByAnalysis(ScheduleDAGInstrs *DAG,
+                                       bool AssumeLoopEmptyFifo) {
+    const TPUSubtarget &ST = DAG->MF.getSubtarget<TPUSubtarget>();
+    MachineBasicBlock *CurMBB = DAG->begin()->getParent();
+    if (CurrentMF != &DAG->MF) {
+      if (AssumeLoopEmptyFifo) {
+        SmallSet<MachineBasicBlock *, 8> AllowSetMBBs;
+        AllowSetMBBs.insert(CurMBB);
+        FA = std::make_unique<FifoAnalysis>(
+            DAG->MF, ST.getFifoInfos(), AllowSetMBBs, &DAG->MF.getRegInfo());
+      } else {
+        FA = std::make_unique<FifoAnalysis>(DAG->MF, ST.getFifoInfos(),
+                                            &DAG->MF.getRegInfo());
+      }
+      CurrentMF = &DAG->MF;
+      MBB = nullptr;
+    }
+    auto &MRI = CurrentMF->getRegInfo();
+    // We model transpose instructions in a separate way from the FifoAnalysis,
+    // as the FifoAnalysis considers only "TransposeENDs" as the value defining
+    // operations. Instead we need to track allocations generated from every
+    // possible transpose instruction in Jellyfish as the allocation happen
+    // at the second Transpose (not the END) and in pufferfish the allocation
+    // happens at the first one.
+    auto GetFifoInfoForTranspose =
+        [&](const MachineInstr &MI) -> const FifoInfo * {
+      assert(TPUInstrInfo::isTranspose(MI.getDesc()) &&
+             "Should be a transpose instruction");
+      Register Reg(MI.getOperand(0).getReg());
+      if (!Reg.isValid())
+        return nullptr;
+      if (Reg.isPhysical())
+        return ST.getFifoInfo(Reg);
+      return ST.getFifoInfo(MRI.getRegClass(Reg));
+    };
+    // Cache analysis done for basicblock across executions.
+    if (MBB != CurMBB) {
+      MBB = CurMBB;
+      FifoInfos.clear();
+      for (MachineInstr &MI : *MBB) {
+        if (TPUInstrInfo::isTranspose(MI.getDesc())) {
+          const FifoInfo *FI = GetFifoInfoForTranspose(MI);
+          // Simulate the behavior of the FifoAnalysis of skipping
+          // instructions with NOREG.
+          if (FI == nullptr)
+            continue;
+          FifoChanges &F = FifoInfos[GetFifoInfoForTranspose(MI)];
+          ++F.SequenceNumber;
+          // Get how many elements of the fifo this instruction instantiates.
+          int FifoUsage =
+              TPUInstrInfo::getTransposeFifoUsage(MI, F.SequenceNumber);
+          F.TotalPushedItems += FifoUsage;
+          if (FifoUsage > 0)
+            F.Pushes.push_back(std::make_pair(&MI, F.TotalPushedItems));
+          if (TPUInstrInfo::isTransposeEnd(MI.getDesc()))
+            F.SequenceNumber = -1;
+        } else if (auto *Def = FA->getDef(MI)) {
+          FifoChanges &F = FifoInfos[Def->getFifoInfo()];
+          F.TotalPushedItems += Def->getNumPushedItems();
+          F.Pushes.push_back(std::make_pair(&MI, F.TotalPushedItems));
+        } else if (auto *Use = FA->getUse(MI)) {
+          FifoChanges &F = FifoInfos[Use->getFifoInfo()];
+          // Model pops that pop more than one element as X individual pops
+          // where X is the amount they pop.
+          for (int I = 0, E = Use->getNumPoppedItems(); I != E; ++I)
+            F.Pops.push_back(std::make_pair(&MI, F.TotalPoppedItems + I));
+          F.TotalPoppedItems += Use->getNumPoppedItems();
+        }
+      }
+    }
+    for (auto It : FifoInfos) {
+      const int InputFifoState =
+          AssumeLoopEmptyFifo ? 0 : FA->getFifoInputState(MBB, It.first);
+      // Skip to the first Push/Pop element of the current DAG.
+      auto FirstInDagPushIt =
+          std::find_if(It.second.Pushes.begin(), It.second.Pushes.end(),
+                       [DAG](std::pair<MachineInstr *, int> &P) {
+                         return DAG->getSUnit(P.first) != nullptr;
+                       });
+      auto FirstInDagPopIt =
+          std::find_if(It.second.Pops.begin(), It.second.Pops.end(),
+                       [DAG](std::pair<MachineInstr *, int> &P) {
+                         return DAG->getSUnit(P.first) != nullptr;
+                       });
+      // Number of pushes/pop instructions happening before the current DAG
+      // in this basic-block. This is not an allocation amount, but just the
+      // number of push/pop instructions.
+      const int PushesBeforeDAG = (FirstInDagPushIt - It.second.Pushes.begin());
+      const int PopsBeforeDAG = (FirstInDagPopIt - It.second.Pops.begin());
+      // If nothing to do continue.
+      if (PushesBeforeDAG >= It.second.Pushes.size())
+        continue;
+      int LocalPushAmountBeforeDAG = 0;
+      if (PushesBeforeDAG > 0)
+        LocalPushAmountBeforeDAG = It.second.Pushes[PushesBeforeDAG - 1].second;
+      int LocalPopAmountBeforeDAG = 0;
+      if (PopsBeforeDAG > 0)
+        LocalPopAmountBeforeDAG = It.second.Pops[PopsBeforeDAG - 1].second;
+      // Amount of elements pushed by the Pushes we currently visited + all
+      // the pushes that happenend in the previous DAG of the block or in
+      // previous blocks that haven't been popped.
+      int CurrentPushAmount = InputFifoState + LocalPushAmountBeforeDAG;
+      // Amount of elements popped by the Pops we currently visited + all the
+      // pops that happenend in the previous DAG of the block.
+      int CurrentPopAmount = LocalPopAmountBeforeDAG;
+      int CurrentPopIndex = PopsBeforeDAG;
+      // Depth of the current fifo into consideration.
+      const int FifoDepth = ST.getFifoDepth(It.first);
+      for (int I = PushesBeforeDAG, E = It.second.Pushes.size(); I != E; ++I) {
+        auto &PushInfo = It.second.Pushes[I];
+        SUnit *PushSU = DAG->getSUnit(PushInfo.first);
+        // If this Push is not part of the current DAG there's no reason
+        // to continue as we terminated the current DAG.
+        if (PushSU == nullptr)
+          break;
+        // If the push is liveout skip adding an edge as it means the push and
+        // pop are from different loop iterations. We can terminate the
+        // execution for this Fifo.
+        if (FA->isLiveOut(PushSU->getInstr()))
+          break;
+        int PushAmount = PushInfo.second;
+        CurrentPushAmount = InputFifoState + PushAmount;
+        // If we are not blowing beyond the limit of the TRF size continue
+        // to the next push.
+        if ((CurrentPushAmount - CurrentPopAmount) <= FifoDepth)
+          continue;
+        // Compute the position of the pop we need to be connected with to avoid
+        // risking of overflowing the Fifo.
+        // This is computed as everything we pushed to the Fifo, minus
+        // everything we popped from it (which is the amount we have on the Fifo
+        // right now) minus the FifoDepth. This number is how much we are
+        // overflowing the Fifo potentially with this instruction. This means
+        // we need to connect as predecessors at least that many pops before
+        // this push to guarantee no fifo exhaustion.
+        // Adding this number to the last pop position we are currently at is
+        // our pop index plus 1.
+        int PopPos = CurrentPopIndex + (CurrentPushAmount - CurrentPopAmount) -
+                     FifoDepth - 1;
+        if (PopPos >= static_cast<int>(It.second.Pops.size())) {
+          LLVM_DEBUG(dbgs()
+                     << "Warning: we would like to connect a push to a pop "
+                        "beyond this basic block. Something has gone wrong");
+          continue;
+        }
+        // Pop position is less than the current pop index if what is currently
+        // in the fifo + what this instruction would allocate is less than the
+        // fifo depth.
+        if (PopPos < CurrentPopIndex)
+          continue;
+        SUnit *PopSU = DAG->getSUnit(It.second.Pops[PopPos].first);
+        if (PopSU == nullptr)
+          continue;
+        SDep D(PopSU, SDep::Artificial);
+        if (ST.hasEarlyVxposeAllocation() &&
+            TPUInstrInfo::isTranspose(PushSU->getInstr()->getDesc())) {
+          D.setLatency(1);
+        } else {
+          D.setLatency(0);
+        }
+        extern cl::opt<bool> UseOriginalOrderScheduler;
+        assert(UseOriginalOrderScheduler || PopSU->NodeNum < PushSU->NodeNum);
+        PushSU->addPred(D);
+      }
+    }
+  }
+
+  void apply(ScheduleDAGInstrs *DAG) override {
+    assert(DAG->begin() != DAG->end() && "Empty scheduling region?");
+    const TPUSubtarget &ST = DAG->MF.getSubtarget<TPUSubtarget>();
+    // It's ok if the using pass does not set single basic block information,
+    // currently only the bundle packer does.
+    bool IsPipelinedBlock =
+        DAG->MF.getInfo<TPUMachineFunctionInfo>()->isBasicBlockPipelined(
+            DAG->begin()->getParent());
+    if (!ST.isSparseCore())
+      return applyOverflowMutationByAnalysis(DAG, false);
+    // On SparseCore, we either fully restrict push/pop reordering, or we
+    // analyze its flow across a DAG and restrict pop/push just enough to
+    // prevent overflow, assuming serial code.
+    if (IsPipelinedBlock || NoPushPopReordering)
+      return applyOverflowMutationByRestricting(DAG);
+    return applyOverflowMutationByCounting(DAG);
+  }
+};
+
+class IndirectVregCbStreamMutation : public ScheduleDAGMutation {
+public:
+  void apply(ScheduleDAGInstrs *DAG) override {
+    assert(DAG->begin() != DAG->end() && "Empty scheduling region?");
+    const TPUSubtarget &ST = DAG->MF.getSubtarget<TPUSubtarget>();
+    if (!ST.hasIndirectVregCbStreamCorruption())
+      return;
+    if (DAG->MF.getInfo<TPUMachineFunctionInfo>()->isBasicBlockPipelined(
+            DAG->begin()->getParent()))
+      return;
+
+    for (SUnit &SU : DAG->SUnits) {
+      if (!TPUInstrInfo::isIndirectVregCbStream(*SU.getInstr()))
+        continue;
+
+      auto AddEdge = [](SUnit *Pred, SUnit *SU) {
+        SDep D(Pred, SDep::Artificial);
+        D.setLatency(1);
+        SU->addPred(D);
+      };
+      AddEdge(&DAG->EntrySU, &SU);
+      AddEdge(&SU, &DAG->ExitSU);
+    }
+  }
+};
+
+// Removes any Anti or Data dependences between instructions in different
+// pipeline stages within a BarnaCore sofware pipelined loop.
+class BcDisjointPipelineMutation : public ScheduleDAGMutation {
+public:
+  void apply(ScheduleDAGInstrs *DAG) override {
+    assert(DAG->begin() != DAG->end() && "Empty scheduling region?");
+    if (!DAG->MF.getSubtarget<TPUSubtarget>().isPxcBarnaCore())
+      return;
+
+    const TPUSubtarget *ST = &DAG->MF.getSubtarget<TPUSubtarget>();
+    const TPUInstrInfo *TII = ST->getInstrInfo();
+
+    // Remove any Anti, Output or Data dependencies between instructions that
+    // live in different pipeline stages. Note, we don't need to deal with
+    // rerouting registers here. The only possible registers that interfere
+    // across pipeline stages are superregisters. Because we use a different
+    // subregister per pipeline stage (TPU::ps_0,ps_1 etc) the physreg deps
+    // here are always accurate.
+    for (MachineInstr &MI : *DAG) {
+      SUnit *SU = DAG->getSUnit(&MI);
+      // Make sure BarnaCore pseudo instructions don't affect the schedule.
+      if (MI.getOpcode() == TPU::bcROTATEREG ||
+          MI.getOpcode() == TPU::bcIMPLICIT_SUBREG_COPY ||
+          MI.getOpcode() == TPU::bcLOOP_END) {
+        for (SDep &P : SU->Preds) {
+          P.setLatency(0);
+          // Change the opposite edge as well.
+          for (SDep &S : P.getSUnit()->Succs) {
+            if (S.getSUnit() == SU)
+              S.setLatency(0);
+          }
+        }
+        for (SDep &S : SU->Succs) {
+          S.setLatency(0);
+          for (SDep &P : S.getSUnit()->Preds) {
+            if (P.getSUnit() == SU)
+              P.setLatency(0);
+          }
+        }
+      }
+      if (!TII->isVectorInstruction(MI))
+        continue;
+      int MIStage = TPUPredicate(&MI).getBarnaCorePipelineStage();
+      SmallVector<SDep, 4> CachedDeps = SU->Preds;
+      for (SDep &Dep : CachedDeps) {
+        if (Dep.getKind() != SDep::Anti &&
+            Dep.getKind() != SDep::Output &&
+            Dep.getKind() != SDep::Data)
+          // Don't erase memory dependencies.
+          continue;
+        MachineInstr *DepInstr = Dep.getSUnit()->getInstr();
+        if (!TII->isVectorInstruction(*DepInstr))
+          continue;
+        int DepStage = TPUPredicate(DepInstr).getBarnaCorePipelineStage();
+        if (MIStage != DepStage)
+          SU->removePred(Dep);
+      }
+    }
+  }
+};
+
+// This mutation is basically a big workaround for the fact that we're modeling
+// the cb.upd semantics of load/store instructions as memory dependencies. What
+// we really should do is adding circular buffer output register to the
+// instructions and intrinsics. In this mutation, we relax some of the ordering
+// constraints described below.
+class CbRegMutation : public ScheduleDAGMutation {
+public:
+  void apply(ScheduleDAGInstrs *DAG) override {
+    assert(DAG->begin() != DAG->end() && "Empty scheduling region?");
+    MachineRegisterInfo &MRI = DAG->MF.getRegInfo();
+    auto IsCbReg = [&](Register R) {
+      if (Register::isVirtualRegister(R)) {
+        if (MRI.getRegClass(R) == &TPU::CBRRegClass)
+          return true;
+      } else {
+        assert(Register::isPhysicalRegister(R));
+        if (TPU::CBRRegClass.contains(R))
+          return true;
+      }
+      return false;
+    };
+    auto GetCbReg = [&](MachineInstr &MI) {
+      for (MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg())
+          continue;
+        Register R = MO.getReg();
+        if (IsCbReg(R))
+          return R;
+      }
+      return (Register)TPU::NoRegister;
+    };
+    SmallVector<std::pair<SUnit *, SDep>, 8> EdgesToRemove;
+    for (MachineInstr &MI : *DAG) {
+      SUnit *SU = DAG->getSUnit(&MI);
+      if (!TPUInstrInfo::isCb(MI) && !TPUInstrInfo::isCbUpd(MI))
+        continue;
+      const TPUSubtarget &ST = DAG->MF.getSubtarget<TPUSubtarget>();
+      if (ST.hasIncorrectCbregWriteBypass()) {
+        for (SDep &D : SU->Succs) {
+          MachineInstr *SuccMI = D.getSUnit()->getInstr();
+          if (!SuccMI)
+            continue;
+          if (D.getKind() != SDep::Anti)
+            continue;
+          if (!IsCbReg(D.getReg()))
+            continue;
+          updateSuccLatency(SU, D, /*L=*/1);
+        }
+      }
+      for (SDep &D : SU->Succs) {
+        MachineInstr *SuccMI = D.getSUnit()->getInstr();
+        if (!SuccMI)
+          continue;
+        if (!TPUInstrInfo::isCb(*SuccMI) && !TPUInstrInfo::isCbUpd(*SuccMI))
+          continue;
+        if (D.getKind() != SDep::Order)
+          continue;
+        if (!D.isNormalMemory())
+          continue;
+        if (TPUInstrInfo::isVectorInstruction(MI) &&
+            TPUInstrInfo::isVectorInstruction(*SuccMI))
+          // Both vector instructions, we don't know the circular buffer address
+          // in this context.
+          continue;
+        if (!TPUInstrInfo::isVectorInstruction(MI) &&
+            !TPUInstrInfo::isVectorInstruction(*SuccMI))
+          // Both scalar instructions, we don't know the circular buffer address
+          // in this context.
+          continue;
+        Register PredCb = GetCbReg(MI);
+        Register SuccCb = GetCbReg(*SuccMI);
+        assert(PredCb != TPU::NoRegister);
+        assert(SuccCb != TPU::NoRegister);
+        if (PredCb != SuccCb) {
+          // The vector/scalar load/store never needs the edge if different
+          // circular buffer register.
+          SDep P = D;
+          P.setSUnit(SU);
+          EdgesToRemove.push_back(std::make_pair(D.getSUnit(), P));
+          continue;
+        }
+        if (TPUInstrInfo::isCb(MI) && TPUInstrInfo::isCb(*SuccMI)) {
+          assert(!TPUInstrInfo::isCbUpd(MI));
+          assert(!TPUInstrInfo::isCbUpd(*SuccMI));
+          // Both vector/scalar load/store not updating the circular buffer
+          // register, edge can be removed.
+          SDep P = D;
+          P.setSUnit(SU);
+          EdgesToRemove.push_back(std::make_pair(D.getSUnit(), P));
+          continue;
+        }
+        if (TPUInstrInfo::isCbUpd(MI)) {
+          // Same circular buffer register with predecessor updating does need
+          // an edge. The latency should already have been set to at least 1.
+          assert(D.getLatency() >= 1);
+          continue;
+        }
+        if (ST.hasIncorrectCbregWriteBypass())
+          continue;
+        assert(!TPUInstrInfo::isCbUpd(MI));
+        assert(TPUInstrInfo::isCbUpd(*SuccMI));
+        // Same circular buffer register with predecessor not updating, and
+        // successor updating, the nodes can be in the same cycle.
+        updateSuccLatency(SU, D, /*L=*/0);
+      }
+    }
+    for (auto &ToRemove : EdgesToRemove)
+      ToRemove.first->removePred(ToRemove.second);
+  }
+};
+
+// Make sure we have a latency of 1 when we a have a load after store or store
+// after store.
+class OrderMemoryAccessesMutation : public ScheduleDAGMutation {
+public:
+  void apply(ScheduleDAGInstrs *DAG) override {
+    assert(DAG->begin() != DAG->end() && "Empty scheduling region?");
+    for (MachineInstr &MI : *DAG) {
+      SUnit *SU = DAG->getSUnit(&MI);
+      // Call cannot be scheduled with any other instructions right now since
+      // we inline assembly during linking.
+      const TPUSubtarget &ST = DAG->MF.getSubtarget<TPUSubtarget>();
+      if (!ST.isTPUABIEnabled() && MI.getOpcode() == TPU::CALL) {
+        for (auto &Pred : SU->Preds) {
+          Pred.setLatency(std::max(Pred.getLatency(), unsigned(1)));
+          // Update edge in the other direction.
+          for (SDep &PI : Pred.getSUnit()->Succs) {
+            if (PI.getSUnit() != SU)
+              continue;
+            PI.setLatency(std::max(PI.getLatency(), unsigned(1)));
+          }
+        }
+        for (auto &Succ : SU->Succs) {
+          Succ.setLatency(std::max(Succ.getLatency(), unsigned(1)));
+          // Update edge in the other direction.
+          for (SDep &PI : Succ.getSUnit()->Preds) {
+            if (PI.getSUnit() != SU)
+              continue;
+            PI.setLatency(std::max(PI.getLatency(), unsigned(1)));
+          }
+        }
+      } else if (MI.getOpcode() == TPU::TRAP ||
+                 MI.getOpcode() == TPU::scTRAPr ||
+                 MI.getOpcode() == TPU::scTRAPi ||
+                 MI.getOpcode() == TPU::scPSEUDO_TRAPr ||
+                 MI.getOpcode() == TPU::scPSEUDO_TRAPi) {
+        for (auto &Succ : SU->Succs) {
+        MachineInstr* MI = Succ.getSUnit()->getInstr();
+        if (MI &&
+            (MI->getOpcode() == TPU::HALT || MI->getOpcode() == TPU::TRAP ||
+             MI->getOpcode() == TPU::scTRAPr ||
+             MI->getOpcode() == TPU::scTRAPi ||
+             MI->getOpcode() == TPU::scPSEUDO_TRAPr ||
+             MI->getOpcode() == TPU::scPSEUDO_TRAPi)) {
+          Succ.setLatency(1);
+          // Update edge in the other direction.
+          for (SDep &PI : Succ.getSUnit()->Preds) {
+            if (PI.getSUnit() != SU)
+              continue;
+            PI.setLatency(1);
+          }
+        }
+        }
+      } else if (MI.mayStore() && !TPUInstrInfo::isScBundleLimiter(MI)) {
+        for (auto &Succ : SU->Succs) {
+          if (Succ.isNormalMemory() && Succ.getLatency() == 0 &&
+              Succ.getSUnit()->getInstr()->mayLoadOrStore()) {
+            Succ.setLatency(1);
+            // Update edge in the other direction.
+            for (SDep &PI : Succ.getSUnit()->Preds) {
+              if (PI.getSUnit() != SU || !PI.isNormalMemory() ||
+                  TPUInstrInfo::isScBundleLimiter(*PI.getSUnit()->getInstr()))
+                continue;
+              PI.setLatency(1);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+class SyncFlagAndStreamOrderMutation : public ScheduleDAGMutation {
+public:
+  void apply(ScheduleDAGInstrs *DAG) override {
+    assert(DAG->begin() != DAG->end() && "Empty scheduling region?");
+    const TPUSubtarget &ST = DAG->MF.getSubtarget<TPUSubtarget>();
+    if (!ST.isSparseCore())
+      return;
+    auto *TII = ST.getInstrInfo();
+    for (MachineInstr &MI : *DAG) {
+      if (!MI.mayStore() || !TII->isVectorInstruction(MI.getDesc()))
+        continue;
+      SUnit *SU = DAG->getSUnit(&MI);
+      SmallVector<std::pair<SUnit *, SDep>, 8> EdgesToRemove;
+      for (auto &Succ : SU->Succs) {
+        auto *SuccMI = Succ.getSUnit()->getInstr();
+        if (SuccMI == nullptr)
+          continue;
+        if (!SuccMI->mayStore())
+          continue;
+        if (TPUInstrInfo::isStream(*SuccMI) || TPUInstrInfo::isDMA(*SuccMI)) {
+          for (const MachineMemOperand *MMO : SuccMI->memoperands()) {
+            if (const Value *V = MMO->getValue()) {
+              // We're currently only attaching the TileSpmem memory operand
+              // during lowering. If it's there, it means it needs to be
+              // ordered, but can be zero latency.
+              assert(V->getType()->getPointerAddressSpace() == TPUAS_TileSpmem);
+              updateSuccLatency(SU, Succ, /*L=*/0);
+              break;
+            } else {
+              // No TileSpmem pointer operand on stream means it can be
+              // reordered. We're removing the edge.
+              // TODO(hgreving): can we do this in MIR AA analysis instead?
+              for (SDep &D : Succ.getSUnit()->Preds) {
+                if (D.getSUnit() == SU)
+                  EdgesToRemove.push_back(std::make_pair(Succ.getSUnit(), D));
+              }
+            }
+          }
+          continue;
+        }
+        for (const MachineMemOperand *MMO : SuccMI->memoperands()) {
+          if (const Value *V = MMO->getValue()) {
+            unsigned AS = V->getType()->getPointerAddressSpace();
+            if (AS == TPUAS_Sflag || AS == TPUAS_SflagOther) {
+              updateSuccLatency(SU, Succ, /*L=*/0);
+            }
+          }
+        }
+      }
+      for (auto &ToRemove : EdgesToRemove)
+        ToRemove.first->removePred(ToRemove.second);
+    }
+  }
+};
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createOrderMemoryAccessMutation() {
+  return std::make_unique<OrderMemoryAccessesMutation>();
+}
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createCbRegMutation() {
+  return std::make_unique<CbRegMutation>();
+}
+
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createSyncFlagAndStreamOrderMutation() {
+  return std::make_unique<SyncFlagAndStreamOrderMutation>();
+}
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createBcDisjointPipelineMutation() {
+  return std::make_unique<BcDisjointPipelineMutation>();
+}
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createComposedFifoLatency() {
+  return std::make_unique<ComposedFifoLatencyMutation>();
+}
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createVMemReadAfterWriteMutation() {
+  return std::make_unique<VMemReadAfterWriteMutation>();
+}
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createTPUUnitCadenceMutation() {
+  return std::make_unique<UnitCadenceMutation>();
+}
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createVRegAddressCalcMutation() {
+  return std::make_unique<VRegAddressCalcMutation>();
+}
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createVResHoldMutation(
+    MapVector<MachineInstr *, std::tuple<MachineInstr *, Register, int>>
+        *DDPairs) {
+  return std::make_unique<VResHoldMutation>(DDPairs);
+}
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createFifoOverflowMutation() {
+  return std::make_unique<FifoOverflowMutation>();
+}
+
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createIndirectVregCbStreamMutation() {
+  return std::make_unique<IndirectVregCbStreamMutation>();
+}
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createFifoVoidMutation() {
+  return std::make_unique<FifoVoidMutation>();
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSchedule.td b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSchedule.td
new file mode 100644
index 0000000..d39e278
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSchedule.td

@@ -0,0 +1,331 @@
+//=-TPUSchedule.td - TPU Scheduling Definitions --*- tablegen -*-=========//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// We use Itineraries here to support the DFA packetizer. These functional units
+// correspond to bundle slots. The itinerary classes IIC_* below will be added
+// to instructions using the BundleSlot<> class in TPUInstrFormats.td.
+def SLOT_S0 : FuncUnit;
+def SLOT_S1 : FuncUnit;
+def SLOT_SM : FuncUnit;
+def SLOT_V0 : FuncUnit;
+def SLOT_V1 : FuncUnit;
+def SLOT_V2 : FuncUnit;
+def SLOT_V3 : FuncUnit;
+def SLOT_SLD : FuncUnit;
+def SLOT_SST : FuncUnit;
+def SLOT_VLD : FuncUnit;
+def SLOT_VST : FuncUnit;
+def SLOT_VEX0 : FuncUnit;
+def SLOT_VEX1 : FuncUnit;
+def SLOT_VRES0 : FuncUnit;
+def SLOT_VRES1 : FuncUnit;
+def SLOT_VAUX : FuncUnit;
+
+// And these itinerary classes correspond to bundle slot requirements. They
+// mainly map 1:1 to slots apart from the Sany and Vany requirements.
+def IIC_S0 : InstrItinClass;
+def IIC_S1 : InstrItinClass;
+def IIC_SM : InstrItinClass;
+def IIC_Sany : InstrItinClass;
+def IIC_SanyMisc : InstrItinClass;
+def IIC_Sboth : InstrItinClass;
+def IIC_V0 : InstrItinClass;
+def IIC_V1 : InstrItinClass;
+def IIC_V2 : InstrItinClass;
+def IIC_V3 : InstrItinClass;
+def IIC_Vany : InstrItinClass;
+def IIC_SLD : InstrItinClass;
+def IIC_SST : InstrItinClass;
+def IIC_VLD : InstrItinClass;
+def IIC_VST : InstrItinClass;
+def IIC_VLDVST : InstrItinClass;
+def IIC_MXU_PUSH : InstrItinClass;
+def IIC_MXU_MUL : InstrItinClass;
+def IIC_VEX0 : InstrItinClass;
+def IIC_VEX1 : InstrItinClass;
+def IIC_VEX : InstrItinClass;
+def IIC_V_VEX : InstrItinClass;
+def IIC_VEXBoth : InstrItinClass;
+def IIC_VRES0 : InstrItinClass;
+def IIC_VRES1 : InstrItinClass;
+def IIC_VRES : InstrItinClass;
+
+// Instruction and architecure specific itineraries.
+def IIC_VARI : InstrItinClass;
+def IIC_VCLAMP : InstrItinClass;
+def IIC_EUP_OP : InstrItinClass;
+def IIC_VCVT : InstrItinClass;
+def IIC_VMPCNT : InstrItinClass;
+def IIC_VMPREFIX : InstrItinClass;
+def IIC_VM_OP : InstrItinClass;
+def IIC_VPUSH : InstrItinClass;
+def IIC_VBCAST : InstrItinClass;
+def IIC_VPERM : InstrItinClass;
+def IIC_VSHIFTI : InstrItinClass;
+def IIC_VFMUL : InstrItinClass;
+def IIC_VFADD : InstrItinClass;
+def IIC_VMOVR : InstrItinClass;
+def IIC_TASK : InstrItinClass;
+def IIC_ALUOV : InstrItinClass;
+def IIC_PACK : InstrItinClass;
+def IIC_UNPACK : InstrItinClass;
+
+// This is a workaround for instruction specialization for extended result op
+// which required an additional secondary slot. Note that we create all
+// itineraries, but each platform defines only part of them.
+// FIXME: come up with better solution.
+foreach vres = ["VRES0", "VRES1"] in {
+foreach extra = ["V0", "V1", "V2", "VLD", "VAUX"] in {
+  def IIC_#vres#_#extra: InstrItinClass;
+}
+}
+
+class TPUSchedModel : SchedMachineModel {
+  // Cycles for loads to access the cache [default = -1]
+  let LoadLatency = 1;
+
+  // Max micro-ops that can be buffered for optimized loop dispatch/execution.
+  // [default = -1]
+  let LoopMicroOpBufferSize = 0;
+
+  // Allow scheduler to assign default model to any unrecognized opcodes.
+  // [default = 1]
+  let CompleteModel = 0;
+
+  // Max micro-ops that may be scheduled per cycle. [default = 1]
+  // Note, set this to the maximum number of instructions in a bundle. This is
+  // used as a hard cutoff to bundle size by the scheduler for efficiency
+  // reasons.
+  let IssueWidth = 16;
+
+  // Extra cycles for a mispredicted branch. [default = -1]
+  let MispredictPenalty = 0;
+
+  // Disble Post RegAlloc Scheduler pass for now. [default = 0]
+  // We used to have postRA scheduler to aggressively break anti-dependency,
+  // however this is expensive in compile time as it requires fully recreating
+  // the scheduling DAG twice. We expect that breaking the anti-dependencies
+  // should have low benefits since we aggressively schedule preRA. We will
+  // revisit this decision if it turns out to cause sub-optimal scheduling.
+  let PostRAScheduler = 0;
+
+  // Max micro-ops that can be buffered. [default = -1]
+  let MicroOpBufferSize = 0;
+}
+
+def WriteVLD : SchedWrite;
+def WriteVLDIdx : SchedWrite;
+def WriteVLDCbUpd : SchedWrite;
+def WriteVSTCbUpd : SchedWrite;
+def WriteVLDCb : SchedWrite;
+def WriteVSTCb : SchedWrite;
+def WriteCrossLane : SchedWrite;
+def WriteVecScanCrossLane : SchedWrite;
+def WriteVecCrossLane : SchedWrite;
+def WriteVecMaskCrossLane : SchedWrite;
+def WriteXrf0_0 : SchedWrite;
+def WriteXrf0_1 : SchedWrite;
+def WriteXrf0_2 : SchedWrite;
+def WriteXrf0Bf16_0 : SchedWrite;
+def WriteXrf0Bf16_1 : SchedWrite;
+def WriteXrf0Bf16_2 : SchedWrite;
+def WriteXrf1 : SchedWrite;
+def WriteMatPush0 : SchedWrite;
+def WriteMatPush1 : SchedWrite;
+def WriteMatPush2 : SchedWrite;
+def WriteMatPush3 : SchedWrite;
+def WriteMatMulMxu0 : SchedWrite;
+def WriteMatMulMxu1 : SchedWrite;
+def WriteMatMulMxu2 : SchedWrite;
+def WriteMatMulMxu3 : SchedWrite;
+def WriteMatMulMxuPacked0 : SchedWrite;
+def WriteMatMulMxuPacked1 : SchedWrite;
+def WriteMatMulMxuPacked2 : SchedWrite;
+def WriteMatMulMxuPacked3 : SchedWrite;
+def WriteMatMulMxuInt0 : SchedWrite;
+def WriteMatMulMxuInt1 : SchedWrite;
+def WriteMatMulMxuInt2 : SchedWrite;
+def WriteMatMulMxuInt3 : SchedWrite;
+def WriteMatRes0 : SchedWrite;
+def WriteMatRes1 : SchedWrite;
+def WriteMatRes2 : SchedWrite;
+def WriteMatRes3 : SchedWrite;
+def WriteDmaGeneral : SchedWrite;
+def WriteDmaLocal : SchedWrite;
+def WriteStream : SchedWrite;
+def WriteStreamCb : SchedWrite;
+def WriteStreamCbUpd : SchedWrite;
+def WriteScalarCb : SchedWrite;
+def WriteSetRngSeed : SchedWrite;
+def WriteGetRngState : SchedWrite;
+def WriteRng : SchedWrite;
+def WriteFadd : SchedWrite;
+def WriteFmul : SchedWrite;
+def WriteEup : SchedWrite;
+def WriteEupBf16 : SchedWrite;
+def WriteEupPop : SchedWrite;
+def WriteDrf : SchedWrite;
+foreach Index = 0-3 in {
+def WriteTranspose#Index : SchedWrite;
+def WriteTransposePacked#Index : SchedWrite;
+def WriteTransposeEnd#Index : SchedWrite;
+def WriteTransposeEndPacked#Index : SchedWrite;
+def WritePermute#Index : SchedWrite;
+def WritePermutePacked#Index : SchedWrite;
+def WriteXLane#Index : SchedWrite;
+def WriteSetPermuteAll#Index : SchedWrite;
+}
+def WriteSetPermute : SchedWrite;
+def WriteTrf0Pop0 : SchedWrite;
+def WriteTrf0Pop1 : SchedWrite;
+def WriteTrf0Pop2 : SchedWrite;
+def WriteTrf1Pop0 : SchedWrite;
+def WriteTrf1Pop1 : SchedWrite;
+def WriteTrf1Pop2 : SchedWrite;
+def WriteTrf2Pop0 : SchedWrite;
+def WriteTrf2Pop1 : SchedWrite;
+def WriteTrf2Pop2 : SchedWrite;
+def WriteV2SF : SchedWrite;
+def WriteSFlagV2SF : SchedWrite;
+def WriteV2SFPop : SchedWrite;
+def WriteIar0 : SchedWrite;
+def WriteIar1 : SchedWrite;
+def WriteVrshra : SchedWrite;
+def WriteFPConvert : SchedWrite;
+def WriteFloatCompose : SchedWrite;
+def WritePackingInst : SchedWrite;
+def WriteIPackingInst : SchedWrite;
+def WriteCPackingInst : SchedWrite;
+def WriteRotateSLane : SchedWrite;
+def WriteSFCmp : SchedWrite;
+def WriteSld : SchedWrite;
+def WriteSetTracemark : SchedWrite;
+def WriteTrace : SchedWrite;
+def WriteSfence : SchedWrite;
+
+// TODO(jmolloy): These paths are shared between XLU and MXU ops so must be
+// modelled in itineraries for bundle packing (in particular XLU ops can select
+// multiple paths so we need to encode that in the instruction).
+// GSFN/GSFT paths for MXUs 0-3. These busses are also shared by XLU ops.
+let BufferSize = 0 in {
+def FU_MATPUSH0_PATH : ProcResource<1>;
+def FU_MATPUSH1_PATH : ProcResource<1>;
+def FU_MATPUSH2_PATH : ProcResource<1>;
+def FU_MATPUSH3_PATH : ProcResource<1>;
+
+// GMR paths for MXUs 0-3.
+def FU_MATMUL0_PATH : ProcResource<1>;
+def FU_MATMUL1_PATH : ProcResource<1>;
+def FU_MATMUL2_PATH : ProcResource<1>;
+def FU_MATMUL3_PATH : ProcResource<1>;
+
+// Matres paths.
+def FU_MATRES0_PATH : ProcResource<1>;
+def FU_MATRES1_PATH : ProcResource<1>;
+def FU_MATRES2_PATH : ProcResource<1>;
+def FU_MATRES3_PATH : ProcResource<1>;
+
+// TRF POP path.
+def FU_TRF0_POP0_PATH : ProcResource<1>;
+def FU_TRF0_POP1_PATH : ProcResource<1>;
+def FU_TRF0_POP2_PATH : ProcResource<1>;
+def FU_TRF1_POP0_PATH : ProcResource<1>;
+def FU_TRF1_POP1_PATH : ProcResource<1>;
+def FU_TRF1_POP2_PATH : ProcResource<1>;
+def FU_TRF2_POP0_PATH : ProcResource<1>;
+def FU_TRF2_POP1_PATH : ProcResource<1>;
+def FU_TRF2_POP2_PATH : ProcResource<1>;
+
+// XLU unit.
+def FU_XLU0_PATH : ProcResource<1>;
+def FU_XLU1_PATH : ProcResource<1>;
+def FU_XLU2_PATH : ProcResource<1>;
+def FU_XLU3_PATH : ProcResource<1>;
+
+// Bus unit.
+def FU_BUS0_PATH : ProcResource<1>;
+def FU_BUS1_PATH : ProcResource<1>;
+def FU_BUS2_PATH : ProcResource<1>;
+def FU_BUS3_PATH : ProcResource<1>;
+
+// PRNG state
+def FU_PRNG_STATE : ProcResource<1>;
+
+// EUP path state.
+def FU_EUP_PATH : ProcResource<1>;
+// EUP POP path.
+def FU_EUP_POP : ProcResource<1>;
+// V2S POP path.
+def FU_V2SF_PATH : ProcResource<1>;
+// Xrf0 path.
+def FU_XRF0_0_PATH : ProcResource<1>;
+def FU_XRF0_1_PATH : ProcResource<1>;
+def FU_XRF0_2_PATH : ProcResource<1>;
+
+// IAR write path.
+def FU_IAR0_PATH : ProcResource<1>;
+def FU_IAR1_PATH : ProcResource<1>;
+
+// CBREG path for Sparsecore
+def FU_CBREG_PATH : ProcResource<1>;
+
+// Descriptor path for Sparsecore
+def FU_DESC_PATH : ProcResource<1>;
+
+// Trace Resource
+def FU_TRACE : ProcResource<1>;
+
+// Pseudo sfence resource
+def FU_SFENCE : ProcResource<1>;
+
+} // End BufferSize = 0
+
+// Defines latency shared across all platforms. For changing per
+// platform those will be defined in the respective sub-platform schedule files.
+multiclass Latency {
+  def : WriteRes<WriteVecCrossLane, []>       { let Latency = 2; }
+  def : WriteRes<WriteVecMaskCrossLane, []>   { let Latency = 1; }
+  // All of the matpush, matmul and matres instructions hold their respective
+  // paths for 8 cycles.
+  let ResourceCycles = [8] in {
+  // MatPush result is read by DWG so the latency is 0. We ensure that pop and
+  // mul are 7 cycles away by adding a special edge in the scheduler.
+  def : WriteRes<WriteMatPush0, [FU_MATPUSH0_PATH]>   { let Latency = 0;   }
+  def : WriteRes<WriteMatPush1, [FU_MATPUSH1_PATH]>   { let Latency = 0;   }
+  def : WriteRes<WriteMatPush2, [FU_MATPUSH2_PATH]>   { let Latency = 0;   }
+  def : WriteRes<WriteMatPush3, [FU_MATPUSH3_PATH]>   { let Latency = 0;   }
+  def : WriteRes<WriteMatRes0, [FU_MATRES0_PATH]>     { let Latency = 1;   }
+  def : WriteRes<WriteMatRes1, [FU_MATRES1_PATH]>     { let Latency = 1;   }
+  def : WriteRes<WriteMatRes2, [FU_MATRES2_PATH]>     { let Latency = 1;   }
+  def : WriteRes<WriteMatRes3, [FU_MATRES3_PATH]>     { let Latency = 1;   }
+  } // End Resources = [8]
+
+  let ResourceCycles = [5] in {
+  def : WriteRes<WriteIar0, [FU_IAR0_PATH]> { let Latency = 5; }
+  def : WriteRes<WriteIar1, [FU_IAR1_PATH]> { let Latency = 5; }
+  }
+  // To limit ensure we have only one pop per bundle.
+  def : WriteRes<WriteEupPop, [FU_EUP_POP]>   { let Latency = 1; let ResourceCycles = [1]; }
+  // To make sure we only get one pop per bundle.
+  def : WriteRes<WriteV2SFPop, [FU_V2SF_PATH]>     { let Latency = 1; let ResourceCycles = [1]; }
+  def : WriteRes<WriteSetPermute, []> { let Latency = 1; }
+  def : WriteRes<WriteSetRngSeed,  [FU_PRNG_STATE]> { let Latency = 1; let ResourceCycles = [9]; }
+  def : WriteRes<WriteGetRngState, [FU_PRNG_STATE]> { let Latency = 1; let ResourceCycles = [1]; }
+  def : WriteRes<WriteRng,         [FU_PRNG_STATE]> { let Latency = 1; let ResourceCycles = [7]; }
+  def : WriteRes<WriteSFCmp, []> { let Latency = 2; }
+}
+
+include "TPUScheduleBarnaCorePF.td"
+include "TPUScheduleSparseCoreVF.td"
+include "TPUScheduleSparseCoreGL.td"
+include "TPUScheduleSparseCoreGF.td"
+include "TPUScheduleTensorCoreJF.td"
+include "TPUScheduleTensorCorePF.td"
+include "TPUScheduleTensorCoreVF.td"
+
+

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUScheduleTensorCoreJF.td b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUScheduleTensorCoreJF.td
new file mode 100644
index 0000000..c81ad30
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUScheduleTensorCoreJF.td

@@ -0,0 +1,180 @@
+//===-- TPUScheduleTensorCoreJF.td - Target Description for TPU Target -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes instruction itineraries for Jellyfish tensor core.
+//
+//===----------------------------------------------------------------------===//
+
+// Map slots to itinerary classes.
+def TensorCoreJFItineraries :
+  ProcessorItineraries<[SLOT_S0, SLOT_S1, SLOT_V0, SLOT_V1, SLOT_VLD, SLOT_VST,
+                        SLOT_SM, SLOT_VEX0, SLOT_VRES0], [], [
+    InstrItinData<IIC_S0, [InstrStage<1, [SLOT_S0]>]>,
+    InstrItinData<IIC_S1, [InstrStage<1, [SLOT_S1]>]>,
+    InstrItinData<IIC_Sany, [InstrStage<1, [SLOT_S0, SLOT_S1]>]>,
+    InstrItinData<IIC_SanyMisc, [InstrStage<1, [SLOT_S0, SLOT_S1]>]>,
+    InstrItinData<IIC_Sboth, [InstrStage<1, [SLOT_S0]>, InstrStage<0, [SLOT_S1]>]>,
+    InstrItinData<IIC_V0, [InstrStage<1, [SLOT_V0]>]>,
+    InstrItinData<IIC_V1, [InstrStage<1, [SLOT_V1]>]>,
+    InstrItinData<IIC_Vany, [InstrStage<1, [SLOT_V0, SLOT_V1]>]>,
+    // On tensor core smem accesses use S1 slot.
+    InstrItinData<IIC_SLD, [InstrStage<1, [SLOT_S1]>]>,
+    InstrItinData<IIC_SST, [InstrStage<1, [SLOT_S1]>]>,
+    InstrItinData<IIC_VLD, [InstrStage<1, [SLOT_VLD]>]>,
+    InstrItinData<IIC_VST, [InstrStage<1, [SLOT_VST]>]>,
+    InstrItinData<IIC_SM, [InstrStage<1, [SLOT_SM]>]>,
+    InstrItinData<IIC_MXU_PUSH,
+      [InstrStage<1, [SLOT_V0, SLOT_V1, SLOT_VST]>,
+       InstrStage<0, [SLOT_VEX0]>]>,
+    InstrItinData<IIC_MXU_MUL,
+      [InstrStage<1, [SLOT_V0, SLOT_V1, SLOT_VST]>,
+       InstrStage<0, [SLOT_VEX0]>]>,
+    // Transpose instructions encode source register in V0/V1 or VST shared
+    // slots.
+    // TODO(b/148140127) Make these definitions more fine grained.
+    // This is valid also for other definitions. that define multiple slots.
+    InstrItinData<IIC_VEX, [InstrStage<1, [SLOT_VEX0]>,
+                            InstrStage<0, [SLOT_V0, SLOT_V1, SLOT_VST]>]>,
+    // On JF vpop needs to steal a slot from either vld or vector alu 0 or 1.
+    InstrItinData<IIC_VRES0_V0, [InstrStage<1, [SLOT_VRES0]>, InstrStage<0, [SLOT_V0]>]>,
+    InstrItinData<IIC_VRES0_V1, [InstrStage<1, [SLOT_VRES0]>, InstrStage<0, [SLOT_V1]>]>,
+    InstrItinData<IIC_VRES0_VLD, [InstrStage<1, [SLOT_VRES0]>, InstrStage<0, [SLOT_VLD]>]>,
+    InstrItinData<IIC_VRES, [InstrStage<1, [SLOT_VRES0]>,
+                             InstrStage<0, [SLOT_V0, SLOT_V1, SLOT_VLD]>]>,
+    InstrItinData<IIC_VARI, [InstrStage<1, [SLOT_V1]>]>,
+    InstrItinData<IIC_VCLAMP, [InstrStage<1, [SLOT_V0, SLOT_V1]>]>,
+    InstrItinData<IIC_EUP_OP, [InstrStage<1, [SLOT_V0, SLOT_V1]>]>,
+    InstrItinData<IIC_VCVT, [InstrStage<1, [SLOT_V0, SLOT_V1]>]>,
+    InstrItinData<IIC_VM_OP, [InstrStage<1, [SLOT_SM]>]>,
+    InstrItinData<IIC_VPUSH, [InstrStage<1, [SLOT_VST]>]>,
+    InstrItinData<IIC_VFMUL, [InstrStage<1, [SLOT_V0]>]>,
+    InstrItinData<IIC_VFADD, [InstrStage<1, [SLOT_V1]>]>,
+    InstrItinData<IIC_VMOVR, [InstrStage<1, [SLOT_V0, SLOT_V1]>]>,
+]>;
+
+def TensorCoreJFSchedModel : TPUSchedModel {
+ // Per-cycle resources tables. [default = NoItineraries]
+  let Itineraries = TensorCoreJFItineraries;
+}
+
+// DMA paths. Note that PFC doesn't have these paths (at least not documented
+// on go/pfc-isa).
+//
+// The way we model DMA paths is a little different to the ISA. The ISA defines
+// a FIFO, the "Descriptor path". Both dma.local and dma.general push work onto
+// this fifo (3 cycles worth). dma.general *can only issue* when the FIFO is
+// empty. dma.local can issue even when the fifo isn't empty.
+//
+// Modelling this as an actual FIFO is hard so we don't bother. We model the
+// dma.local and descriptor path separately; dma.local only requires its own
+// path and dma.general requires the descriptor path for 3 cycles which is
+// an optimistic base requirement.
+let BufferSize = 0 in {
+def FU_DESCRIPTOR_PATH : ProcResource<1>;
+def FU_DMA_LOCAL_PATH : ProcResource<1>;
+}
+
+let SchedModel = TensorCoreJFSchedModel in {
+  defm : Latency;
+  def : WriteRes<WriteVLD, []>            { let Latency = 1; }
+  def : WriteRes<WriteVLDIdx, []>         { let Latency = 1; }
+  def : WriteRes<WriteVLDCbUpd, []>       { let Latency = 1; }
+  def : WriteRes<WriteVSTCbUpd, []>;
+  def : WriteRes<WriteVLDCb, []>          { let Latency = 1; }
+  def : WriteRes<WriteVSTCb, []>;
+  def : WriteRes<WriteXrf0_0, []>;
+  def : WriteRes<WriteXrf0_1, []>;
+  def : WriteRes<WriteXrf0_2, []>;
+  def : WriteRes<WriteXrf0Bf16_0, []>;
+  def : WriteRes<WriteXrf0Bf16_1, []>;
+  def : WriteRes<WriteXrf0Bf16_2, []>;
+  def : WriteRes<WriteXrf1, []>;
+  let ResourceCycles = [8] in {
+  def : WriteRes<WriteMatMulMxu0, [FU_MATMUL0_PATH]>  { let Latency = 105; }
+  def : WriteRes<WriteMatMulMxu1, [FU_MATMUL1_PATH]>  { let Latency = 105; }
+  }
+  def : WriteRes<WriteMatMulMxu2, []>;
+  def : WriteRes<WriteMatMulMxu3, []>;
+
+  def : WriteRes<WriteSfence, []>;
+
+  def : WriteRes<WriteCrossLane, []> { let Latency = 8; }
+  let ResourceCycles = [8, 8] in {
+  def : WriteRes<WriteTransposeEnd0, [FU_XLU0_PATH, FU_BUS0_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTranspose0, [FU_XLU0_PATH, FU_BUS0_PATH]> { let Latency = 0; }
+  // TODO(thomasraoux): DFC latency is different. We need a new target for DFC.
+  def : WriteRes<WritePermute0, [FU_XLU0_PATH, FU_BUS0_PATH]> { let Latency = 92; }
+  def : WriteRes<WriteXLane0, [FU_XLU0_PATH, FU_BUS0_PATH]> { let Latency = 92; }
+  def : WriteRes<WriteSetPermuteAll0, [FU_XLU0_PATH, FU_BUS0_PATH]> { let Latency = 1; }
+  }
+  let ResourceCycles = [8] in {
+  def : WriteRes<WriteTrf0Pop0, [FU_TRF0_POP0_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf0Pop1, [FU_TRF0_POP1_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf0Pop2, [FU_TRF0_POP2_PATH]> { let Latency = 1; }
+  }
+  // Only XLU 0 Bus 0 on JF.
+  // No XLU
+  foreach Index = 1-3 in {
+  def : WriteRes<!cast<SchedWrite>("WriteTransposeEnd"#Index), []> { let Latency = 0; }
+  def : WriteRes<!cast<SchedWrite>("WriteTranspose"#Index), []> { let Latency = 0; }
+  def : WriteRes<!cast<SchedWrite>("WritePermute"#Index), []> { let Latency = 0; }
+  def : WriteRes<!cast<SchedWrite>("WriteXLane"#Index), []> { let Latency = 0; }
+  def : WriteRes<!cast<SchedWrite>("WriteSetPermuteAll"#Index), []> { let Latency = 0; }
+  }
+  def : WriteRes<WriteTrf1Pop0, [FU_TRF1_POP0_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf1Pop1, [FU_TRF1_POP1_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf1Pop2, [FU_TRF1_POP2_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf2Pop0, []>;
+  def : WriteRes<WriteTrf2Pop1, []>;
+  def : WriteRes<WriteTrf2Pop2, []>;
+  // No packed XLU.
+  foreach Index = 0-3 in {
+  def : WriteRes<!cast<SchedWrite>("WriteTransposeEndPacked"#Index), []> { let Latency = 0; }
+  def : WriteRes<!cast<SchedWrite>("WriteTransposePacked"#Index), []> { let Latency = 0; }
+  def : WriteRes<!cast<SchedWrite>("WritePermutePacked"#Index), []> { let Latency = 0; }
+  }
+  def : WriteRes<WriteDmaGeneral, [FU_DESCRIPTOR_PATH]> {
+    let ResourceCycles = [4];
+  }
+  def : WriteRes<WriteDmaLocal, [FU_DMA_LOCAL_PATH]> {
+    let ResourceCycles = [2];
+  }
+  def : WriteRes<WriteStream, []>;
+  def : WriteRes<WriteStreamCbUpd, []>;
+  def : WriteRes<WriteStreamCb, []>;
+  def : WriteRes<WriteScalarCb, []>;
+  // No packed/int MXU matmul ops
+  def : WriteRes<WriteMatMulMxuPacked0, []>;
+  def : WriteRes<WriteMatMulMxuPacked1, []>;
+  def : WriteRes<WriteMatMulMxuPacked2, []>;
+  def : WriteRes<WriteMatMulMxuPacked3, []>;
+  def : WriteRes<WriteMatMulMxuInt0, []>;
+  def : WriteRes<WriteMatMulMxuInt1, []>;
+  def : WriteRes<WriteMatMulMxuInt2, []>;
+  def : WriteRes<WriteMatMulMxuInt3, []>;
+
+  def : WriteRes<WriteEup,[FU_EUP_PATH]>   { let Latency = 4; let ResourceCycles = [1]; }
+  def : WriteRes<WriteEupBf16, []>;
+  def : WriteRes<WriteV2SF, []>      { let Latency = 11; }
+  def : WriteRes<WriteSFlagV2SF, []> { let Latency = 2; }
+  def : WriteRes<WriteFadd, []> { let Latency = 1; }
+  def : WriteRes<WriteFmul, []> { let Latency = 1; }
+  def : WriteRes<WriteFPConvert, []> { let Latency = 1; }
+  def : WriteRes<WriteFloatCompose, []> { let Latency = 1; }
+  def : WriteRes<WritePackingInst, []> { let Latency = 1; }
+  def : WriteRes<WriteIPackingInst, []>;
+  def : WriteRes<WriteCPackingInst, []>;
+  def : WriteRes<WriteRotateSLane, []> { let Latency = 1; }
+  def : WriteRes<WriteVrshra, []> { let Latency = 1; }
+  def : WriteRes<WriteSld, []> { let Latency = 2; }
+  def : WriteRes<WriteVecScanCrossLane, []>;
+  def : WriteRes<WriteDrf, []>;
+  def : WriteRes<WriteTrace, []>;
+  def : WriteRes<WriteSetTracemark, []>;
+}
+

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUScheduleTensorCorePF.td b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUScheduleTensorCorePF.td
new file mode 100644
index 0000000..f3fe740
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUScheduleTensorCorePF.td

@@ -0,0 +1,186 @@
+//===-- TPUScheduleTensorCorePF.td - Target Description for TPU Target -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes instruction itineraries for Pufferfish tensor core.
+//
+//===----------------------------------------------------------------------===//
+
+// Map slots to itinerary classes.
+def TensorCorePFItineraries :
+  ProcessorItineraries<[SLOT_S0, SLOT_S1, SLOT_V0, SLOT_V1, SLOT_V2, SLOT_VLD,
+                        SLOT_VST, SLOT_SM, SLOT_VEX0, SLOT_VEX1, SLOT_VRES0,
+                        SLOT_VRES1, SLOT_VAUX], [], [
+    InstrItinData<IIC_S0, [InstrStage<1, [SLOT_S0]>]>,
+    InstrItinData<IIC_S1, [InstrStage<1, [SLOT_S1]>]>,
+    InstrItinData<IIC_Sany, [InstrStage<1, [SLOT_S0, SLOT_S1]>]>,
+    InstrItinData<IIC_SanyMisc, [InstrStage<1, [SLOT_S0, SLOT_S1]>]>,
+    InstrItinData<IIC_Sboth, [InstrStage<1, [SLOT_S0]>, InstrStage<0, [SLOT_S1]>]>,
+    InstrItinData<IIC_V0, [InstrStage<1, [SLOT_V0]>]>,
+    InstrItinData<IIC_V1, [InstrStage<1, [SLOT_V1]>]>,
+    InstrItinData<IIC_Vany, [InstrStage<1, [SLOT_V0, SLOT_V1]>]>,
+    // On tensor core smem accesses use S1 slot.
+    InstrItinData<IIC_SLD, [InstrStage<1, [SLOT_S1]>]>,
+    InstrItinData<IIC_SST, [InstrStage<1, [SLOT_S1]>]>,
+    InstrItinData<IIC_VLD, [InstrStage<1, [SLOT_VLD]>]>,
+    InstrItinData<IIC_VST, [InstrStage<1, [SLOT_VST]>]>,
+    InstrItinData<IIC_SM, [InstrStage<1, [SLOT_SM]>]>,
+    InstrItinData<IIC_MXU_PUSH,
+      [InstrStage<1, [SLOT_V0, SLOT_V1, SLOT_VST]>,
+       InstrStage<0, [SLOT_VEX0, SLOT_VEX1]>]>,
+    InstrItinData<IIC_MXU_MUL,
+      [InstrStage<1, [SLOT_V0, SLOT_V1, SLOT_VST]>,
+       InstrStage<0, [SLOT_VEX0, SLOT_VEX1]>]>,
+    // TODO(b/148140127) Make these definitions more fine grained.
+    // This is valid also for other definitions. that define multiple slots.
+    InstrItinData<IIC_VEX, [InstrStage<1, [SLOT_VEX0, SLOT_VEX1]>,
+                            InstrStage<0, [SLOT_V0, SLOT_V1, SLOT_VST]>]>,
+    InstrItinData<IIC_VEX0, [InstrStage<1, [SLOT_VEX0]>,
+                            InstrStage<0, [SLOT_V0, SLOT_V1, SLOT_VST]>]>,
+    InstrItinData<IIC_VEX1, [InstrStage<1, [SLOT_VEX1]>,
+                            InstrStage<0, [SLOT_V0, SLOT_V1, SLOT_VST]>]>,
+    InstrItinData<IIC_VEXBoth, [InstrStage<1, [SLOT_VEX0]>, InstrStage<0, [SLOT_VEX1]>,
+                            InstrStage<0, [SLOT_V0, SLOT_V1, SLOT_VST]>]>,
+    InstrItinData<IIC_VRES0_V0, [InstrStage<1, [SLOT_VRES0]>, InstrStage<0, [SLOT_V0]>]>,
+    InstrItinData<IIC_VRES0_V1, [InstrStage<1, [SLOT_VRES0]>, InstrStage<0, [SLOT_V1]>]>,
+    InstrItinData<IIC_VRES0_VLD, [InstrStage<1, [SLOT_VRES0]>, InstrStage<0, [SLOT_VLD]>]>,
+    InstrItinData<IIC_VRES0_VAUX, [InstrStage<1, [SLOT_VRES0]>, InstrStage<0, [SLOT_VAUX]>]>,
+    InstrItinData<IIC_VRES1_V0, [InstrStage<1, [SLOT_VRES1]>, InstrStage<0, [SLOT_V0]>]>,
+    InstrItinData<IIC_VRES1_V1, [InstrStage<1, [SLOT_VRES1]>, InstrStage<0, [SLOT_V1]>]>,
+    InstrItinData<IIC_VRES1_VLD, [InstrStage<1, [SLOT_VRES1]>, InstrStage<0, [SLOT_VLD]>]>,
+    InstrItinData<IIC_VRES1_VAUX, [InstrStage<1, [SLOT_VRES1]>, InstrStage<0, [SLOT_VAUX]>]>,
+    InstrItinData<IIC_VRES, [InstrStage<1, [SLOT_VRES0, SLOT_VRES1]>,
+                             InstrStage<0, [SLOT_V0, SLOT_V1, SLOT_VLD, SLOT_VAUX]>]>,
+    InstrItinData<IIC_VARI, [InstrStage<1, [SLOT_V1]>]>,
+    InstrItinData<IIC_VCLAMP, [InstrStage<1, [SLOT_V0, SLOT_V1]>]>,
+    InstrItinData<IIC_EUP_OP, [InstrStage<1, [SLOT_V0, SLOT_V1]>]>,
+    InstrItinData<IIC_VCVT, [InstrStage<1, [SLOT_V0, SLOT_V1]>]>,
+    InstrItinData<IIC_VM_OP, [InstrStage<1, [SLOT_SM]>]>,
+    InstrItinData<IIC_VPUSH, [InstrStage<1, [SLOT_VST]>]>,
+    InstrItinData<IIC_VFMUL, [InstrStage<1, [SLOT_V0]>]>,
+    InstrItinData<IIC_VFADD, [InstrStage<1, [SLOT_V1]>]>,
+    InstrItinData<IIC_VMOVR, [InstrStage<1, [SLOT_V0, SLOT_V1]>]>,
+]>;
+
+def TensorCorePFSchedModel : TPUSchedModel {
+ // Per-cycle resources tables. [default = NoItineraries]
+  let Itineraries = TensorCorePFItineraries;
+}
+
+let SchedModel = TensorCorePFSchedModel in {
+  defm : Latency;
+  def : WriteRes<WriteVLD, []>            { let Latency = 1; }
+  def : WriteRes<WriteVLDIdx, []>         { let Latency = 1; }
+  def : WriteRes<WriteVLDCbUpd, []>       { let Latency = 1; }
+  def : WriteRes<WriteVSTCbUpd, []>;
+  def : WriteRes<WriteVLDCb, []>          { let Latency = 1; }
+  def : WriteRes<WriteVSTCb, []>;
+  def : WriteRes<WriteXrf0_0, []>;
+  def : WriteRes<WriteXrf0_1, []>;
+  def : WriteRes<WriteXrf0_2, []>;
+  def : WriteRes<WriteXrf0Bf16_0, []>;
+  def : WriteRes<WriteXrf0Bf16_1, []>;
+  def : WriteRes<WriteXrf0Bf16_2, []>;
+  def : WriteRes<WriteXrf1, []>;
+  let ResourceCycles = [8] in {
+  def : WriteRes<WriteMatMulMxu0, [FU_MATMUL0_PATH]>  { let Latency = 83; }
+  def : WriteRes<WriteMatMulMxu1, [FU_MATMUL1_PATH]>  { let Latency = 83; }
+  def : WriteRes<WriteMatMulMxu2, [FU_MATMUL2_PATH]>  { let Latency = 101; }
+  def : WriteRes<WriteMatMulMxu3, [FU_MATMUL3_PATH]>  { let Latency = 101; }
+  }
+  let ResourceCycles = [16] in {
+  def : WriteRes<WriteMatMulMxuPacked0, [FU_MATMUL0_PATH]>  { let Latency = 83; }
+  def : WriteRes<WriteMatMulMxuPacked1, [FU_MATMUL1_PATH]>  { let Latency = 83; }
+  def : WriteRes<WriteMatMulMxuPacked2, [FU_MATMUL2_PATH]>  { let Latency = 101; }
+  def : WriteRes<WriteMatMulMxuPacked3, [FU_MATMUL3_PATH]>  { let Latency = 101; }
+  }
+
+  // No int MXU matmul ops
+  def : WriteRes<WriteMatMulMxuInt0, []>;
+  def : WriteRes<WriteMatMulMxuInt1, []>;
+  def : WriteRes<WriteMatMulMxuInt2, []>;
+  def : WriteRes<WriteMatMulMxuInt3, []>;
+
+  def : WriteRes<WriteSfence, []>;
+
+  def : WriteRes<WriteCrossLane, []> { let Latency = 8; }
+  let ResourceCycles = [8, 8] in {
+    // Latency is for a full height transpose. For short matrix we increase the
+    // latency in DAG mutation.
+  def : WriteRes<WriteTransposeEnd0, [FU_XLU0_PATH, FU_BUS0_PATH]> { let Latency = 5; }
+  def : WriteRes<WriteTransposeEnd1, [FU_XLU1_PATH, FU_BUS1_PATH]> { let Latency = 5; }
+  def : WriteRes<WriteTransposeEnd2, [FU_XLU0_PATH, FU_BUS2_PATH]> { let Latency = 5; }
+  def : WriteRes<WriteTransposeEnd3, [FU_XLU1_PATH, FU_BUS3_PATH]> { let Latency = 5; }
+  def : WriteRes<WriteTranspose0, [FU_XLU0_PATH, FU_BUS0_PATH]> { let Latency = 0; }
+  def : WriteRes<WriteTranspose1, [FU_XLU1_PATH, FU_BUS1_PATH]> { let Latency = 0; }
+  def : WriteRes<WriteTranspose2, [FU_XLU0_PATH, FU_BUS2_PATH]> { let Latency = 0; }
+  def : WriteRes<WriteTranspose3, [FU_XLU1_PATH, FU_BUS3_PATH]> { let Latency = 0; }
+  def : WriteRes<WritePermute0, [FU_XLU0_PATH, FU_BUS0_PATH]> { let Latency = 69; }
+  def : WriteRes<WritePermute1, [FU_XLU1_PATH, FU_BUS1_PATH]> { let Latency = 69; }
+  def : WriteRes<WritePermute2, [FU_XLU0_PATH, FU_BUS2_PATH]> { let Latency = 69; }
+  def : WriteRes<WritePermute3, [FU_XLU1_PATH, FU_BUS3_PATH]> { let Latency = 69; }
+  def : WriteRes<WriteXLane0, [FU_XLU0_PATH, FU_BUS0_PATH]> { let Latency = 79; }
+  def : WriteRes<WriteXLane1, [FU_XLU1_PATH, FU_BUS1_PATH]> { let Latency = 79; }
+  def : WriteRes<WriteXLane2, [FU_XLU0_PATH, FU_BUS2_PATH]> { let Latency = 79; }
+  def : WriteRes<WriteXLane3, [FU_XLU1_PATH, FU_BUS3_PATH]> { let Latency = 79; }
+  def : WriteRes<WriteSetPermuteAll0, [FU_XLU0_PATH, FU_BUS0_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteSetPermuteAll1, [FU_XLU1_PATH, FU_BUS1_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteSetPermuteAll2, [FU_XLU0_PATH, FU_BUS2_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteSetPermuteAll3, [FU_XLU1_PATH, FU_BUS3_PATH]> { let Latency = 1; }
+  }
+  let ResourceCycles = [16, 16] in {
+    // Latency is for a full height transpose. For short matrix we increase the
+    // latency in DAG mutation. (formula is 126 - H + 23 == 21 + (128 - H))
+  def : WriteRes<WriteTransposeEndPacked0, [FU_XLU0_PATH, FU_BUS0_PATH]> { let Latency = 21; }
+  def : WriteRes<WriteTransposeEndPacked1, [FU_XLU1_PATH, FU_BUS1_PATH]> { let Latency = 21; }
+  def : WriteRes<WriteTransposeEndPacked2, [FU_XLU0_PATH, FU_BUS2_PATH]> { let Latency = 21; }
+  def : WriteRes<WriteTransposeEndPacked3, [FU_XLU1_PATH, FU_BUS3_PATH]> { let Latency = 21; }
+  def : WriteRes<WriteTransposePacked0, [FU_XLU0_PATH, FU_BUS0_PATH]> { let Latency = 0; }
+  def : WriteRes<WriteTransposePacked1, [FU_XLU1_PATH, FU_BUS1_PATH]> { let Latency = 0; }
+  def : WriteRes<WriteTransposePacked2, [FU_XLU0_PATH, FU_BUS2_PATH]> { let Latency = 0; }
+  def : WriteRes<WriteTransposePacked3, [FU_XLU1_PATH, FU_BUS3_PATH]> { let Latency = 0; }
+  def : WriteRes<WritePermutePacked0, [FU_XLU0_PATH, FU_BUS0_PATH]> { let Latency = 77; }
+  def : WriteRes<WritePermutePacked1, [FU_XLU1_PATH, FU_BUS1_PATH]> { let Latency = 77; }
+  def : WriteRes<WritePermutePacked2, [FU_XLU0_PATH, FU_BUS2_PATH]> { let Latency = 77; }
+  def : WriteRes<WritePermutePacked3, [FU_XLU1_PATH, FU_BUS3_PATH]> { let Latency = 77; }
+  }
+  let ResourceCycles = [8] in {
+  def : WriteRes<WriteTrf0Pop0, [FU_TRF0_POP0_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf0Pop1, [FU_TRF0_POP1_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf0Pop2, [FU_TRF0_POP2_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf1Pop0, [FU_TRF1_POP0_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf1Pop1, [FU_TRF1_POP1_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf1Pop2, [FU_TRF1_POP2_PATH]> { let Latency = 1; }
+  }
+  def : WriteRes<WriteTrf2Pop0, []>;
+  def : WriteRes<WriteTrf2Pop1, []>;
+  def : WriteRes<WriteTrf2Pop2, []>;
+  def : WriteRes<WriteEup, [FU_EUP_PATH]>   { let Latency = 7; let ResourceCycles = [2]; }
+  def : WriteRes<WriteEupBf16, []>;
+  def : WriteRes<WriteV2SF, []>        { let Latency = 30; }
+  def : WriteRes<WriteSFlagV2SF, []>        { let Latency = 3; }
+  def : WriteRes<WriteDmaLocal, []>;
+  def : WriteRes<WriteDmaGeneral, []>;
+  def : WriteRes<WriteStream, []>;
+  def : WriteRes<WriteStreamCb, []>;
+  def : WriteRes<WriteStreamCbUpd, []>;
+  def : WriteRes<WriteScalarCb, []>;
+  def : WriteRes<WriteFadd, []> { let Latency = 2; }
+  def : WriteRes<WriteFmul, []> { let Latency = 2; }
+  def : WriteRes<WriteFPConvert, []> { let Latency = 2; }
+  def : WriteRes<WriteFloatCompose, []> { let Latency = 2; }
+  def : WriteRes<WritePackingInst, []> { let Latency = 2; }
+  def : WriteRes<WriteIPackingInst, []>;
+  def : WriteRes<WriteCPackingInst, []>;
+  def : WriteRes<WriteRotateSLane, []> { let Latency = 2; }
+  def : WriteRes<WriteVrshra, []> { let Latency = 2; }
+  def : WriteRes<WriteSld, []> { let Latency = 4; }
+  def : WriteRes<WriteVecScanCrossLane, []>;  
+  def : WriteRes<WriteDrf, []>;
+  def : WriteRes<WriteTrace, []>;
+  def : WriteRes<WriteSetTracemark, []>;
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUScheduleTensorCoreVF.td b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUScheduleTensorCoreVF.td
new file mode 100644
index 0000000..3950687
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUScheduleTensorCoreVF.td

@@ -0,0 +1,203 @@
+//===-- TPUScheduleTensorCoreVF.td - Target Description for TPU Target -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes instruction itineraries for Pufferfish tensor core.
+//
+//===----------------------------------------------------------------------===//
+
+// Map slots to itinerary classes.
+def TensorCoreVFItineraries :
+  ProcessorItineraries<[SLOT_S0, SLOT_S1, SLOT_V0, SLOT_V1, SLOT_V2, SLOT_V3,
+                        SLOT_VLD, SLOT_VST, SLOT_SM, SLOT_VEX0, SLOT_VEX1,
+                        SLOT_VRES0, SLOT_VRES1, SLOT_VAUX], [], [
+    InstrItinData<IIC_S0, [InstrStage<1, [SLOT_S0]>]>,
+    InstrItinData<IIC_S1, [InstrStage<1, [SLOT_S1]>]>,
+    InstrItinData<IIC_Sany, [InstrStage<1, [SLOT_S0, SLOT_S1]>]>,
+    InstrItinData<IIC_SanyMisc, [InstrStage<1, [SLOT_S0, SLOT_S1]>]>,
+    InstrItinData<IIC_Sboth, [InstrStage<1, [SLOT_S0]>, InstrStage<0, [SLOT_S1]>]>,
+    InstrItinData<IIC_V0, [InstrStage<1, [SLOT_V0]>]>,
+    InstrItinData<IIC_V1, [InstrStage<1, [SLOT_V1]>]>,
+    InstrItinData<IIC_V2, [InstrStage<1, [SLOT_V2]>]>,
+    InstrItinData<IIC_Vany, [InstrStage<1, [SLOT_V0, SLOT_V1]>]>,
+    // On tensor core smem accesses use S1 slot.
+    InstrItinData<IIC_SLD, [InstrStage<1, [SLOT_S1]>]>,
+    InstrItinData<IIC_SST, [InstrStage<1, [SLOT_S1]>]>,
+    InstrItinData<IIC_VLD, [InstrStage<1, [SLOT_VLD]>]>,
+    InstrItinData<IIC_VST, [InstrStage<1, [SLOT_VST]>]>,
+    InstrItinData<IIC_SM, [InstrStage<1, [SLOT_SM]>]>,
+    InstrItinData<IIC_MXU_PUSH,
+      [InstrStage<1, [SLOT_V0, SLOT_V1, SLOT_VST]>,
+       InstrStage<0, [SLOT_VEX0, SLOT_VEX1]>]>,
+    InstrItinData<IIC_MXU_MUL,
+      [InstrStage<1, [SLOT_V0, SLOT_V1, SLOT_VST]>,
+       InstrStage<0, [SLOT_VEX0, SLOT_VEX1]>]>,
+    // TODO(b/148140127) Make these definitions more fine grained.
+    // This is valid also for other definitions. that define multiple slots.
+    InstrItinData<IIC_VEX, [InstrStage<1, [SLOT_VEX0, SLOT_VEX1]>,
+                            InstrStage<0, [SLOT_V0, SLOT_V1, SLOT_VST]>]>,
+    InstrItinData<IIC_VEX0, [InstrStage<1, [SLOT_VEX0]>,
+                            InstrStage<0, [SLOT_V0, SLOT_V1, SLOT_VST]>]>,
+    InstrItinData<IIC_VEX1, [InstrStage<1, [SLOT_VEX1]>,
+                            InstrStage<0, [SLOT_V0, SLOT_V1, SLOT_VST]>]>,
+    InstrItinData<IIC_VEXBoth, [InstrStage<1, [SLOT_VEX0]>, InstrStage<0, [SLOT_VEX1]>,
+                            InstrStage<0, [SLOT_V0, SLOT_V1, SLOT_VST]>]>,
+    InstrItinData<IIC_VRES0_V0, [InstrStage<1, [SLOT_VRES0]>, InstrStage<0, [SLOT_V0]>]>,
+    InstrItinData<IIC_VRES0_V1, [InstrStage<1, [SLOT_VRES0]>, InstrStage<0, [SLOT_V1]>]>,
+    InstrItinData<IIC_VRES0_VLD, [InstrStage<1, [SLOT_VRES0]>, InstrStage<0, [SLOT_VLD]>]>,
+    InstrItinData<IIC_VRES0_VAUX, [InstrStage<1, [SLOT_VRES0]>, InstrStage<0, [SLOT_VAUX]>]>,
+    InstrItinData<IIC_VRES1_V0, [InstrStage<1, [SLOT_VRES1]>, InstrStage<0, [SLOT_V0]>]>,
+    InstrItinData<IIC_VRES1_V1, [InstrStage<1, [SLOT_VRES1]>, InstrStage<0, [SLOT_V1]>]>,
+    InstrItinData<IIC_VRES1_VLD, [InstrStage<1, [SLOT_VRES1]>, InstrStage<0, [SLOT_VLD]>]>,
+    InstrItinData<IIC_VRES1_VAUX, [InstrStage<1, [SLOT_VRES1]>, InstrStage<0, [SLOT_VAUX]>]>,
+    InstrItinData<IIC_VRES, [InstrStage<1, [SLOT_VRES0, SLOT_VRES1]>,
+                             InstrStage<0, [SLOT_V0, SLOT_V1, SLOT_VLD, SLOT_VAUX]>]>,
+    InstrItinData<IIC_VARI, [InstrStage<1, [SLOT_V0, SLOT_V1, SLOT_V2]>]>,
+    InstrItinData<IIC_VCLAMP, [InstrStage<1, [SLOT_V0, SLOT_V1]>]>,
+    InstrItinData<IIC_EUP_OP, [InstrStage<1, [SLOT_V0, SLOT_V1]>]>,
+    InstrItinData<IIC_VCVT, [InstrStage<1, [SLOT_V0, SLOT_V1]>]>,
+    InstrItinData<IIC_VM_OP, [InstrStage<1, [SLOT_V2]>]>,
+    InstrItinData<IIC_VPUSH, [InstrStage<1, [SLOT_VST]>]>,
+    // TODO(sdasgup3): Intentionally omitting SLOT_V3. Will add it once SLOT_V3
+    // specific bundled instructon-variants are added.
+    InstrItinData<IIC_VFMUL, [InstrStage<1, [SLOT_V0, SLOT_V1, SLOT_V2]>]>,
+    InstrItinData<IIC_VFADD, [InstrStage<1, [SLOT_V0, SLOT_V1, SLOT_V2]>]>,
+    InstrItinData<IIC_VMOVR, [InstrStage<1, [SLOT_V0, SLOT_V1]>]>,
+    InstrItinData<IIC_ALUOV, [InstrStage<1, [SLOT_S0, SLOT_S1]>]>,
+]>;
+
+def TensorCoreVFSchedModel : TPUSchedModel {
+ // Per-cycle resources tables. [default = NoItineraries]
+  let Itineraries = TensorCoreVFItineraries;
+}
+
+let SchedModel = TensorCoreVFSchedModel in {
+  defm : Latency;
+  def : WriteRes<WriteVLD, []>            { let Latency = 1; }
+  def : WriteRes<WriteVLDIdx, []>         { let Latency = 1; }
+  def : WriteRes<WriteVLDCbUpd, []>       { let Latency = 1; }
+  def : WriteRes<WriteVSTCbUpd, []>;
+  def : WriteRes<WriteVLDCb, []>          { let Latency = 1; }
+  def : WriteRes<WriteVSTCb, []>;
+  def : WriteRes<WriteXrf0_0, []>;
+  def : WriteRes<WriteXrf0_1, []>;
+  def : WriteRes<WriteXrf0_2, []>;
+  def : WriteRes<WriteXrf0Bf16_0, []>;
+  def : WriteRes<WriteXrf0Bf16_1, []>;
+  def : WriteRes<WriteXrf0Bf16_2, []>;
+  def : WriteRes<WriteXrf1, []>;
+  let ResourceCycles = [8] in {
+  def : WriteRes<WriteMatMulMxu0, [FU_MATMUL0_PATH]>  { let Latency = 124; }
+  def : WriteRes<WriteMatMulMxu1, [FU_MATMUL1_PATH]>  { let Latency = 124; }
+  def : WriteRes<WriteMatMulMxu2, [FU_MATMUL2_PATH]>  { let Latency = 130; }
+  def : WriteRes<WriteMatMulMxu3, [FU_MATMUL3_PATH]>  { let Latency = 130; }
+  def : WriteRes<WriteMatMulMxuInt0, [FU_MATMUL0_PATH]>  { let Latency = 113; }
+  def : WriteRes<WriteMatMulMxuInt1, [FU_MATMUL1_PATH]>  { let Latency = 113; }
+  def : WriteRes<WriteMatMulMxuInt2, [FU_MATMUL2_PATH]>  { let Latency = 119; }
+  def : WriteRes<WriteMatMulMxuInt3, [FU_MATMUL3_PATH]>  { let Latency = 119; }
+  }
+  def : WriteRes<WriteSfence, []>;
+  // XLU
+  def : WriteRes<WriteCrossLane, []> { let Latency = 8; }
+  let ResourceCycles = [8] in {
+    // Latency specified here is for a full height transpose.
+    // For example: The latency from last unpacked-transpose instruction in a set to the first
+    // TRF Result Instruction is given by:
+    //  XLU0: 155 - H + 7 = (128 - H) + 34
+    //  XLU1: 155 - H + 7 = (128 - H) + 34
+    //  XLU2: 143 - H + 7 = (128 - H) + 22
+    // For short matrix we increase the latency in DAG mutation.
+  def : WriteRes<WriteTransposeEnd0, [FU_XLU0_PATH]> { let Latency = 34; }
+  def : WriteRes<WriteTransposeEnd1, [FU_XLU1_PATH]> { let Latency = 34; }
+  def : WriteRes<WriteTransposeEnd2, [FU_XLU2_PATH]> { let Latency = 22; }
+  def : WriteRes<WriteTranspose0, [FU_XLU0_PATH]> { let Latency = 0; }
+  def : WriteRes<WriteTranspose1, [FU_XLU1_PATH]> { let Latency = 0; }
+  def : WriteRes<WriteTranspose2, [FU_XLU2_PATH]> { let Latency = 0; }
+
+  def : WriteRes<WritePermute0, [FU_XLU0_PATH]> { let Latency = 114; }
+  def : WriteRes<WritePermute1, [FU_XLU1_PATH]> { let Latency = 114; }
+  def : WriteRes<WritePermute2, [FU_XLU2_PATH]> { let Latency = 90; }
+  def : WriteRes<WriteSetPermuteAll0, [FU_XLU0_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteSetPermuteAll1, [FU_XLU1_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteSetPermuteAll2, [FU_XLU2_PATH]> { let Latency = 1; }
+
+  def : WriteRes<WriteXLane0, [FU_XLU0_PATH]> { let Latency = 115; }
+  def : WriteRes<WriteXLane1, [FU_XLU1_PATH]> { let Latency = 115; }
+  def : WriteRes<WriteXLane2, [FU_XLU2_PATH]> { let Latency = 91; }
+  }
+  def : WriteRes<WriteTransposeEnd3, [FU_XLU3_PATH]>;
+  def : WriteRes<WriteTranspose3, [FU_XLU3_PATH]>;
+  def : WriteRes<WriteSetPermuteAll3, []>;
+  def : WriteRes<WritePermute3, [FU_XLU3_PATH]>;
+  def : WriteRes<WriteXLane3, [FU_XLU3_PATH]>;
+
+  let ResourceCycles = [16] in {
+    // Latency specified here is for a full height transpose.
+    // For example: The latency from last unpacked-transpose instruction in a set to the first
+    // TRF Result Instruction is given by:
+    //  XLU0: 155 - H + 23 = (128 - H) + 50
+    //  XLU1: 155 - H + 23 = (128 - H) + 50
+    //  XLU2: 143 - H + 23 = (128 - H) + 38
+    // For short matrix we increase the latency in DAG mutation.
+  def : WriteRes<WriteTransposeEndPacked0, [FU_XLU0_PATH]> { let Latency = 50; }
+  def : WriteRes<WriteTransposeEndPacked1, [FU_XLU1_PATH]> { let Latency = 50; }
+  def : WriteRes<WriteTransposeEndPacked2, [FU_XLU2_PATH]> { let Latency = 38; }
+  def : WriteRes<WriteTransposePacked0, [FU_XLU0_PATH]> { let Latency = 0; }
+  def : WriteRes<WriteTransposePacked1, [FU_XLU1_PATH]> { let Latency = 0; }
+  def : WriteRes<WriteTransposePacked2, [FU_XLU2_PATH]> { let Latency = 0; }
+
+  def : WriteRes<WritePermutePacked0, [FU_XLU0_PATH]> { let Latency = 122; }
+  def : WriteRes<WritePermutePacked1, [FU_XLU1_PATH]> { let Latency = 122; }
+  def : WriteRes<WritePermutePacked2, [FU_XLU2_PATH]> { let Latency = 98; }
+  }
+  def : WriteRes<WriteTransposeEndPacked3, [FU_XLU3_PATH]>;
+  def : WriteRes<WriteTransposePacked3, [FU_XLU3_PATH]>;
+  def : WriteRes<WritePermutePacked3, [FU_XLU3_PATH]>;
+
+  let ResourceCycles = [8] in {
+  def : WriteRes<WriteTrf0Pop0, [FU_TRF0_POP0_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf0Pop1, [FU_TRF0_POP1_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf0Pop2, [FU_TRF0_POP2_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf1Pop0, [FU_TRF1_POP0_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf1Pop1, [FU_TRF1_POP1_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf1Pop2, [FU_TRF1_POP2_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf2Pop0, [FU_TRF2_POP0_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf2Pop1, [FU_TRF2_POP1_PATH]> { let Latency = 1; }
+  def : WriteRes<WriteTrf2Pop2, [FU_TRF2_POP2_PATH]> { let Latency = 1; }
+  }
+
+  // No packed MXU
+  def : WriteRes<WriteMatMulMxuPacked0, []>;
+  def : WriteRes<WriteMatMulMxuPacked1, []>;
+  def : WriteRes<WriteMatMulMxuPacked2, []>;
+  def : WriteRes<WriteMatMulMxuPacked3, []>;
+
+  def : WriteRes<WriteEup, [FU_EUP_PATH]>   { let Latency = 7; let ResourceCycles = [2]; }
+  def : WriteRes<WriteEupBf16, []>;
+  def : WriteRes<WriteV2SF, []>        { let Latency = 30; }
+  def : WriteRes<WriteSFlagV2SF, []>        { let Latency = 3; }
+  def : WriteRes<WriteDmaLocal, []>;
+  def : WriteRes<WriteDmaGeneral, []>;
+  def : WriteRes<WriteStream, []>;
+  def : WriteRes<WriteStreamCb, []>;
+  def : WriteRes<WriteStreamCbUpd, []>;
+  def : WriteRes<WriteScalarCb, []>;
+  def : WriteRes<WriteFadd, []> { let Latency = 2; }
+  def : WriteRes<WriteFmul, []> { let Latency = 2; }
+  def : WriteRes<WriteFPConvert, []> { let Latency = 2; }
+  def : WriteRes<WriteFloatCompose, []>;
+  def : WriteRes<WritePackingInst, []> { let Latency = 2; }
+  def : WriteRes<WriteIPackingInst, []>;
+  def : WriteRes<WriteCPackingInst, []>;
+  def : WriteRes<WriteRotateSLane, []> { let Latency = 2; }
+  def : WriteRes<WriteVrshra, []>;
+  def : WriteRes<WriteSld, []> { let Latency = 6; }
+  def : WriteRes<WriteVecScanCrossLane, []>;
+  def : WriteRes<WriteDrf, []> { let Latency = 9; }
+  def : WriteRes<WriteTrace, []>;
+  def : WriteRes<WriteSetTracemark, []>;
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp
new file mode 100644
index 0000000..e70e40c
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.cpp

@@ -0,0 +1,689 @@
+//===- TPUSubtarget.cpp - TPU Subtarget Information -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TPU specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TPUSubtarget.h"
+#include "MCTargetDesc/TPUMCTargetDesc.h"
+#include "TPU.h"
+#include "TPURegisterInfo.h"
+#include "TPUSchedule.h"
+#include "TPUSubtarget.h"
+#include "third_party/llvm/llvm/lib/Target/GoogleTPU/MCTargetDesc/TPUMCTargetDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/YAMLParser.h"
+#include <set>
+#include <string>
+#include <vector>
+
+#define DEBUG_TYPE "TPU-subtarget"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "TPUGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+// If given, a YAML file of the format:
+//   opcode_regex: latency
+//
+// where opcode_regex is a Regex applied to the lowercased name of an opcode,
+// and if it matches the latency of that opcode is "latency".
+//
+// This is provided to quickly override TableGen for the common case of fiddling
+// with hardware latencies during codesign.
+cl::opt<std::string> LatenciesFile(
+    "tpu-latencies", cl::init(""),
+    cl::desc("YAML file describing latency overrides for codesign."));
+
+static cl::opt<bool> EnableTPUABIOnTec(
+    "tpu-enable-tpu-abi-tec", cl::Hidden, cl::init(false),
+    cl::desc(
+        "Enables support for function calls w/ TPU ABI on sparsecore-tec."));
+
+static cl::opt<bool> EnableTPUABIOnScs(
+    "tpu-enable-tpu-abi-scs", cl::Hidden, cl::init(false),
+    cl::desc(
+        "Enables support for function calls w/ TPU ABI on sparsecore-scs."));
+
+void TPUSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
+  std::string CPUName(CPU);
+  if (CPUName.empty())
+    CPUName = "generic";
+
+  ParseSubtargetFeatures(CPUName, /*TuneCPU=*/CPUName, FS);
+}
+
+TPUSubtarget &
+TPUSubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                     StringRef FS) {
+  initSubtargetFeatures(CPU, FS);
+  return *this;
+}
+
+FifoInfo *
+TPUSubtarget::FillFifoInfo(const TargetRegisterClass *RegClass,
+                           const DenseMap<unsigned, int> &ItemsPushed,
+                           const DenseMap<unsigned, unsigned> &OpcToComposed) {
+  FifoInfo *Fifo = new FifoInfo(RegClass);
+  for (unsigned opcode = 0; opcode < TPU::INSTRUCTION_LIST_END; opcode++) {
+    const MCInstrDesc &MCID = InstrInfo.get(opcode);
+    for (auto Op = MCID.opInfo_begin(), OpEnd = MCID.opInfo_end(); Op != OpEnd;
+         Op++) {
+      if (Op->RegClass != RegClass->getID())
+        continue;
+      // For push we only check the destination type as we could have several
+      // level of Fifo
+      auto ItItemsPushed = ItemsPushed.find(opcode);
+      int ItemPushed =
+          (ItItemsPushed == ItemsPushed.end()) ? 1 : ItItemsPushed->second;
+      auto ItOpcToComposed = OpcToComposed.find(opcode);
+      if (Op == MCID.opInfo_begin() && TPUInstrInfo::isFifoPush(MCID)) {
+        Fifo->PushOpcToItems.insert({opcode, ItemPushed});
+        if (ItOpcToComposed != OpcToComposed.end())
+          Fifo->PushOpcToComposed.insert({opcode, ItOpcToComposed->second});
+        Fifo->PushOpcodes.push_back(opcode);
+      } else if (Op != MCID.opInfo_begin() && TPUInstrInfo::isFifoPop(MCID)) {
+        assert(TPUInstrInfo::getFifoPopOperandNo(MCID) != -1);
+        // Always one item popped.
+        Fifo->PopOpcToItems.insert({opcode, 1});
+        Fifo->PopOpcodes.push_back(opcode);
+      } else if (Op == MCID.opInfo_begin() &&
+                 TPUInstrInfo::isComposedFifo(MCID)) {
+        Fifo->ComposedOpcodes.push_back(opcode);
+      }
+    }
+  }
+  return Fifo;
+}
+
+TPUSubtarget::TPUSubtarget(const Triple &TargetTriple, StringRef Cpu,
+                           StringRef FeatureString, const TargetMachine &TM,
+                           const TargetOptions & /*Options*/,
+                           CodeModel::Model /*CodeModel*/, bool TPUABIOverride,
+                           CodeGenOpt::Level /*OptLevel*/)
+    : TPUGenSubtargetInfo(TargetTriple, Cpu, /*TuneCPU=*/Cpu, FeatureString),
+      /*Important to initialize before InstrInfo*/ TPUABIOverride(
+          TPUABIOverride),
+      FrameLowering(initializeSubtargetDependencies(Cpu, FeatureString)),
+      InstrInfo(this, getHwMode()), TLInfo(TM, *this), TSInfo() {
+  InstrItins = getInstrItineraryForCPU(Cpu);
+
+  // Maps each opcode to the number of items pushed or popped. Opcodes not
+  // present in the map will be assumed to push/pop one item. Some SparseCore
+  // instructions push or pop 3 values at once, logically modeled as one here.
+  DenseMap<unsigned, int> ItemsPushed = {
+      {TPU::SDIVREMrr, 2},
+      {TPU::SDIVREMri, 2},
+  };
+
+  // Maps each opcode to its composed equivalent. All targets running the
+  // software pipeliner must define a mapping for all its fifo opcodes.
+  DenseMap<unsigned, unsigned> OpcToComposed = {
+      // XRF0 cross-lane composed FIFO instruction mappings.
+      {TPU::scVADDSCAN1XNI, TPU::scVADDSCAN1XNI_CF},
+      {TPU::scVADDSCAN1XNF, TPU::scVADDSCAN1XNF_CF},
+      {TPU::scVADDSEGSCAN1XNI, TPU::scVADDSEGSCAN1XNI_CF},
+      {TPU::scVADDSEGSCAN1XNF, TPU::scVADDSEGSCAN1XNF_CF},
+      {TPU::scVMAXSCAN1XNI, TPU::scVMAXSCAN1XNI_CF},
+      {TPU::scVMAXSCAN1XNF, TPU::scVMAXSCAN1XNF_CF},
+      {TPU::scVMINSCAN1XNI, TPU::scVMINSCAN1XNI_CF},
+      {TPU::scVMINSCAN1XNF, TPU::scVMINSCAN1XNF_CF},
+      {TPU::scVMINSEGSCAN1XNI, TPU::scVMINSEGSCAN1XNI_CF},
+      {TPU::scVMINSEGSCAN1XNF, TPU::scVMINSEGSCAN1XNF_CF},
+      {TPU::scVMINSEGIDXSCAN1XNI, TPU::scVMINSEGIDXSCAN1XNI_CF},
+      {TPU::scVMINSEGIDXSCAN1XNF, TPU::scVMINSEGIDXSCAN1XNF_CF},
+      {TPU::scVMAXSEGSCAN1XNI, TPU::scVMAXSEGSCAN1XNI_CF},
+      {TPU::scVMAXSEGSCAN1XNF, TPU::scVMAXSEGSCAN1XNF_CF},
+      {TPU::scVMAXSEGIDXSCAN1XNI, TPU::scVMAXSEGIDXSCAN1XNI_CF},
+      {TPU::scVMAXSEGIDXSCAN1XNF, TPU::scVMAXSEGIDXSCAN1XNF_CF},
+      {TPU::scVADDSCAN2XNHALFBF16, TPU::scVADDSCAN2XNHALFBF16_CF},
+      {TPU::scVADDSCAN2XNFULLBF16, TPU::scVADDSCAN2XNFULLBF16_CF},
+      {TPU::scVMINSCAN2XNBF16, TPU::scVMINSCAN2XNBF16_CF},
+      {TPU::scVMAXSCAN2XNBF16, TPU::scVMAXSCAN2XNBF16_CF},
+      {TPU::scVMINIDXSCAN2XNBF16, TPU::scVMINIDXSCAN2XNBF16_CF},
+      {TPU::scVMAXIDXSCAN2XNBF16, TPU::scVMAXIDXSCAN2XNBF16_CF},
+      {TPU::scVADDSEGSCAN2XNHALFBF16, TPU::scVADDSEGSCAN2XNHALFBF16_CF},
+      {TPU::scVADDSEGSCAN2XNFULLBF16, TPU::scVADDSEGSCAN2XNFULLBF16_CF},
+      {TPU::scVMINSEGSCAN2XNBF16, TPU::scVMINSEGSCAN2XNBF16_CF},
+      {TPU::scVMAXSEGSCAN2XNBF16, TPU::scVMAXSEGSCAN2XNBF16_CF},
+      {TPU::scVMINSEGIDXSCAN2XNBF16, TPU::scVMINSEGIDXSCAN2XNBF16_CF},
+      {TPU::scVMAXSEGIDXSCAN2XNBF16, TPU::scVMAXSEGIDXSCAN2XNBF16_CF},
+      // XRF1 cross-lane composed FIFO instruction mappings.
+      {TPU::scVSORTASCD, TPU::scVSORTASCD_CF},
+      {TPU::scVSORTASCDF, TPU::scVSORTASCDF_CF},
+      {TPU::scVSORTDSCD, TPU::scVSORTDSCD_CF},
+      {TPU::scVSORTDSCDF, TPU::scVSORTDSCDF_CF},
+      {TPU::scVDUPCNT, TPU::scVDUPCNT_CF},
+      {TPU::scVDUPCNTF, TPU::scVDUPCNTF_CF},
+      {TPU::scVUNIQUE, TPU::scVUNIQUE_CF},
+      {TPU::scVUNIQUEF, TPU::scVUNIQUEF_CF},
+      // FIXME(hgreving): Remove, deprecated FIFO instruction mapping.
+      {TPU::scVSEGREDUCEADDF, TPU::scVSEGREDUCEADDF_CF},
+      // ERF composed FIFO instruction mappings.
+      {TPU::VRSQRT, TPU::VRSQRT_CF},
+      {TPU::VPOW2, TPU::VPOW2_CF},
+      {TPU::VLOG2, TPU::VLOG2_CF},
+      {TPU::VTANH, TPU::VTANH_CF},
+      {TPU::VRCP, TPU::VRCP_CF},
+      {TPU::VSIGSHFT, TPU::VSIGSHFT_CF},
+      {TPU::VPUSH_EUP, TPU::VPUSH_EUP_CF},
+      // V2S composed FIFO instructio mappings.
+      {TPU::VPUSH, TPU::VPUSH_CF},
+      {TPU::scVPUSHi, TPU::scVPUSHi_CF},
+      {TPU::scVPUSHr, TPU::scVPUSHr_CF},
+      // DRF composed FIFO instruction mappings.
+      {TPU::SDIVrr, TPU::SDIV_CFrr},
+      {TPU::SDIVri, TPU::SDIV_CFri},
+      {TPU::SREMrr, TPU::SREM_CFrr},
+      {TPU::SREMri, TPU::SREM_CFri},
+      // Transcendental ERF
+      {TPU::VSIN, TPU::VSIN_CF},
+      {TPU::VCOS, TPU::VCOS_CF},
+      {TPU::VERF, TPU::VERF_CF},
+      // Low precision ERF
+      {TPU::VSINBF16, TPU::VSINBF16_CF},
+      {TPU::VCOSBF16, TPU::VCOSBF16_CF},
+      {TPU::VERFBF16, TPU::VERFBF16_CF},
+      {TPU::VRSQRTBF16, TPU::VRSQRT_CF},
+      {TPU::VPOW2BF16, TPU::VPOW2_CF},
+      {TPU::VLOG2BF16, TPU::VLOG2_CF},
+      {TPU::VTANHBF16, TPU::VTANH_CF},
+      {TPU::VRCPBF16, TPU::VRCP_CF},
+      {TPU::VSIGSHFTBF16, TPU::VSIGSHFT_CF},
+      // FIXME(hgreving): Composing of SDIVREMrr or SDIVREMri (2 results, drf
+      // fifo) is currently unsupported. Also, some TPU fifos are not supported,
+      // e.g. SFRF.
+  };
+
+  for (auto Fifo : FifoClasses) {
+    FifoInfos.push_back(FillFifoInfo(Fifo, ItemsPushed, OpcToComposed));
+  }
+
+  for (auto *Info : FifoInfos) {
+    for (const MCPhysReg &Reg : Info->getRegisterClass()->getRegisters()) {
+      assert(FifoInfoByPhysReg.count(Reg) == 0 &&
+             "Cannot have one register in multiple fifos!");
+      FifoInfoByPhysReg[Reg] = Info;
+    }
+  }
+}
+
+unsigned TPUSubtarget::getVMemHazardLatency() const { return 5; }
+
+TPUSubtarget::~TPUSubtarget() {
+  for (auto *Info : FifoInfos) {
+   delete Info;
+  }
+  FifoInfos.clear();
+}
+
+void TPUSubtarget::getSMSMutations(
+    std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+  Mutations.push_back(createTPURemoveExitSUMutation());
+  Mutations.push_back(createTPUUnitCadenceMutation());
+  Mutations.push_back(createFifoVoidMutation());
+  Mutations.push_back(createVRegAddressCalcMutation());
+  Mutations.push_back(createComposedFifoLatency());
+}
+
+namespace {
+// Encapsulate information about latency overrides from the command line.
+class LatenciesInfo {
+public:
+  LatenciesInfo(std::string Filename, const MCInstrInfo &II) {
+    Latencies.resize(II.getNumOpcodes(), -1);
+    if (Filename.empty())
+      return;
+    Init(Filename, II);
+  }
+
+  void Init(std::string Filename, const MCInstrInfo &II) {
+    SourceMgr SM;
+    std::string ErrorPrefix =
+        std::string("while loading TPU latencies file '") + Filename +
+        "': ";
+    auto MemBufferOr = MemoryBuffer::getFile(Filename);
+    if (MemBufferOr.getError()) {
+      SM.PrintMessage(SMLoc(), SourceMgr::DK_Error,
+                      ErrorPrefix + MemBufferOr.getError().message());
+      return;
+    }
+    auto &MemBuffer = MemBufferOr.get();
+    SMLoc RootLoc = SMLoc::getFromPointer(MemBuffer->getBufferStart());
+
+    auto Error = [&](yaml::Node *N, StringRef Msg) -> void {
+      SMLoc Loc = N ? N->getSourceRange().Start : RootLoc;
+      SM.PrintMessage(Loc, SourceMgr::DK_Error, ErrorPrefix + Msg);
+    };
+
+    std::error_code Err;
+    yaml::Stream Stream(*MemBuffer, SM, /*ShowColors=*/true, &Err);
+    if (Err)
+      return Error(nullptr, "YAML parsing failed");
+
+    yaml::Document &Document = *Stream.begin();
+    yaml::MappingNode *Root = dyn_cast<yaml::MappingNode>(Document.getRoot());
+    if (!Root)
+      return Error(Document.getRoot(), "root node was not a mapping");
+
+    // Track the SMLoc and text of each parsed regex in mapping order.
+    std::vector<std::pair<SMLoc, std::string>> DebugInfo;
+    // Track the regex to match and the latency to apply in mapping order.
+    std::vector<std::pair<Regex, int>> REs;
+    for (auto &KV : *Root) {
+      if (!isa<yaml::ScalarNode>(KV.getKey()))
+        return Error(KV.getKey(), "key must be of scalar type");
+      if (!isa<yaml::ScalarNode>(KV.getValue()))
+        return Error(KV.getValue(), "value must be of scalar type");
+      std::string RE(cast<yaml::ScalarNode>(KV.getKey())->getRawValue());
+      std::string Value(cast<yaml::ScalarNode>(KV.getValue())->getRawValue());
+      int N;
+      if (!to_integer(Value, N))
+        return Error(KV.getValue(),
+                     "value must be of integer type: '" + Value + "'");
+      DebugInfo.push_back({KV.getKey()->getSourceRange().Start, RE});
+      REs.push_back({Regex(RE), N});
+    }
+
+    // Track which regexes have matched at least one opcode.
+    std::set<int> MatchedRegexIndices;
+    for (unsigned Opcode = 0; Opcode < II.getNumOpcodes(); ++Opcode) {
+      StringRef Name = II.getName(Opcode);
+      // Match opcodes in priority order.
+      for (auto I = REs.begin(), E = REs.end(); I != E; ++I) {
+        if (!I->first.match(Name.lower()))
+          continue;
+        MatchedRegexIndices.insert(std::distance(REs.begin(), I));
+        Latencies[Opcode] = I->second;
+        break;
+      }
+    }
+
+    // Output helpful debugging messages if a regex didn't match anything; this
+    // is likely user error.
+    for (unsigned I = 0; I != DebugInfo.size(); ++I) {
+      if (MatchedRegexIndices.count(I) == 0)
+        SM.PrintMessage(DebugInfo[I].first, SourceMgr::DK_Note,
+                        Twine("Regex '") + DebugInfo[I].second +
+                            "' did not match any opcodes");
+    }
+  }
+
+  // Return the def latency for the given opcode, or -1 if not known.
+  int Latency(unsigned Opcode) const { return Latencies[Opcode]; }
+
+private:
+  std::vector<int> Latencies;
+};
+} // namespace
+
+int TPUSubtarget::getMaxSchedLatency(MachineFunction &MF,
+                                     unsigned Opcode) const {
+  assert(!TPUInstrInfo::isTransposeEnd(getInstrInfo()->get(Opcode)));
+  static LatenciesInfo LI(LatenciesFile, *getInstrInfo());
+  int L = LI.Latency(Opcode);
+  if (L == -1) {
+    auto *TPUDesc = getSchedModel().getSchedClassDesc(
+        getInstrInfo()->get(Opcode).getSchedClass());
+    for (int i = 0; i < TPUDesc->NumWriteLatencyEntries; i++) {
+      const MCWriteLatencyEntry *WLEntry = getWriteLatencyEntry(TPUDesc, i);
+      L = std::max(L, (int)WLEntry->Cycles);
+    }
+  }
+  return L;
+}
+
+void TPUSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
+                                         int UseOpIdx, SDep &Dep) const {
+  static LatenciesInfo LI(LatenciesFile, *getInstrInfo());
+  if (Dep.getKind() != SDep::Data)
+    return;
+  MachineInstr *DefMI = Def->getInstr();
+  const TargetRegisterInfo *TRI =
+      DefMI->getMF()->getSubtarget().getRegisterInfo();
+  int L = LI.Latency(DefMI->getOpcode());
+  if (L == -1) {
+    unsigned DefIdx =
+        DefMI->findRegisterDefOperandIdx(Dep.getReg(), false, true, TRI);
+    // We have both itineraries and per-operand latency information. Latency
+    // lookup defaults to itineraries, so query the per-operand model now.
+    auto *TPUDesc =
+        getSchedModel().getSchedClassDesc(DefMI->getDesc().getSchedClass());
+    if (DefIdx < TPUDesc->NumWriteLatencyEntries) {
+      const MCWriteLatencyEntry *WLEntry = getWriteLatencyEntry(TPUDesc, DefIdx);
+      L = WLEntry->Cycles;
+    }
+  }
+  if (TPUInstrInfo::isTransposeEnd(DefMI->getDesc())) {
+    assert(DefMI->getOperand(3).isImm());
+    // The scheduling information encodes the latency for a a full transpose.
+    // Short transpose have a longer latency so we adjust it here.
+    L = L + 128 - DefMI->getOperand(3).getImm();
+  }
+  if (L >= 0)
+    Dep.setLatency(L);
+}
+
+namespace {
+struct XLUUnitInfoTy {
+  unsigned Opcode;
+  unsigned UnitId;
+};
+
+using namespace TPU;
+#define GET_XLUUnitInfoTable_IMPL
+#include "TPUGenSearchableTables.inc"
+#undef GET_XLUUnitInfoTable_IMPL
+} // namespace
+
+// XLU FIFO is used by both permute, transpose and XLane units. When switching
+// from one to the other we need to add extra latency.
+void TPUSubtarget::UpdateCrossUnitLatency(SUnit *Def, SUnit *Use,
+                                          SDep &Dep) const {
+  auto &DefDesc = Def->getInstr()->getDesc();
+  auto &UseDesc = Use->getInstr()->getDesc();
+  // Latency calculated may be negative. It is okay since we only update the
+  // latency if it is greater than the original latency which is always postive.
+  int L = 0;
+  if (hasPfcTensorCore()) {
+    // Implement the scoreboard condition described:
+    // https://g3doc.corp.google.com/platforms/deepsea/logic/pfc/g3doc/isa/tensorcore.md#scoreboard-entries-39
+    // Latency is Scorebord time + 1.
+    if (TPUInstrInfo::isTransposeEnd(DefDesc)) {
+      const unsigned WidthOp = TPUInstrInfo::isPacked(DefDesc) ? 3 : 2;
+      const unsigned HeightOp = TPUInstrInfo::isPacked(DefDesc) ? 4 : 3;
+      const int W = Def->getInstr()->getOperand(WidthOp).getImm();
+      const int H = Def->getInstr()->getOperand(HeightOp).getImm();
+      if (TPUInstrInfo::isTranspose(UseDesc)) {
+        if (TPUInstrInfo::isPacked(DefDesc) &&
+            TPUInstrInfo::isSegmented(DefDesc))
+          L = 128 - H + 16;
+        else if (TPUInstrInfo::isSegmented(DefDesc))
+          L = 128 - H + 8;
+        else if (TPUInstrInfo::isPacked(DefDesc))
+          L = W - H + 16;
+        else // unpacked, not segmented
+          L = W - H + 8;
+      }
+      if (TPUInstrInfo::isReduce(UseDesc)) {
+        if (TPUInstrInfo::isPacked(DefDesc) &&
+            TPUInstrInfo::isSegmented(DefDesc))
+          L = 214 - H + 16;
+        else if (TPUInstrInfo::isSegmented(DefDesc))
+          L = 214 - H + 8;
+        else if (TPUInstrInfo::isPacked(DefDesc))
+          L = 86 + W - H + 16;
+        else // unpacked, not segmented
+          L = 86 + W - H + 8;
+      }
+      if (TPUInstrInfo::isPermute(UseDesc)) {
+        if (TPUInstrInfo::isPacked(DefDesc) &&
+            TPUInstrInfo::isSegmented(DefDesc))
+          L = 224 - H + 16;
+        else if (TPUInstrInfo::isSegmented(DefDesc))
+          L = 224 - H + 8;
+        else if (TPUInstrInfo::isPacked(DefDesc))
+          L = 96 + W - H + 16;
+        else // unpacked, not segmented
+          L = 96 + W - H + 8;
+      }
+    } else if (TPUInstrInfo::isPermute(DefDesc)) {
+      if (TPUInstrInfo::isTranspose(UseDesc)) {
+        if (TPUInstrInfo::isPacked(DefDesc))
+          L = 55;
+        else
+          L = 47;
+      }
+    } else if (TPUInstrInfo::isReduce(DefDesc)) {
+      if (TPUInstrInfo::isTranspose(UseDesc))
+        L = 57;
+      if (TPUInstrInfo::isPermute(UseDesc))
+        L = 18;
+    }
+  } else if (hasVfcTensorCore()) {
+    if (TPUInstrInfo::isTransposeEnd(DefDesc)) {
+      const unsigned WidthOp = TPUInstrInfo::isPacked(DefDesc) ? 3 : 2;
+      const unsigned HeightOp = TPUInstrInfo::isPacked(DefDesc) ? 4 : 3;
+      const int W = Def->getInstr()->getOperand(WidthOp).getImm();
+      const int H = Def->getInstr()->getOperand(HeightOp).getImm();
+      if (TPUInstrInfo::isTranspose(UseDesc)) {
+        if (TPUInstrInfo::isPacked(DefDesc) &&
+            !TPUInstrInfo::isSegmented(DefDesc))
+          L = W - H + 16;
+        else if (!TPUInstrInfo::isPacked(DefDesc) &&
+                 !TPUInstrInfo::isSegmented(DefDesc))
+          L = W - H + 8;
+      }
+
+      if (TPUInstrInfo::isReduce(UseDesc)) {
+        if (TPUInstrInfo::isPacked(DefDesc) &&
+            TPUInstrInfo::isSegmented(DefDesc))
+          L = 214 - H + 16;
+        else if (TPUInstrInfo::isSegmented(DefDesc))
+          L = 214 - H + 8;
+        else if (TPUInstrInfo::isPacked(DefDesc))
+          L = 86 + W - H + 16;
+        else // unpacked, not segmented
+          L = 86 + W - H + 8;
+      }
+
+      if (TPUInstrInfo::isPermute(UseDesc)) {
+        if (TPUInstrInfo::isPacked(DefDesc) &&
+            TPUInstrInfo::isSegmented(DefDesc))
+          L = 224 - H + 16;
+        else if (TPUInstrInfo::isSegmented(DefDesc))
+          L = 224 - H + 8;
+        else if (TPUInstrInfo::isPacked(DefDesc))
+          L = 96 + W - H + 16;
+        else // unpacked, not segmented
+          L = 96 + W - H + 8;
+      }
+    } else if (TPUInstrInfo::isPermute(DefDesc)) {
+      // The cross-unit latencies for permute (or rotate) are dependent on
+      // XLU units.
+      const XLUUnitInfoTy *XLUUnit = XLUUnitInfo(Def->getInstr()->getOpcode());
+      assert(XLUUnit && "Missing opcode - XLU unit mapping.");
+      const unsigned XLUIdx = XLUUnit->UnitId;
+      if (TPUInstrInfo::isTranspose(UseDesc)) {
+        if (TPUInstrInfo::isPacked(DefDesc)) {
+          if (XLUIdx == 0)
+            L = 19;
+          if (XLUIdx == 1)
+            L = 38;
+          if (XLUIdx == 2)
+            L = 23;
+        } else {
+          if (XLUIdx == 0)
+            L = 12;
+          if (XLUIdx == 1)
+            L = 31;
+          if (XLUIdx == 2)
+            L = 16;
+        }
+      }
+      if (TPUInstrInfo::isReduce(UseDesc)) {
+        if (TPUInstrInfo::isPacked(DefDesc)) {
+          if (XLUIdx == 0)
+            L = 49;
+          if (XLUIdx == 1)
+            L = 49;
+          if (XLUIdx == 2)
+            L = 48;
+        } else {
+          if (XLUIdx == 0)
+            L = 42;
+          if (XLUIdx == 1)
+            L = 42;
+          if (XLUIdx == 2)
+            L = 41;
+        }
+      }
+    } else if (TPUInstrInfo::isReduce(DefDesc)) {
+      // The cross-unit latencies for reduce are dependent on XLU units.
+      const XLUUnitInfoTy *XLUUnit = XLUUnitInfo(Def->getInstr()->getOpcode());
+      assert(XLUUnit && "Missing opcode - XLU unit mapping.");
+      const unsigned XLUIdx = XLUUnit->UnitId;
+      if (TPUInstrInfo::isTranspose(UseDesc)) {
+        if (XLUIdx == 0)
+          L = 1;
+        if (XLUIdx == 1)
+          L = 15;
+        if (XLUIdx == 2)
+          L = 1;
+      }
+      if (TPUInstrInfo::isPermute(UseDesc)) {
+        if (XLUIdx == 0)
+          L = 20;
+        if (XLUIdx == 1)
+          L = 25;
+        if (XLUIdx == 2)
+          L = 24;
+      }
+    }
+  } else {
+    // TODO(thomasraoux): Add DF target.
+    // Jellyfish case.
+    if (TPUInstrInfo::isTransposeEnd(DefDesc)) {
+      if (TPUInstrInfo::isReduce(UseDesc) || TPUInstrInfo::isPermute(UseDesc))
+        L = 87;
+    }
+  }
+  if (L > (int)Dep.getLatency())
+    Dep.setLatency(L);
+}
+
+unsigned TPUSubtarget::getVMaskAddressCalcLatency(MachineInstr *DefMI) const {
+  assert(isSparseCore());
+  if (DefMI && DefMI->getOpcode() == TargetOpcode::IMPLICIT_DEF)
+    // Undefined
+    return 0;
+  return 3;
+}
+
+unsigned TPUSubtarget::getVRegAddressCalcLatency(MachineInstr *DefMI) const {
+  assert(isSparseCore());
+  // For cases where the defining instruction is not available, just assume the
+  // worst-case. FIXME(b/190200357).
+  int DefaultDelay = 3;
+  if (DefMI == nullptr) {
+    return DefaultDelay;
+  }
+  switch (DefMI->getOpcode()) {
+  default:
+    // xref go/vxc-sc-isa#vector-addr-calc-delay: delay 2 for vector loads, 3
+    // for vector ALU.
+    assert(DefMI->isCopy() || TPUInstrInfo::isVectorInstruction(*DefMI));
+    return DefMI->mayLoad() ? 2 : 3;
+  case TPU::VRES_EUP:
+  case TPU::scVMOVC:
+    // xref go/vxc-sc-isa#vector-addr-calc-delay: 2 for vector result.
+    return 2;
+  case TargetOpcode::IMPLICIT_DEF:
+    // Undefined
+    return 0;
+  }
+}
+
+unsigned TPUSubtarget::getFifoDepth(const FifoInfo *Info) const {
+  // The fifo depths should be kept in sync with the number of pseudo fifo
+  // registers in TPURegisterInfo.td + 1.
+  auto *RegClass = Info->getRegisterClass();
+  if (RegClass == &TPU::MRFPR0RegClass || RegClass == &TPU::MRFPR1RegClass ||
+      RegClass == &TPU::MRFPR2RegClass || RegClass == &TPU::MRFPR3RegClass) {
+    if (hasVfcTensorCore())
+      return 48;
+    return 17;
+  }
+  if (RegClass == &TPU::TRFPR0RegClass || RegClass == &TPU::TRFPR1RegClass ||
+      RegClass == &TPU::TRFPR2RegClass) {
+    if (hasVfcTensorCore())
+      return 49;
+    return 33;
+  }
+  if (RegClass == &TPU::XRFPR0RegClass)
+    return 6;
+  if (RegClass == &TPU::XRFPR1RegClass)
+    return 10;
+  if (RegClass == &TPU::DRFPRRegClass)
+    return 16;
+  if (RegClass == &TPU::ERFPRRegClass) {
+    if (hasPfcTensorCore() || hasBarnaCoreChannelControllerIsa())
+      return 32;
+    if (hasVfcTensorCore())
+      return 16;
+    else if (isSparseCore())
+      return 8;
+    else
+      return 4;
+  }
+  if (RegClass == &TPU::V2SFPRRegClass) {
+    if (isSparseCore())
+      return 8;
+    return 128;
+  }
+  if (RegClass == &TPU::SFRFPRRegClass)
+    return 16;
+  llvm_unreachable("Unknown fifo class");
+}
+
+// For tensorcore, disable cross block edges tracking during bundle packing to
+// save compile time on large kernels. We may re-enable it for better scheduling
+// if we can reduce the impact on compile time.
+bool TPUSubtarget::trackCrossBlockEdges() const {
+  return StringSwitch<bool>(getCPU())
+      .Cases("sparsecore-scs-vf", "sparsecore-tec-vf", "sparsecore-tac-vf",
+             true)
+      .Cases("sparsecore-scs-gl", "sparsecore-tec-gl", "sparsecore-tac-gl",
+             true)
+      .Cases("sparsecore-scs-gf", "sparsecore-tec-gf", "sparsecore-tac-gf",
+             true)
+      .Case("barnacore-cc-pf", true)
+      .Default(false);
+}
+
+unsigned TPUSubtarget::getUncomposedPopLatency(MachineInstr *MI) const {
+  assert(TPUInstrInfo::isComposedFifo(*MI));
+  if (TPUInstrInfo::isComposedErfFifo(*MI)) {
+    return getMaxSchedLatency(*MI->getMF(), TPU::VRES_EUP);
+  } else if (TPUInstrInfo::isComposedXrf0Fifo(*MI)) {
+    return getMaxSchedLatency(*MI->getMF(), TPU::scVPOP3_XRF0);
+  } else if (TPUInstrInfo::isComposedXrf1Fifo(*MI)) {
+    return getMaxSchedLatency(*MI->getMF(), TPU::scVPOP3_XRF1);
+  } else if (TPUInstrInfo::isComposedV2SFifo(*MI)) {
+    return getMaxSchedLatency(*MI->getMF(), TPU::SPOP_V2SF);
+  } else if (TPUInstrInfo::isComposedDrfFifo(*MI)) {
+    return getMaxSchedLatency(*MI->getMF(), TPU::SPOP_DRF);
+  }
+
+  llvm_unreachable("Unsupported composed fifo instruction.");
+  return 0;
+}
+
+bool TPUSubtarget::isTPUABIEnabled() const {
+  return StringSwitch<bool>(getCPU())
+      .Cases("tensorcore-jf", "tensorcore-df", false)
+      .Case("tensorcore-pf", false)
+      .Case("tensorcore-vf", false)
+      .Case("sparsecore-tec-vf", EnableTPUABIOnTec || TPUABIOverride)
+      .Case("sparsecore-scs-vf", EnableTPUABIOnScs || TPUABIOverride)
+      .Case("sparsecore-tac-vf", false)
+      .Case("sparsecore-tec-gl", EnableTPUABIOnTec || TPUABIOverride)
+      .Case("sparsecore-scs-gl", EnableTPUABIOnScs || TPUABIOverride)
+      .Case("sparsecore-tac-gl", false)
+      .Case("sparsecore-tec-gf", EnableTPUABIOnTec || TPUABIOverride)
+      .Case("sparsecore-scs-gf", EnableTPUABIOnScs || TPUABIOverride)
+      .Case("sparsecore-tac-gf", false)
+      .Case("barnacore-cc-pf", false);
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.h b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.h
new file mode 100644
index 0000000..ed7dc69
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.h

@@ -0,0 +1,771 @@
+//=====-- TPUSubtarget.h - Define Subtarget for TPU -------*- C++ -*--==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the TPU specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_TPU_TPUSUBTARGET_H
+#define LLVM_LIB_TARGET_TPU_TPUSUBTARGET_H
+
+#include "TPUFrameLowering.h"
+#include "TPUISelLowering.h"
+#include "TPUInstrInfo.h"
+#include "TPUSelectionDAGInfo.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "TPUGenSubtargetInfo.inc"
+
+namespace llvm {
+class TPUInstrInfo;
+
+// FifoInfo describes a first-in-first-out register file. Typically such a
+// register file will contain only one physical register, but it may contain
+// more (for example the MXU or XLU fifos).
+//
+// Each Fifo has a distinct register class and has the concept of "push" and
+// "pop" opcodes. Each push or pop may push multiple items to the Fifo, for
+// example a VSORT instruction pushes three items and VPOP3_XRF pops three
+// items.
+//
+// Push and Pop operations are identified by the IsPush and IsPop tblgen flags,
+// but are also described here.
+class FifoInfo {
+public:
+  // Return the register class corresponding to this Fifo. Each Fifo must be
+  // associated with one and only one register class.
+  const TargetRegisterClass *getRegisterClass() const { return RegClass; }
+
+  // Return a list of all "push"-like opcodes on this Fifo.
+  ArrayRef<unsigned> getPushOpcodes() const { return PushOpcodes; }
+
+  // Return a list of all "pop"-like opcodes on this Fifo.
+  ArrayRef<unsigned> getPopOpcodes() const { return PopOpcodes; }
+
+  // Return a list of all composed opcodes on this Fifo.
+  ArrayRef<unsigned> getComposedOpcodes() const { return ComposedOpcodes; }
+
+  // Given a push-like opcode, return the number of items it pushes.
+  unsigned getPushNumItems(unsigned Opc) const {
+    return PushOpcToItems.find(Opc)->second;
+  }
+  // Given a pop-like opcode, return the number of items it pops.
+  unsigned getPopNumItems(unsigned Opc) const {
+    return PopOpcToItems.find(Opc)->second;
+  }
+  // Given a push-like opcode, returns whether there is an equivalent
+  // composed opcode.
+  bool hasComposedPushOpc(unsigned Opc) const {
+    return PushOpcToComposed.find(Opc) != PushOpcToComposed.end();
+  }
+  // Given a push-like opcode, return the equivalent composed opcode.
+  unsigned getComposedPushOpc(unsigned Opc) const {
+    return PushOpcToComposed.find(Opc)->second;
+  }
+
+private:
+  friend class TPUSubtarget;
+  // The register class that contains this FIFO's registers. Usually there is
+  // one physical register.
+  FifoInfo(const TargetRegisterClass *RegClass) : RegClass(RegClass) {}
+
+  const TargetRegisterClass *RegClass;
+  // Map of push opcode to the number of items pushed.
+  DenseMap<unsigned, unsigned> PushOpcToItems;
+  // Map of push opcode to the equivalent composed opcode.
+  DenseMap<unsigned, unsigned> PushOpcToComposed;
+  // Map of pop opcode to the number of items popped.
+  DenseMap<unsigned, unsigned> PopOpcToItems;
+  // Just a list of the push opcodes.
+  SmallVector<unsigned, 4> PushOpcodes;
+  // Just a list of the pop opcodes.
+  SmallVector<unsigned, 4> PopOpcodes;
+  // Just a list of the composed opcodes.
+  SmallVector<unsigned, 4> ComposedOpcodes;
+};
+
+// Array of all the Fifo registers
+static constexpr std::array<const TargetRegisterClass *, 13> FifoClasses = {
+    &TPU::MRFPR0RegClass, &TPU::MRFPR1RegClass, &TPU::MRFPR2RegClass,
+    &TPU::MRFPR3RegClass, &TPU::TRFPR0RegClass, &TPU::TRFPR1RegClass,
+    &TPU::TRFPR2RegClass, &TPU::XRFPR0RegClass, &TPU::XRFPR1RegClass,
+    &TPU::DRFPRRegClass,  &TPU::V2SFPRRegClass, &TPU::ERFPRRegClass,
+    &TPU::SFRFPRRegClass};
+
+// Array of register class for special register that can only have a single
+// alive register at anytime. Those registers don't behave like FIFOs but
+// require memory dependencies between instructions reading or writing them.
+// TODO(thomasraoux): Can this be modeled as physical register instead?
+static constexpr std::array<const TargetRegisterClass *, 23> SpecialStagingReg =
+    {
+        &TPU::GSFNPR0RegClass, &TPU::GSFNPR1RegClass, &TPU::GSFNPR2RegClass,
+        &TPU::GSFNPR3RegClass, &TPU::GSFTPR0RegClass, &TPU::GSFTPR1RegClass,
+        &TPU::GSFTPR2RegClass, &TPU::GSFTPR3RegClass, &TPU::PCRPR0RegClass,
+        &TPU::PCRPR1RegClass,  &TPU::PCRPR2RegClass,  &TPU::SPRPR0RegClass,
+        &TPU::SPRPR1RegClass,  &TPU::IARPR0RegClass,  &TPU::IARPR1RegClass,
+        &TPU::MSRAPR0RegClass, &TPU::MSRAPR1RegClass, &TPU::MSRAPR2RegClass,
+        &TPU::MSRAPR3RegClass, &TPU::MSRBPR0RegClass, &TPU::MSRBPR1RegClass,
+        &TPU::MSRBPR2RegClass, &TPU::MSRBPR3RegClass,
+};
+
+class TPUSubtarget : public TPUGenSubtargetInfo {
+public:
+  // This constructor initializes the data members to match that
+  // of the specified triple.
+  TPUSubtarget(const Triple &TargetTriple, StringRef Cpu,
+               StringRef FeatureString, const TargetMachine &TM,
+               const TargetOptions &Options, CodeModel::Model CodeModel,
+               bool TPUABIOverride, CodeGenOpt::Level OptLevel);
+
+  ~TPUSubtarget();
+
+  // ParseSubtargetFeatures - Parses features string setting specified
+  // subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+
+  TPUSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
+
+  void initSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  bool enableMachineScheduler() const override { return true; }
+
+  bool enableSubRegLiveness() const override { return true; }
+
+  AntiDepBreakMode getAntiDepBreakMode() const override {
+    // On barnacore don't rename registers after RA as it would break with
+    // rotating registers.
+    return HasBarnacoreChannelControllerIsa ? ANTIDEP_NONE : ANTIDEP_ALL;
+  }
+
+  const TPUInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
+
+  const TargetFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+
+  const TPURegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  const TPUTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+
+  const TPUSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+
+  bool useAA() const override { return true; }
+
+  void getSMSMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>>
+                           &Mutations) const override;
+
+  // Returns the instruction's maximum latency considering all write latency
+  // entries in the instruction's scheduling entry, based on opcode. Use with
+  // care.
+  int getMaxSchedLatency(MachineFunction &MF, unsigned Opcode) const;
+
+  virtual void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
+                                     int UseOpIdx, SDep &Dep) const override;
+
+  void UpdateCrossUnitLatency(SUnit *Def, SUnit *Use, SDep &Dep) const;
+
+  // Return the FifoInfo for the given register class, or nullptr.
+  const FifoInfo *getFifoInfo(const TargetRegisterClass *RegClass) const {
+    auto I = llvm::find_if(FifoInfos, [RegClass](const FifoInfo *FI) {
+      return FI->getRegisterClass() == RegClass;
+    });
+    return I == FifoInfos.end() ? nullptr : *I;
+  }
+
+  // Return the FifoInfo for the given physical register, or nullptr.
+  const FifoInfo *getFifoInfo(unsigned PhysReg) const {
+    auto I = FifoInfoByPhysReg.find(PhysReg);
+    return I == FifoInfoByPhysReg.end() ? nullptr : I->second;
+  }
+
+  // Return all FifoInfo structures.
+  ArrayRef<FifoInfo *> getFifoInfos() const { return FifoInfos; }
+
+  bool hasV1024() const { return HasV1024; }
+  bool hasV8() const { return HasV8; }
+  bool hasV16() const { return HasV16; }
+  bool hasVPU() const { return HasV8 || HasV16 || HasV1024; }
+  int scalarSizeInBytes() const { return 4; }
+  int vectorSizeInElements(int ElementSizeInBytes) const {
+    if (HasV8)
+      return 32 / ElementSizeInBytes;
+    else if (HasV16)
+      return 64 / ElementSizeInBytes;
+    else if (HasV1024)
+      return 4096 / ElementSizeInBytes;
+    return 0;
+  }
+  int vectorSizeInBytes() const { return vectorSizeInElements(1); }
+  bool hasLPVF() const { return HasLPVF || HasLPGL; }
+  bool hasLPGL() const { return HasLPGL; }
+  bool hasVMinMax() const { return isSparseCore() && !isVfcSparseCore(); }
+  bool hasTranscendental() const { return HasTranscendental; }
+  bool hasVectorSflags() const { return HasVectorSflags; }
+  bool hasScalarSflags() const { return HasScalarSflags; }
+  bool hasMXU() const { return HasMXU; }
+  bool hasFatalRawHazard() const { return HasFatalRawHazard; }
+  bool hasMaskPermute() const {
+    if (!hasVPU())
+      return false;
+    return HasGlcSparsecoreIsa || HasGfcSparsecoreIsa;
+  }
+  bool hasBroadcast() const {
+    if (!hasVPU())
+      return false;
+    return isSparseCore();
+  }
+  bool hasJfcTensorCore() const { return HasJfcTensorCore; }
+  bool hasDfcTensorCore() const { return HasDfcTensorCore; }
+  bool hasPfcTensorCore() const { return HasPfcTensorCore; }
+  bool hasVfcTensorCore() const { return HasVfcTensorCore; }
+  unsigned getVMemHazardLatency() const;
+  bool hasBarnaCoreChannelControllerIsa() const {
+    return HasBarnacoreChannelControllerIsa;
+  }
+  bool isPxcBarnaCore() const { return HasBarnacoreChannelControllerIsa; }
+  bool isSparseCore() const {
+    return HasVfcSparsecoreIsa || HasGlcSparsecoreIsa || HasGfcSparsecoreIsa;
+  }
+  bool isVfcSparseCore() const { return HasVfcSparsecoreIsa; }
+  bool isGlcSparseCore() const { return HasGlcSparsecoreIsa; }
+  bool isGfcSparseCore() const { return HasGfcSparsecoreIsa; }
+  unsigned isSparseCoreScs() const {
+    return StringSwitch<bool>(getCPU())
+        .Cases("tensorcore-jf", "tensorcore-df", false)
+        .Case("tensorcore-pf", false)
+        .Case("tensorcore-vf", false)
+        .Cases("sparsecore-tec-vf", "sparsecore-tec-gl", "sparsecore-tec-gf",
+               false)
+        .Cases("sparsecore-tac-vf", "sparsecore-tac-gl", "sparsecore-tac-gf",
+               false)
+        .Cases("sparsecore-scs-vf", "sparsecore-scs-gl", "sparsecore-scs-gf",
+               true)
+        .Case("barnacore-cc-pf", false);
+  }
+  unsigned isSparseCoreTac() const {
+    return StringSwitch<bool>(getCPU())
+        .Cases("tensorcore-jf", "tensorcore-df", false)
+        .Case("tensorcore-pf", false)
+        .Case("tensorcore-vf", false)
+        .Cases("sparsecore-tec-vf", "sparsecore-tec-gl", "sparsecore-tec-gf",
+               false)
+        .Cases("sparsecore-tac-vf", "sparsecore-tac-gl", "sparsecore-tac-gf",
+               true)
+        .Cases("sparsecore-scs-vf", "sparsecore-scs-gl", "sparsecore-scs-gf",
+               false)
+        .Case("barnacore-cc-pf", false);
+  }
+  unsigned isSparseCoreTec() const {
+    return StringSwitch<bool>(getCPU())
+        .Cases("tensorcore-jf", "tensorcore-df", false)
+        .Case("tensorcore-pf", false)
+        .Case("tensorcore-vf", false)
+        .Cases("sparsecore-tec-vf", "sparsecore-tec-gl", "sparsecore-tec-gf",
+               true)
+        .Cases("sparsecore-tac-vf", "sparsecore-tac-gl", "sparsecore-tac-gf",
+               false)
+        .Cases("sparsecore-scs-vf", "sparsecore-scs-gl", "sparsecore-scs-gf",
+               false)
+        .Case("barnacore-cc-pf", false);
+  }
+  bool hasPxcVPU() const { return HasPxcVPU; }
+  bool hasGsft() const { return HasGsft; }
+  bool hasSMul24() const { return HasSMul24; }
+  bool hasSMul32() const { return HasSMul32; }
+  bool hasEarlyVxposeAllocation() const { return HasEarlyVxposeAllocation; }
+  bool hasNeedsCompilerThrottling() const { return HasNeedsCompilerThrottling; }
+  bool hasUnsignedScalarCompare() const {
+    return HasVfcTensorCore || isSparseCore();
+  }
+  bool hasUnsignedVectorCompare() const { return isSparseCore(); }
+  bool hasVectorPredicateSelect() const { return isSparseCore(); }
+  bool hasVResHold() const { return isSparseCore(); }
+  // FIXME(hgreving): Enable for TensorCore if needed.
+  bool hasEmbeddedMasks() const { return isSparseCore(); }
+  // FIXME(hgreving): Enable for TensorCore if needed.
+  bool hasVCMasks() const { return isSparseCore(); }
+  bool supports20bitSignedIndices() const { return isSparseCore(); }
+  bool supportsVldVstIdxAdd() const {
+    return isGlcSparseCore() || isGfcSparseCore();
+  }
+
+  bool useFastDag() const;
+  // Return true if we should track cross block edges during bundle packing.
+  bool trackCrossBlockEdges() const;
+
+  unsigned getHbmWordSizeLog2() const {
+    return StringSwitch<unsigned>(getCPU())
+        .Cases("tensorcore-jf", "tensorcore-df", 10) // 1024b
+        .Case("tensorcore-pf", 9)                    // 512b
+        .Case("tensorcore-vf", 5)                    // 32b
+        .Case("sparsecore-tec-vf", 5)                // 32b
+        .Case("sparsecore-scs-vf", 5)                // 32b
+        .Case("sparsecore-tac-vf", 5)                // 32b
+        .Case("sparsecore-tec-gl", 5)                // 32b
+        .Case("sparsecore-scs-gl", 5)                // 32b
+        .Case("sparsecore-tac-gl", 5)                // 32b
+        .Case("sparsecore-tec-gf", 5)                // 32b
+        .Case("sparsecore-scs-gf", 5)                // 32b
+        .Case("sparsecore-tac-gf", 5);               // 32b
+  }
+
+  unsigned getNumTiles() const {
+    return StringSwitch<unsigned>(getCPU())
+        .Cases("tensorcore-jf", "tensorcore-df", 0) // no tiles
+        .Case("tensorcore-pf", 0)                   // no tiles
+        .Case("tensorcore-vf", 0)                   // no tiles
+        .Case("sparsecore-tec-vf", 16)              // 16 tiles
+        .Case("sparsecore-scs-vf", 0)               // no tiles
+        .Case("sparsecore-tac-vf", 0)               // no tiles
+        .Case("sparsecore-tec-gl", 16)              // 16 tiles
+        .Case("sparsecore-scs-gl", 0)               // no tiles
+        .Case("sparsecore-tac-gl", 0)               // no tiles
+        .Case("sparsecore-tec-gf", 16)              // 16 tiles
+        .Case("sparsecore-scs-gf", 0)               // no tiles
+        .Case("sparsecore-tac-gf", 0);              // no tiles
+  }
+
+  unsigned getNumDelaySlots() const {
+    return StringSwitch<unsigned>(getCPU())
+        .Cases("tensorcore-jf", "tensorcore-df", 1)
+        .Case("tensorcore-pf", 1)
+        .Case("tensorcore-vf", 4)
+        .Case("sparsecore-tec-vf", 2)
+        .Case("sparsecore-scs-vf", 3)
+        .Case("sparsecore-tac-vf", 2)
+        .Case("sparsecore-tec-gl", 2)
+        .Case("sparsecore-scs-gl", 3)
+        .Case("sparsecore-tac-gl", 2)
+        .Case("sparsecore-tec-gf", 2)
+        .Case("sparsecore-scs-gf", 3)
+        .Case("sparsecore-tac-gf", 2)
+        .Case("barnacore-cc-pf", 0);
+  }
+
+  // Returns the number of VReg read ports to be used as a limit for bundle
+  // packing.
+  unsigned getVRegReadPorts() const {
+    return StringSwitch<unsigned>(getCPU())
+        .Cases("tensorcore-jf", "tensorcore-df",
+               ~0L)                 // Currently no constraint.
+        .Case("tensorcore-pf", ~0L) // Currently no constraint.
+        .Case("tensorcore-vf", ~0L) // Currently no constraint.
+        .Case("sparsecore-tec-vf", 7)
+        .Case("sparsecore-scs-vf", 0) // No VPU.
+        .Case("sparsecore-tac-vf", 0) // No VPU.
+        .Case("sparsecore-tec-gl", 7)
+        .Case("sparsecore-scs-gl", 0) // No VPU.
+        .Case("sparsecore-tac-gl", 0) // No VPU.
+        .Case("sparsecore-tec-gf", 7)
+        .Case("sparsecore-scs-gf", 0)  // No VPU.
+        .Case("sparsecore-tac-gf", 0)  // No VPU.
+        .Case("barnacore-cc-pf", ~0L); // Currently no constraint.
+  }
+
+  unsigned getMemSize(unsigned MemAddressSpace) const {
+    switch (MemAddressSpace) {
+    default:
+      llvm_unreachable("Unknown memory size or unknown address space!");
+    case TPUAS_Bmem:
+      return StringSwitch<unsigned>(getCPU()).Case(
+          "barnacore-cc-pf", 5120); // This value is unused in the compiler.
+    case TPUAS_Smem:
+      return StringSwitch<unsigned>(getCPU())
+          .Cases("tensorcore-jf", "tensorcore-df",
+                 4192)                      // 16kb total, word size 4 bytes
+          .Case("tensorcore-pf", 262144)    // 1024kb total, word size 4 bytes
+          .Case("tensorcore-vf", 262144)    // 1024kb total, word size 4 bytes
+          .Case("sparsecore-tec-vf", 2048)  // 8kb total, word size 4 bytes
+          .Case("sparsecore-scs-vf", 16384) // 64kb total, word size 4 bytes
+          .Case("sparsecore-tac-vf", 2048)  // 8kb total, word size 4 bytes
+          .Case("sparsecore-tec-gl",
+                2048)                       // 8kb total, word size 4 bytes
+          .Case("sparsecore-scs-gl", 16384) // 64kb total, word size 4 bytes
+          .Case("sparsecore-tac-gl", 2048)  // 8kb total, word size 4 bytes
+          .Case("sparsecore-tec-gf", 2048)  // 8kb total, word size 4 bytes
+          .Case("sparsecore-scs-gf", 16384) // 64kb total, word size 4 bytes
+          .Case("sparsecore-tac-gf", 2048)  // 8kb total, word size 4 bytes
+          .Case("barnacore-cc-pf", 8192);   // 32kb total, word size 4 bytes
+    case TPUAS_SmemAny:
+      return StringSwitch<unsigned>(getCPU())
+          .Cases("tensorcore-jf", "tensorcore-df",
+                 0)                         // No "any" mem
+          .Case("tensorcore-pf", 0)         // No "any" mem
+          .Case("tensorcore-vf", 0)         // No "any" mem
+          .Case("sparsecore-tec-vf", 1024)  // 4kb total, word size 4 bytes
+          .Case("sparsecore-scs-vf", 16384) // 64kb total, word size 4 bytes
+          .Case("sparsecore-tac-vf", 1024)  // 4kb total, word size 4 bytes
+          .Case("sparsecore-tec-gl",
+                1024)                       // 4kb total, word size 4 bytes
+          .Case("sparsecore-scs-gl", 16384) // 64kb total, word size 4 bytes
+          .Case("sparsecore-tac-gl", 1024)  // 4kb total, word size 4 bytes
+          .Case("sparsecore-tec-gf", 1024)  // 4kb total, word size 4 bytes
+          .Case("sparsecore-scs-gf", 16384) // 64kb total, word size 4 bytes
+          .Case("sparsecore-tac-gf", 1024)  // 4kb total, word size 4 bytes
+          .Case("barnacore-cc-pf", 0);      // No "any" mem
+    case TPUAS_Hbm:
+      return StringSwitch<unsigned>(getCPU())
+          .Case("tensorcore-jf", 8388608)   // 8GiB total, word size 1024 bytes
+          .Case("tensorcore-df", 16777216)  // 16GiB total, word size 1024 bytes
+          .Case("tensorcore-pf", 67108864)  // 32GiB total, word size 512 bytes
+          .Case("tensorcore-vf", 201326592) // 96GiB total, word size 512 bytes
+          .Case("sparsecore-tec-vf",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("sparsecore-scs-vf",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("sparsecore-tac-vf",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("sparsecore-tec-gl",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("sparsecore-scs-gl",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("sparsecore-tac-gl",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("sparsecore-tec-gf",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("sparsecore-scs-gf",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("sparsecore-tac-gf",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("barnacore-cc-pf",
+                67108864); // 32GiB total, word size 512 bytes
+    case TPUAS_HbmAny:
+      return StringSwitch<unsigned>(getCPU())
+          .Case("tensorcore-jf", 0) // No "any" mem
+          .Case("tensorcore-df", 0) // No "any" mem
+          .Case("tensorcore-pf", 0) // No "any" mem
+          .Case("tensorcore-vf", 0) // No "any" mem
+          .Case("sparsecore-tec-vf",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("sparsecore-scs-vf",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("sparsecore-tac-vf",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("sparsecore-tec-gl",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("sparsecore-scs-gl",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("sparsecore-tac-gl",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("sparsecore-tec-gf",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("sparsecore-scs-gf",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("sparsecore-tac-gf",
+                3221225472) // 96GiB total, word size 32 bytes
+          .Case("barnacore-cc-pf",
+                0); // No "any" mem
+    case TPUAS_TileSpmem:
+      return StringSwitch<unsigned>(getCPU())
+          .Cases("tensorcore-jf", "tensorcore-df", 0) // no tilespmem
+          .Case("tensorcore-pf", 0)                   // no tilespmem
+          .Case("tensorcore-vf", 0)                   // no tilespmem
+          .Case("sparsecore-tec-vf", 131072) // 8MiB/16 total, word size 4 bytes
+          .Case("sparsecore-scs-vf", 0)      // no tilespmem
+          .Case("sparsecore-tac-vf", 131072) // 8MiB/16 total, word size 4 bytes
+          .Case("sparsecore-tec-gl",
+                65536)                       // 4MiB/16 total, word size 4 bytes
+          .Case("sparsecore-scs-gl", 0)      // no tilespmem
+          .Case("sparsecore-tac-gl", 65536)  // 4MiB/16 total, word size 4 bytes
+          .Case("sparsecore-tec-gf", 131072) // 8MiB/16 total, word size 4 bytes
+          .Case("sparsecore-scs-gf", 0)      // no tilespmem
+          .Case("sparsecore-tac-gf", 131072) // 8MiB/16 total, word size 4 bytes
+          .Case("barnacore-cc-pf", 0);       // no tilespmem
+    case TPUAS_Simem:
+      return StringSwitch<unsigned>(getCPU())
+          .Cases("tensorcore-jf", "tensorcore-df", 0) // no simem
+          .Case("tensorcore-pf", 0)                   // no simem
+          .Case("tensorcore-vf", 0)                   // no simem
+          .Case("sparsecore-tec-vf", 0)
+          .Case("sparsecore-scs-vf", 8192) // 8K words
+          .Case("sparsecore-tac-vf", 0)    // no simem
+          .Case("sparsecore-tec-gl", 0)
+          .Case("sparsecore-scs-gl", 8192) // 8K words
+          .Case("sparsecore-tac-gl", 0)    // no simem
+          .Case("sparsecore-tec-gf", 0)
+          .Case("sparsecore-scs-gf", 8192) // 8K words
+          .Case("sparsecore-tac-gf", 0)    // no simem
+          .Case("barnacore-cc-pf", 0);     // no simem
+    case TPUAS_Timem:
+      return StringSwitch<unsigned>(getCPU())
+          .Cases("tensorcore-jf", "tensorcore-df", 0) // no timem
+          .Case("tensorcore-pf", 0)                   // no timem
+          .Case("tensorcore-vf", 0)                   // no timem
+          .Case("sparsecore-tec-vf", 16384)           // 16K words
+          .Case("sparsecore-scs-vf", 16384)           // 16K words
+          .Case("sparsecore-tac-vf", 16384)           // 16K words
+          .Case("sparsecore-tec-gl", 16384)           // 16K words
+          .Case("sparsecore-scs-gl", 16384)           // 16K words
+          .Case("sparsecore-tac-gl", 16384)           // 16K words
+          .Case("sparsecore-tec-gf", 16384)           // 16K words
+          .Case("sparsecore-scs-gf", 16384)           // 16K words
+          .Case("sparsecore-tac-gf", 16384)           // 16K words
+          .Case("barnacore-cc-pf", 0);                // no timem
+    case TPUAS_Spmem:
+      return StringSwitch<unsigned>(getCPU())
+          .Cases("tensorcore-jf", "tensorcore-df", 0) // no spmem
+          .Case("tensorcore-pf", 0)                   // no spmem
+          .Case("tensorcore-vf", 0)                   // no spmem
+          .Case("sparsecore-tec-vf", 2097152) // 8MiB total, word size 4 bytes
+          .Case("sparsecore-scs-vf", 2097152) // 8MiB total, word size 4 bytes
+          .Case("sparsecore-tac-vf", 2097152) // 8MiB total accessible,
+          .Case("sparsecore-tec-gl",
+                1048576)                      // 4MiB total, word size 4 bytes
+          .Case("sparsecore-scs-gl", 1048576) // 4MiB total, word size 4 bytes
+          .Case("sparsecore-tac-gl", 1048576) // 4MiB total accessible,
+                                              // word size 4 bytes
+          .Case("sparsecore-tec-gf", 2097152) // 8MiB total, word size 4 bytes
+          .Case("sparsecore-scs-gf", 2097152) // 8MiB total, word size 4 bytes
+          .Case("sparsecore-tac-gf", 2097152) // 8MiB total accessible,
+          .Case("barnacore-cc-pf", 0);        // no spmem
+    case TPUAS_Vmem:
+      return StringSwitch<unsigned>(getCPU())
+          .Cases("tensorcore-jf", "tensorcore-df",
+                 65536) // 32768kb total, word size is 512 bytes
+          .Case("tensorcore-pf",
+                32768) // 2^(5+10+7+5-3)b total, word size is 512 bytes
+          .Case("tensorcore-vf",
+                262144) // 2^(8+10+7+5-3)b total, word size is 512 bytes
+          .Cases("sparsecore-tec-vf", "sparsecore-scs-vf", "sparsecore-tac-vf",
+                 "sparsecore-tec-gl", "sparsecore-scs-gl", "sparsecore-tac-gl",
+                 "sparsecore-tec-gf", "sparsecore-scs-gf", "sparsecore-tac-gf",
+                 0)                    // no vmem
+          .Case("barnacore-cc-pf", 0); // no vmem
+    case TPUAS_Dreg:
+      return StringSwitch<unsigned>(getCPU())
+          .Cases("tensorcore-jf", "tensorcore-df", 0) // no dreg
+          .Case("tensorcore-pf", 0)                   // no dreg
+          .Case("tensorcore-vf", 0)                   // no dreg
+          .Cases("sparsecore-tec-vf", "sparsecore-scs-vf", "sparsecore-tac-vf",
+                 32) // 128b total, word size is 4 bytes
+          .Cases("sparsecore-tec-gl", "sparsecore-scs-gl", "sparsecore-tac-gl",
+                 32) // 128b total, word size is 4 bytes
+          .Cases("sparsecore-tec-gf", "sparsecore-scs-gf", "sparsecore-tac-gf",
+                 32)                   // 128b total, word size is 4 bytes
+          .Case("barnacore-cc-pf", 0); // no dreg
+    case TPUAS_Sflag:
+      return StringSwitch<unsigned>(getCPU())
+          .Cases("tensorcore-jf", "tensorcore-df",
+                 256)                 // 1kb total, word size 4 bytes
+          .Case("tensorcore-pf", 512) // 2kb total, word size 4 bytes
+          .Case("tensorcore-vf", 512) // 2kb total, word size 4 bytes
+          .Cases("sparsecore-tec-vf", "sparsecore-tac-vf", "sparsecore-tec-gl",
+                 "sparsecore-tac-gl", "sparsecore-tec-gf", "sparsecore-tac-gf",
+                 32) // 128b total, word size 4 bytes
+          .Cases("sparsecore-scs-vf", "sparsecore-scs-gl", "sparsecore-scs-gf",
+                 7168)                 // 28kb total, word size 4 bytes
+          .Case("barnacore-cc-pf", 0); // No support as yet
+    case TPUAS_SflagAny:
+    case TPUAS_SflagOther:
+      return StringSwitch<unsigned>(getCPU())
+          .Cases("tensorcore-jf", "tensorcore-df",
+                 0)                 // No "other" core
+          .Case("tensorcore-pf", 0) // No "other" core
+          .Case("tensorcore-vf", 0) // No "other" core
+          .Cases("sparsecore-tec-vf", "sparsecore-tac-vf", "sparsecore-tec-gl",
+                 "sparsecore-tac-gl", "sparsecore-tec-gf", "sparsecore-tac-gf",
+                 32) // 128b total, word size 4 bytes
+          .Cases("sparsecore-scs-vf", "sparsecore-scs-gl", "sparsecore-scs-gf",
+                 0)                    // No "other" core
+          .Case("barnacore-cc-pf", 0); // No "other" core
+    case TPUAS_SflagTile:
+      return StringSwitch<unsigned>(getCPU())
+          .Cases("tensorcore-jf", "tensorcore-df", "tensorcore-pf",
+                 "tensorcore-vf", 0) // No tiles.
+          .Cases("sparsecore-scs-vf", "sparsecore-scs-gl", "sparsecore-scs-gf",
+                 7168)
+          .Cases("sparsecore-tec-vf", "sparsecore-tac-vf", "sparsecore-tec-gl",
+                 "sparsecore-tac-gl", "sparsecore-tec-gf", "sparsecore-tac-gf",
+                 32)
+          .Case("barnacore-cc-pf", 0); // No support as yet
+    case TPUAS_Iova:
+      return StringSwitch<unsigned>(getCPU())
+          .Cases("tensorcore-jf", "tensorcore-df",
+                 0)                 // No IOVA
+          .Case("tensorcore-pf", 0) // No IOVA
+          .Case("tensorcore-vf", 0) // No IOVA
+          .Cases("sparsecore-tec-vf", "sparsecore-scs-vf", "sparsecore-tac-vf",
+                 "sparsecore-tec-gl", "sparsecore-scs-gl", "sparsecore-tac-gl",
+                 "sparsecore-tec-gf", "sparsecore-scs-gf", "sparsecore-tac-gf",
+                 ~0L)                  // Arbitrary
+          .Case("barnacore-cc-pf", 0); // No IOVA
+    }
+  }
+
+  // Returns the number of VS fields.
+  unsigned getNumVs() const {
+    return StringSwitch<unsigned>(getCPU())
+        .Cases("tensorcore-jf", "tensorcore-df", 3)
+        .Case("tensorcore-pf", 3)
+        .Case("tensorcore-vf", 4)
+        .Cases("sparsecore-tec-vf", "sparsecore-scs-vf", "sparsecore-tac-vf",
+               "sparsecore-tec-gl", "sparsecore-scs-gl", "sparsecore-tac-gl",
+               "sparsecore-tec-gf", "sparsecore-scs-gf", "sparsecore-tac-gf", 4)
+        .Case("barnacore-cc-pf", 3);
+  }
+
+  // Returns the default immediate size of an immediate slot.
+  unsigned getImmediateSizeInBits() const {
+    return StringSwitch<unsigned>(getCPU())
+        .Cases("tensorcore-jf", "tensorcore-df", 16)
+        .Case("tensorcore-pf", 16)
+        .Case("tensorcore-vf", 20)
+        .Cases("sparsecore-tec-vf", "sparsecore-scs-vf", "sparsecore-tac-vf",
+               "sparsecore-tec-gl", "sparsecore-scs-gl", "sparsecore-tac-gl",
+               "sparsecore-tec-gf", "sparsecore-scs-gf", "sparsecore-tac-gf",
+               20)
+        .Case("barnacore-cc-pf", 16);
+  }
+
+  // Returns the DMA core id of SC0 on SparseCore.
+  unsigned getDmaCoreIdOffset() const {
+    return StringSwitch<unsigned>(getCPU()).Cases(
+        "sparsecore-tec-vf", "sparsecore-scs-vf", "sparsecore-tac-vf",
+        "sparsecore-tec-gl", "sparsecore-scs-gl", "sparsecore-tac-gl",
+        "sparsecore-tec-gf", "sparsecore-scs-gf", "sparsecore-tac-gf", 4);
+  }
+
+  // Returns the default no-op opcode.
+  unsigned getDefaultNop() const {
+    return StringSwitch<unsigned>(getCPU())
+        .Cases("tensorcore-jf", "tensorcore-df", TPU::VNOP)
+        .Case("tensorcore-pf", TPU::VNOP)
+        .Case("tensorcore-vf", TPU::VNOP)
+        .Cases("sparsecore-tec-vf", "sparsecore-scs-vf", "sparsecore-tac-vf",
+               "sparsecore-tec-gl", "sparsecore-scs-gl", "sparsecore-tac-gl",
+               "sparsecore-tec-gf", "sparsecore-scs-gf", "sparsecore-tac-gf",
+               TPU::SNOP)
+        .Case("barnacore-cc-pf", TPU::bcNOP);
+  }
+
+  // Returns the number of immediate slots.
+  unsigned getNumberImmediateSlots() const {
+    return StringSwitch<unsigned>(getCPU())
+        .Cases("tensorcore-jf", "tensorcore-df", 6)
+        .Case("tensorcore-pf", 6)
+        .Case("tensorcore-vf", 6)
+        .Case("sparsecore-tec-vf", 6)
+        .Case("sparsecore-scs-vf", 6)
+        .Case("sparsecore-tac-vf", 6)
+        .Case("sparsecore-tec-gl", 6)
+        .Case("sparsecore-scs-gl", 6)
+        .Case("sparsecore-tac-gl", 6)
+        .Case("sparsecore-tec-gf", 6)
+        .Case("sparsecore-scs-gf", 6)
+        .Case("sparsecore-tac-gf", 6)
+        .Case("barnacore-cc-pf", 4);
+  }
+
+  // Returns the preferred allocation order for immediate slots.
+  SmallVector<int, 6> getImmediateSlotAllocOrder() const {
+    return StringSwitch<SmallVector<int, 6>>(getCPU())
+        .Cases("tensorcore-jf", "tensorcore-df", {0, 1, 2, 3, 4, 5})
+        .Case("tensorcore-pf", {0, 1, 2, 3, 4, 5})
+        .Case("tensorcore-vf", {4, 5, 0, 1, 2, 3})
+        .Case("sparsecore-tec-vf", {4, 5, 0, 1, 2, 3})
+        .Case("sparsecore-scs-vf", {4, 5, 0, 1, 2, 3})
+        .Case("sparsecore-tac-vf", {4, 5, 0, 1, 2, 3})
+        .Case("sparsecore-tec-gl", {4, 5, 0, 1, 2, 3})
+        .Case("sparsecore-scs-gl", {4, 5, 0, 1, 2, 3})
+        .Case("sparsecore-tac-gl", {4, 5, 0, 1, 2, 3})
+        .Case("sparsecore-tec-gf", {4, 5, 0, 1, 2, 3})
+        .Case("sparsecore-scs-gf", {4, 5, 0, 1, 2, 3})
+        .Case("sparsecore-tac-gf", {4, 5, 0, 1, 2, 3})
+        .Case("barnacore-cc-pf", {0, 1, 2, 3});
+  }
+
+  // Returns true if TPU ABI for function calls is enabled for subtarget.
+  bool isTPUABIEnabled() const;
+
+  // Returns the amount of additional structural latency required between the
+  // given address-producing instruction and a consuming Vector
+  // Load/Store/Store-Add instruction. See go/vxc-sc-isa#vector-addr-calc-delay
+  unsigned getVRegAddressCalcLatency(MachineInstr *DefMI) const;
+
+  // Same as above, for vector masks.
+  unsigned getVMaskAddressCalcLatency(MachineInstr *DefMI) const;
+
+  unsigned getFifoDepth(const FifoInfo *Info) const;
+
+  // Assumes that MI is a composed fifo instruction, and returns the maximum
+  // latency of the corresponding composed pop instruction.
+  unsigned getUncomposedPopLatency(MachineInstr *MI) const;
+
+  // **********************
+  // * Production erratas *
+  // **********************
+
+  // Errata b/210042404
+  bool hasIndirectVregCbStreamCorruption() const {
+    return isVfcSparseCore() && isSparseCoreTec();
+  }
+
+  // Errata b/244231604
+  bool hasIncorrectCbregWriteBypass() const { return isVfcSparseCore(); }
+
+private:
+  FifoInfo *FillFifoInfo(const TargetRegisterClass *RegClass,
+                         const DenseMap<unsigned, int> &ItemsPushed,
+                         const DenseMap<unsigned, unsigned> &OpcToComposed);
+
+  // Populated by tblgen. Must be initialized before FrameLowering is
+  // constructed!
+  bool HasV1024 = false;
+  bool HasV8 = false;
+  bool HasV16 = false;
+  bool HasLPVF = false;
+  bool HasLPGL = false;
+  bool HasTranscendental = false;
+  bool HasVectorSflags = false;
+  bool HasScalarSflags = false;
+  bool HasMXU = false;
+  bool HasFatalRawHazard = false;
+  bool HasJfcTensorCore = false;
+  bool HasDfcTensorCore = false;
+  bool HasPfcTensorCore = false;
+  bool HasVfcTensorCore = false;
+  bool HasBarnacoreChannelControllerIsa = false;
+  bool HasVfcSparsecoreIsa = false;
+  bool HasGlcSparsecoreIsa = false;
+  bool HasGfcSparsecoreIsa = false;
+  bool HasPxcVPU = false;
+  bool HasGsft = false;
+  bool HasSMul24 = false;
+  bool HasSMul32 = false;
+  bool HasEarlyVxposeAllocation = false;
+  bool HasNeedsCompilerThrottling = false;
+  bool TPUABIOverride = false;
+
+  TPUFrameLowering FrameLowering;
+  TPUInstrInfo InstrInfo;
+  TPUTargetLowering TLInfo;
+  TPUSelectionDAGInfo TSInfo;
+  InstrItineraryData InstrItins;
+
+  SmallVector<FifoInfo *, 4> FifoInfos;
+  DenseMap<Register, FifoInfo *> FifoInfoByPhysReg;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_TPU_TPUSUBTARGET_H

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUTargetMachine.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUTargetMachine.cpp
new file mode 100644
index 0000000..c21b33d
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUTargetMachine.cpp

@@ -0,0 +1,692 @@
+//===------ TPUTargetMachine.cpp - Define TargetMachine for TPU -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about TPU target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TPUTargetMachine.h"
+
+#include "ScheduleDAGSwing.h"
+#include "TPU.h"
+#include "TPUAliasAnalysis.h"
+#include "TPUCriticalPathScheduler.h"
+#include "TPUMachineFunctionInfo.h"
+#include "TPUMachinePipelinerInfo.h"
+#include "TPUOriginalOrderScheduler.h"
+#include "TPUSchedule.h"
+#include "TPUTargetTransformInfo.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/LinkAllPasses.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/IPO/GlobalDCE.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/ADCE.h"
+#include "llvm/Transforms/Scalar/DCE.h"
+#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
+#include "llvm/Transforms/Scalar/SROA.h"
+
+#include <memory>
+
+using namespace llvm;
+
+cl::opt<bool> UseOriginalOrderScheduler(
+    "tpu-use-original-order-sched", cl::init(false),
+    cl::desc("Use the original order scheduler for pre-RA scheduling."));
+
+static cl::opt<bool> UseSwingScheduler(
+    "tpu-use-swing-sched", cl::init(false),
+    cl::desc("Use the swing scheduler for pre-RA scheduling."));
+
+static cl::opt<bool> UseCriticalPathScheduler(
+    "tpu-critical-path-sched", cl::init(false),
+    cl::desc("Use the critical path scheduler for pre-RA scheduling."));
+
+static cl::opt<bool> UseResourceSwingScheduler(
+    "tpu-use-resource-swing-sched", cl::init(false),
+    cl::desc(
+        "Use experimental variant of swing scheduler for pre-RA scheduling"));
+
+static cl::opt<bool> UseSwingModuloPipeliner(
+    "tpu-use-swing-modulo-sched", cl::init(false),
+    cl::desc("Use the tpu-specific swing-modulo scheduler instead of the "
+             "MachinePipeliner."));
+
+cl::opt<bool> EnableExperimentalCopyRotate(
+    "tpu-enable-copy-rotate", cl::init(false),
+    cl::desc("Enable the experimental copy rotation pass."));
+
+// TODO(thomasraoux): Fifo scheduling is disabled as it expose a bug in fifo
+// analysis for pipelined loops without prologue. Re-enable it once it is fixed.
+static cl::opt<bool> UseTPUFifoSchedulingPass(
+    "tpu-use-fifo-sched", cl::init(false),
+    cl::desc("Use pass to re-order fifo instructions before the "
+             "MachinePipeliner."));
+
+static cl::opt<bool>
+    UseVDelayPass("tpu-use-vdelay", cl::init(true),
+                  cl::desc("Use pass merge Nops into VDelay."));
+
+cl::opt<bool>
+    NeedOverlayerPasses("tpu-enable-overlayer-passes", cl::init(false),
+                        cl::desc("Enable passes needed for TPU overlayer."));
+
+static cl::opt<bool> SkipFastOpt(
+    "tpu-skip-fast-opt", cl::init(false),
+    cl::desc("Skip generic optimization passes being run for tensor-core"));
+
+static cl::opt<bool> EnableXLUOpts("enable-tpu-xlu-opt", cl::init(true),
+                                   cl::desc("Enable XLU opts pass"));
+
+static cl::opt<bool> EnablePipelinerSuperPass(
+    "tpu-enable-pipeliner-super-pass", cl::init(false),
+    cl::desc("Use the machine pipeliner super pass to optimize for spilling."));
+
+cl::opt<bool> SpillDebugEnabled("tpu-enable-spill-debug", cl::init(false),
+                                cl::desc("Enable spill debugging on TPU."));
+
+extern "C" void LLVMInitializeTPUTarget() {
+  // Register the target.
+  RegisterTargetMachine<TPUTargetMachine> registered_target(getTheTPUTarget());
+
+  PassRegistry &Registry = *PassRegistry::getPassRegistry();
+  initializeBundlePackerPass(Registry);
+  initializeTPUBreakVResHoldPass(Registry);
+  initializeTPUAddrCalcDelayPass(Registry);
+  initializeFifoPseudoAllocPass(Registry);
+  initializeBundleTrackerTestPass(Registry);
+  initializePostBundleLowerPseudosPass(Registry);
+  initializeOverPredicatePass(Registry);
+  initializeTPUMachinePipelinerSuperPassPass(Registry);
+  initializeTPUMachinePipelinerInfoWrapperPassPass(Registry);
+  initializeTPUMachinePipelinerAnalysisPass(Registry);
+  initializeTPUMachinePipelinerPass(Registry);
+  initializeTPUMachineSSIPipelinerPass(Registry);
+  initializeTPUPipelineDovetailingPass(Registry);
+  initializeTPUEarlyBranchFoldingPass(Registry);
+  initializeTPULoopParallelPass(Registry);
+  initializeTPUPreSpillerPass(Registry);
+  initializeTPUVLIWPreparePass(Registry);
+  initializeTPUCodeGenPreparePass(Registry);
+  initializeTPUNopCoalescingPass(Registry);
+  initializeTPUPadFunctionsPass(Registry);
+  initializeTPULoopAnalysisPass(Registry);
+  initializeTPURematerializePass(Registry);
+  initializeTPUCrossCallSpillPackerPass(Registry);
+  initializeTPURawHazardPass(Registry);
+  initializeTPUEarlyPostISelMiscPass(Registry);
+  initializeTPUCopyRotatePass(Registry);
+  initializeTPUEmulateComplexAddressingPass(Registry);
+  initializeTPUMemAllocPass(Registry);
+  initializeTPUMemOpIntrinsicsPass(Registry);
+  initializeTPUGEPLoweringPass(Registry);
+  initializeTPUOptimizePreparePass(Registry);
+  initializeUnderPredicatePass(Registry);
+  initializeTPUFifoSchedulingPass(Registry);
+  initializeTPUVerifierPass(Registry);
+  initializeTPUEnsureProgramEndHaltPass(Registry);
+  initializeBarnaCoreRotateLoweringPass(Registry);
+  initializeMachineSchedulerFastPass(Registry);
+  initializeTPUXLUOptimizationsPass(Registry);
+  initializeTPURegisterPreparePass(Registry);
+  initializeTPUEventDebugPass(Registry);
+  initializeTPUSpillDebugPass(Registry);
+  initializeTPUExecutionProfilePass(Registry);
+  initializeTPUPrintMachineFunctionPass(Registry);
+  initializeTPULateIBufMissMitigationPass(Registry);
+  initializeTPUInvalidateFifoFillAnalysisPass(Registry);
+  initializeTPURemoveIdentityCopiesPass(Registry);
+  initializeTPUDAGToDAGISelPass(Registry);
+  initializeTPUOptimizeSpillToDregPass(Registry);
+  initializeTPUHardwareTraceDebugPass(Registry);
+  initializeTPUEarlyIfPredicatorPass(Registry);
+}
+
+static std::string computeDataLayout() {
+  // Data layout (keep in sync with clang/lib/Basic/Targets.cpp)
+  // LINT.IfChange
+  return "e"        // Little endian
+         "-m:e"     // ELF name manging
+         "-p:32:32" // 32-bit pointers, 32 bit aligned
+         "-i64:64"  // 64 bit integers, 64 bit aligned
+         "-a:0:32"  // 32 bit alignment of objects of aggregate type
+         "-n32"     // 32 bit native integer width
+         "-S64";    // 64 bit natural stack alignment
+  // LINT.ThenChange(//depot/google3/platforms/xla/sparse_core/llvm/compiler_driver.cc)
+}
+
+static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
+  if (!RM.has_value())
+    return Reloc::PIC_;
+  return *RM;
+}
+
+TPUTargetMachine::TPUTargetMachine(const Target &TheTarget,
+                                   const Triple &TargetTriple, StringRef Cpu,
+                                   StringRef FeatureString,
+                                   const TargetOptions &Options,
+                                   std::optional<Reloc::Model> RelocationModel,
+                                   std::optional<CodeModel::Model> CodeModel,
+                                   CodeGenOpt::Level OptLevel, bool JIT)
+    : LLVMTargetMachine(
+          TheTarget, computeDataLayout(), TargetTriple, Cpu, FeatureString,
+          Options, getEffectiveRelocModel(RelocationModel),
+          getEffectiveCodeModel(CodeModel, CodeModel::Medium), OptLevel),
+      TLOF(new TargetLoweringObjectFileELF()) {
+  initAsmInfo();
+}
+
+TPUTargetMachine::~TPUTargetMachine() {
+  for (const SWPTargetPSV *PSV : SWPTargetPSVs)
+    delete PSV;
+}
+
+const TPUSubtarget *
+TPUTargetMachine::getSubtargetImpl(const llvm::Function &F) const {
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
+  Attribute ABIAttr = F.getFnAttribute("enable-tpu-abi");
+
+  StringRef CPU = !CPUAttr.hasAttribute(Attribute::None)
+                      ? CPUAttr.getValueAsString()
+                      : (StringRef)TargetCPU;
+  StringRef FS = !FSAttr.hasAttribute(Attribute::None)
+                     ? FSAttr.getValueAsString()
+                     : (StringRef)TargetFS;
+  bool TPUABIOvrd = !ABIAttr.hasAttribute(Attribute::None);
+
+  SmallString<512> Key;
+  Key.reserve(CPU.size() + FS.size() + F.getName().size());
+  Key += CPU;
+  Key += FS;
+  Key += F.getName();
+
+  auto &I = SubtargetMap[Key];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = std::make_unique<TPUSubtarget>(TargetTriple, CPU, FS, *this, Options,
+                                       getCodeModel(), TPUABIOvrd, OptLevel);
+  }
+  return I.get();
+}
+
+TargetTransformInfo
+TPUTargetMachine::getTargetTransformInfo(const Function &F) const {
+  return TargetTransformInfo(TPUTTIImpl(this, F));
+}
+
+TPUPassConfig::TPUPassConfig(TPUTargetMachine &TPUTM,
+                             PassManagerBase *PassManager, bool IsBarnaCore,
+                             bool IsSparseCore, bool IsTensorCore)
+    : TargetPassConfig(TPUTM, *PassManager), IsBarnaCore(IsBarnaCore),
+      IsSparseCore(IsSparseCore), IsTensorCore(IsTensorCore) {
+  if (IsBarnaCore) {
+    // Disable LICM for barnacore. When the kernel is pipelined registers
+    // automatically rotate every iteration so we cannot have invariant
+    // registers.
+    disablePass(&MachineLICMID);
+    disablePass(&EarlyMachineLICMID);
+  }
+  if (!IsSparseCore)
+    // Block placement pass hurts overlay and creates extra branches.
+    disablePass(&MachineBlockPlacementID);
+}
+
+TPUTargetMachine &TPUPassConfig::getTPUTargetMachine() const {
+  return getTM<TPUTargetMachine>();
+}
+
+ScheduleDAGInstrs *
+TPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
+  SchedulerMode Mode = getSchedulingMode();
+  switch (Mode) {
+  case SchedulerMode::SwingResource: {
+    auto *DAG = createScheduleDAGSwingResource(C);
+    addScoreboardDependencies(DAG);
+    DAG->addMutation(createFifoOverflowMutation());
+    DAG->addMutation(createFifoVoidMutation());
+    DAG->addMutation(createVRegAddressCalcMutation());
+    DAG->addMutation(createBanZeroLatencyMutation());
+    return DAG;
+  }
+  case SchedulerMode::Swing: {
+    auto *DAG = new ScheduleDAGSwing(*C->MF, C->MLI, C->LIS, C->AA);
+    addScoreboardDependencies(DAG);
+    DAG->addMutation(createFifoOverflowMutation());
+    DAG->addMutation(createFifoVoidMutation());
+    DAG->addMutation(createVRegAddressCalcMutation());
+    DAG->addMutation(createBanZeroLatencyMutation());
+    return DAG;
+  }
+  case SchedulerMode::CriticalPath: {
+    auto *DAG = createTpuCriticalPathSchedLive(C);
+    addScoreboardDependencies(DAG);
+    DAG->addMutation(createFifoOverflowMutation());
+    DAG->addMutation(createFifoVoidMutation());
+    DAG->addMutation(createVRegAddressCalcMutation());
+    return DAG;
+  }
+  case SchedulerMode::OriginalOrder: {
+    return createTPUOriginalOrderScheduler(C);
+  }
+  }
+  llvm_unreachable("Invalid scheduler mode");
+}
+
+ScheduleDAGInstrs *
+TPUPassConfig::createPostMachineScheduler(MachineSchedContext *C) const {
+  ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
+  DAG->addMutation(createTPUEventDepsMutation());
+  DAG->addMutation(createFifoOverflowMutation());
+  DAG->addMutation(createFifoVoidMutation());
+  DAG->addMutation(createVResHoldMutation());
+  // Needs to be after VResHoldMutation.
+  DAG->addMutation(createVRegAddressCalcMutation());
+  addScoreboardDependencies(DAG);
+  return DAG;
+}
+
+TPUPassConfig::SchedulerMode TPUPassConfig::getSchedulingMode() const {
+  if (UseSwingScheduler)
+    return SchedulerMode::Swing;
+  if (UseCriticalPathScheduler)
+    return SchedulerMode::CriticalPath;
+  if (UseResourceSwingScheduler)
+    return SchedulerMode::SwingResource;
+  if (UseOriginalOrderScheduler)
+    return SchedulerMode::OriginalOrder;
+  // Default scheduler
+  if (IsTensorCore)
+    return SchedulerMode::CriticalPath;
+  // TODO(hgreving): Evaluate better options.
+  return SchedulerMode::Swing;
+}
+
+bool TPUPassConfig::isSoftwarePipelinerSuperPassEnabled() {
+  return EnablePipelinerSuperPass;
+}
+
+bool TPUPassConfig::isSoftwarePipeliningEnabled() {
+  if (UseSwingModuloPipeliner ||
+      (!IsTensorCore && UseSwingModuloPipeliner.getNumOccurrences() == 0))
+    return true;
+  return false;
+}
+
+TargetPassConfig *
+TPUTargetMachine::createPassConfig(PassManagerBase &PassManager) {
+  IsBarnaCore = getTargetCPU().str() == "barnacore-cc-pf";
+  IsSparseCore = getTargetCPU().str() == "sparsecore-tac-vf" ||
+                 getTargetCPU().str() == "sparsecore-tec-vf" ||
+                 getTargetCPU().str() == "sparsecore-scs-vf" ||
+                 getTargetCPU().str() == "sparsecore-tac-gl" ||
+                 getTargetCPU().str() == "sparsecore-tec-gl" ||
+                 getTargetCPU().str() == "sparsecore-scs-gl" ||
+                 getTargetCPU().str() == "sparsecore-tac-gf" ||
+                 getTargetCPU().str() == "sparsecore-tec-gf" ||
+                 getTargetCPU().str() == "sparsecore-scs-gf";
+  IsTensorCore = getTargetCPU().str() == "tensorcore-jf" ||
+                 getTargetCPU().str() == "tensorcore-df" ||
+                 getTargetCPU().str() == "tensorcore-pf" ||
+                 getTargetCPU().str() == "tensorcore-vf";
+  TPUPassConfig *C = new TPUPassConfig(*this, &PassManager, IsBarnaCore,
+                                       IsSparseCore, IsTensorCore);
+  if (!IsSparseCore) {
+    // On SparseCore, we are using LLVM's generic machine scheduler, since we
+    // don't need to mitigate any compile time issues on these subtargets at
+    // present. In theory, we don't need it on BarnaCore either.
+    // TODO(hgreving): Investigate why BarnaCore breaks.
+    C->substitutePass(&MachineSchedulerID, &TPUMachineSchedulerFastID);
+  } else {
+    C->insertPass(&MachineBlockPlacementID, &TPULateIBufMissMitigationID);
+  }
+  // We want to reduce live ranges right after pre-RA scheduling as the
+  // scheduler may increase register pressure significantly
+  if (C->getSchedulingMode() == TPUPassConfig::SchedulerMode::CriticalPath) {
+    C->insertPass(&TPUMachineSchedulerFastID,
+                  createTPUGreddyLiveRangeReductionPass());
+  }
+  C->insertPass(&MachineSchedulerID, &TPUInvalidateFifoFillAnalysisID);
+  C->insertPass(&PostMachineSchedulerID, &TPUInvalidateFifoFillAnalysisID);
+  if (IsBarnaCore)
+    C->insertPass(&TPUMachineSchedulerFastID,
+                  createBarnaCoreRotateLoweringPass());
+  if (SpillDebugEnabled)
+    C->insertPass(&MachineSinkingID, createTPUSpillDebugPass());
+  C->insertPass(&VirtRegRewriterID, &TPUOptimizeSpillToDregID);
+  return C;
+}
+
+void TPUPassConfig::addOptimizedRegAlloc() {
+  TargetPassConfig::addOptimizedRegAlloc();
+}
+
+void TPUPassConfig::addPostRegAlloc() {
+  addPass(createTPURemoveIdentityCopiesPass());
+  if (EnableExperimentalCopyRotate) {
+    addPass(createTPUCopyRotatePass());
+    addPass(&MachineCopyPropagationID);
+  }
+  if (!IsSparseCore)
+    addPass(createTPUCrossCallSpillPackerPass());
+}
+
+void TPUPassConfig::addIRPasses() {
+  // Reduced set of optimization passes ran for Tensorcore programs. We run
+  // minimal number of passes to reduce compile time.
+  if (IsTensorCore && !SkipFastOpt) {
+    TargetLibraryInfoImpl LibraryInfo;
+    LibraryInfo.disableAllFunctions();
+    addPass(new TargetLibraryInfoWrapperPass(LibraryInfo));
+    addPass(llvm::createBasicAAWrapperPass());
+    addPass(llvm::createScopedNoAliasAAWrapperPass());
+    addPass(llvm::createAlwaysInlinerLegacyPass(false));
+    if (getOptLevel() != llvm::CodeGenOpt::None) {
+      addPass(llvm::createDeadCodeEliminationPass());
+      addPass(llvm::createInstructionCombiningPass());
+    }
+    addPass(llvm::createTPUMemAllocPass(
+        reinterpret_cast<llvm::TPUTargetMachine *>(TM)));
+    if (getOptLevel() != llvm::CodeGenOpt::None) {
+      addPass(llvm::createCFGSimplificationPass());
+      addPass(llvm::createEarlyCSEPass());
+      addPass(llvm::createGVNPass());
+      addPass(llvm::createIndVarSimplifyPass());
+      addPass(llvm::createLICMPass());
+      addPass(llvm::createLoopRotatePass());
+      addPass(llvm::createLoopInstSimplifyPass());
+      addPass(llvm::createReassociatePass());
+    }
+  } else {
+    addPass(createTPUAAWrapperPass(IsSparseCore));
+    addPass(createTPUExternalAAWrapperPass());
+    TargetPassConfig::addIRPasses();
+  }
+  if (IsTensorCore && EnableXLUOpts &&
+      getOptLevel() != llvm::CodeGenOpt::None) {
+    addPass(llvm::createTPUXLUOptimizationsPass());
+  }
+}
+
+bool TPUPassConfig::addPreISel() {
+  addPass(createSROAPass());
+  addPass(llvm::createEarlyCSEPass());
+  addPass(createInstructionCombiningPass());
+  addPass(createTPUMemOpIntrinsicsPass());
+  addPass(createTPUGEPLoweringPass());
+  addPass(createInstSimplifyLegacyPass());
+  addPass(createTPUVLIWPreparePass());
+  addPass(createTPUCodeGenPreparePass(&getTPUTargetMachine()));
+  addPass(createTPUVerifierPass(&getTPUTargetMachine(),
+                                /* IsLateCodegen  = */ true));
+  return true;
+}
+
+// Run if predicator before preRA scheduler to give the scheduler more flexibily
+bool TPUPassConfig::addILPOpts() {
+  addPass(&EarlyIfPredicatorID);
+  addPass(createTPUEarlyIfPredicatorPass());
+  return false;
+}
+
+// Install an instruction selector pass.
+bool TPUPassConfig::addInstSelector() {
+  addPass(createTPUISelDag(getTPUTargetMachine()));
+#ifndef NDEBUG
+  addPass(createTPUMachineVerifierPass());
+#endif
+  addPass(createTPUEarlyPostISelMiscPass());
+  return false;
+}
+
+void TPUPassConfig::addSoftwarePipeliningPasses(TPUMachinePipelinerInfo *ExtMPI,
+                                                const Twine &DbgStr) {
+  if (TM->getOptLevel() >= CodeGenOpt::Default) {
+    // Pipeliner is disabled by default on tensorcore, but enabled on
+    // sparsecore/barnacore, unless explicitly disabled.
+    if (isSoftwarePipeliningEnabled()) {
+      addPass(createTPUMachinePipelinerInfoWrapperPass());
+      addPass(createTPUPreSpillerPass(
+          ExtMPI, /*UseHeuristicMode=*/!isSoftwarePipelinerSuperPassEnabled()));
+      addPass(createTPUMachinePipelinerPass(ExtMPI, DbgStr));
+      addPass(createTPUEarlyBranchFoldingPass(/*KeepPrologEpilog=*/true));
+      if (!IsBarnaCore) {
+        addPass(createTPUPipelineDovetailingPass());
+        addPass(createTPUMachineSSIPipelinerPass());
+      }
+      addPass(createTPUEarlyBranchFoldingPass(/*KeepPrologEpilog=*/false));
+      // This has been added in order to clean up redundant predicates created
+      // during software pipelining.
+      addPass(&MachineCSEID);
+      addPass(createTPUEventDebugPass());
+      // Software pipelining may produce dead code in some cases.
+      addPass(&DeadMachineInstructionElimID);
+    }
+  }
+}
+
+void TPUPassConfig::addSoftwarePipeliningAnalysisPass(
+    TPUMachinePipelinerInfo *ExtMPI) {
+  addPass(createTPUMachinePipelinerAnalysisPass(ExtMPI));
+}
+
+void TPUPassConfig::addMachineSSAOptimization() {
+  TargetPassConfig::addMachineSSAOptimization();
+  // FIXME(hgreving): This increases register pressure, we might want to
+  // selectively disable this through analysis and make it part of the super
+  // pass pipeline.
+  addPass(createTPUEmulateComplexAddressingPass());
+  if (UseTPUFifoSchedulingPass)
+    addPass(createTPUFifoSchedulingPass());
+  addPass(createTPURegisterPreparePass());
+  if (isSoftwarePipelinerSuperPassEnabled())
+    addPass(createTPUMachinePipelinerSuperPass(&getTPUTargetMachine()));
+  addSoftwarePipeliningPasses(/*ExtMPI=*/nullptr);
+}
+
+void TPUPassConfig::addPreRegAlloc() {
+  // Currently empty.
+}
+
+bool TPUPassConfig::addRegAssignAndRewriteOptimized() {
+  // TPU specific rematerialization pass.
+  addPass(createTPURematerializePass());
+  TargetPassConfig::addRegAssignAndRewriteOptimized();
+
+  return true;
+}
+
+// Implemented by targets that want to run passes immediately before
+// machine code is emitted.
+void TPUPassConfig::addPreEmitPass() {
+  if (IsSparseCore) {
+    // On SparseCore, we're running pseudo expansion twice, because we may need
+    // to scavenge registers, and we can't do this in a single scan.
+    addPass(&ExpandPostRAPseudosID);
+  }
+  addPass(createTPUOverPredicatePass());
+  if (NeedOverlayerPasses) {
+    addPass(createTPUEnsureProgramEndHaltPass());
+  }
+  addPass(createTPUAddrCalcDelayPass());
+  addPass(createTPUBreakVResHoldPass());
+  addPass(createTPUHardwareTraceDebugPass());
+  addPass(createTPUBundlePackerPass());
+  addPass(createTPUFifoPseudoAllocPass());
+  addPass(createTPUUnderPredicatePass());
+  addPass(createRawHazardPass());
+  addPass(createTPUPostBundleLowerPseudosPass());
+  // This is intentionally before Nop coalescing.
+  addPass(createTPULoopAnalysisPass());
+  // Merge Nop instructions after fixing branch delay slots.
+  if (UseVDelayPass)
+    addPass(createTPUNopCoalescingPass());
+  if (IsSparseCore) {
+    addPass(createTPUPadFunctionsPass());
+  }
+#ifndef NDEBUG
+  addPass(createTPUMachineVerifierPass());
+#endif
+}
+
+// Run passes after prolog-epilog insertion and before the second instruction
+// scheduling pass.
+void TPUPassConfig::addPreSched2() { addPass(&IfConverterID); }
+
+void TPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
+  AAM.registerFunctionAnalysis<TPUAA>();
+}
+
+MachineFunctionInfo *TPUTargetMachine::createMachineFunctionInfo(
+    BumpPtrAllocator &Allocator, const Function &F,
+    const TargetSubtargetInfo *STI) const {
+  return TPUMachineFunctionInfo::create<TPUMachineFunctionInfo>(Allocator, F,
+                                                                STI);
+}
+
+void TPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
+  PB.registerPipelineEarlySimplificationEPCallback(
+      [this](ModulePassManager &PM, OptimizationLevel Level) {
+        PM.addPass(TPUOptimizePreparePass(*this));
+        PM.addPass(AlwaysInlinerPass(false));
+        PM.addPass(GlobalDCEPass());
+        // Clean up pointer casts and redundant GEP before MemAllocPass
+        PM.addPass(createModuleToFunctionPassAdaptor(
+            SROAPass(llvm::SROAOptions::PreserveCFG)));
+        PM.addPass(createModuleToFunctionPassAdaptor(ADCEPass()));
+        PM.addPass(createModuleToFunctionPassAdaptor(InstCombinePass()));
+        PM.addPass(createModuleToFunctionPassAdaptor(DCEPass()));
+        PM.addPass(TPUMemAllocPass(*this));
+        PM.addPass(createModuleToFunctionPassAdaptor(
+            createFunctionToLoopPassAdaptor(TPULoopParallelPass())));
+        PM.addPass(createModuleToFunctionPassAdaptor(
+            TPUVerifierPass(*this, /* IsLateCodegen = */ false)));
+      });
+
+  PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
+    FAM.registerPass([&] { return TPUAA(); });
+  });
+  PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
+    if (AAName == "tpu-aa") {
+      AAM.registerFunctionAnalysis<TPUAA>();
+      return true;
+    }
+    return false;
+  });
+}
+
+const PseudoSourceValue *
+TPUTargetMachine::getFifoPushPSV(const TargetRegisterClass *RegClass) const {
+  std::pair<const TargetRegisterClass *, PseudoSourceValueKind> Key = {
+      RegClass, PSV_FifoRegPush};
+  if (TargetPSVs.count(Key))
+    return &TargetPSVs.at(Key);
+  TargetPSVs.emplace(
+      Key,
+      TargetPSV(PseudoSourceValue::TargetCustom + TargetPSVs.size(), *this));
+  return &TargetPSVs.at(Key);
+}
+
+const PseudoSourceValue *
+TPUTargetMachine::getFifoPopPSV(const TargetRegisterClass *RegClass) const {
+  std::pair<const TargetRegisterClass *, PseudoSourceValueKind> Key = {
+      RegClass, PSV_FifoRegPop};
+  if (TargetPSVs.count(Key))
+    return &TargetPSVs.at(Key);
+  TargetPSVs.emplace(
+      Key,
+      TargetPSV(PseudoSourceValue::TargetCustom + TargetPSVs.size(), *this));
+  return &TargetPSVs.at(Key);
+}
+
+const PseudoSourceValue *TPUTargetMachine::getDTPrologPSV() const {
+  return getPSV(PSV_DTProlog);
+}
+
+const PseudoSourceValue *TPUTargetMachine::getDTEpilogPSV() const {
+  return getPSV(PSV_DTEpilog);
+}
+
+const PseudoSourceValue *TPUTargetMachine::getDTHeaderPSV() const {
+  return getPSV(PSV_DTHeader);
+}
+
+const TPUTargetMachine::SWPTargetPSV *
+TPUTargetMachine::getSWPIterationPSV(int loop, int iteration) const {
+  return getSWPPSV(loop, iteration);
+}
+
+bool TPUTargetMachine::isDTPrologPSV(const PseudoSourceValue *PSV) const {
+  if (!PSV)
+    return false;
+  return PSV == getDTPrologPSV();
+}
+
+bool TPUTargetMachine::isDTEpilogPSV(const PseudoSourceValue *PSV) const {
+  if (!PSV)
+    return false;
+  return PSV == getDTEpilogPSV();
+}
+
+bool TPUTargetMachine::isDTHeaderPSV(const PseudoSourceValue *PSV) const {
+  if (!PSV)
+    return false;
+  return PSV == getDTHeaderPSV();
+}
+
+bool TPUTargetMachine::isSWPIterationPSV(const PseudoSourceValue *PSV) const {
+  if (!PSV)
+    return false;
+  return PSV->kind() >=
+         PseudoSourceValue::TargetCustom + PSV_SWPIteration_Custom;
+}
+
+const PseudoSourceValue *
+TPUTargetMachine::getPSV(PseudoSourceValueKind Kind) const {
+  std::pair<const TargetRegisterClass *, PseudoSourceValueKind> Key = {nullptr,
+                                                                       Kind};
+  if (TargetPSVs.count(Key))
+    return &TargetPSVs.at(Key);
+  TargetPSVs.emplace(
+      Key,
+      TargetPSV(PseudoSourceValue::TargetCustom + TargetPSVs.size(), *this));
+  return &TargetPSVs.at(Key);
+}
+
+TPUTargetMachine::SWPTargetPSV::SWPTargetPSV(unsigned Kind,
+                                             const TargetMachine &TM, int loop,
+                                             int iteration)
+    : PseudoSourceValue(Kind, TM), loop(loop), iteration(iteration) {}
+
+const TPUTargetMachine::SWPTargetPSV *
+TPUTargetMachine::getSWPPSV(int loop, int iteration) const {
+  // We currently define all custom memory operands >=
+  // PseudoSourceValue::TargetCustom + PSV_SWPIteration_Custom to be of type
+  // SWPTargetPSV. There is one new unique memory operand per instruction
+  // (potentially heavy lifting).
+  SWPTargetPSV *SWPPSV =
+      new SWPTargetPSV(PseudoSourceValue::TargetCustom +
+                           PSV_SWPIteration_Custom + SWPTargetPSVs.size(),
+                       *this, loop, iteration);
+  SWPTargetPSVs.push_back(SWPPSV);
+  return SWPPSV;
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUTargetMachine.h b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUTargetMachine.h
new file mode 100644
index 0000000..69e3634
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUTargetMachine.h

@@ -0,0 +1,237 @@
+//===-- TPUTargetMachine.h - Define TargetMachine for TPU --- C++ ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the TPU specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_TPU_TPUTARGETMACHINE_H
+#define LLVM_LIB_TARGET_TPU_TPUTARGETMACHINE_H
+
+#include "TPUISelLowering.h"
+#include "TPUInstrInfo.h"
+#include "TPUSelectionDAGInfo.h"
+#include "TPUSubtarget.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class formatted_raw_ostream;
+class TPUMachinePipelinerInfo;
+
+class TPUTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  mutable StringMap<std::unique_ptr<TPUSubtarget>> SubtargetMap;
+
+public:
+  // A special software pipelining, per loop, per iteration memory operand for
+  // iteration parallel loops.
+  class SWPTargetPSV : public PseudoSourceValue {
+  public:
+    explicit SWPTargetPSV(unsigned Kind, const TargetMachine &TM, int loop,
+                          int iteration);
+    bool isConstant(const MachineFrameInfo *) const override { return false; }
+    bool isAliased(const MachineFrameInfo *) const override { return false; }
+    bool mayAlias(const MachineFrameInfo *) const override { return false; }
+    int getLoop() const { return loop; }
+    int getIteration() const { return iteration; }
+
+  private:
+    // A unique ID for the loop the pseudo memory corresponds to.
+    int loop;
+    // The represented iteration in a software pipelined loop.
+    int iteration;
+  };
+
+private:
+  // A PseudoSourceValue allows us to generate an artificial area of memory that
+  // Alias Analysis knows is not addressable by any other load or store. We use
+  // these to link sequences of FIFO pushes and pops; each push or pop is
+  // modelled as mayLoad+mayStore. Each push is assigned a specific
+  // PseudoSourceValue (per FIFO register) so all pushes interfere with each
+  // other. Similarly pops get a different PSV. Pushes don't interfere with
+  // Pops, which means chains of pushes can be reordered with respect to chains
+  // of pops.
+  class TargetPSV : public PseudoSourceValue {
+  public:
+    using PseudoSourceValue::PseudoSourceValue;
+    bool isConstant(const MachineFrameInfo *) const override { return false; }
+    bool isAliased(const MachineFrameInfo *) const override { return false; }
+    bool mayAlias(const MachineFrameInfo *) const override { return false; }
+  };
+
+public:
+  TPUTargetMachine(const Target &TheTarget, const Triple &TargetTriple,
+                   StringRef Cpu, StringRef FeatureString,
+                   const TargetOptions &Options,
+                   std::optional<Reloc::Model> RelocationModel,
+                   std::optional<CodeModel::Model> CodeModel,
+                   CodeGenOpt::Level OptLevel, bool JIT);
+  ~TPUTargetMachine() override;
+
+  const TPUSubtarget *getSubtargetImpl(const llvm::Function &F) const override;
+
+  TargetTransformInfo getTargetTransformInfo(const Function &F) const override;
+
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PassManager) override;
+
+  MachineFunctionInfo *
+  createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
+                            const TargetSubtargetInfo *STI) const override;
+
+
+  // New pass manager callbacks.
+  void registerPassBuilderCallbacks(PassBuilder &PB) override;
+
+  // New pass manager AA analysis default.
+  void registerDefaultAliasAnalyses(AAManager &) override;
+
+  // Only used by PEI, and no harm returning true. We don't callee save if TPU
+  // ABI is disabled.
+  bool usesPhysRegsForValues() const override { return true; }
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
+
+  // Return a PseudoSourceValue for a push to the given FIFO register class.
+  // This allows to link FIFO pushes without polluting the alias information for
+  // loads and stores (and pops!).
+  const PseudoSourceValue *
+  getFifoPushPSV(const TargetRegisterClass *RegClass) const;
+
+  // Return a PseudoSourceValue for a pop to the given FIFO register class.
+  // This allows to link FIFO pushes without polluting the alias information for
+  // loads and stores (and pushes!).
+  const PseudoSourceValue *
+  getFifoPopPSV(const TargetRegisterClass *RegClass) const;
+
+  const PseudoSourceValue *
+  getFifoPSV(bool IsPush, const TargetRegisterClass *RegClass) const {
+    return IsPush ? getFifoPushPSV(RegClass) : getFifoPopPSV(RegClass);
+  }
+
+  enum PseudoSourceValueKind {
+    PSV_FifoRegPush,
+    PSV_FifoRegPop,
+    PSV_BarnaCore_ConcatReg,
+    PSV_BarnaCore_ShiftReg,
+    PSV_BarnaCoreChannel_LoopEnd,
+    PSV_DTProlog,
+    PSV_DTEpilog,
+    PSV_DTHeader,
+    PSV_SWPIteration,
+  };
+
+  // A static offset on top of PseudoSourceValue::TargetCustom for all
+  // SWPTargetPSV memory operands.
+  static constexpr int PSV_SWPIteration_Custom = 128;
+
+  // Gets or creates a PseudoSourceValue for instructions in a prolog of a
+  // potential dovetail of a software pipelined loop. The semantics are
+  // different from the other PSVs: Dovetail prolog PSVs are ignored, except
+  // they are assumed to never intersect with dovetail epilog PSVs.
+  const PseudoSourceValue *getDTPrologPSV() const;
+
+  // Same a dovetail prolog PSV, but for epilog.
+  const PseudoSourceValue *getDTEpilogPSV() const;
+
+  // Same a dovetail prolog PSV, but for dovetail loop header.
+  const PseudoSourceValue *getDTHeaderPSV() const;
+
+  // Creates and returns a new memory location.
+  const SWPTargetPSV *getSWPIterationPSV(int loop, int iteration) const;
+
+  // Return true if PSV is a dovetailed prolog PSV.
+  bool isDTPrologPSV(const PseudoSourceValue *PSV) const;
+
+  // Return true if PSV is a dovetailed epilog PSV.
+  bool isDTEpilogPSV(const PseudoSourceValue *PSV) const;
+
+  // Return true if PSV is a dovetailed header PSV.
+  bool isDTHeaderPSV(const PseudoSourceValue *PSV) const;
+
+  // Returns true if PSV is a software pipelining, per loop, per iteration PSV.
+  bool isSWPIterationPSV(const PseudoSourceValue *PSV) const;
+
+  const PseudoSourceValue *getPSV(PseudoSourceValueKind Kind) const;
+  const SWPTargetPSV *getSWPPSV(int loop, int iteration) const;
+
+private:
+  // We use a std::map here so we get stable addresses and can return pointers.
+  mutable std::map<
+      std::pair<const TargetRegisterClass *, PseudoSourceValueKind>, TargetPSV>
+      TargetPSVs;
+  // A map for all software pipeliner target PSVs, per loop.
+  mutable std::vector<const SWPTargetPSV *> SWPTargetPSVs;
+
+  bool IsBarnaCore = false;
+  bool IsSparseCore = false;
+  bool IsTensorCore = false;
+};
+
+// TPU Code Generator Pass Configuration Options.
+class TPUPassConfig : public TargetPassConfig {
+public:
+  TPUPassConfig(TPUTargetMachine &TPUTM, PassManagerBase *PassManager,
+                bool IsBarnaCore, bool IsSparseCore, bool IsTensorCore);
+
+  TPUTargetMachine &getTPUTargetMachine() const;
+
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  void addPreSched2() override;
+  void addPreEmitPass() override;
+  void addMachineSSAOptimization() override;
+  void addPreRegAlloc() override;
+  bool addRegAssignAndRewriteOptimized() override;
+  bool addILPOpts() override;
+  void addIRPasses() override;
+  void addOptimizedRegAlloc() override;
+  void addPostRegAlloc() override;
+  void addSoftwarePipeliningPasses(TPUMachinePipelinerInfo *ExtMPI,
+                                   const Twine &DbgStr = "");
+  void addSoftwarePipeliningAnalysisPass(TPUMachinePipelinerInfo *ExtMPI);
+
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override;
+
+  ScheduleDAGInstrs *
+  createPostMachineScheduler(MachineSchedContext *C) const override;
+
+  enum class SchedulerMode {
+    Swing,
+    SwingResource,
+    CriticalPath,
+    OriginalOrder,
+  };
+
+  SchedulerMode getSchedulingMode() const;
+
+  // Returns whether software pipelining is enabled.
+  bool isSoftwarePipeliningEnabled();
+
+  // Returns whether we're running a software pipelining super pass
+  // (experimental).
+  bool isSoftwarePipelinerSuperPassEnabled();
+
+private:
+  bool IsBarnaCore = false;
+  bool IsSparseCore = false;
+  bool IsTensorCore = false;
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_TPU_TPUTARGETMACHINE_H

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUVerifier.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUVerifier.cpp
new file mode 100644
index 0000000..806e84b
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUVerifier.cpp

@@ -0,0 +1,642 @@
+//===----------------- TPUVerifier.cpp - Verify -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TPUVerifier validates that the module doesn't contain any unsupported
+// functionality.
+//
+//===----------------------------------------------------------------------===//
+#include "TPU.h"
+#include "TPUSubtarget.h"
+#include "TPUTargetMachine.h"
+#include "third_party/llvm/llvm/lib/Target/GoogleTPU/TPUSubtarget.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsTPU.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#define DEBUG_TYPE "tpu-verifier"
+using namespace llvm;
+
+cl::opt<bool>
+    TPUFatalVerifierError("tpu-fatal-verifier-error", cl::Hidden,
+                          cl::init(true),
+                          cl::desc("Make TPU verifier errors fatal."));
+
+cl::opt<bool>
+    TPUVerifierStrictIntoPtr("tpu-strict-inttoptr", cl::Hidden, cl::init(false),
+                             cl::desc("Make the TPU verifier fail if LLVM's "
+                                      "inttoptr instruction in the code."));
+extern cl::opt<bool> SpillDebugEnabled;
+
+namespace {
+class TPUVerifier : public FunctionPass {
+public:
+  static char ID;
+  TPUVerifier() : FunctionPass(ID) {}
+  TPUVerifier(TPUTargetMachine *TM, bool IsLateCodegen)
+      : FunctionPass(ID), TM(TM), IsLateCodegen(IsLateCodegen) {}
+
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  StringRef getPassName() const override { return "TPU verifier"; }
+
+private:
+  TPUTargetMachine *TM;
+  bool IsLateCodegen = false;
+};
+char TPUVerifier::ID = 0;
+} // namespace
+
+INITIALIZE_PASS(TPUVerifier, DEBUG_TYPE, "TPU verifier", false, false)
+
+Pass *llvm::createTPUVerifierPass(TPUTargetMachine *TM, bool IsLateCodegen) {
+  return new TPUVerifier(TM, IsLateCodegen);
+}
+
+static bool isIntrinsicMaskValid(IntrinsicInst *Intr, const TPUSubtarget &ST) {
+  if (!ST.isSparseCore())
+    return true;
+  switch (Intr->getIntrinsicID()) {
+  case Intrinsic::tpu_vld_msk:
+  case Intrinsic::tpu_vld_msk_strided:
+  case Intrinsic::tpu_vld_msk_idx_strided:
+  case Intrinsic::tpu_vld_cb_msk:
+  case Intrinsic::tpu_vld_cb_msk_strided:
+  case Intrinsic::tpu_vld_cb_upd_msk:
+  case Intrinsic::tpu_vld_cb_upd_msk_strided:
+  case Intrinsic::tpu_vld_msk_idx:
+  case Intrinsic::tpu_vld_msk_idx_np:
+  case Intrinsic::tpu_vld_cb_msk_idx:
+  case Intrinsic::tpu_vld_cb_msk_idx_np:
+  case Intrinsic::tpu_vst_msk:
+  case Intrinsic::tpu_vst_msk_strided:
+  case Intrinsic::tpu_vst_msk_idx_strided:
+  case Intrinsic::tpu_vst_msk_add_strided:
+  case Intrinsic::tpu_vst_msk_add:
+  case Intrinsic::tpu_vst_msk_idx:
+  case Intrinsic::tpu_vst_msk_idx_np:
+  case Intrinsic::tpu_vst_msk_idx_add:
+  case Intrinsic::tpu_vst_msk_idx_ret_add_np:
+  case Intrinsic::tpu_vst_msk_idx_add_np:
+  case Intrinsic::tpu_vst_cb_msk_idx_add:
+  case Intrinsic::tpu_vst_cb_msk_idx_add_np:
+  case Intrinsic::tpu_vst_cb_msk:
+  case Intrinsic::tpu_vst_cb_upd_msk:
+  case Intrinsic::tpu_vst_cb_msk_strided:
+  case Intrinsic::tpu_vst_cb_upd_msk_strided:
+  case Intrinsic::tpu_vst_cb_msk_add:
+  case Intrinsic::tpu_vst_cb_upd_msk_add:
+  case Intrinsic::tpu_vst_cb_msk_add_strided:
+  case Intrinsic::tpu_vst_cb_upd_msk_add_strided:
+  case Intrinsic::tpu_vst_cb_msk_idx:
+    if (cast<VectorType>(Intr->getOperand(0)->getType())->getElementType() !=
+        Type::getInt1Ty(Intr->getContext()))
+      return false;
+    if (ST.hasV16()) {
+      if (cast<VectorType>(Intr->getOperand(0)->getType())
+              ->getElementCount()
+              .getKnownMinValue() == 8)
+        return false;
+    }
+    break;
+  default:
+    break;
+  }
+  return true;
+}
+
+static bool isIntrinsicSupported(IntrinsicInst *Intr, const TPUSubtarget &ST,
+                                 bool IsLateCodegen) {
+  switch (Intr->getIntrinsicID()) {
+  case Intrinsic::memmove:
+    // We generally don't currently allow these in opt. llvm.memcpy can slip in
+    // from clang even with -mno-memops (copy constructor?). We explicitly allow
+    // for memset via MLO.
+    return false;
+  // TensorCore Only
+  case Intrinsic::tpu_tc_transpose:
+  case Intrinsic::tpu_tc_transpose_end:
+    return ST.hasJfcTensorCore() || ST.hasDfcTensorCore() ||
+           ST.hasPfcTensorCore() || ST.hasVfcTensorCore();
+
+  // TensorCore, PF and newer
+  case Intrinsic::tpu_tc_transpose_packed:
+  case Intrinsic::tpu_tc_transpose_segmented_packed:
+  case Intrinsic::tpu_tc_transpose_end_packed:
+  case Intrinsic::tpu_tc_transpose_end_segmented_packed:
+  case Intrinsic::tpu_tc_transpose_segmented:
+  case Intrinsic::tpu_tc_transpose_end_segmented:
+    return ST.hasPfcTensorCore() || ST.hasVfcTensorCore();
+
+  // ViperFish and newer
+  case Intrinsic::tpu_waitle_yieldable:
+    return ST.hasVfcTensorCore();
+
+  // General DMA operations TensorCore
+  case Intrinsic::tpu_dma_hbm_to_smem_general:
+  case Intrinsic::tpu_dma_hbm_to_vmem_general:
+  case Intrinsic::tpu_dma_smem_to_hbm_general:
+  case Intrinsic::tpu_dma_vmem_to_hbm_general:
+    return ST.hasJfcTensorCore() || ST.hasDfcTensorCore() ||
+           ST.hasPfcTensorCore() || ST.hasVfcTensorCore();
+
+  // General DMA operations SparseCore
+  case Intrinsic::tpu_dma_hbm_to_hbm_sc_general:
+  case Intrinsic::tpu_dma_smem_to_smem_sc_general:
+    return ST.isSparseCore();
+
+  // Stream intrinsics
+  // TODO(b/186857259): consider *all* stream instrinsics
+  case Intrinsic::tpu_stream_linear_gather_spmem_to_smem:
+  case Intrinsic::tpu_stream_linear_gather_tilespmem_tileN_to_smem:
+  case Intrinsic::tpu_stream_indirect_gather_spmem_to_smem:
+  case Intrinsic::tpu_stream_indirect_gather_tilespmem_tileN_to_smem:
+  case Intrinsic::tpu_stream_strided_gather_spmem_to_smem:
+  case Intrinsic::tpu_stream_strided_gather_tilespmem_tileN_to_smem:
+  case Intrinsic::tpu_stream_linear_scatter_smem_to_spmem:
+  case Intrinsic::tpu_stream_linear_scatter_smem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_indirect_scatter_smem_to_spmem:
+  case Intrinsic::tpu_stream_indirect_scatter_smem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_strided_scatter_smem_to_spmem:
+  case Intrinsic::tpu_stream_strided_scatter_smem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_linear_scatter_add_s32_smem_to_spmem:
+  case Intrinsic::tpu_stream_linear_scatter_add_s32_smem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_indirect_scatter_add_s32_smem_to_spmem:
+  case Intrinsic::tpu_stream_indirect_scatter_add_s32_smem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_strided_scatter_add_s32_smem_to_spmem:
+  case Intrinsic::tpu_stream_strided_scatter_add_s32_smem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_linear_scatter_add_f32_smem_to_spmem:
+  case Intrinsic::tpu_stream_linear_scatter_add_f32_smem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_indirect_scatter_add_f32_smem_to_spmem:
+  case Intrinsic::tpu_stream_indirect_scatter_add_f32_smem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_strided_scatter_add_f32_smem_to_spmem:
+  case Intrinsic::tpu_stream_strided_scatter_add_f32_smem_to_tilespmem_tileN:
+  case Intrinsic::tpu_waiteqordone:
+  case Intrinsic::tpu_waitneordone:
+  case Intrinsic::tpu_waitgtordone:
+  case Intrinsic::tpu_waitgeordone:
+  case Intrinsic::tpu_waitltordone:
+    return ST.isSparseCore();
+
+  // SparseCore TEC and TAC only
+  case Intrinsic::tpu_stream_linear_gather_hbm_to_tilespmem:
+  case Intrinsic::tpu_stream_linear_gather_spmem_to_tilespmem:
+  case Intrinsic::tpu_stream_linear_gather_tilespmem_tileN_to_tilespmem:
+  case Intrinsic::tpu_stream_indirect_gather_hbm_to_tilespmem:
+  case Intrinsic::tpu_stream_indirect_gather_spmem_to_tilespmem:
+  case Intrinsic::tpu_stream_indirect_gather_tilespmem_tileN_to_tilespmem:
+  case Intrinsic::tpu_stream_strided_gather_hbm_to_tilespmem:
+  case Intrinsic::tpu_stream_strided_gather_spmem_to_tilespmem:
+  case Intrinsic::tpu_stream_strided_gather_tilespmem_tileN_to_tilespmem:
+  case Intrinsic::tpu_stream_linear_gather_add_s32_hbm_to_tilespmem:
+  case Intrinsic::tpu_stream_linear_gather_add_s32_spmem_to_tilespmem:
+  case Intrinsic::tpu_stream_linear_gather_add_s32_tilespmem_tileN_to_tilespmem:
+  case Intrinsic::tpu_stream_indirect_gather_add_s32_hbm_to_tilespmem:
+  case Intrinsic::tpu_stream_indirect_gather_add_s32_spmem_to_tilespmem:
+  case Intrinsic::
+      tpu_stream_indirect_gather_add_s32_tilespmem_tileN_to_tilespmem:
+  case Intrinsic::tpu_stream_strided_gather_add_s32_hbm_to_tilespmem:
+  case Intrinsic::tpu_stream_strided_gather_add_s32_spmem_to_tilespmem:
+  case Intrinsic::
+      tpu_stream_strided_gather_add_s32_tilespmem_tileN_to_tilespmem:
+  case Intrinsic::tpu_stream_linear_gather_add_f32_hbm_to_tilespmem:
+  case Intrinsic::tpu_stream_linear_gather_add_f32_spmem_to_tilespmem:
+  case Intrinsic::tpu_stream_linear_gather_add_f32_tilespmem_tileN_to_tilespmem:
+  case Intrinsic::tpu_stream_indirect_gather_add_f32_hbm_to_tilespmem:
+  case Intrinsic::tpu_stream_indirect_gather_add_f32_spmem_to_tilespmem:
+  case Intrinsic::
+      tpu_stream_indirect_gather_add_f32_tilespmem_tileN_to_tilespmem:
+  case Intrinsic::tpu_stream_strided_gather_add_f32_hbm_to_tilespmem:
+  case Intrinsic::tpu_stream_strided_gather_add_f32_spmem_to_tilespmem:
+  case Intrinsic::
+      tpu_stream_strided_gather_add_f32_tilespmem_tileN_to_tilespmem:
+  case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_hbm:
+  case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_spmem:
+  case Intrinsic::tpu_stream_linear_scatter_tilespmem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_hbm:
+  case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_spmem:
+  case Intrinsic::tpu_stream_indirect_scatter_tilespmem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_hbm:
+  case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_spmem:
+  case Intrinsic::tpu_stream_strided_scatter_tilespmem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_linear_scatter_add_s32_tilespmem_to_spmem:
+  case Intrinsic::
+      tpu_stream_linear_scatter_add_s32_tilespmem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_indirect_scatter_add_s32_tilespmem_to_spmem:
+  case Intrinsic::
+      tpu_stream_indirect_scatter_add_s32_tilespmem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_strided_scatter_add_s32_tilespmem_to_spmem:
+  case Intrinsic::
+      tpu_stream_strided_scatter_add_s32_tilespmem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_linear_scatter_add_f32_tilespmem_to_spmem:
+  case Intrinsic::
+      tpu_stream_linear_scatter_add_f32_tilespmem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_indirect_scatter_add_f32_tilespmem_to_spmem:
+  case Intrinsic::
+      tpu_stream_indirect_scatter_add_f32_tilespmem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_strided_scatter_add_f32_tilespmem_to_spmem:
+  case Intrinsic::
+      tpu_stream_strided_scatter_add_f32_tilespmem_to_tilespmem_tileN:
+  case Intrinsic::tpu_allocate_sflag_other:
+  case Intrinsic::tpu_syncset_both:
+  case Intrinsic::tpu_syncset_both_done:
+  case Intrinsic::tpu_syncset_other_done:
+  case Intrinsic::tpu_syncadd_both:
+  case Intrinsic::tpu_syncadd_other:
+    return ST.isSparseCoreTac() || ST.isSparseCoreTec();
+
+  // SparseCore TEC
+  case Intrinsic::tpu_stream_indirect_vreg_gather_spmem_to_smem:
+  case Intrinsic::tpu_stream_indirect_vreg_gather_tilespmem_tileN_to_smem:
+  case Intrinsic::tpu_stream_indirect_vreg_gather_hbm_to_tilespmem:
+  case Intrinsic::tpu_stream_indirect_vreg_gather_spmem_to_tilespmem:
+  case Intrinsic::tpu_stream_indirect_vreg_gather_tilespmem_tileN_to_tilespmem:
+  case Intrinsic::tpu_stream_indirect_vreg_gather_add_s32_hbm_to_tilespmem:
+  case Intrinsic::tpu_stream_indirect_vreg_gather_add_s32_spmem_to_tilespmem:
+  case Intrinsic::
+      tpu_stream_indirect_vreg_gather_add_s32_tilespmem_tileN_to_tilespmem:
+  case Intrinsic::tpu_stream_indirect_vreg_gather_add_f32_hbm_to_tilespmem:
+  case Intrinsic::tpu_stream_indirect_vreg_gather_add_f32_spmem_to_tilespmem:
+  case Intrinsic::
+      tpu_stream_indirect_vreg_gather_add_f32_tilespmem_tileN_to_tilespmem:
+  case Intrinsic::tpu_stream_indirect_vreg_scatter_smem_to_spmem:
+  case Intrinsic::tpu_stream_indirect_vreg_scatter_smem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_indirect_vreg_scatter_tilespmem_to_hbm:
+  case Intrinsic::tpu_stream_indirect_vreg_scatter_tilespmem_to_spmem:
+  case Intrinsic::tpu_stream_indirect_vreg_scatter_tilespmem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_indirect_vreg_scatter_add_s32_smem_to_spmem:
+  case Intrinsic::
+      tpu_stream_indirect_vreg_scatter_add_s32_smem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_indirect_vreg_scatter_add_s32_tilespmem_to_spmem:
+  case Intrinsic::
+      tpu_stream_indirect_vreg_scatter_add_s32_tilespmem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_indirect_vreg_scatter_add_f32_smem_to_spmem:
+  case Intrinsic::
+      tpu_stream_indirect_vreg_scatter_add_f32_smem_to_tilespmem_tileN:
+  case Intrinsic::tpu_stream_indirect_vreg_scatter_add_f32_tilespmem_to_spmem:
+  case Intrinsic::
+      tpu_stream_indirect_vreg_scatter_add_f32_tilespmem_to_tilespmem_tileN:
+    return ST.isSparseCoreTec();
+
+  // SparseCore SCS
+  case Intrinsic::tpu_waiteqordone_yieldable:
+  case Intrinsic::tpu_waitneordone_yieldable:
+  case Intrinsic::tpu_waitgtordone_yieldable:
+  case Intrinsic::tpu_waitgeordone_yieldable:
+  case Intrinsic::tpu_waitltordone_yieldable:
+    return ST.isSparseCoreScs();
+
+  // ViperFish TensorCore and SparseCore SCS
+  case Intrinsic::tpu_waiteq_yieldable:
+  case Intrinsic::tpu_waitne_yieldable:
+  case Intrinsic::tpu_waitgt_yieldable:
+  case Intrinsic::tpu_waitge_yieldable:
+  case Intrinsic::tpu_waitlt_yieldable:
+  case Intrinsic::tpu_waitdone_yieldable:
+  case Intrinsic::tpu_waitnotdone_yieldable:
+    return ST.isSparseCoreScs() || ST.hasVfcTensorCore();
+
+  // We enforce using the _macro intrinsic versions on SparseCore.
+  case Intrinsic::tpu_rsqrt:
+  case Intrinsic::tpu_pow2:
+  case Intrinsic::tpu_log2:
+  case Intrinsic::tpu_tanh:
+  case Intrinsic::tpu_rcp:
+  case Intrinsic::tpu_sigshft:
+  case Intrinsic::tpu_eup_pop:
+  case Intrinsic::tpu_eup_push:
+    // On SparseCore, do not emit these during codegen prepare and lower during
+    // isel instead.
+    return !ST.isSparseCore();
+
+  case Intrinsic::tpu_spill_debug:
+    return SpillDebugEnabled;
+
+  default:
+    LLVM_DEBUG({
+      dbgs() << "Intrinsic not explicitly checked in verifier\n";
+      Intr->print(dbgs());
+      dbgs() << "\n";
+    });
+    // Not necessarily supported, but not explicitly checked here. May instead
+    // be disallowed during instruction selection via Predicates defined in
+    // *InstrInfo.td
+    return true;
+  }
+}
+
+static bool report_error(StringRef ErrorMsg, Value *V = nullptr) {
+  if (TPUFatalVerifierError) {
+    report_fatal_error(ErrorMsg);
+  }
+  dbgs() << ErrorMsg << "\n";
+  if (V) {
+    V->print(dbgs());
+    dbgs() << "\n";
+  }
+  return true;
+}
+
+static bool validateSimdSize(Function &F, const TPUSubtarget &ST) {
+  auto ValidateSimdType = [&](const Type *VTy) {
+    if (!ST.hasV16())
+      return true;
+    Type *SVITy = VectorType::get(Type::getInt32Ty(F.getContext()), 8,
+                                  /*Scalable=*/false);
+    Type *SVFTy = VectorType::get(Type::getFloatTy(F.getContext()), 8,
+                                  /*Scalable=*/false);
+    if (VTy == SVITy)
+      return false;
+    if (VTy == SVFTy)
+      return false;
+    if (isa<PointerType>(VTy) && !cast<PointerType>(VTy)->isOpaque()) {
+      if (VTy == SVITy->getPointerTo(TPUAS_TileSpmem))
+        return false;
+      if (VTy == SVFTy->getPointerTo(TPUAS_TileSpmem))
+        return false;
+    }
+    return true;
+  };
+  for (BasicBlock &B : F) {
+    for (Instruction &I : B) {
+      if (!ValidateSimdType(I.getType()))
+        return report_error("Unsupported SIMD type on this architecture.");
+      for (Use &U : I.operands()) {
+        if (!ValidateSimdType(U->getType()))
+          return report_error(
+              "Unsupported SIMD operand type on this architecture.");
+      }
+    }
+  }
+  return true;
+}
+
+static bool runImpl(Function &F, TPUTargetMachine *TM, bool IsLateCodegen) {
+  // SparseCore's core types (go/vfc-sc#core-type).
+  enum class StreamCoreType {
+    TEC_or_SCS = 0,
+    TAC = 1,
+  };
+  const TPUSubtarget &ST = TM->getSubtarget<TPUSubtarget>(F);
+  // Checking for valid subtarget.
+  assert(ST.getMemSize(TPUAS_Smem) != 0);
+  // We don't want to mix 8xSIMD and 16xSIMD and avoid undetected bugs since
+  // selection DAG will try to widen vectors if it can.
+  if (ST.isSparseCore()) {
+    if (!validateSimdSize(F, ST))
+      return false;
+  }
+  for (auto &I : instructions(F)) {
+    unsigned TransposeWidthOp = 1;
+    unsigned TransposeHeightOp = 2;
+    if (IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(&I)) {
+      if (!isIntrinsicSupported(Intr, ST, IsLateCodegen)) {
+        return report_error("Intrinsic not supported on this subtarget.", &I);
+      }
+      if (!isIntrinsicMaskValid(Intr, ST)) {
+        return report_error("Intrinsic mask operand error.", &I);
+      }
+      switch (Intr->getIntrinsicID()) {
+      case llvm::Intrinsic::tpu_tc_transpose_packed:
+      case llvm::Intrinsic::tpu_tc_transpose_segmented_packed:
+      case llvm::Intrinsic::tpu_tc_transpose_end_packed:
+      case llvm::Intrinsic::tpu_tc_transpose_end_segmented_packed:
+        TransposeWidthOp = 2;
+        TransposeHeightOp = 3;
+        LLVM_FALLTHROUGH;
+      case llvm::Intrinsic::tpu_tc_transpose:
+      case llvm::Intrinsic::tpu_tc_transpose_segmented:
+      case llvm::Intrinsic::tpu_tc_transpose_end:
+      case llvm::Intrinsic::tpu_tc_transpose_end_segmented: {
+        ConstantInt *Width =
+            dyn_cast<ConstantInt>(Intr->getOperand(TransposeWidthOp));
+        ConstantInt *Height =
+            dyn_cast<ConstantInt>(Intr->getOperand(TransposeHeightOp));
+        if (Width == nullptr || Height == nullptr)
+          return report_error("Non constant size transpose unsupported.", &I);
+        if (Width->getZExtValue() % 8 != 0 || Width->getZExtValue() > 128 ||
+            Height->getZExtValue() % 8 != 0 || Height->getZExtValue() > 128)
+          return report_error("Invalid transpose height or width.", &I);
+        if ((ST.hasJfcTensorCore() || ST.hasDfcTensorCore()) &&
+            (Width->getZExtValue() != 128 || Height->getZExtValue() != 128))
+          return report_error("Short or narrow transpose not supported.", &I);
+        ConstantInt *Bus = dyn_cast<ConstantInt>(Intr->getOperand(3));
+        if (Bus == nullptr)
+          return report_error("Non constant XLU Bus.", &I);
+        if ((ST.hasJfcTensorCore() || ST.hasDfcTensorCore()) &&
+            Bus->getZExtValue() != 0)
+          return report_error("Invalid XLU Bus used.", &I);
+      } break;
+      case llvm::Intrinsic::tpu_stream_linear_gather_spmem_to_smem:
+      case llvm::Intrinsic::tpu_stream_linear_gather_tilespmem_tileN_to_smem:
+      case llvm::Intrinsic::tpu_stream_indirect_gather_spmem_to_smem:
+      case llvm::Intrinsic::tpu_stream_indirect_gather_tilespmem_tileN_to_smem:
+      case llvm::Intrinsic::tpu_stream_strided_gather_spmem_to_smem:
+      case llvm::Intrinsic::tpu_stream_strided_gather_tilespmem_tileN_to_smem:
+      case llvm::Intrinsic::tpu_stream_linear_scatter_smem_to_spmem:
+      case llvm::Intrinsic::tpu_stream_linear_scatter_smem_to_tilespmem_tileN:
+      case llvm::Intrinsic::tpu_stream_indirect_scatter_smem_to_spmem:
+      case llvm::Intrinsic::tpu_stream_indirect_scatter_smem_to_tilespmem_tileN:
+      case llvm::Intrinsic::tpu_stream_strided_scatter_smem_to_spmem:
+      case llvm::Intrinsic::tpu_stream_strided_scatter_smem_to_tilespmem_tileN:
+      case llvm::Intrinsic::tpu_stream_linear_scatter_add_s32_smem_to_spmem:
+      case llvm::Intrinsic::
+          tpu_stream_linear_scatter_add_s32_smem_to_tilespmem_tileN:
+      case llvm::Intrinsic::tpu_stream_indirect_scatter_add_s32_smem_to_spmem:
+      case llvm::Intrinsic::
+          tpu_stream_indirect_scatter_add_s32_smem_to_tilespmem_tileN:
+      case llvm::Intrinsic::tpu_stream_strided_scatter_add_s32_smem_to_spmem:
+      case llvm::Intrinsic::
+          tpu_stream_strided_scatter_add_s32_smem_to_tilespmem_tileN:
+      case llvm::Intrinsic::tpu_stream_linear_scatter_add_f32_smem_to_spmem:
+      case llvm::Intrinsic::
+          tpu_stream_linear_scatter_add_f32_smem_to_tilespmem_tileN:
+      case llvm::Intrinsic::tpu_stream_indirect_scatter_add_f32_smem_to_spmem:
+      case llvm::Intrinsic::
+          tpu_stream_indirect_scatter_add_f32_smem_to_tilespmem_tileN:
+      case llvm::Intrinsic::tpu_stream_strided_scatter_add_f32_smem_to_spmem:
+      case llvm::Intrinsic::
+          tpu_stream_strided_scatter_add_f32_smem_to_tilespmem_tileN: {
+        unsigned StreamControlOp = 1;
+        ConstantInt *Control =
+            dyn_cast<ConstantInt>(Intr->getOperand(StreamControlOp));
+        if (Control == nullptr) {
+          // Non-constant stream_control override is allowed and is handled in
+          // codegen prepare.
+          return true;
+        }
+        // Hardware raises a fatal error if the value of the sync_flag_core_type
+        // field is 1 and the stream transfer is initiated by SCS.
+        if (ST.isSparseCoreScs() && StreamCoreType(Control->getZExtValue() &
+                                                   0x1) == StreamCoreType::TAC)
+          return report_error("Invalid sync flag core type.", &I);
+      } break;
+      case llvm::Intrinsic::ctlz: {
+        ConstantInt *Control = dyn_cast<ConstantInt>(Intr->getOperand(1));
+        if (Control->getZExtValue() == 1)
+          return report_error("TPU defines clz zero as 32.");
+      } break;
+      case Intrinsic::tpu_dma_hbm_to_hbm_sc_simple:
+      case Intrinsic::tpu_dma_hbm_to_simem_sc_simple:
+      case Intrinsic::tpu_dma_hbm_to_smem_sc_simple:
+      case Intrinsic::tpu_dma_hbm_to_spmem_sc_simple:
+      case Intrinsic::tpu_dma_hbm_to_tilespmem_sc_simple:
+      case Intrinsic::tpu_dma_hbm_to_timem_sc_simple:
+      case Intrinsic::tpu_dma_smem_to_hbm_sc_simple:
+      case Intrinsic::tpu_dma_smem_to_smem_sc_simple:
+      case Intrinsic::tpu_dma_spmem_to_hbm_sc_simple:
+      case Intrinsic::tpu_dma_spmem_to_spmem_sc_simple:
+      case Intrinsic::tpu_dma_spmem_to_tilespmem_sc_simple:
+      case Intrinsic::tpu_dma_tilespmem_to_hbm_sc_simple:
+      case Intrinsic::tpu_dma_tilespmem_to_spmem_sc_simple:
+      case Intrinsic::tpu_dma_timem_to_hbm_sc_simple:
+      case Intrinsic::tpu_dma_hbm_to_iova_sc_simple:
+      case Intrinsic::tpu_dma_iova_to_hbm_sc_simple: {
+        Value *TraceEnableOpnd;
+        if (Intr->getIntrinsicID() ==
+                Intrinsic::tpu_dma_hbm_to_iova_sc_simple ||
+            Intr->getIntrinsicID() ==
+                Intrinsic::tpu_dma_iova_to_hbm_sc_simple) {
+          TraceEnableOpnd = Intr->getOperand(5);
+        } else {
+          TraceEnableOpnd = Intr->getOperand(4);
+        }
+        if (!isa<ConstantInt>(TraceEnableOpnd))
+          return report_error("trace_en field must be an immediate");
+        unsigned TraceEn = cast<ConstantInt>(TraceEnableOpnd)->getZExtValue();
+        if (TraceEn != 0 && TraceEn != 1)
+          return report_error("trace_en field must be zero or one, got " +
+                              std::to_string(TraceEn));
+        break;
+      }
+      case Intrinsic::tpu_dma_hbm_to_hbm_sc_general:
+      case Intrinsic::tpu_dma_smem_to_smem_sc_general: {
+        Value *DregV = Intr->getOperand(7);
+        if (IsLateCodegen) {
+          // We take advantage of LLVM already classifying null pointers when
+          // checking for a dreg pointer that points to null for DMAs.
+          if (!isa<ConstantPointerNull>(DregV))
+            return report_error(
+                "Late verifier expects dreg pointer that points to null.", &I);
+          Value *SrcCoreIdV = Intr->getOperand(6);
+          Value *DstCoreIdV = Intr->getOperand(1);
+          if (!isa<ConstantInt>(SrcCoreIdV) || !isa<ConstantInt>(DstCoreIdV) ||
+              !cast<ConstantInt>(SrcCoreIdV)->isZero() ||
+              !cast<ConstantInt>(DstCoreIdV)->isZero())
+            return report_error(
+                "Late verifier expects src and dst sflag core id null.", &I);
+        } else {
+          if (isa<ConstantPointerNull>(DregV))
+            return report_error(
+                "Early verifier expects unfolded dreg pointer value.", &I);
+        }
+      } break;
+      default:
+        break;
+      }
+    } else if (StoreInst *Store = dyn_cast<StoreInst>(&I)) {
+      // If the address space isn't known to the subtarget at all, this might
+      // assert.
+      if (ST.getMemSize(Store->getPointerAddressSpace()) == 0)
+        report_error("Unsupported address space on this processor.");
+    } else if (LoadInst *Load = dyn_cast<LoadInst>(&I)) {
+      // If the address space isn't known to the subtarget at all, this might
+      // assert.
+      if (ST.getMemSize(Load->getPointerAddressSpace()) == 0)
+        report_error("Unsupported address space on this processor.");
+      if (Load->getType()->isVectorTy()) {
+        // We've added these check in order to prevent bugs due to vector/scalar
+        // combine logic. We may relax these checks in the future. We explicitly
+        // allow vector of 2 for a fragile clang emulation of cbr.
+        if (ST.hasV8() &&
+            cast<VectorType>(Load->getType())
+                    ->getElementCount()
+                    .getKnownMinValue() != 8 &&
+            cast<VectorType>(Load->getType())
+                    ->getElementCount()
+                    .getKnownMinValue() != 2) {
+          if (!ST.hasLPVF() || (ST.hasLPVF() &&
+                                cast<VectorType>(Load->getType())
+                                        ->getElementCount()
+                                        .getKnownMinValue() != 32 &&
+                                cast<VectorType>(Load->getType())
+                                        ->getElementCount()
+                                        .getKnownMinValue() != 16)) {
+            report_error("Vector load of vector != 8.");
+          }
+        } else if (ST.hasV16() &&
+                   cast<VectorType>(Load->getType())
+                           ->getElementCount()
+                           .getKnownMinValue() != 16 &&
+                   cast<VectorType>(Load->getType())
+                           ->getElementCount()
+                           .getKnownMinValue() != 2) {
+          if (!ST.hasLPVF() || (ST.hasLPVF() &&
+                                cast<VectorType>(Load->getType())
+                                        ->getElementCount()
+                                        .getKnownMinValue() != 64 &&
+                                cast<VectorType>(Load->getType())
+                                        ->getElementCount()
+                                        .getKnownMinValue() != 32)) {
+            report_error("Vector load of vector != 16.");
+          }
+        } else if (ST.hasV1024() &&
+                   cast<VectorType>(Load->getType())
+                           ->getElementCount()
+                           .getKnownMinValue() != 1024 &&
+                   cast<VectorType>(Load->getType())
+                           ->getElementCount()
+                           .getKnownMinValue() != 2) {
+          report_error("Vector load of vector != 1024.");
+        }
+      }
+    } else if (TPUVerifierStrictIntoPtr &&
+               I.getOpcode() == Instruction::IntToPtr) {
+      if (IsLateCodegen)
+        continue;
+      // On SparseCore, we enforce using tpu_inttoptr intrinsics more strictly.
+      if (ST.isSparseCore())
+        report_error("Use tpu_inttoptr intrinsics.");
+    } else if (AddrSpaceCastInst *ASCast = dyn_cast<AddrSpaceCastInst>(&I)) {
+      if (IsLateCodegen)
+        continue;
+      if (ST.isSparseCore())
+        report_error("Use tpu_addrspacecast intrinsics.");
+      // Make sure only TPU-known address spaces are in the code.
+      if (ST.getMemSize(ASCast->getSrcAddressSpace()) == 0)
+        report_error("Unsupported address space on this processor.");
+      if (ST.getMemSize(ASCast->getDestAddressSpace()) == 0)
+        report_error("Unsupported address space on this processor.");
+    }
+  }
+  return true;
+}
+
+PreservedAnalyses TPUVerifierPass::run(Function &F,
+                                       FunctionAnalysisManager &AF) {
+  runImpl(F, &TM, IsLateCodegen);
+  return PreservedAnalyses::all();
+}
+
+bool TPUVerifier::runOnFunction(Function &F) {
+  return runImpl(F, TM, IsLateCodegen);
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUXLUOptimizations.cpp b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUXLUOptimizations.cpp
new file mode 100644
index 0000000..b55977b
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/TPUXLUOptimizations.cpp

@@ -0,0 +1,642 @@
+//===-- TPUXLUOptimizations.cpp - Optimizations over XLU ops ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Optimizations over XLU operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TPU.h"
+#include "TPUAliasSetTracker.h"
+#include "TPUIRUtils.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsTPU.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Verifier.h"
+
+#include <algorithm>
+
+#define DEBUG_TYPE "tpu-xlu-opt"
+
+using namespace llvm;
+using namespace llvm::TPU;
+
+namespace {
+
+cl::opt<bool> TPUXLUOptsPrintDeps(
+    "tpu-xlu-opts-print-deps", cl::init(false),
+    cl::desc(
+        "When debug is enabled print XLU dependencies between instructions."));
+
+using DependenciesSet = SparseBitVector<>;
+
+// Custom alias set that tracks dependencies of XLU operations per alias set.
+class TPUAliasSetWithDep : public TPUAliasSet {
+  DependenciesSet Deps;
+
+public:
+  DependenciesSet &getDeps() { return Deps; };
+  void merge(TPUAliasSet &&Other) override;
+};
+
+void TPUAliasSetWithDep::merge(TPUAliasSet &&Other) {
+  Deps |= static_cast<TPUAliasSetWithDep &&>(Other).Deps;
+  TPUAliasSet::merge(std::move(Other));
+}
+
+using TPUAliasSetTrackerWithDep = TPUAliasSetTracker<TPUAliasSetWithDep>;
+
+class XLUDepGraph;
+
+// Node reprenting an XLU operation the algorithm can operate on.
+// TODO(maggioni): In the first implementation I was tracking predecessors,
+// but the current algorithm is not using them, so I removed them for the time
+// being to make the Node class lighter.
+class Node {
+  enum class NodeType {
+    Transpose,
+    Rotate,
+  };
+
+public:
+  using NodeIdx = unsigned;
+  friend class XLUDepGraph;
+  ArrayRef<Instruction *> getPushes() const { return PushSequence; }
+  ArrayRef<Instruction *> getPops() const { return ReturnSequence; }
+  ArrayRef<NodeIdx> getSuccs() const { return Succs; }
+  unsigned getId() const { return Id; }
+  bool canMerge() const { return CanMerge; }
+  // Check if two nodes are compatible for merging.
+  bool canMergeWith(const Node &N) const;
+  NodeType getNodeType() const { return Type; }
+
+  Node(NodeType Ty, unsigned Id) : Type(Ty), Id(Id), CanMerge(true) {}
+
+private:
+  std::vector<Instruction *> PushSequence;
+  std::vector<Instruction *> ReturnSequence;
+  SmallVector<NodeIdx, 2> Succs;
+  NodeType Type;
+  unsigned Id;
+  bool CanMerge;
+};
+
+bool Node::canMergeWith(const Node &N) const {
+  // Nodes need to be mergeable
+  if (!canMerge() || !N.canMerge())
+    return false;
+  // Nodes need to be of the same type.
+  if (getNodeType() != N.getNodeType())
+    return false;
+  switch (getNodeType()) {
+  case Node::NodeType::Rotate: {
+    // Rotate nodes need to have the same operands that are not the value to
+    // be rotated.
+    assert(getPushes().size() == 1 && N.getPushes().size() == 1);
+    if (TPU::getRotateAmount(*getPushes()[0]) !=
+        TPU::getRotateAmount(*N.getPushes()[0]))
+      return false;
+    if (TPU::getRotateBusIdx(*getPushes()[0]) !=
+        TPU::getRotateBusIdx(*N.getPushes()[0]))
+      return false;
+    break;
+  }
+  default:
+    break;
+  }
+  // If the two nodes have a dependency bail.
+  return std::find(Succs.begin(), Succs.end(), N.getId()) == Succs.end();
+}
+
+// Tracks dependencies of instructions with XLU nodes.
+class DependenciesTracker {
+  // Map tracking Dependencies of XLU nodes for every instruction.
+  DenseMap<const Instruction *, DependenciesSet> DepsPerInstr;
+
+public:
+  const DependenciesSet *getInstrDeps(const Instruction *I) const {
+    auto It = DepsPerInstr.find(I);
+    if (It != DepsPerInstr.end())
+      return &It->second;
+    return nullptr;
+  }
+  void addDepsForInstr(const Instruction *I, const DependenciesSet &DepSet) {
+    auto &DepsI = DepsPerInstr[I];
+    DepsI |= DepSet;
+  }
+  void trackDeps(const Instruction *I, const XLUDepGraph &Graph,
+                 TPUAliasSetTrackerWithDep &AliasTracker);
+  void clear() { DepsPerInstr.clear(); }
+};
+
+class XLUDepGraph {
+  using NodeVector = std::vector<Node>;
+
+public:
+  using Iterator = NodeVector::iterator;
+
+private:
+  NodeVector Nodes;
+  DenseMap<const Instruction *, Node::NodeIdx> InstrSequenceId;
+  DependenciesTracker DepTracker;
+
+public:
+  Iterator begin() { return Nodes.begin(); }
+  Iterator end() { return Nodes.end(); }
+  void buildGraph(BasicBlock &BB);
+  void addPred(Node::NodeIdx Target, Node::NodeIdx Pred) {
+    Nodes[Pred].Succs.push_back(Target);
+  }
+  const DependenciesSet *getDepForInstr(const Instruction *I) const;
+  unsigned size() const { return Nodes.size(); }
+  Node &getNodeFromId(Node::NodeIdx Idx) {
+    assert(Idx < Nodes.size() && "Out of bounds");
+    return Nodes[Idx];
+  }
+  std::optional<Node::NodeIdx> getInstrSequenceId(const Instruction *I) const {
+    auto InstrIt = InstrSequenceId.find(I);
+    if (InstrIt == InstrSequenceId.end())
+      return std::nullopt;
+    return InstrIt->second;
+  }
+};
+
+// Add XLU dependencies for an instruction
+void DependenciesTracker::trackDeps(const Instruction *I,
+                                    const XLUDepGraph &Graph,
+                                    TPUAliasSetTrackerWithDep &AliasTracker) {
+  DependenciesSet &NewDeps = DepsPerInstr[I];
+  for (auto &U : I->operands()) {
+    const Instruction *UI = dyn_cast<Instruction>(U.get());
+    if (!UI)
+      continue;
+    auto DepsIt = DepsPerInstr.find(UI);
+    if (DepsIt != DepsPerInstr.end()) {
+      for (auto NIdx : DepsIt->second)
+        NewDeps.set(NIdx);
+    }
+  }
+
+  if (!I->mayReadOrWriteMemory() || Graph.getInstrSequenceId(I).has_value()) {
+    LLVM_DEBUG(if (TPUXLUOptsPrintDeps) {
+      auto SeqId = Graph.getInstrSequenceId(I);
+      dbgs() << "Deps for: ";
+      if (SeqId.has_value())
+        dbgs() << "Seq " << *SeqId << " ";
+      dbgs() << " " << *I << " - ";
+      for (auto D : NewDeps) {
+        dbgs() << D << " ";
+      }
+      dbgs() << "\n";
+    });
+    return;
+  }
+
+  // Call back to be called when an alias set is first found to be aliased.
+  auto AddSetDepsToInstr = [&](TPUAliasSet *AS) {
+    TPUAliasSetWithDep *ASDep = static_cast<TPUAliasSetWithDep *>(AS);
+    NewDeps |= ASDep->getDeps();
+  };
+  // Callback to be called when the memory operation has been added to an
+  // alias set.
+  auto AddInstrDepsToSet = [&](TPUAliasSet *AS) {
+    TPUAliasSetWithDep *ASDep = static_cast<TPUAliasSetWithDep *>(AS);
+    ASDep->getDeps() = NewDeps;
+  };
+  if (NewDeps.empty()) {
+    // We want to track only operations that are dependent on XLU ops that
+    // we care about. So, if this instruction is not dependent yet check
+    // that is dependent on any memory operation we are tracking (if we are
+    // tracking it that means it is dependent on an XLU op) and if it is add
+    // it to the tracker.
+    if (AliasTracker.aliasQuery(I, /*AddToTracker*/ false) !=
+        AliasResult::NoAlias)
+      AliasTracker.aliasQuery(I, /*AddToTracker*/ true, AddSetDepsToInstr,
+                              AddInstrDepsToSet);
+  } else {
+    AliasTracker.aliasQuery(I, /*AddToTracker*/ true, AddSetDepsToInstr,
+                            AddInstrDepsToSet);
+  }
+  LLVM_DEBUG(if (TPUXLUOptsPrintDeps) {
+    auto SeqId = Graph.getInstrSequenceId(I);
+    dbgs() << "Deps for: ";
+    if (SeqId.has_value())
+      dbgs() << "Seq " << *SeqId;
+    dbgs() << " " << *I << " - ";
+    for (auto D : NewDeps) {
+      dbgs() << D << " ";
+    }
+    dbgs() << "\n";
+  });
+}
+
+const DependenciesSet *XLUDepGraph::getDepForInstr(const Instruction *I) const {
+  return DepTracker.getInstrDeps(I);
+}
+
+void XLUDepGraph::buildGraph(BasicBlock &BB) {
+  // Alias tracker to check if memory operations are dependent with one another.
+  TPUAliasSetTrackerWithDep AliasTracker(BB.getModule()->getDataLayout());
+  DepTracker.clear();
+  bool FoundXLUSequence = false;
+  DenseMap<Node::NodeIdx, DependenciesSet> DepsAdded;
+  for (auto &I : BB) {
+    // Tracking only unpacked transposes as packed transposes are already
+    // in the form we are trying to transform this to.
+    const bool IsTransposePush = isTransposePushNotPacked(I);
+    const bool IsRotatePush = isRotatePushNotPacked(I);
+    // Skip instructions until we found a transpose.
+    if (!FoundXLUSequence && !IsTransposePush && !IsRotatePush)
+      continue;
+    if (IsTransposePush || IsRotatePush) {
+      const Instruction *PreviousPush =
+          IsTransposePush ? TPU::getPreviousTransposePush(I, false) : nullptr;
+      Node *SequenceNode;
+      // Found a new transpose sequence. Create a new node.
+      if (PreviousPush == nullptr) {
+        FoundXLUSequence = true;
+        Nodes.emplace_back(IsTransposePush ? Node::NodeType::Transpose
+                                           : Node::NodeType::Rotate,
+                           Nodes.size());
+        SequenceNode = &Nodes.back();
+      } else {
+        assert(InstrSequenceId.count(PreviousPush) &&
+               "Previous push instruction has no sequence id assigned");
+        SequenceNode = &Nodes[InstrSequenceId[PreviousPush]];
+      }
+      InstrSequenceId[&I] = SequenceNode->getId();
+      DepTracker.trackDeps(&I, *this, AliasTracker);
+      SequenceNode->PushSequence.push_back(&I);
+      // Different instructions of the transpose sequence could have different
+      // dependencies, so we need to try to add edges for all of them.
+      auto *Deps = DepTracker.getInstrDeps(&I);
+      if (Deps != nullptr) {
+        auto &CurrentDepsAdded = DepsAdded[SequenceNode->getId()];
+        for (auto NIdx : *Deps) {
+          if (CurrentDepsAdded.test_and_set(NIdx)) {
+            addPred(SequenceNode->getId(), NIdx);
+          }
+        }
+      }
+      // If this is the end of the sequence add all dependencies of pushes
+      // to the pops and set the pops as part of the sequence.
+      if (isTransposeEnd(I) || IsRotatePush) {
+        auto DepIt = DepsAdded.find(SequenceNode->getId());
+        if (DepIt != DepsAdded.end()) {
+          DepIt->second.set(SequenceNode->getId());
+          for (auto *U : I.users()) {
+            Instruction *UI = cast<Instruction>(U);
+            assert(isXLUPop(*UI) &&
+                   "Expected uses of transpose end to be pops");
+            InstrSequenceId[UI] = SequenceNode->getId();
+            DepTracker.addDepsForInstr(UI, DepIt->second);
+          }
+          assert(!I.user_empty());
+          DepsAdded.erase(DepIt);
+        }
+      }
+    } else if (isXLUPop(I)) {
+      // This is an XLU pop, but it might be not part of a transpose sequence.
+      // If it is add it to the return sequence vector and track it.
+      auto SeqIt = InstrSequenceId.find(&I);
+      if (SeqIt != InstrSequenceId.end())
+        Nodes[SeqIt->second].ReturnSequence.push_back(&I);
+      DepTracker.trackDeps(&I, *this, AliasTracker);
+    } else {
+      DepTracker.trackDeps(&I, *this, AliasTracker);
+    }
+  }
+  // We need to check if the nodes are mergeable. A node is mergeable only
+  // if its input can be truncated to BF16 without loss of precision.
+  // TODO(maggioni): This is simpler than what LLO does right now.
+  // We don't try to prove that ANDs or shifts clear away bits of precision
+  // and we don't try to look through between loads and stores for now and
+  // only consider stores (pessimizing the analysis.
+  DenseMap<const Instruction *, bool> CanProduceBF16Map;
+  for (auto It = BB.rbegin(), E = BB.rend(); It != E; ++It) {
+    // If this is a transpose push then if it can produce BF16 depends on what
+    // we determined about the uses when we checked the pops (because we are
+    // iterating backwards we visit the pops before the pushes).
+    if (InstrSequenceId.count(&*It) && !isXLUPop(*It)) {
+      assert(TPU::isTransposePushNotPacked(*It) ||
+             TPU::isRotatePushNotPacked(*It));
+      CanProduceBF16Map[&*It] = Nodes[InstrSequenceId[&*It]].canMerge();
+      continue;
+    }
+    CanProduceBF16Map[&*It] = true;
+    // Check all the uses and find uses that potentially need full precision.
+    for (auto &U : It->uses()) {
+      const Instruction *UI = cast<Instruction>(U.getUser());
+      if (reducesPrecisionToBF16(*UI, U.getOperandNo()))
+        continue;
+      auto PreserveOperands = preservesBF16OperandPrecision(*UI);
+
+      // Not checking value that escape the block or might be used by PHIs (like
+      // in a loop). Pessimize in that case.
+      if (UI->getParent() != &BB || !CanProduceBF16Map.count(UI) ||
+          !(CanProduceBF16Map[UI] &&
+            std::find(PreserveOperands.begin(), PreserveOperands.end(),
+                      U.getOperandNo()) != PreserveOperands.end())) {
+        if (isXLUPop(*It)) {
+          auto NodeIt = InstrSequenceId.find(&*It);
+          // Transpose pops can be used to pop other TRF things, so check this
+          // is actually part of a sequence.
+          if (NodeIt != InstrSequenceId.end())
+            Nodes[NodeIt->second].CanMerge = false;
+        }
+        // We might need full precision. Set CanProduceBF16 to false for the
+        // instruction.
+        CanProduceBF16Map[&*It] = false;
+        break;
+      }
+    }
+  }
+  LLVM_DEBUG(dbgs() << "Found " << Nodes.size() << " nodes\n"; int NNum = 0;
+             for (auto &N
+                  : Nodes) {
+               dbgs() << "Node " << NNum << "\n";
+               dbgs() << "\tSuccs: ";
+               for (auto S : N.Succs) {
+                 dbgs() << S << " ";
+               }
+               dbgs() << "CanMerge: " << N.canMerge() << "\n";
+               ++NNum;
+             });
+}
+
+// Class that performs the merge of XLU transpose ops.
+class TPUXLUBF16Merger {
+  XLUDepGraph Graph;
+  BasicBlock &BB;
+  const DataLayout &DL;
+
+  XLUDepGraph::Iterator mergeNodes(XLUDepGraph::Iterator N1,
+                                   XLUDepGraph::Iterator N2);
+
+public:
+  TPUXLUBF16Merger(BasicBlock &BB)
+      : BB(BB), DL(BB.getModule()->getDataLayout()) {}
+
+  bool run();
+};
+
+XLUDepGraph::Iterator TPUXLUBF16Merger::mergeNodes(XLUDepGraph::Iterator N1It,
+                                                   XLUDepGraph::Iterator N2It) {
+  auto N1 = *N1It;
+  auto N2 = *N2It;
+  // Keep track of dependent nodes of N2 that we had to move before N1 to merge
+  // N2 with N1.
+  BitVector MovedNodes(Graph.size());
+  unsigned MovedInstructions = 0;
+  unsigned MovedNodesCount = 0;
+  SetVector<Instruction *> Deps(N2.getPushes().begin(), N2.getPushes().end());
+  BasicBlock::reverse_iterator It = N2.getPops().back()->getReverseIterator();
+  BasicBlock::reverse_iterator EndIt = N1.getPushes()[0]->getReverseIterator();
+  Instruction *InsertBefore = &*EndIt;
+  TPUAliasSetTracker<> TAT(DL);
+  // Helper to move instructions that moves XLU sequence in a block.
+  auto MoveInstruction = [&](Instruction *I, std::optional<uint32_t> SeqId) {
+    // If this is a sequence lets move the whole sequence
+    if (SeqId.has_value()) {
+      MovedNodes.set(SeqId.value());
+      while (It != EndIt) {
+        auto ItSeq = Graph.getInstrSequenceId(&*It);
+        if (!ItSeq.has_value())
+          break;
+        if (ItSeq.value() != SeqId.value())
+          break;
+        ++It;
+      }
+      auto &SeqNode = Graph.getNodeFromId(SeqId.value());
+      for (auto *P : SeqNode.getPushes()) {
+        P->moveBefore(InsertBefore);
+        Deps.insert(P);
+      }
+      for (auto *P : SeqNode.getPops()) {
+        P->moveBefore(InsertBefore);
+        Deps.insert(P);
+      }
+      MovedInstructions += SeqNode.getPushes().size();
+      MovedInstructions += SeqNode.getPops().size();
+      ++MovedNodesCount;
+      InsertBefore = SeqNode.getPushes()[0];
+      return;
+    }
+    I->moveBefore(InsertBefore);
+    InsertBefore = I;
+    Deps.insert(I);
+    ++MovedInstructions;
+  };
+  unsigned N1Id = N1.getId();
+  unsigned N2Id = N2.getId();
+  // Move instruction for merging.
+  while (It != EndIt) {
+    Instruction *I = (&*It++);
+    auto SeqId = Graph.getInstrSequenceId(I);
+    // Do not move instructions of the two nodes we are merging.
+    if (SeqId.has_value() && (SeqId.value() == N1Id || SeqId.value() == N2Id))
+      continue;
+    bool AddedToDeps = false;
+    // If any of the users of the current instruction is one of the instructions
+    // we determined we have to move then move this instruction as well.
+    for (auto *U : I->users()) {
+      Instruction *UI = cast<Instruction>(U);
+      if (Deps.count(UI)) {
+        AddedToDeps = true;
+        assert(
+            (!Graph.getDepForInstr(I) ||
+             !Graph.getDepForInstr(I)->test(N1.getId())) &&
+            "Adding dep that make us dependent on node we want to merge with");
+        MoveInstruction(I, SeqId);
+        break;
+      }
+    }
+    // Check for dependencies with previously moved instructions through
+    // memory.
+    if (!I->mayReadOrWriteMemory())
+      continue;
+    // We don't consider handled XLU operations as aliasing with other
+    // inaccessible mem intrinsics.
+    if (SeqId.has_value())
+      continue;
+    // If this instruction doesn't alias then we are good and we don't need
+    // to move it otherwise move it and track it. If the instruction is already
+    // in Deps then we don't need to move it. Just track it.
+    if (!AddedToDeps && TAT.aliasQuery(I, false) != AliasResult::NoAlias) {
+      assert((!Graph.getDepForInstr(I) ||
+              !Graph.getDepForInstr(I)->test(N1.getId())) &&
+             "Adding dep that make us dependent on node we want to merge with");
+      MoveInstruction(I, SeqId);
+    }
+    if (Deps.count(I))
+      TAT.aliasQuery(I, true);
+  }
+  LLVM_DEBUG(dbgs() << "Moved instructions: " << MovedInstructions << "\n");
+  assert(N1.getPushes().size() == N2.getPushes().size());
+  // Merge the two XLU transpose sequences.
+  IRBuilder<> Builder(BB.getContext());
+  auto ForceToFloat = [&Builder](Value *V) {
+    assert(V->getType()->isVectorTy());
+    Value *Float = V;
+    if (!Float->getType()->isFPOrFPVectorTy()) {
+      Float = Builder.CreateBitCast(
+          Float, VectorType::get(
+                     Builder.getFloatTy(),
+                     cast<VectorType>(Float->getType())->getElementCount()));
+    }
+    return Float;
+  };
+  auto ForceToTypeOf = [&Builder](Value *V, Value *Of) {
+    assert(V->getType()->isVectorTy());
+    assert(Of->getType()->isVectorTy());
+    Value *Result = V;
+    if (Result->getType() != Of->getType()) {
+      Result = Builder.CreateBitCast(Result, Of->getType());
+    }
+    return Result;
+  };
+  Type *FloatVecTy = VectorType::get(Builder.getFloatTy(), 1024, false);
+  Function *PackIntr = llvm::Intrinsic::getDeclaration(
+      BB.getModule(), llvm::Intrinsic::tpu_pack, {FloatVecTy});
+  for (int I = N1.getPushes().size() - 1; I >= 0; --I) {
+    Instruction *Push1 = N1.getPushes()[I];
+    Instruction *Push2 = N2.getPushes()[I];
+    Builder.SetInsertPoint(N1.getPushes()[I]);
+    Instruction *Pack =
+        Builder.CreateCall(PackIntr, {ForceToFloat(Push2->getOperand(0)),
+                                      ForceToFloat(Push1->getOperand(0))});
+    Push1->replaceUsesOfWith(Push1->getOperand(0),
+                             ForceToTypeOf(Pack, Push1->getOperand(0)));
+    Instruction *Pop1 = N1.getPops()[I];
+    Instruction *Pop2 = N2.getPops()[I];
+    Builder.SetInsertPoint(&BB, std::next(Pop1->getIterator()));
+    Instruction *UnpackL = cast<Instruction>(Builder.CreateShl(
+        Pop1, ConstantDataVector::getSplat(1024, Builder.getInt32(16))));
+    Instruction *UnpackU = cast<Instruction>(Builder.CreateAnd(
+        Pop1,
+        ConstantDataVector::getSplat(1024, Builder.getInt32(0xFFFF0000U))));
+    Pop1->replaceUsesWithIf(UnpackL, [UnpackL, UnpackU](Use &U) {
+      return U.getUser() != UnpackL && U.getUser() != UnpackU;
+    });
+    Pop2->replaceUsesWithIf(
+        UnpackU, [UnpackU](Use &U) { return U.getUser() != UnpackU; });
+    Pop2->eraseFromParent();
+  }
+  for (int I = N2.getPushes().size() - 1; I >= 0; --I) {
+    N2.getPushes()[I]->eraseFromParent();
+  }
+  auto NewIt = N1It;
+  if (MovedNodesCount > 0) {
+    NewIt = std::stable_partition(
+        N1It, N2It, [&](const Node &N) { return MovedNodes.test(N.getId()); });
+  }
+  LLVM_DEBUG(dbgs() << "Post reorder: "; for (auto &N
+                                              : Graph) {
+    dbgs() << N.getId() << " ";
+  } dbgs() << "\n";);
+  return NewIt;
+}
+
+bool TPUXLUBF16Merger::run() {
+  Graph.buildGraph(BB);
+  if (Graph.begin() == Graph.end())
+    return false;
+  auto It = Graph.begin();
+  auto E = std::prev(Graph.end());
+  BitVector Merged(Graph.size());
+  unsigned MergedNodes = 0;
+  bool Changed = false;
+  // We currently naively iterate the graph in node order and try to merge
+  // eagerly with the first node we find
+  // TODO(maggioni): Investigate fancier algorithms.
+  while (It != E) {
+    auto &N = *It;
+    // If the node cannot be merged (preserve precision) or has been merged
+    // already continue.
+    if (!N.canMerge() || Merged.test(N.getId())) {
+      ++It;
+      continue;
+    }
+    // Helper to evaluate if a node can be merged with the current node.
+    // Check if the node ToMerge can be merged, if it is of the same type of
+    // the current node, if it hasn't been merged already and if it is not one
+    // of the successors of the current node.
+    auto ValidForMerge = [&](const Node &ToMerge) {
+      return !Merged.test(ToMerge.getId()) && N.canMergeWith(ToMerge);
+    };
+    auto ToMergeIt = std::find_if(std::next(It), Graph.end(), ValidForMerge);
+    if (ToMergeIt == Graph.end()) {
+      ++It;
+      // Didn't find anything to merge with
+      continue;
+    }
+    // Perform the merge
+    ++MergedNodes;
+    LLVM_DEBUG(dbgs() << "Merging Node " << N.getId() << " with node "
+                      << ToMergeIt->getId() << "\n");
+    Merged.set(N.getId());
+    Merged.set(ToMergeIt->getId());
+    It = mergeNodes(It, ToMergeIt);
+    Changed = true;
+  }
+  (void)MergedNodes;
+  LLVM_DEBUG(dbgs() << "Num of merged nodes: " << MergedNodes << "\n");
+
+  return Changed;
+}
+
+class TPUXLUOptimizations : public FunctionPass {
+public:
+  static char ID;
+  TPUXLUOptimizations() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+
+  StringRef getPassName() const override { return "TPU XLU optimizations"; }
+
+private:
+  bool processBasicBlock(BasicBlock &BB);
+};
+char TPUXLUOptimizations::ID = 0;
+
+} // namespace
+
+INITIALIZE_PASS(TPUXLUOptimizations, DEBUG_TYPE, "TPU XLU optimizations", false,
+                false)
+
+Pass *llvm::createTPUXLUOptimizationsPass() {
+  return new TPUXLUOptimizations();
+}
+
+bool TPUXLUOptimizations::processBasicBlock(BasicBlock &BB) {
+  return TPUXLUBF16Merger(BB).run();
+}
+
+bool TPUXLUOptimizations::runOnFunction(Function &F) {
+  bool Changed = false;
+  for (auto &BB : F) {
+    Changed |= processBasicBlock(BB);
+  }
+  // After pass verification of valid IR.
+  assert(!verifyFunction(F, &dbgs()));
+  return Changed;
+}

diff --git a/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/tpu.blueprint b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/tpu.blueprint
new file mode 100644
index 0000000..8e838b6
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/lib/Target/GoogleTPU/tpu.blueprint

@@ -0,0 +1,49 @@
+include "devtools/blueprint/ncl/blueprint_file.ncl";
+include "devtools/blueprint/ncl/sanitizer.ncl";
+
+blueprint_file = ::blueprint::BlueprintFile(
+  project_name = "tpu",
+  teams_product_id = 1330132954551,
+  tech_lead = ["hgreving"],
+  mdb_groups = ["research-tap", "llvm-build"],
+  dev_mailing_list = "tf-codegen-backend@google.com",
+  buganizer_component_ids = [147961],
+
+  buildable_units = [
+    ::blueprint::BuildableUnit(
+      name = "tpu_llvm",
+      test_patterns = [
+        "//platforms/xla/service/jellyfish/llo_execution_tests:llvm_tests",
+        "//platforms/xla/sparse_core:sparsecore_tests",
+      ],
+      enable_release = false,
+    ),
+    ::blueprint::BuildableUnit(
+      name = "tpu_llvm_short",
+      test_patterns = [
+        "//third_party/llvm/llvm:googletpu_tests",
+        "//third_party/llvm/llvm:xla_llvm_gwsq_test",
+        "//third_party/llvm/llvm-project/clang/test/CodeGen:all",
+      ],
+      enable_release = false,
+    )],
+
+  continuous_integration_tests = [
+    ::blueprint::ContinuousIntegrationTest(
+      name = "tpu_llvm_guitar",
+      base_version_spec = "cl:HEAD",
+      buildable_unit_name = "tpu_llvm",
+      requester = "llvm-build",
+      cluster_name = "FORGE",
+      throttle = "never",
+    ),
+    ::blueprint::ContinuousIntegrationTest(
+      name = "tpu_llvm_guitar_short",
+      base_version_spec = "cl:HEAD",
+      buildable_unit_name = "tpu_llvm_short",
+      requester = "llvm-build",
+      cluster_name = "FORGE",
+      throttle = "never",
+    ),
+  ],
+);

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/LiveRangeReduction.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/LiveRangeReduction.ll
new file mode 100644
index 0000000..5665685
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/LiveRangeReduction.ll

@@ -0,0 +1,32 @@
+; RUN: llc < %s -mcpu=tensorcore-jf -asm-verbose=false -misched-limit=1 -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu--"
+
+declare void @llvm.tpu.vtrace(i32)
+declare void @llvm.tpu.dma.vmem.to.hbm(i32 addrspace(204)*, <1024 x i32> addrspace(205)*, i32 addrspace(203)*, i32)
+declare void @llvm.tpu.waitge(i32 addrspace(204)*, i32)
+declare void @llvm.tpu.syncadd(i32 addrspace(204)*, i32)
+declare void @llvm.tpu.waiteq(i32 addrspace(204)*, i32)
+
+; Test that that liverange reduction pass doesn't run into an infinite loop.
+; We set a low value for -misched-limit which will be reached during
+; liverangereduction the pass should still make the next node in program order 
+; available to avoid an infinite loop.
+; CHECK-LABEL: LiverRangeReduceHang
+; CHECK: _ =	shalt
+define void @LiverRangeReduceHang() {
+entry:
+  %0 = load float, float* inttoptr (i32 160 to float*), align 32
+  %1 = fcmp une float %0, 0xFFF0000000000000
+  %2 = zext i1 %1 to i32
+  %3 = add nuw nsw i32 %2, %2
+  store i32 -8388608, i32* inttoptr (i32 160 to i32*), align 32
+  %4 = load float, float* inttoptr (i32 160 to float*), align 32
+  %5 = fcmp une float %4, 0xFFF0000000000000
+  %6 = zext i1 %5 to i32
+  %7 = add nuw nsw i32 %6, %3
+  store i32 %7, i32* inttoptr (i32 160 to i32*), align 32
+  ret void
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/addrspacecast_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/addrspacecast_sc.ll
new file mode 100644
index 0000000..c5f130c
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/addrspacecast_sc.ll

@@ -0,0 +1,328 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-vf < %s \
+; RUN: | llc -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-vf -opaque-pointers < %s \
+; RUN: | llc -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN:  -opaque-pointers | FileCheck %s
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-gl -opaque-pointers < %s \
+; RUN: | llc -mcpu=sparsecore-tec-gl -asm-verbose=false -disable-cgp \
+; RUN:  -opaque-pointers | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests that we're lowering tpu.addrspacecast as designed.
+
+declare i32 addrspace(212)* @llvm.tpu.addrspacecast.p212i32(i32*)
+declare i32 addrspace(213)* @llvm.tpu.addrspacecast.p213i32(i32 addrspace(203)*)
+declare i32 addrspace(211)* @llvm.tpu.addrspacecast.tc(i32 addrspace(204)*)
+declare i32 addrspace(211)* @llvm.tpu.addrspacecast.scs(i32 addrspace(204)*)
+declare i32 addrspace(211)* @llvm.tpu.addrspacecast.tec(i32 addrspace(204)*, i32)
+declare i32 addrspace(211)* @llvm.tpu.addrspacecast.tac(i32 addrspace(204)*, i32)
+declare i32 addrspace(217)* @llvm.tpu.addrspacecast.tile.scs(i32 addrspace(204)*)
+declare i32 addrspace(217)* @llvm.tpu.addrspacecast.tile.tec(i32 addrspace(204)*, i32)
+declare i32 addrspace(217)* @llvm.tpu.addrspacecast.tile.tac(i32 addrspace(204)*, i32)
+declare i32* @llvm.tpu.alloca.smem(i32)
+declare i32 addrspace(204)* @llvm.tpu.alloca.sflag(i32)
+declare i32 addrspace(203)* @llvm.tpu.alloca.hbm(i32)
+declare i32 @llvm.tpu.ptrtoint.pi32(i32*)
+declare i32* @llvm.tpu.inttoptr.pi32(i32)
+declare i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32)
+declare void @llvm.tpu.dma.hbm.to.hbm.sc.general(i32 addrspace(211)*, i32, i32 addrspace(203)*, i32 addrspace(213)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.dma.smem.to.smem.sc.general(i32 addrspace(211)*, i32, i32*, i32 addrspace(212)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32)
+declare i32 @llvm.tpu.tileid()
+declare void @llvm.tpu.syncadd.tile(i32 addrspace(217)*, i32)
+declare i32 @llvm.tpu.sc.dma.core.id(i32)
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+@flag = addrspace(204) global i32 0, align 4
+@rflag = addrspace(204) global i32 0, align 4
+
+; CHECK-LABEL: smem_smemany_sflag_scs_cast:
+; CHECK: s[[dsfi:[0-9]+]] = simm.s32 rflag
+; CHECK: s[[srccid:[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0xd
+; CHECK: s[[dstcid:[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0xd
+; CHECK: s[[ssft:[0-9]+]] = sor.u32 s{{[0-9]+}}, s[[srccid]]
+; CHECK: s[[dsft:[0-9]+]] = sor.u32 s[[dsfi]], s[[dstcid]]
+; CHECK: [smem:s{{[0-9]+}}], [sflag:s[[dsft]]] = dma.general [smem:s{{[0-9]+}}], [sflag:s[[ssft]]], length:$0x4, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @smem_smemany_sflag_scs_cast(i32 %a, i32* %src, i32 %srccid, i32 %dstcid) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %2 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %3, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %a
+  store i32 %add, i32* %arrayidx1, align 4
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  %cast_dst = tail call i32 addrspace(212)* @llvm.tpu.addrspacecast.p212i32(i32* %2)
+  %dst_sflag = tail call i32 addrspace(211)* @llvm.tpu.addrspacecast.scs(i32 addrspace(204)* @rflag)
+  call void @llvm.tpu.dma.smem.to.smem.sc.general(i32 addrspace(211)* %dst_sflag, i32 %dstcid, i32* %src, i32 addrspace(212)* %cast_dst, i32 4, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: hbm_hbmany_sflag_scs_cast:
+; CHECK: s[[dsfi:[0-9]+]] = simm.s32 rflag
+; CHECK: s[[srccid:[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0xd
+; CHECK: s[[dstcid:[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0xd
+; CHECK: s[[ssft:[0-9]+]] = sor.u32 s{{[0-9]+}}, s[[srccid]]
+; CHECK: s[[dsft:[0-9]+]] = sor.u32 s[[dsfi]], s{{[0-9]+}}
+; CHECK: [hbm:s{{[0-9]+}}], [sflag:s[[dsft]]] = dma.general [hbm:s{{[0-9]+}}], [sflag:s[[ssft]]], length:$0x4, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @hbm_hbmany_sflag_scs_cast(i32 %a, i32 addrspace(203)* %src, i32 %srccid, i32 %dstcid) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %2 = call i32 addrspace(203)* @llvm.tpu.alloca.hbm(i32 5)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %3, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %a
+  store i32 %add, i32* %arrayidx1, align 4
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  %cast_dst = tail call i32 addrspace(213)* @llvm.tpu.addrspacecast.p213i32(i32 addrspace(203)* %2)
+  %dst_sflag = tail call i32 addrspace(211)* @llvm.tpu.addrspacecast.scs(i32 addrspace(204)* @rflag)
+  call void @llvm.tpu.dma.hbm.to.hbm.sc.general(i32 addrspace(211)* %dst_sflag, i32 %dstcid, i32 addrspace(203)* %src, i32 addrspace(213)* %cast_dst, i32 4, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: hbm_hbmany_sflag_tc_cast:
+; CHECK: s[[dsfi:[0-9]+]] = simm.s32 rflag
+; CHECK: s[[srccid:[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0xd
+; CHECK: s[[dstcid:[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0xd
+; CHECK: s[[ssft:[0-9]+]] = sor.u32 s{{[0-9]+}}, s[[srccid]]
+; CHECK: s[[dsft:[0-9]+]] = sor.u32 s[[dsfi]], s{{[0-9]+}}
+; CHECK: [hbm:s{{[0-9]+}}], [sflag:s[[dsft]]] = dma.general [hbm:s{{[0-9]+}}], [sflag:s[[ssft]]], length:$0x4, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @hbm_hbmany_sflag_tc_cast(i32 %a, i32 addrspace(203)* %src, i32 %srccid, i32 %dstcid) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %2 = call i32 addrspace(203)* @llvm.tpu.alloca.hbm(i32 5)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %3, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %a
+  store i32 %add, i32* %arrayidx1, align 4
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  %cast_dst = tail call i32 addrspace(213)* @llvm.tpu.addrspacecast.p213i32(i32 addrspace(203)* %2)
+  %dst_sflag = tail call i32 addrspace(211)* @llvm.tpu.addrspacecast.tc(i32 addrspace(204)* @rflag)
+  call void @llvm.tpu.dma.hbm.to.hbm.sc.general(i32 addrspace(211)* %dst_sflag, i32 %dstcid, i32 addrspace(203)* %src, i32 addrspace(213)* %cast_dst, i32 4, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: smem_smemany_sflag_tec_0_cast:
+; CHECK: s[[dsfi:[0-9]+]] = simm.s32 rflag
+; CHECK: s[[dsft0:[0-9]+]] = sadd.s32 $0x1c00, s[[dsfi]]
+; CHECK: s[[dsft1:[0-9]+]] = sor.u32 s[[dsft0]], s{{[0-9]+}}
+; CHECK: [smem:s{{[0-9]+}}], [sflag:s[[dsft1]]] = dma.general [smem:s{{[0-9]+}}], [sflag:s{{[0-9]+}}], length:$0x4, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @smem_smemany_sflag_tec_0_cast(i32 %a, i32* %src, i32 %srccid, i32 %dstcid) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %2 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %3 = call i32 addrspace(204)* @llvm.tpu.alloca.sflag(i32 5)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %4 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %4, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %a
+  store i32 %add, i32* %arrayidx1, align 4
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  %cast_dst = tail call i32 addrspace(212)* @llvm.tpu.addrspacecast.p212i32(i32* %2)
+  %dst_sflag = tail call i32 addrspace(211)* @llvm.tpu.addrspacecast.tec(i32 addrspace(204)* @rflag, i32 0)
+  call void @llvm.tpu.dma.smem.to.smem.sc.general(i32 addrspace(211)* %dst_sflag, i32 %dstcid, i32* %src, i32 addrspace(212)* %cast_dst, i32 4, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: smem_smemany_sflag_tec_15_cast:
+; CHECK: s[[dsfi:[0-9]+]] = simm.s32 rflag
+; CHECK: s[[dsft0:[0-9]+]] = sadd.s32 $0x1fc0, s[[dsfi]]
+; CHECK: s[[dsft1:[0-9]+]] = sor.u32 s[[dsft0]], s{{[0-9]+}}
+; CHECK: [smem:s{{[0-9]+}}], [sflag:s[[dsft1]]] = dma.general [smem:s{{[0-9]+}}], [sflag:s{{[0-9]+}}], length:$0x4, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @smem_smemany_sflag_tec_15_cast(i32 %a, i32* %src, i32 %srccid, i32 %dstcid) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %2 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %3 = call i32 addrspace(204)* @llvm.tpu.alloca.sflag(i32 5)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %4 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %4, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %a
+  store i32 %add, i32* %arrayidx1, align 4
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  %cast_dst = tail call i32 addrspace(212)* @llvm.tpu.addrspacecast.p212i32(i32* %2)
+  %dst_sflag = tail call i32 addrspace(211)* @llvm.tpu.addrspacecast.tec(i32 addrspace(204)* @rflag, i32 15)
+  call void @llvm.tpu.dma.smem.to.smem.sc.general(i32 addrspace(211)* %dst_sflag, i32 %dstcid, i32* %src, i32 addrspace(212)* %cast_dst, i32 4, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: smem_smemany_sflag_tac_0_cast:
+; CHECK: s[[dsfi:[0-9]+]] = simm.s32 rflag
+; CHECK: s[[dsft0:[0-9]+]] = sadd.s32 $0x1c20, s[[dsfi]]
+; CHECK: s[[dsft1:[0-9]+]] = sor.u32 s[[dsft0]], s{{[0-9]+}}
+; CHECK: [smem:s{{[0-9]+}}], [sflag:s[[dsft1]]] = dma.general [smem:s{{[0-9]+}}], [sflag:s{{[0-9]+}}], length:$0x4, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @smem_smemany_sflag_tac_0_cast(i32 %a, i32* %src, i32 %srccid, i32 %dstcid) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %2 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %3 = call i32 addrspace(204)* @llvm.tpu.alloca.sflag(i32 5)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %4 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %4, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %a
+  store i32 %add, i32* %arrayidx1, align 4
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  %cast_dst = tail call i32 addrspace(212)* @llvm.tpu.addrspacecast.p212i32(i32* %2)
+  %dst_sflag = tail call i32 addrspace(211)* @llvm.tpu.addrspacecast.tac(i32 addrspace(204)* @rflag, i32 0)
+  call void @llvm.tpu.dma.smem.to.smem.sc.general(i32 addrspace(211)* %dst_sflag, i32 %dstcid, i32* %src, i32 addrspace(212)* %cast_dst, i32 4, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: smem_smemany_sflag_tac_15_cast:
+; CHECK: s[[dsfi:[0-9]+]] = simm.s32 rflag
+; CHECK: s[[dsft0:[0-9]+]] = sadd.s32 $0x1fe0, s[[dsfi]]
+; CHECK: s[[dsft1:[0-9]+]] = sor.u32 s[[dsft0]], s{{[0-9]+}}
+; CHECK: [smem:s{{[0-9]+}}], [sflag:s[[dsft1]]] = dma.general [smem:s{{[0-9]+}}], [sflag:s{{[0-9]+}}], length:$0x4, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @smem_smemany_sflag_tac_15_cast(i32 %a, i32* %src, i32 %srccid, i32 %dstcid) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %2 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %3 = call i32 addrspace(204)* @llvm.tpu.alloca.sflag(i32 5)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %4 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %4, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %a
+  store i32 %add, i32* %arrayidx1, align 4
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  %cast_dst = tail call i32 addrspace(212)* @llvm.tpu.addrspacecast.p212i32(i32* %2)
+  %dst_sflag = tail call i32 addrspace(211)* @llvm.tpu.addrspacecast.tac(i32 addrspace(204)* @rflag, i32 15)
+  call void @llvm.tpu.dma.smem.to.smem.sc.general(i32 addrspace(211)* %dst_sflag, i32 %dstcid, i32* %src, i32 addrspace(212)* %cast_dst, i32 4, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: smem_smemany_sflag_tec_tileid_cast:
+; CHECK: s[[tid:[0-9]+]] = stileid.u32
+; CHECK: s[[dsfi:[0-9]+]] = simm.s32 rflag
+; CHECK: s[[stid:[0-9]+]] = sshll.u32 s[[tid]], $0x6
+; CHECK: s[[doff:[0-9]+]] = sadd.s32 s[[dsfi]], s[[stid]]
+; CHECK: s[[dsft0:[0-9]+]] = sadd.s32 $0x1c00, s[[doff]]
+; CHECK: s[[dsft1:[0-9]+]] = sor.u32 s[[dsft0]], s{{[0-9]+}}
+; CHECK: [smem:s{{[0-9]+}}], [sflag:s[[dsft1]]] = dma.general [smem:s{{[0-9]+}}], [sflag:s{{[0-9]+}}], length:$0x4, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @smem_smemany_sflag_tec_tileid_cast(i32 %a, i32* %src, i32 %srccid, i32 %dstcid) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %2 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %3 = call i32 addrspace(204)* @llvm.tpu.alloca.sflag(i32 5)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %4 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %4, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %a
+  store i32 %add, i32* %arrayidx1, align 4
+  %tileid = tail call i32 @llvm.tpu.tileid()
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  %cast_dst = tail call i32 addrspace(212)* @llvm.tpu.addrspacecast.p212i32(i32* %2)
+  %dst_sflag = tail call i32 addrspace(211)* @llvm.tpu.addrspacecast.tec(i32 addrspace(204)* @rflag, i32 %tileid)
+  call void @llvm.tpu.dma.smem.to.smem.sc.general(i32 addrspace(211)* %dst_sflag, i32 %dstcid, i32* %src, i32 addrspace(212)* %cast_dst, i32 4, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: smem_smemany_sflag_tac_tileid_cast:
+; CHECK: s[[tid:[0-9]+]] = stileid.u32
+; CHECK: s[[dsfi:[0-9]+]] = simm.s32 rflag
+; CHECK: s[[stid:[0-9]+]] = sshll.u32 s[[tid]], $0x6
+; CHECK: s[[doff:[0-9]+]] = sadd.s32 s[[dsfi]], s[[stid]]
+; CHECK: s[[dsft0:[0-9]+]] = sadd.s32 $0x1c20, s[[doff]]
+; CHECK: s[[dsft1:[0-9]+]] = sor.u32 s[[dsft0]], s{{[0-9]+}}
+; CHECK: [smem:s{{[0-9]+}}], [sflag:s[[dsft1]]] = dma.general [smem:s{{[0-9]+}}], [sflag:s{{[0-9]+}}], length:$0x4, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @smem_smemany_sflag_tac_tileid_cast(i32 %a, i32* %src, i32 %srccid, i32 %dstcid) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %2 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %3 = call i32 addrspace(204)* @llvm.tpu.alloca.sflag(i32 5)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %4 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %4, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %a
+  store i32 %add, i32* %arrayidx1, align 4
+  %tileid = tail call i32 @llvm.tpu.tileid()
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  %cast_dst = tail call i32 addrspace(212)* @llvm.tpu.addrspacecast.p212i32(i32* %2)
+  %dst_sflag = tail call i32 addrspace(211)* @llvm.tpu.addrspacecast.tac(i32 addrspace(204)* @rflag, i32 %tileid)
+  call void @llvm.tpu.dma.smem.to.smem.sc.general(i32 addrspace(211)* %dst_sflag, i32 %dstcid, i32* %src, i32 addrspace(212)* %cast_dst, i32 4, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: sflag_tile_scs_cast
+; CHECK: s[[rflag:[0-9]+]] = simm.s32 rflag
+; CHECK: s[[enc:[0-9]+]] = sadd.s32 $0x100000, s[[rflag]]
+; CHECK: [sflag:s[[enc]]] = ssyncadd.tile.s32 $0xd
+define void @sflag_tile_scs_cast() #2 {
+entry:
+  %tileflag = call i32 addrspace(217)* @llvm.tpu.addrspacecast.tile.scs(i32 addrspace(204)* @rflag)
+  call void @llvm.tpu.syncadd.tile(i32 addrspace(217)* %tileflag, i32 13)
+  ret void
+}
+
+; CHECK-LABEL: sflag_tile_tec_cast
+; CHECK: s[[rflag:[0-9]+]] = simm.s32 rflag
+; CHECK: s[[enc:[0-9]+]] = sadd.s32 $0x70000, s[[rflag]]
+; CHECK: [sflag:s[[enc]]] = ssyncadd.tile.s32 $0xd
+define void @sflag_tile_tec_cast() #2 {
+entry:
+  %tileflag = call i32 addrspace(217)* @llvm.tpu.addrspacecast.tile.tec(i32 addrspace(204)* @rflag, i32 7)
+  call void @llvm.tpu.syncadd.tile(i32 addrspace(217)* %tileflag, i32 13)
+  ret void
+}
+
+; CHECK-LABEL: sflag_tile_tac_cast
+; CHECK: s[[rflag:[0-9]+]] = simm.s32 rflag
+; CHECK: s[[enc:[0-9]+]] = sadd.s32 $0x70020, s[[rflag]]
+; CHECK: [sflag:s[[enc]]] = ssyncadd.tile.s32 $0xd
+define void @sflag_tile_tac_cast() #2 {
+entry:
+  %tileflag = call i32 addrspace(217)* @llvm.tpu.addrspacecast.tile.tac(i32 addrspace(204)* @rflag, i32 7)
+  call void @llvm.tpu.syncadd.tile(i32 addrspace(217)* %tileflag, i32 13)
+  ret void
+}
+
+; CHECK-LABEL: sc_core_id
+; CHECK: s0 = sadd.s32 $0x4, s0
+define i32 @sc_core_id(i32 %cidx) #2 {
+entry:
+  %cid = call i32 @llvm.tpu.sc.dma.core.id(i32 %cidx)
+  ret i32 %cid
+}
+
+!smem.funcs.alloca = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12}
+!smem.start.alloca = !{!32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32}
+!hbm.funcs.alloca = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12}
+!hbm.start.alloca = !{!32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32}
+!sflag.funcs.alloca = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12}
+!sflag.start.alloca = !{!32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32}
+!sflagother.funcs.alloca = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12}
+!sflagother.start.alloca = !{!32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32}
+
+!0 = !{void (i32, i32*, i32, i32)* @smem_smemany_sflag_scs_cast}
+!1 = !{void (i32, i32 addrspace(203)*, i32, i32)* @hbm_hbmany_sflag_scs_cast}
+!2 = !{void (i32, i32 addrspace(203)*, i32, i32)* @hbm_hbmany_sflag_tc_cast}
+!3 = !{void (i32, i32*, i32, i32)* @smem_smemany_sflag_tec_0_cast}
+!4 = !{void (i32, i32*, i32, i32)* @smem_smemany_sflag_tec_15_cast}
+!5 = !{void (i32, i32*, i32, i32)* @smem_smemany_sflag_tac_0_cast}
+!6 = !{void (i32, i32*, i32, i32)* @smem_smemany_sflag_tac_15_cast}
+!7 = !{void (i32, i32*, i32, i32)* @smem_smemany_sflag_tec_tileid_cast}
+!8 = !{void (i32, i32*, i32, i32)* @smem_smemany_sflag_tac_tileid_cast}
+!9 = !{void ()* @sflag_tile_scs_cast}
+!10 = !{void ()* @sflag_tile_tec_cast}
+!11 = !{void ()* @sflag_tile_tac_cast}
+!12 = !{i32 (i32)* @sc_core_id}
+
+; Not perfect naming, but more readable as constants than !n, !{n+1}, etc.
+!32 = !{i32 0}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/aliasing.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/aliasing.ll
new file mode 100644
index 0000000..1fd4419
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/aliasing.ll

@@ -0,0 +1,35 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <8 x i32> addrspace(201)* @llvm.tpu.make.restrict.ptr.v8i32p201(<8 x i32> addrspace(201)*)
+declare <8 x float> addrspace(201)* @llvm.tpu.make.restrict.ptr.v8f32p201(<8 x float> addrspace(201)*)
+
+; Test that the store can be re-ordered across the load as user mark it as no
+; alias.
+; CHECK-LABEL: create_restrict_ptri:
+; CHECK:      {  v0 = vld [tilespmem:s{{[0-9]+}}+$0x0];
+; CHECK: [tilespmem:s{{[0-9]+}}+$0x0] = vst v{{[0-9]+}};
+; CHECK-NEXT: _ = shalt }
+define <8 x i32> @create_restrict_ptri(<8 x i32> addrspace(201)* %in, i32 %offset) {
+  %noalias = call <8 x i32> addrspace(201)* @llvm.tpu.make.restrict.ptr.v8i32p201(<8 x i32> addrspace(201)* %in)
+  %addr = getelementptr <8 x i32>, <8 x i32> addrspace(201)* %noalias, i32 %offset
+  store <8 x i32> zeroinitializer, <8 x i32> addrspace(201)* %addr
+  %b = load <8 x i32>, <8 x i32> addrspace(201)* %in
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: create_restrict_ptrf:
+; CHECK:      {  v0 = vld [tilespmem:s{{[0-9]+}}+$0x0];
+; CHECK: [tilespmem:s{{[0-9]+}}+$0x0] = vst v{{[0-9]+}};
+; CHECK-NEXT: _ = shalt }
+define <8 x float> @create_restrict_ptrf(<8 x float> addrspace(201)* %in, i32 %offset) {
+  %noalias = call <8 x float> addrspace(201)* @llvm.tpu.make.restrict.ptr.v8f32p201(<8 x float> addrspace(201)* %in)
+  %addr = getelementptr <8 x float>, <8 x float> addrspace(201)* %noalias, i32 %offset
+  store <8 x float> zeroinitializer, <8 x float> addrspace(201)* %addr
+  %b = load <8 x float>, <8 x float> addrspace(201)* %in
+  ret <8 x float> %b
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/aliasing_gf.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/aliasing_gf.ll
new file mode 100644
index 0000000..a25a11c
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/aliasing_gf.ll

@@ -0,0 +1,35 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <16 x i32> addrspace(201)* @llvm.tpu.make.restrict.ptr.v16i32p201(<16 x i32> addrspace(201)*)
+declare <16 x float> addrspace(201)* @llvm.tpu.make.restrict.ptr.v16f32p201(<16 x float> addrspace(201)*)
+
+; Test that the store can be re-ordered across the load as user mark it as no
+; alias.
+; CHECK-LABEL: create_restrict_ptri:
+; CHECK:      {  v0 = vld [tilespmem:s{{[0-9]+}}+$0x0];
+; CHECK: [tilespmem:s{{[0-9]+}}+$0x0] = vst v{{[0-9]+}};
+; CHECK-NEXT: _ = shalt }
+define <16 x i32> @create_restrict_ptri(<16 x i32> addrspace(201)* %in, i32 %offset) {
+  %noalias = call <16 x i32> addrspace(201)* @llvm.tpu.make.restrict.ptr.v16i32p201(<16 x i32> addrspace(201)* %in)
+  %addr = getelementptr <16 x i32>, <16 x i32> addrspace(201)* %noalias, i32 %offset
+  store <16 x i32> zeroinitializer, <16 x i32> addrspace(201)* %addr
+  %b = load <16 x i32>, <16 x i32> addrspace(201)* %in
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: create_restrict_ptrf:
+; CHECK:      {  v0 = vld [tilespmem:s{{[0-9]+}}+$0x0];
+; CHECK: [tilespmem:s{{[0-9]+}}+$0x0] = vst v{{[0-9]+}};
+; CHECK-NEXT: _ = shalt }
+define <16 x float> @create_restrict_ptrf(<16 x float> addrspace(201)* %in, i32 %offset) {
+  %noalias = call <16 x float> addrspace(201)* @llvm.tpu.make.restrict.ptr.v16f32p201(<16 x float> addrspace(201)* %in)
+  %addr = getelementptr <16 x float>, <16 x float> addrspace(201)* %noalias, i32 %offset
+  store <16 x float> zeroinitializer, <16 x float> addrspace(201)* %addr
+  %b = load <16 x float>, <16 x float> addrspace(201)* %in
+  ret <16 x float> %b
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/alloc_constant_fold_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/alloc_constant_fold_sc.ll
new file mode 100644
index 0000000..50c21ea
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/alloc_constant_fold_sc.ll

@@ -0,0 +1,21 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-vf < %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests that an expression with getelementptr w/ literal pointer can be constant folded
+; by instruction combine, as needed by our memory allocation pass.
+
+; CHECK-LABEL: alloca_smem:
+; CHECK: %1 = tail call i32 addrspace(203)* @llvm.tpu.inttoptr.p203i32(i32 0)
+define void @alloca_smem(i32 %v, i32 %a) {
+  %1 = call i32 addrspace(203)* @llvm.tpu.alloca.hbm(i32 lshr (i32 mul (i32 ptrtoint (float* getelementptr (float, float* inttoptr(i32 0 to float*), i32 1) to i32), i32 2), i32 5))
+  %addr = getelementptr inbounds i32, i32 addrspace(203)* %1, i32 %a
+  ; We can't actually lower a store to hbm, this is just here so the alloca isn't dead.
+  store i32 %v, i32 addrspace(203)* %addr, align 32
+  ret void
+}
+
+declare i32 addrspace(203)* @llvm.tpu.alloca.hbm(i32) #0
+attributes #0 = { nounwind }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/antidep_liverange_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/antidep_liverange_sc.ll
new file mode 100644
index 0000000..f54f57c
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/antidep_liverange_sc.ll

@@ -0,0 +1,156 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; This test is derived from the SparseCore embedding kernel segmented reduce. It checks for whether we're
+; successfully break an anti-dependence, reducing the kernel to 5 bundles.
+
+; CHECK-LABEL: tile_execute:
+; CHECK: .LBB[[b1:[0-9]+]]_[[b2:[0-9]+]]:
+; CHECK {
+; CHECK {
+; CHECK {
+; CHECK {
+; CHECK {{.*}} .LBB[[b1]]_[[b2]]
+; CHECK {
+
+; Function Attrs: nounwind
+define void @tile_execute() {
+entry:
+  %0 = load i32, i32* inttoptr (i32 258 to i32*), align 4, !tbaa !49
+  %1 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 260 to <8 x i32> addrspace(201)**), align 4, !tbaa !49
+  %2 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 261 to <8 x i32> addrspace(201)**), align 4, !tbaa !49
+  %3 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 262 to <8 x float> addrspace(201)**), align 4, !tbaa !49
+  %4 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 263 to <8 x float> addrspace(201)**), align 4, !tbaa !49
+  %5 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 264 to <8 x float> addrspace(201)**), align 8, !tbaa !49
+  %div = sdiv i32 %0, 8
+  %6 = load <8 x i32>, <8 x i32> addrspace(201)* %1, align 32, !tbaa !53
+  %7 = load <8 x i32>, <8 x i32> addrspace(201)* %2, align 32, !tbaa !53
+  %8 = bitcast <8 x float> addrspace(201)* %3 to <8 x i32> addrspace(201)*
+  %9 = load <8 x i32>, <8 x i32> addrspace(201)* %8, align 32, !tbaa !53
+  %10 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %6, <8 x i32> %7) #6, !noalias !54
+  %11 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %6, <8 x i32> %9) #6, !noalias !57
+  %cmp106.i = icmp sgt i32 %0, 7
+  br i1 %cmp106.i, label %for.body.lr.ph.i, label %_ZN10embeddings15SegmentedReduce7ComputeEiNS_15TmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit, !llvm.loop !60
+
+for.body.lr.ph.i:                                 ; preds = %entry
+  %12 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %11, 1
+  %13 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %10, 1
+  %14 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %10, 0
+  %sub.i = add nsw i32 %div, -1
+  br label %for.body.i
+
+for.cond.loopexit.i:                              ; preds = %for.body22.i
+  %15 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %21, 1
+  %16 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %23, 1
+  %exitcond = icmp eq i32 %add.i, %div
+  br i1 %exitcond, label %_ZN10embeddings15SegmentedReduce7ComputeEiNS_15TmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit, label %for.body.i, !llvm.loop !60
+
+for.body.i:                                       ; preds = %for.cond.loopexit.i, %for.body.lr.ph.i
+  %sorted_gains.0110.in.i = phi <8 x i32> [ %12, %for.body.lr.ph.i ], [ %16, %for.cond.loopexit.i ]
+  %sorted_segments.0109.i = phi <8 x i32> [ %14, %for.body.lr.ph.i ], [ %22, %for.cond.loopexit.i ]
+  %sorted_indices.0108.i = phi <8 x i32> [ %13, %for.body.lr.ph.i ], [ %15, %for.cond.loopexit.i ]
+  %i.0107.i = phi i32 [ 0, %for.body.lr.ph.i ], [ %add.i, %for.cond.loopexit.i ]
+  %sorted_gains.0110.i = bitcast <8 x i32> %sorted_gains.0110.in.i to <8 x float>
+  %cmp7.i = icmp eq i32 %i.0107.i, %sub.i
+  %add.i = add nuw nsw i32 %i.0107.i, 1
+  %cond.i = select i1 %cmp7.i, i32 %i.0107.i, i32 %add.i
+  %add.ptr.i80.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %1, i32 %cond.i
+  %17 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i80.i, align 32, !tbaa !53
+  %add.ptr.i78.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %2, i32 %cond.i
+  %18 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i78.i, align 32, !tbaa !53
+  %add.ptr.i.i = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %3, i32 %cond.i
+  %19 = bitcast <8 x float> addrspace(201)* %add.ptr.i.i to <8 x i32> addrspace(201)*
+  %20 = load <8 x i32>, <8 x i32> addrspace(201)* %19, align 32, !tbaa !53
+  %21 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %17, <8 x i32> %18) #6, !noalias !67
+  %22 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %21, 0
+  %23 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %17, <8 x i32> %20) #6, !noalias !70
+  %mul.i = shl <8 x i32> %sorted_indices.0108.i, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  %mul30.i = shl <8 x i32> %sorted_segments.0109.i, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  br label %for.body22.i
+
+for.body22.i:                                     ; preds = %for.body22.i, %for.body.i
+  %j.0104.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body22.i ]
+  %splat.splatinsert23.i = insertelement <8 x i32> undef, i32 %j.0104.i, i32 0
+  %splat.splat24.i = shufflevector <8 x i32> %splat.splatinsert23.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %add25.i = add <8 x i32> %splat.splat24.i, %mul.i
+  %24 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %4, <8 x i32> %add25.i), !llvm.access.group !62
+  %mul27.i = fmul <8 x float> %24, %sorted_gains.0110.i
+  %25 = tail call { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %sorted_segments.0109.i, <8 x float> %mul27.i), !llvm.access.group !62
+  %26 = extractvalue { <8 x float>, <8 x i1> } %25, 0
+  %add33.i = add <8 x i32> %splat.splat24.i, %mul30.i
+  tail call void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %5, <8 x i32> %add33.i, <8 x float> %26), !llvm.access.group !62
+  %inc.i = add nuw nsw i32 %j.0104.i, 1
+  %exitcond.i = icmp eq i32 %inc.i, 32
+  br i1 %exitcond.i, label %for.cond.loopexit.i, label %for.body22.i, !llvm.loop !60
+
+_ZN10embeddings15SegmentedReduce7ComputeEiNS_15TmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit: ; preds = %for.cond.loopexit.i, %entry
+  store i32 1, i32* inttoptr (i32 256 to i32*), align 256, !tbaa !49
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind readonly
+declare <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>) #3
+
+; Function Attrs: inaccessiblememonly nounwind
+declare { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32>, <8 x float>) #4
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, <8 x float>) #5
+
+; Function Attrs: inaccessiblememonly nounwind
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1>, <8 x i32>, <8 x i32>) #4
+
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tac-vf" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="256" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { argmemonly nounwind readonly }
+attributes #4 = { inaccessiblememonly nounwind }
+attributes #5 = { argmemonly nounwind willreturn }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!smem.funcs.spill = !{!2}
+!tilespmem.funcs.spill = !{!2}
+!vmem.funcs.spill = !{!2}
+!smem.ranges.spill.start = !{!45}
+!smem.ranges.spill.limit = !{!46}
+!tilespmem.ranges.spill.start = !{!45}
+!tilespmem.ranges.spill.limit = !{!45}
+!vmem.ranges.spill.start = !{!45}
+!vmem.ranges.spill.limit = !{!45}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version google3-trunk (fee41517fe0f7ff9f0e204dd9200ebf32ca03cb8)"}
+!2 = !{void ()* @tile_execute}
+!45 = !{i32 0}
+!46 = !{i32 1024}
+!47 = !{i32 8192}
+!48 = !{i32 131072}
+!49 = !{!50, !50, i64 0}
+!50 = !{!"int", !51, i64 0}
+!51 = !{!"omnipotent char", !52, i64 0}
+!52 = !{!"Simple C++ TBAA"}
+!53 = !{!51, !51, i64 0}
+!54 = !{!55}
+!55 = distinct !{!55, !56, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_: %agg.result"}
+!56 = distinct !{!56, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_"}
+!57 = !{!58}
+!58 = distinct !{!58, !59, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_: %agg.result"}
+!59 = distinct !{!59, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_"}
+!60 = distinct !{!60, !61, !63, !64, !65, !66}
+!61 = !{!"llvm.loop.parallel_accesses", !62}
+!62 = distinct !{}
+!63 = !{!"llvm.loop.unroll.disable"}
+!64 = !{!"llvm.loop.vectorize.width", i32 1}
+!65 = !{!"llvm.loop.interleave.count", i32 1}
+!66 = !{!"llvm.loop.vectorize.enable", i1 true}
+!67 = !{!68}
+!68 = distinct !{!68, !69, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_: %agg.result"}
+!69 = distinct !{!69, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_"}
+!70 = !{!71}
+!71 = distinct !{!71, !72, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_: %agg.result"}
+!72 = distinct !{!72, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_"}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/barnacore.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/barnacore.ll
new file mode 100644
index 0000000..4687b81
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/barnacore.ll

@@ -0,0 +1,224 @@
+; RUN: llc < %s -march=googletpu -mcpu=barnacore-cc-pf -disable-cgp -print-encoding-annotations | FileCheck %s
+; REQUIRES: tpu
+
+declare [12 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex([12 x <8 x float>], <8 x float>)
+declare <8 x float> @llvm.tpu.bc.load.aliaddr(<8 x float> addrspace(207)*, i32) readonly argmemonly
+declare void @llvm.tpu.bc.store.aliaddr(<8 x float>, <8 x float> addrspace(207)*, i32) argmemonly
+declare <8 x float> addrspace(207)* @llvm.tpu.inttoptr.p207v8f32(i32) nounwind
+
+; Takes a set of 12 vectors in v0..v11. Loads the a value from bmem into
+; [v0 + (loop_index)].
+; CHECK-LABEL: insertvalue_loopindex_1:
+; CHECK: v0.ali = vld.f32 [bmem:$0x2a]
+; CHECK-NEXT: #HALT
+define [12 x <8 x float>] @insertvalue_loopindex_1( [12 x <8 x float>] %regs) {
+  %a = call <8 x float> addrspace(207)* @llvm.tpu.inttoptr.p207v8f32(i32 42)
+  %b = load <8 x float>, <8 x float> addrspace(207)* %a
+  %c = call [12 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex([12 x <8 x float>] %regs, <8 x float> %b)
+  ret [12 x <8 x float>] %c
+}
+
+; CHECK-LABEL: insertvalue_loopindex_carried:
+; CHECK: v0.ali = vld.f32 [bmem:$0x2a]
+; CHECK: v0.ali = vld.f32 [bmem:$0x2]
+; CHECK-NEXT: #HALT
+define [12 x <8 x float>] @insertvalue_loopindex_carried( [12 x <8 x float>] %regs) {
+  %a = call <8 x float> addrspace(207)* @llvm.tpu.inttoptr.p207v8f32(i32 42)
+  %b = load <8 x float>, <8 x float> addrspace(207)* %a
+  %c = call [12 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex([12 x <8 x float>] %regs, <8 x float> %b)
+  %d = call <8 x float> addrspace(207)* @llvm.tpu.inttoptr.p207v8f32(i32 2)
+  %e = load <8 x float>, <8 x float> addrspace(207)* %d
+  %f = call [12 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex([12 x <8 x float>] %c, <8 x float> %e)
+  ret [12 x <8 x float>] %f
+}
+
+; CHECK-LABEL: load_aliaddr:
+; CHECK: v0 = vld.f32 [bmem:$0x2a] ali_addr:$0x2;
+define <8 x float> @load_aliaddr() {
+  %a = call <8 x float> addrspace(207)* @llvm.tpu.inttoptr.p207v8f32(i32 42)
+  %b = call <8 x float> @llvm.tpu.bc.load.aliaddr(<8 x float> addrspace(207)* %a, i32 2)
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: consecutive_regs:
+; Check that the second return value is allocated v12.
+; CHECK: v12{{(.ali)?}} = vld.f32 [bmem:$0x2a]
+; CHECK-NEXT: #HALT
+; FIXME: The .ali on v12 is incorrect. It is caused because the whole returned
+; struct is flattened and we can't work out correctly that the last argument
+; isn't an aggregate.
+define {[12 x <8 x float>], <8 x float>} @consecutive_regs( [12 x <8 x float>] %regs) {
+  %a = call <8 x float> addrspace(207)* @llvm.tpu.inttoptr.p207v8f32(i32 42)
+  %b = load <8 x float>, <8 x float> addrspace(207)* %a
+
+  %r1 = insertvalue {[12 x <8 x float>], <8 x float>} undef, [12 x <8 x float>] %regs, 0
+  %r2 = insertvalue {[12 x <8 x float>], <8 x float>} %r1, <8 x float> %b, 1
+  ret {[12 x <8 x float>], <8 x float>} %r2
+}
+
+; CHECK-LABEL: store_aliaddr:
+; CHECK: [bmem:$0x2a] = vst.f32 v0 ali_addr:$0x3;
+define void @store_aliaddr(<8 x float> %a) {
+  %b = call <8 x float> addrspace(207)* @llvm.tpu.inttoptr.p207v8f32(i32 42)
+  call void @llvm.tpu.bc.store.aliaddr(<8 x float> %a, <8 x float> addrspace(207)* %b, i32 3)
+  ret void
+}
+
+; CHECK-LABEL: storer_aliaddr:
+; CHECK: [bmem:s0] = vst.f32 v0 ali_addr:$0x3;
+define void @storer_aliaddr(<8 x float> %a, <8 x float> addrspace(207)* %addr) {
+  call void @llvm.tpu.bc.store.aliaddr(<8 x float> %a, <8 x float> addrspace(207)* %addr, i32 3)
+  ret void
+}
+
+declare <8 x float> @llvm.tpu.bc.extractvalue.loopindex([14 x <8 x float>])
+
+; CHECK-LABEL: storer_alisrc
+; CHECK: [bmem:s0] = vst.f32 v0.ali
+define void @storer_alisrc([14 x <8 x float>] %accum, <8 x float> addrspace(207)* %addr) {
+  %a = call <8 x float> @llvm.tpu.bc.extractvalue.loopindex([14 x <8 x float>] %accum)
+  store <8 x float> %a, <8 x float> addrspace(207)* %addr
+  ret void
+}
+
+; CHECK-LABEL: storer_alisrc_aliaddr
+; CHECK: [bmem:$0x2a] = vst.f32 v3.ali ali_addr:$0x1
+define void @storer_alisrc_aliaddr([3 x <8 x float>] %dummy, [14 x <8 x float>] %accum) {
+  %a = call <8 x float> @llvm.tpu.bc.extractvalue.loopindex([14 x <8 x float>] %accum)
+  %b = call <8 x float> addrspace(207)* @llvm.tpu.inttoptr.p207v8f32(i32 42)
+  call void @llvm.tpu.bc.store.aliaddr(<8 x float> %a, <8 x float> addrspace(207)* %b, i32 1)
+  ret void
+}
+
+declare void @llvm.tpu.bc.store.concat(<8 x float>) inaccessiblememonly
+
+; CHECK-LABEL: store_concat:
+; CHECK: (concat_reg) = vst.f32 v0
+define void @store_concat(<8 x float> %v) {
+  call void @llvm.tpu.bc.store.concat(<8 x float> %v)
+  ret void
+}
+
+declare <8 x float> @llvm.tpu.bc.extractvalue.loopindex.a5v8f([5 x <8 x float>])
+
+; CHECK-LABEL: store_concat_alisrc:
+; CHECK: (concat_reg) = vst.f32 v1.ali
+define void @store_concat_alisrc(<8 x float> %dummy, [5 x <8 x float>] %accum) {
+  %a = call <8 x float> @llvm.tpu.bc.extractvalue.loopindex.a5v8f([5 x <8 x float>] %accum)
+  call void @llvm.tpu.bc.store.concat(<8 x float> %a)
+  ret void
+}
+
+
+; CHECK-LABEL: vadd_f32_rr:
+; CHECK: v0 = vadd.f32 v1, v0
+define <8 x float> @vadd_f32_rr(<8 x float> %a, <8 x float> %b) {
+  %c = fadd <8 x float> %a, %b
+  ret <8 x float> %c
+}
+
+; CHECK-LABEL: vadd_f32_ri:
+; CHECK: v0 = vadd.f32 $1.0, v0
+define <8 x float> @vadd_f32_ri(<8 x float> %a, <8 x float> %b) {
+  %c = fadd <8 x float> %a, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <8 x float> %c
+}
+
+; CHECK-LABEL: vor_i32_rs:
+; CHECK: v0 = vor.u32 s0, v1
+define <8 x i32> @vor_i32_rs(<8 x i32> %a, <8 x i32> %b, i32 %c) {
+  %d = insertelement <8 x i32> undef, i32 %c, i32 0
+  %e = shufflevector <8 x i32> %d, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %f = or <8 x i32> %b, %e
+  ret <8 x i32> %f
+}
+
+declare [5 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex.a5vf32([5 x <8 x float>], <8 x float>)
+
+; CHECK-LABEL: vmul_f32_rr_alidst:
+; CHECK: v0.ali = vmul.f32 v5, v5
+define [5 x <8 x float>] @vmul_f32_rr_alidst([5 x <8 x float>] %accum, <8 x float> %b) {
+  %c = fmul <8 x float> %b, %b
+  %ins = call [5 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex.a5vf32([5 x <8 x float>] %accum, <8 x float> %c)
+  ret [5 x <8 x float>] %ins
+}
+
+; CHECK-LABEL: vmul_f32_rr_alix_alidst:
+; CHECK: v0.ali = vmul.f32 v10, v5.ali
+define [5 x <8 x float>] @vmul_f32_rr_alix_alidst([5 x <8 x float>] %accum, [ 5 x <8 x float>] %xs, <8 x float> %b) {
+  %x = call <8 x float> @llvm.tpu.bc.extractvalue.loopindex.a5v8f([5 x <8 x float>] %xs)
+  %c = fmul <8 x float> %x, %b
+  %ins = call [5 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex.a5vf32([5 x <8 x float>] %accum, <8 x float> %c)
+  ret [5 x <8 x float>] %ins
+}
+
+; CHECK-LABEL: vadd_f32_ri_alix_alidst:
+; CHECK: v0.ali = vadd.f32 $1.0, v5.ali
+define [5 x <8 x float>] @vadd_f32_ri_alix_alidst([5 x <8 x float>] %accum, [ 5 x <8 x float>] %xs, <8 x float> %b) {
+  %x = call <8 x float> @llvm.tpu.bc.extractvalue.loopindex.a5v8f([5 x <8 x float>] %xs)
+  %c = fadd <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  %ins = call [5 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex.a5vf32([5 x <8 x float>] %accum, <8 x float> %c)
+  ret [5 x <8 x float>] %ins
+}
+
+; CHECK-LABEL: vmul_f32_rr_alix_aliy_alidst:
+; CHECK: v0.ali = vmul.f32 v0.ali, v5.ali
+define [5 x <8 x float>] @vmul_f32_rr_alix_aliy_alidst([5 x <8 x float>] %accum, [ 5 x <8 x float>] %xs, <8 x float> %b) {
+  %x = call <8 x float> @llvm.tpu.bc.extractvalue.loopindex.a5v8f([5 x <8 x float>] %xs)
+  %y = call <8 x float> @llvm.tpu.bc.extractvalue.loopindex.a5v8f([5 x <8 x float>] %accum)
+  %c = fmul <8 x float> %x, %y
+  %ins = call [5 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex.a5vf32([5 x <8 x float>] %accum, <8 x float> %c)
+  ret [5 x <8 x float>] %ins
+}
+
+; CHECK-LABEL: vsub_f32_rr_aliy_alix_alidst:
+; CHECK: v0.ali = vsub.f32 v5.ali, v5.ali
+define [5 x <8 x float>] @vsub_f32_rr_aliy_alix_alidst([5 x <8 x float>] %accum, [ 5 x <8 x float>] %xs, <8 x float> %b) {
+  %x = call <8 x float> @llvm.tpu.bc.extractvalue.loopindex.a5v8f([5 x <8 x float>] %xs)
+  %c = fsub <8 x float> %x, %x
+  %ins = call [5 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex.a5vf32([5 x <8 x float>] %accum, <8 x float> %c)
+  ret [5 x <8 x float>] %ins
+}
+
+; CHECK-LABEL: vsub_f32_rr_aliy_alidst:
+; CHECK: v0.ali = vsub.f32 v5.ali, v10
+define [5 x <8 x float>] @vsub_f32_rr_aliy_alidst([5 x <8 x float>] %accum, [ 5 x <8 x float>] %xs, <8 x float> %b) {
+  %x = call <8 x float> @llvm.tpu.bc.extractvalue.loopindex.a5v8f([5 x <8 x float>] %xs)
+  %c = fsub <8 x float> %x, %b
+  %ins = call [5 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex.a5vf32([5 x <8 x float>] %accum, <8 x float> %c)
+  ret [5 x <8 x float>] %ins
+}
+
+declare <8 x float> @llvm.tpu.bc.select.predicate(i32, <8 x float>, <8 x float>)
+
+; CHECK-LABEL: vsel_pred:
+; CHECK: v0 = vmov @new_token_id v1
+define <8 x float> @vsel_pred(<8 x float> %a, <8 x float> %b) {
+  %c = call <8 x float> @llvm.tpu.bc.select.predicate(i32 19, <8 x float> %a, <8 x float> %b)
+  ret <8 x float> %c
+}
+
+; CHECK-LABEL: vsel_pred2:
+; CHECK: v0 = vimm.f32 @!new_feature_id $1
+define <8 x float> @vsel_pred2(<8 x float> %a, <8 x float> %b) {
+  %c = call <8 x float> @llvm.tpu.bc.select.predicate(i32 2, <8 x float> %a, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  ret <8 x float> %c
+}
+
+declare <8 x float> @llvm.tpu.clamp.symmetric(<8 x float>, <8 x float>) readnone
+
+; CHECK-LABEL: clamp_symm:
+; CHECK: v0 = vclamps.f32 v0, v1
+define <8 x float> @clamp_symm(<8 x float> %a, <8 x float> %b) {
+  %c = call <8 x float> @llvm.tpu.clamp.symmetric(<8 x float> %a, <8 x float> %b) readnone
+  ret <8 x float> %c
+}
+
+; CHECK-LABEL: cmpselect:
+; CHECK: vm{{[0-9]+}} = veq.s32 v0, $0x1  }
+; CHECK: (slot_v0) v0 = vsel vm{{[0-9]+}}, $0x0, v1;
+define <8 x float> @cmpselect(<8 x i32> %a, <8 x float> %b) {
+  %c = icmp eq <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = select <8 x i1> %c, <8 x float> zeroinitializer, <8 x float> %b
+  ret <8 x float> %r
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/barrier_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/barrier_sc.ll
new file mode 100644
index 0000000..63e73eb
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/barrier_sc.ll

@@ -0,0 +1,63 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.barrier(i32, i32)
+
+; CHECK-LABEL: barrier0i
+; CHECK: [bflag:$0x0] = sbarrier.arrive $0xf
+define void @barrier0i() {
+  call void @llvm.tpu.barrier(i32 0, i32 15)
+  ret void
+}
+
+; CHECK-LABEL: barrier1i
+; CHECK: [bflag:$0x1] = sbarrier.arrive $0xf
+define void @barrier1i() {
+  call void @llvm.tpu.barrier(i32 1, i32 15)
+  ret void
+}
+
+; CHECK-LABEL: barrier2i
+; CHECK: [bflag:$0x2] = sbarrier.arrive $0xf
+define void @barrier2i() {
+  call void @llvm.tpu.barrier(i32 2, i32 15)
+  ret void
+}
+
+; CHECK-LABEL: barrier3i
+; CHECK: [bflag:$0x3] = sbarrier.arrive $0xf
+define void @barrier3i() {
+  call void @llvm.tpu.barrier(i32 3, i32 15)
+  ret void
+}
+
+; CHECK-LABEL: barrier0r
+; CHECK: [bflag:$0x0] = sbarrier.arrive s0
+define void @barrier0r(i32 %m) {
+  call void @llvm.tpu.barrier(i32 0, i32 %m)
+  ret void
+}
+
+; CHECK-LABEL: barrier1r
+; CHECK: [bflag:$0x1] = sbarrier.arrive s0
+define void @barrier1r(i32 %m) {
+  call void @llvm.tpu.barrier(i32 1, i32 %m)
+  ret void
+}
+
+; CHECK-LABEL: barrier2r
+; CHECK: [bflag:$0x2] = sbarrier.arrive s0
+define void @barrier2r(i32 %m) {
+  call void @llvm.tpu.barrier(i32 2, i32 %m)
+  ret void
+}
+
+; CHECK-LABEL: barrier3r
+; CHECK: [bflag:$0x3] = sbarrier.arrive s0
+define void @barrier3r(i32 %m) {
+  call void @llvm.tpu.barrier(i32 3, i32 %m)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/branch.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/branch.ll
new file mode 100644
index 0000000..a1cbcce
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/branch.ll

@@ -0,0 +1,62 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=true -disable-cgp \
+; RUN: -tpu-enable-early-if-predicator=false | FileCheck %s
+; REQUIRES: tpu
+
+; Test that simple branching works.
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: br_forward:
+; CHECK:      [smem:s0] =  sst @!p0 s1
+; CHECK-NEXT:  _ = shalt
+define i1 @br_forward(i1 %p, i32* %x, i32 %y) {
+  br i1 %p, label %next, label %one
+
+one:
+  store i32 %y, i32* %x
+  br label %next
+next:
+  ret i1 %p
+}
+
+; CHECK-LABEL: br_backward:
+; CHECK:  bb.0:
+; CHECK-NEXT: s2 = sadd.s32 $0x1, s4
+; CHECK-NEXT: p2 = por !p2, !p2
+; CHECK: .Ltmp0
+; CHECK-NEXT:  { (pc) =  sbr.rel @p0 .LBB1_4-.Ltmp0 }
+; CHECK-NEXT: { _ = snop }
+; CHECK-NEXT: { _ = snop }
+; CHECK: { s3 = sld [smem:s1+$0x0] }
+; CHECK: (pc) = sbr.rel @p1 .LBB1_4-.Ltmp1
+; CHECK-NEXT: { _ = snop }
+; CHECK-NEXT: { [smem:s0] = sst s3 }
+; CHECK: (pc) = sbr.rel @p2 .LBB1_1-.Ltmp2
+; CHECK-NEXT: { _ = snop }
+; CHECK-NEXT:  { [smem:s0] =  sst s2 }
+define i1 @br_backward(i1 %p1, i1 %p2, i1 %p3, i1 %p4, i32* %x, i32* %xx, i32* %xxx, i32 %y, i32 %z) {
+entry:
+  %zz = add i32 %z, 1
+  br label %begin
+
+begin:
+  br i1 %p1, label %next, label %one
+
+one:
+  %yy = load i32, i32* %xx
+  store i32 %yy, i32* %x
+  br i1 %p2, label %next, label %two
+
+two:
+  store i32 %zz, i32* %x
+  br i1 %p3, label %next, label %begin
+
+three:
+  %zzz = load i32, i32* %xxx
+  store i32 %zzz, i32* %x
+  br i1 %p4, label %next, label %begin
+
+next:
+  ret i1 %p1
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/bundle_packer_corner_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/bundle_packer_corner_sc.ll
new file mode 100644
index 0000000..d327893
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/bundle_packer_corner_sc.ll

@@ -0,0 +1,140 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+; This test checks for whether we successfully avoid a vnop in a software pipelined loop that the bundle
+; packer heuristic for bottom edges should handle.
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: .LBB0_4:
+; CHECK-NOT: vnop
+; CHECK-NOT: vdelay
+; CHECK: mov
+
+define void @tile_execute() {
+entry:
+  %0 = load i32, i32* inttoptr (i32 258 to i32*), align 4, !tbaa !4
+  %1 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 260 to <8 x i32> addrspace(201)**), align 4, !tbaa !4
+  %2 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 261 to <8 x i32> addrspace(201)**), align 4, !tbaa !4
+  %3 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 262 to <8 x float> addrspace(201)**), align 4, !tbaa !4
+  %4 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 263 to <8 x float> addrspace(201)**), align 4, !tbaa !4
+  %5 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 264 to <8 x float> addrspace(201)**), align 8, !tbaa !4
+  %div = sdiv i32 %0, 8
+  %6 = load <8 x i32>, <8 x i32> addrspace(201)* %1, align 32, !tbaa !8
+  %7 = load <8 x i32>, <8 x i32> addrspace(201)* %2, align 32, !tbaa !8
+  %8 = bitcast <8 x float> addrspace(201)* %3 to <8 x i32> addrspace(201)*
+  %9 = load <8 x i32>, <8 x i32> addrspace(201)* %8, align 32, !tbaa !8
+  %10 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %6, <8 x i32> %7), !noalias !9
+  %11 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %6, <8 x i32> %9), !noalias !12
+  %cmp106.i = icmp sgt i32 %0, 7
+  br i1 %cmp106.i, label %for.body.lr.ph.i, label %_ZN10embeddings15SegmentedReduce7ComputeEiNS_15TmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit
+
+for.body.lr.ph.i:                                 ; preds = %entry
+  %12 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %11, 1
+  %13 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %10, 1
+  %14 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %10, 0
+  %sub.i = add nsw i32 %div, -1
+  %15 = bitcast <8 x float> addrspace(201)* %5 to <8 x i32> addrspace(201)*
+  br label %for.body.i
+
+for.cond.loopexit.i:                              ; preds = %for.body22.i
+  %16 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %23, 0
+  %17 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %23, 1
+  %18 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %24, 1
+  %exitcond = icmp eq i32 %add.i, %div
+  br i1 %exitcond, label %_ZN10embeddings15SegmentedReduce7ComputeEiNS_15TmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit, label %for.body.i
+
+for.body.i:                                       ; preds = %for.cond.loopexit.i, %for.body.lr.ph.i
+  %sorted_gains.0110.in.i = phi <8 x i32> [ %12, %for.body.lr.ph.i ], [ %18, %for.cond.loopexit.i ]
+  %sorted_segments.0109.i = phi <8 x i32> [ %14, %for.body.lr.ph.i ], [ %16, %for.cond.loopexit.i ]
+  %sorted_indices.0108.i = phi <8 x i32> [ %13, %for.body.lr.ph.i ], [ %17, %for.cond.loopexit.i ]
+  %i.0107.i = phi i32 [ 0, %for.body.lr.ph.i ], [ %add.i, %for.cond.loopexit.i ]
+  %sorted_gains.0110.i = bitcast <8 x i32> %sorted_gains.0110.in.i to <8 x float>
+  %cmp7.i = icmp eq i32 %i.0107.i, %sub.i
+  %add.i = add nuw nsw i32 %i.0107.i, 1
+  %cond.i = select i1 %cmp7.i, i32 %i.0107.i, i32 %add.i
+  %add.ptr.i81.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %1, i32 %cond.i
+  %19 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i81.i, align 32, !tbaa !8
+  %add.ptr.i79.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %2, i32 %cond.i
+  %20 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i79.i, align 32, !tbaa !8
+  %add.ptr.i.i = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %3, i32 %cond.i
+  %21 = bitcast <8 x float> addrspace(201)* %add.ptr.i.i to <8 x i32> addrspace(201)*
+  %22 = load <8 x i32>, <8 x i32> addrspace(201)* %21, align 32, !tbaa !8
+  %23 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %19, <8 x i32> %20), !noalias !15
+  %24 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %19, <8 x i32> %22), !noalias !18
+  %mul.i = shl <8 x i32> %sorted_indices.0108.i, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  %mul26.i = shl <8 x i32> %sorted_segments.0109.i, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  br label %for.body22.i
+
+for.body22.i:                                     ; preds = %for.body22.i, %for.body.i
+  %j.0104.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body22.i ]
+  %splat.splatinsert.i = insertelement <8 x i32> undef, i32 %j.0104.i, i32 0
+  %splat.splat.i = shufflevector <8 x i32> %splat.splatinsert.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %add23.i = add <8 x i32> %splat.splat.i, %mul.i
+  %25 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %4, <8 x i32> %add23.i), !llvm.access.group !21
+  %mul25.i = fmul <8 x float> %25, %sorted_gains.0110.i
+  %26 = tail call { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %sorted_segments.0109.i, <8 x float> %mul25.i)
+  %27 = extractvalue { <8 x float>, <8 x i1> } %26, 0
+  %28 = extractvalue { <8 x float>, <8 x i1> } %26, 1
+  %add29.i = add <8 x i32> %splat.splat.i, %mul26.i
+  %29 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %5, <8 x i32> %add29.i), !llvm.access.group !21
+  %add31.i = fadd <8 x float> %27, %29
+  %30 = bitcast <8 x float> %add31.i to <8 x i32>
+  tail call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> %28, <8 x i32> addrspace(201)* %15, <8 x i32> %add29.i, <8 x i32> %30)
+  %inc.i = add nuw nsw i32 %j.0104.i, 1
+  %exitcond.i = icmp eq i32 %inc.i, 32
+  br i1 %exitcond.i, label %for.cond.loopexit.i, label %for.body22.i, !llvm.loop !22
+
+_ZN10embeddings15SegmentedReduce7ComputeEiNS_15TmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit: ; preds = %for.cond.loopexit.i, %entry
+  store i32 1, i32* inttoptr (i32 256 to i32*), align 256, !tbaa !4
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind readonly
+declare <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>)
+
+; Function Attrs: inaccessiblememonly nounwind
+declare { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32>, <8 x float>)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+
+; Function Attrs: inaccessiblememonly nounwind
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1>, <8 x i32>, <8 x i32>)
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!smem.spill.start = !{!2}
+!smem.spill.limit = !{!3}
+!vmem.spill.start = !{!2}
+!vmem.spill.limit = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version google3-trunk (835c81923efee0cef1c64b25a34cf0872fa1e634)"}
+!2 = !{i32 0}
+!3 = !{i32 1024}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"int", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C++ TBAA"}
+!8 = !{!6, !6, i64 0}
+!9 = !{!10}
+!10 = distinct !{!10, !11, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_: %agg.result"}
+!11 = distinct !{!11, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_"}
+!12 = !{!13}
+!13 = distinct !{!13, !14, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_: %agg.result"}
+!14 = distinct !{!14, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_"}
+!15 = !{!16}
+!16 = distinct !{!16, !17, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_: %agg.result"}
+!17 = distinct !{!17, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_"}
+!18 = !{!19}
+!19 = distinct !{!19, !20, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_: %agg.result"}
+!20 = distinct !{!20, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_"}
+!21 = distinct !{}
+!22 = distinct !{!22, !23, !24, !25, !26, !27}
+!23 = !{!"llvm.loop.parallel_accesses", !21}
+!24 = !{!"llvm.loop.unroll.disable"}
+!25 = !{!"llvm.loop.vectorize.width", i32 1}
+!26 = !{!"llvm.loop.interleave.count", i32 1}
+!27 = !{!"llvm.loop.vectorize.enable", i1 true}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/bundle_packer_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/bundle_packer_sc.ll
new file mode 100644
index 0000000..f9f41d8
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/bundle_packer_sc.ll

@@ -0,0 +1,195 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp | FileCheck %s -check-prefixes CHECK-NO-NOP,CHECK
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp -tpu-latencies=%S/Inputs/long_push1.yml | FileCheck %s -check-prefixes CHECK-LONG1,CHECK
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp -tpu-latencies=%S/Inputs/long_push2.yml | FileCheck %s -check-prefixes CHECK-LONG2,CHECK
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp -tpu-latencies=%S/Inputs/long_push3.yml | FileCheck %s -check-prefixes CHECK-LONG3,CHECK
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; This test is taken from the segmented reduce SparseCore kernel. We're checking whether we're
+; succesfully able to recognize clearance due to different push/pop stages. The different
+; latencies cause different orders and stages before bundle packing. The long latency cases
+; are exposing different orders and stages for the bundle packer:
+;
+; Default:
+; 3 x COPY
+; POP stage 4
+; PUSH stage 1
+; POP stage 4
+;
+; long_push1:
+; 4 x COPY
+; PUSH stage 1
+; POP stage 5
+; POP stage 5
+;
+; long_push2:
+; 2 x COPY
+; POP stage 3
+; PUSH stage 1
+; POP stage 3
+;
+; long_push3:
+; 10 x COPY
+; POP stage 14
+; PUSH stage 1
+; POP stage 13
+
+; Function Attrs: argmemonly nounwind readonly
+declare <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>)
+
+; Function Attrs: inaccessiblememonly nounwind
+declare { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32>, <8 x float>)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+
+; Function Attrs: inaccessiblememonly nounwind
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1>, <8 x i32>, <8 x i32>)
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!smem.spill.start = !{!2}
+!smem.spill.limit = !{!3}
+!vmem.spill.start = !{!2}
+!vmem.spill.limit = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version google3-trunk (676c29694c5444ca3c63067770dfac0f37158797)"}
+!2 = !{i32 0}
+!3 = !{i32 1024}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"int", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C++ TBAA"}
+!8 = !{!6, !6, i64 0}
+!9 = !{!10}
+!10 = distinct !{!10, !11, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_: %agg.result"}
+!11 = distinct !{!11, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_"}
+!12 = !{!13}
+!13 = distinct !{!13, !14, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_: %agg.result"}
+!14 = distinct !{!14, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_"}
+!15 = !{!16}
+!16 = distinct !{!16, !17, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_: %agg.result"}
+!17 = distinct !{!17, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_"}
+!18 = !{!19}
+!19 = distinct !{!19, !20, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_: %agg.result"}
+!20 = distinct !{!20, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_"}
+!21 = distinct !{}
+!22 = distinct !{!22, !23, !24, !25, !26, !27}
+!23 = !{!"llvm.loop.parallel_accesses", !21}
+!24 = !{!"llvm.loop.unroll.disable"}
+!25 = !{!"llvm.loop.vectorize.width", i32 1}
+!26 = !{!"llvm.loop.interleave.count", i32 1}
+!27 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+; The different latency cases exhibit different schedules, some cases expose nops.
+; TODO(hgreving): while the nops get reduced by the fifo stage difference analysis,
+; check whether we can do even better.
+
+; CHECK-LABEL: tile_execute:
+; CHECK-LABEL: .LBB0_4:
+; CHECK-NO-NOP-NOT: {       _ =     vdelay
+; CHECK-NO-NOP-NOT: {       _ =     vnop
+; CHECK-NO-NOP: {
+; CHECK-NO-NOP: {
+; CHECK-NO-NOP: {{.*}} .LBB0_4
+; CHECK-NO-NOP: {
+; CHECK-LONG1-NOT: {       _ =     vdelay
+; CHECK-LONG1-NOT: {       _ =     vnop
+; CHECK-LONG1: {
+; CHECK-LONG1: {
+; CHECK-LONG1: {
+; CHECK-LONG1: {{.*}} .LBB0_4
+; CHECK-LONG1: {
+; CHECK-LONG2-NOT: {       _ =     vdelay
+; CHECK-LONG2-NOT: {       _ =     vnop
+; CHECK-LONG2: {
+; CHECK-LONG2: {
+; CHECK-LONG2: {{.*}} .LBB0_4
+; CHECK-LONG2: {
+; CHECK-LONG3-NOT: _ =     vdelay
+; CHECK-LONG3: {
+; CHECK-LONG3: {
+; CHECK-LONG3: {{.*}} .LBB0_4
+; CHECK-LONG3: {
+
+define void @tile_execute() {
+entry:
+  %0 = load i32, i32* inttoptr (i32 258 to i32*), align 4, !tbaa !4
+  %1 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 260 to <8 x i32> addrspace(201)**), align 4, !tbaa !4
+  %2 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 261 to <8 x i32> addrspace(201)**), align 4, !tbaa !4
+  %3 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 262 to <8 x float> addrspace(201)**), align 4, !tbaa !4
+  %4 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 263 to <8 x float> addrspace(201)**), align 4, !tbaa !4
+  %5 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 264 to <8 x float> addrspace(201)**), align 8, !tbaa !4
+  %div = sdiv i32 %0, 8
+  %6 = load <8 x i32>, <8 x i32> addrspace(201)* %1, align 32, !tbaa !8
+  %7 = load <8 x i32>, <8 x i32> addrspace(201)* %2, align 32, !tbaa !8
+  %8 = bitcast <8 x float> addrspace(201)* %3 to <8 x i32> addrspace(201)*
+  %9 = load <8 x i32>, <8 x i32> addrspace(201)* %8, align 32, !tbaa !8
+  %10 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %6, <8 x i32> %7) #6, !noalias !9
+  %11 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %6, <8 x i32> %9) #6, !noalias !12
+  %cmp106.i = icmp sgt i32 %0, 7
+  br i1 %cmp106.i, label %for.body.lr.ph.i, label %_ZN10embeddings15SegmentedReduce7ComputeEiNS_15TmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit
+
+for.body.lr.ph.i:                                 ; preds = %entry
+  %12 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %11, 1
+  %13 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %10, 1
+  %14 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %10, 0
+  %sub.i = add nsw i32 %div, -1
+  %15 = bitcast <8 x float> addrspace(201)* %5 to <8 x i32> addrspace(201)*
+  br label %for.body.i
+
+for.cond.loopexit.i:                              ; preds = %for.body22.i
+  %16 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %23, 0
+  %17 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %23, 1
+  %18 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %24, 1
+  %exitcond = icmp eq i32 %add.i, %div
+  br i1 %exitcond, label %_ZN10embeddings15SegmentedReduce7ComputeEiNS_15TmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit, label %for.body.i
+
+for.body.i:                                       ; preds = %for.cond.loopexit.i, %for.body.lr.ph.i
+  %sorted_gains.0110.in.i = phi <8 x i32> [ %12, %for.body.lr.ph.i ], [ %18, %for.cond.loopexit.i ]
+  %sorted_segments.0109.i = phi <8 x i32> [ %14, %for.body.lr.ph.i ], [ %16, %for.cond.loopexit.i ]
+  %sorted_indices.0108.i = phi <8 x i32> [ %13, %for.body.lr.ph.i ], [ %17, %for.cond.loopexit.i ]
+  %i.0107.i = phi i32 [ 0, %for.body.lr.ph.i ], [ %add.i, %for.cond.loopexit.i ]
+  %sorted_gains.0110.i = bitcast <8 x i32> %sorted_gains.0110.in.i to <8 x float>
+  %cmp7.i = icmp eq i32 %i.0107.i, %sub.i
+  %add.i = add nuw nsw i32 %i.0107.i, 1
+  %cond.i = select i1 %cmp7.i, i32 %i.0107.i, i32 %add.i
+  %add.ptr.i81.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %1, i32 %cond.i
+  %19 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i81.i, align 32, !tbaa !8
+  %add.ptr.i79.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %2, i32 %cond.i
+  %20 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i79.i, align 32, !tbaa !8
+  %add.ptr.i.i = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %3, i32 %cond.i
+  %21 = bitcast <8 x float> addrspace(201)* %add.ptr.i.i to <8 x i32> addrspace(201)*
+  %22 = load <8 x i32>, <8 x i32> addrspace(201)* %21, align 32, !tbaa !8
+  %23 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %19, <8 x i32> %20) #6, !noalias !15
+  %24 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %19, <8 x i32> %22) #6, !noalias !18
+  %mul.i = shl <8 x i32> %sorted_indices.0108.i, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  %mul26.i = shl <8 x i32> %sorted_segments.0109.i, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  br label %for.body22.i
+
+for.body22.i:                                     ; preds = %for.body22.i, %for.body.i
+  %j.0104.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body22.i ]
+  %splat.splatinsert.i = insertelement <8 x i32> undef, i32 %j.0104.i, i32 0
+  %splat.splat.i = shufflevector <8 x i32> %splat.splatinsert.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %add23.i = add <8 x i32> %splat.splat.i, %mul.i
+  %25 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %4, <8 x i32> %add23.i), !llvm.access.group !21
+  %mul25.i = fmul <8 x float> %25, %sorted_gains.0110.i
+  %26 = tail call { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %sorted_segments.0109.i, <8 x float> %mul25.i)
+  %27 = extractvalue { <8 x float>, <8 x i1> } %26, 0
+  %28 = extractvalue { <8 x float>, <8 x i1> } %26, 1
+  %add29.i = add <8 x i32> %splat.splat.i, %mul26.i
+  %29 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %5, <8 x i32> %add29.i), !llvm.access.group !21
+  %add31.i = fadd <8 x float> %27, %29
+  %30 = bitcast <8 x float> %add31.i to <8 x i32>
+  tail call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> %28, <8 x i32> addrspace(201)* %15, <8 x i32> %add29.i, <8 x i32> %30)
+  %inc.i = add nuw nsw i32 %j.0104.i, 1
+  %exitcond.i = icmp eq i32 %inc.i, 32
+  br i1 %exitcond.i, label %for.cond.loopexit.i, label %for.body22.i, !llvm.loop !22
+
+_ZN10embeddings15SegmentedReduce7ComputeEiNS_15TmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit: ; preds = %for.cond.loopexit.i, %entry
+  store i32 1, i32* inttoptr (i32 256 to i32*), align 256, !tbaa !4
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/bundle_packing.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/bundle_packing.ll
new file mode 100644
index 0000000..3abc189
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/bundle_packing.ll

@@ -0,0 +1,49 @@
+; RUN: llc < %s -mcpu=tensorcore-jf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-skip-fast-opt -tpu-use-resource-swing-sched | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)*, i32, i32)
+declare i32 @llvm.tpu.rsqrt.v1024f32(<1024 x float>)
+declare <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32)
+declare <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32) nounwind
+
+; Test that vpop doesn't get packed with both 2 ALU and a ld as it needs to\
+; steal a slot from one of them.
+; CHECK-LABEL: packing:
+; CHECK: vld.sshfl
+; CHECK: }
+; CHECK: vpop
+define <1024 x i32> @packing(<1024 x i32> %x, <1024 x i32> %y, <1024 x float> %z, <1024 x i32> %w) {
+entry:
+  %f = call i32 @llvm.tpu.rsqrt.v1024f32(<1024 x float> %z)
+  %r = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  %ri = bitcast <1024 x float> %r to <1024 x i32>
+  %k = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 1)
+  %l = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* nonnull %k, i32 123, i32 19088743)
+  %a = add <1024 x i32> %x, %w
+  %b = add <1024 x i32> %y, %w
+  %c = add <1024 x i32> %a, %b
+  %d = add <1024 x i32> %l, %ri
+  %e = add <1024 x i32> %d, %c
+  ret <1024 x i32> %e
+}
+
+declare void @llvm.tpu.syncadd(i32 addrspace(204)*, i32)
+
+; Check that we don't use more than 3 vector to scalar slots in one bundle.
+; CHECK-LABEL: overuseVs:
+; CHECK: s{{[0-9]+}}
+; CHECK: }
+; CHECK: s{{[0-9]+}}
+; CHECK: s{{[0-9]+}}
+; CHECK: s{{[0-9]+}}
+; CHECK: shalt
+define <1024 x i32> @overuseVs(i32 addrspace(204)* %s0, i32 %s1, <1024 x i32> addrspace(205)* %s2, <1024 x i32> addrspace(205)* %s3, <1024 x i32> %x) {
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %s0, i32 %s1)
+  %r = load <1024 x i32>, <1024 x i32> addrspace(205)* %s2
+  store <1024 x i32> %x, <1024 x i32> addrspace(205)* %s3
+  ret <1024 x i32> %r
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/call_remat_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/call_remat_sc.ll
new file mode 100644
index 0000000..0196401
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/call_remat_sc.ll

@@ -0,0 +1,57 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-enable-tpu-abi-tec -tpu-enable-tpu-abi-scs | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests that all vector values are rematerialized from vcmask instructions
+; instead of spilled and restored across the call.
+
+declare void @llvm.tpu.init.stack(i32, i32)
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32)
+declare void @bar(<8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>,
+                  <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>)
+
+; CHECK-LABEL: foo
+; CHECK: scall.abs bar
+; CHECK-NOT: vld
+; CHECK: vcmask
+
+define void @foo() #1 {
+  tail call void @llvm.tpu.init.stack(i32 1024, i32 512)
+  tail call void @bar(<8 x i32> noundef <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>,
+                      <8 x i32> noundef <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>,
+                      <8 x i32> noundef <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>,
+                      <8 x i32> noundef <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>,
+                      <8 x i32> noundef <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>,
+                      <8 x i32> noundef <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>,
+                      <8 x i32> noundef <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>,
+		      <8 x i32> noundef <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>,
+		      <8 x i32> noundef <i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71>,
+		      <8 x i32> noundef <i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79>)
+  %1 = tail call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 80)
+  store <8 x i32> <i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87>, <8 x i32> addrspace(201)* %1, align 32
+  %2 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %1, i32 1
+  store <8 x i32> <i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>, <8 x i32> addrspace(201)* %2, align 32
+  %3 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %1, i32 2
+  store <8 x i32> <i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103>, <8 x i32> addrspace(201)* %3, align 32
+  %4 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %1, i32 3
+  store <8 x i32> <i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111>, <8 x i32> addrspace(201)* %4, align 32
+  %5 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %1, i32 4
+  store <8 x i32> <i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119>, <8 x i32> addrspace(201)* %5, align 32
+  %6 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %1, i32 5
+  store <8 x i32> <i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>, <8 x i32> addrspace(201)* %6, align 32
+  %7 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %1, i32 6
+  store <8 x i32> <i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135>, <8 x i32> addrspace(201)* %7, align 32
+  %8 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %1, i32 7
+  store <8 x i32> <i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143>, <8 x i32> addrspace(201)* %8, align 32
+  %9 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %1, i32 8
+  store <8 x i32> <i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151>, <8 x i32> addrspace(201)* %9, align 32
+  %10 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %1, i32 9
+  store <8 x i32> <i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159>, <8 x i32> addrspace(201)* %10, align 32
+  store i32 1, i32* inttoptr (i32 256 to i32*), align 256
+  ret void
+}
+
+attributes #1 = { "target-cpu"="sparsecore-tec-vf" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/cgp.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/cgp.ll
new file mode 100644
index 0000000..6aa125f
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/cgp.ll

@@ -0,0 +1,171 @@
+; RUN: opt < %s -S -mcpu=sparsecore-tec-vf -enable-new-pm=0 -tpu-codegen-prepare \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; CHECK-LABEL: @f
+; CHECK: icmp slt i32 %7, 32
+
+declare void @llvm.tpu.nop() inaccessiblememonly nounwind
+
+define void @f(i32) {
+  %2 = load i32, i32* inttoptr (i32 256 to i32*), align 256
+  %3 = icmp sgt i32 %2, 0
+  br i1 %3, label %5, label %4
+
+4:                                                ; preds = %5, %1
+  ret void
+
+5:                                                ; preds = %1, %5
+  %6 = phi i32 [ %7, %5 ], [ 0, %1 ]
+  tail call void @llvm.tpu.nop()
+  %7 = add nuw nsw i32 %6, 1
+  %8 = icmp ult i32 %7, 32
+  %9 = icmp slt i32 %7, %2
+  %10 = and i1 %8, %9
+  br i1 %10, label %5, label %4
+}
+
+; CHECK-LABEL: @unwrap_slt
+; CHECK: icmp slt i32 %7, 31
+
+define void @unwrap_slt(i32) {
+  %2 = load i32, i32* inttoptr (i32 256 to i32*), align 256
+  %3 = icmp sgt i32 %2, 0
+  br i1 %3, label %5, label %4
+
+4:                                                ; preds = %5, %1
+  ret void
+
+5:                                                ; preds = %1, %5
+  %6 = phi i32 [ %7, %5 ], [ 0, %1 ]
+  tail call void @llvm.tpu.nop()
+  %7 = add nsw i32 %6, 1
+  %a = add i32 %7, 1
+  %8 = icmp ult i32 %a, 32
+  %9 = icmp slt i32 %7, %2
+  %10 = and i1 %8, %9
+  br i1 %10, label %5, label %4
+}
+
+; CHECK-LABEL:  @cmpulteven
+; CHECK: %1 = lshr i32 %x, 1
+; CHECK: icmp slt i32 %1, 1
+define i1 @cmpulteven(i32 %x) {
+  %a = icmp ult i32 %x, 2
+  ret i1 %a
+}
+
+declare void @llvm.tpu.syncadd(i32 addrspace(204)*, i32)
+declare void @llvm.tpu.waitge(i32 addrspace(204)*, i32)
+
+; CHECK-LABEL:  @dmadone
+; CHECK: call void @llvm.tpu.waitge(i32 addrspace(204)* %flag, i32 232)
+; CHECK-NEXT: call void @llvm.tpu.syncadd(i32 addrspace(204)* %flag, i32 -232)
+; CHECK-NEXT: ret void
+define void @dmadone(i32 addrspace(204)* %flag){
+entry:
+  call void @llvm.tpu.waitge(i32 addrspace(204)* %flag, i32 16)
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %flag, i32 -16)
+  call void @llvm.tpu.waitge(i32 addrspace(204)* %flag, i32 16)
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %flag, i32 -16)
+  call void @llvm.tpu.waitge(i32 addrspace(204)* %flag, i32 200)
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %flag, i32 -200)
+  ret void
+}
+
+; CHECK-LABEL:  @dmadone_negativetest
+; CHECK: call void @llvm.tpu.waitge(i32 addrspace(204)* %flag, i32 16)
+; CHECK-NEXT: call void @llvm.tpu.syncadd(i32 addrspace(204)* %flag, i32 -15)
+; CHECK-NEXT: call void @llvm.tpu.waitge(i32 addrspace(204)* %flag, i32 16)
+; CHECK-NEXT: call void @llvm.tpu.syncadd(i32 addrspace(204)* %flag, i32 -16)
+; CHECK-NEXT: call void @llvm.tpu.waitge(i32 addrspace(204)* %flag, i32 200)
+; CHECK-NEXT: call void @llvm.tpu.syncadd(i32 addrspace(204)* %flag, i32 0)
+; CHECK-NEXT: ret void
+define void @dmadone_negativetest(i32 addrspace(204)* %flag){
+entry:
+  call void @llvm.tpu.waitge(i32 addrspace(204)* %flag, i32 16)
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %flag, i32 -15)
+  call void @llvm.tpu.waitge(i32 addrspace(204)* %flag, i32 16)
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %flag, i32 -16)
+  call void @llvm.tpu.waitge(i32 addrspace(204)* %flag, i32 200)
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %flag, i32 0)
+  ret void
+}
+
+declare <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr(i32)
+
+; CHECK-LABEL:  @gep_reassos
+; CHECK: %0 = call i8 addrspace(205)* @llvm.tpu.inttoptr.p205i8(i32 %base)
+; CHECK: %1 = getelementptr i8, i8 addrspace(205)* %0, i32 add (i32 ptrtoint (i8 addrspace(205)* getelementptr (i8, i8 addrspace(205)* null, i32 8768) to i32), i32 512)
+define void @gep_reassos(i32 %cond, i32 %base) {
+entry:  
+  %gep0 = getelementptr i8, i8 addrspace(205)* getelementptr (i8, i8 addrspace(205)* null, i32 8768), i32 %base
+  %c = icmp sgt i32 %cond, 0
+  br label %for.body.i
+
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.i ]
+  %gep = getelementptr i8, i8 addrspace(205)* %gep0, i32 512
+  %bc = bitcast i8 addrspace(205)* %gep to <1024 x i32> addrspace(205)*
+  %r = load <1024 x i32>, <1024 x i32> addrspace(205)* %bc
+  %addr = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr(i32 %i)
+  store <1024 x i32> %r, <1024 x i32> addrspace(205)* %addr, align 4
+  %ic = add i32 %i, 1
+  %cmp.i = icmp slt i32 %ic, 10000
+  br i1 %cmp.i, label %for.body.i, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL:  @gep_reassos2
+; CHECK: getelementptr i8, i8 addrspace(205)* %base, i32 9280
+define void @gep_reassos2(i32 %cond, i8 addrspace(205)* %base) {
+entry:  
+  %gep0 = getelementptr i8, i8 addrspace(205)* %base, i32 8768
+  %c = icmp sgt i32 %cond, 0
+  br label %for.body.i
+
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.i ]
+  %gep = getelementptr i8, i8 addrspace(205)* %gep0, i32 512
+  %bc = bitcast i8 addrspace(205)* %gep to <1024 x i32> addrspace(205)*
+  %r = load <1024 x i32>, <1024 x i32> addrspace(205)* %bc
+  %addr = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr(i32 %i)
+  store <1024 x i32> %r, <1024 x i32> addrspace(205)* %addr, align 4
+  %ic = add i32 %i, 1
+  %cmp.i = icmp slt i32 %ic, 10000
+  br i1 %cmp.i, label %for.body.i, label %exit
+
+exit:
+  ret void
+}
+
+
+; CHECK-LABEL:  @gep_sink
+; CHECK: for.body.i:
+; CHECK: [[v1:%[0-9]+]] = getelementptr i8, i8 addrspace(205)* getelementptr (i8, i8 addrspace(205)* null, i32 8768), i32 %base
+; CHECK: [[v2:%[0-9]+]] = bitcast i8 addrspace(205)* [[v1]] to <1024 x i32> addrspace(205)*
+; CHECK: %r = load <1024 x i32>, <1024 x i32> addrspace(205)* [[v2]]
+define void @gep_sink(i32 %cond, i32 %base) {
+entry:  
+  %gep = getelementptr i8, i8 addrspace(205)* getelementptr (i8, i8 addrspace(205)* null, i32 8768), i32 %base
+  %bc = bitcast i8 addrspace(205)* %gep to <1024 x i32> addrspace(205)*
+  %c = icmp sgt i32 %cond, 0
+  br label %for.body.i
+
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.i ]
+  %r = load <1024 x i32>, <1024 x i32> addrspace(205)* %bc
+  %addr = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr(i32 %i)
+  store <1024 x i32> %r, <1024 x i32> addrspace(205)* %addr, align 4
+  %ic = add i32 %i, 1
+  %cmp.i = icmp slt i32 %ic, 10000
+  br i1 %cmp.i, label %for.body.i, label %exit
+
+exit:
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/circular_buffer_maa.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/circular_buffer_maa.ll
new file mode 100644
index 0000000..df2b505
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/circular_buffer_maa.ll

@@ -0,0 +1,250 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp \
+; RUN: -tpu-latencies=%S/Inputs/load_one_cycle.yml \
+; RUN: | FileCheck --check-prefixes CHECK,CHECK-VF %s
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -disable-cgp \
+; RUN: -tpu-latencies=%S/Inputs/load_one_cycle.yml \
+; RUN: | FileCheck --check-prefixes CHECK,CHECK-GL %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; This test checks that we're able to relax some of the ordering rules
+; with respect to circular buffer vector/scalar load/stores. Some of
+; the tests are the same as in bundle_packer_sc.mir:cbreg_machine_alias,
+; but running through the optimizer.
+
+; The Viperfish version includes errata b/244231604.
+
+declare <8 x i32> @llvm.tpu.vld.cb.msk.v8i32(<8 x i1>, x86_mmx, i32)
+declare x86_mmx @llvm.tpu.cbreg.add.offset(x86_mmx, i32)
+declare void @llvm.tpu.vst.cb.msk.v8i32(<8 x i1>, x86_mmx, i32, <8 x i32>)
+declare x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx, i32)
+declare i32 @llvm.tpu.sld.cb(x86_mmx, i32)
+declare i32 @llvm.tpu.sld.cb.upd(x86_mmx, i32)
+declare void @llvm.tpu.sst.cb(i32, x86_mmx, i32)
+declare void @llvm.tpu.sst.cb.upd(i32, x86_mmx, i32)
+
+; CHECK-LABEL: bb_0
+; CHECK-VF: { v{{[0-9]+}} = vld.cb.msk {{.*}}}
+; CHECK-VF: {
+; CHECK-GL: { v{{[0-9]+}} = vld.cb.msk {{.*}};
+; CHECK-NEXT: cbreg.add
+define <8 x i32> @bb_0(x86_mmx %cb, i32 %base, <8 x i1> %m, i32 %off) {
+entry:
+  %val = tail call <8 x i32> @llvm.tpu.vld.cb.msk.v8i32(<8 x i1> %m,
+                                                   x86_mmx %cb,
+                                                   i32 %base)
+  %r = tail call x86_mmx @llvm.tpu.cbreg.add.offset(x86_mmx %cb, i32 %off)
+  tail call void @llvm.tpu.sst.cb(i32 0, x86_mmx %r, i32 1)
+  ret <8 x i32> %val
+}
+
+; CHECK-LABEL: bb_1
+; CHECK-VF: { v{{[0-9]+}} = vld.cb.msk {{.*}}}
+; CHECK-VF: {
+; CHECK-GL: { v{{[0-9]+}} = vld.cb.msk {{.*}};
+; CHECK-NEXT: cbreg.add
+define <8 x i32> @bb_1(x86_mmx %cb, i32 %base, <8 x i1> %m, i32 %off) {
+entry:
+  %val = tail call <8 x i32> @llvm.tpu.vld.cb.msk.v8i32(<8 x i1> %m,
+                                                   x86_mmx %cb,
+                                                   i32 %base)
+  %r = tail call x86_mmx @llvm.tpu.cbreg.add.offset(x86_mmx %cb, i32 5)
+  tail call void @llvm.tpu.sst.cb(i32 0, x86_mmx %r, i32 1)
+  ret <8 x i32> %val
+}
+
+; CHECK-LABEL: bb_2
+; CHECK-VF: { v{{[0-9]+}} = vld.cb.msk {{.*}}}
+; CHECK-VF: {
+; CHECK-GL: { v{{[0-9]+}} = vld.cb.msk {{.*}};
+; CHECK-NEXT: wrcbreg
+define <8 x i32> @bb_2(x86_mmx %cb, i32 %base, <8 x i1> %m, i32 %off) {
+entry:
+  %val = tail call <8 x i32> @llvm.tpu.vld.cb.msk.v8i32(<8 x i1> %m,
+                                                        x86_mmx %cb,
+                                                        i32 %base)
+  %r = tail call x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx %cb, i32 %off)
+  tail call void @llvm.tpu.sst.cb(i32 0, x86_mmx %r, i32 1)
+  ret <8 x i32> %val
+}
+
+; CHECK-LABEL: bb_3
+; CHECK-VF: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}}}
+; CHECK-GL: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}};
+; CHECK-NEXT: wrcbreg
+define x86_mmx @bb_3(x86_mmx %cb, <8 x i32> %val, i32 %base, <8 x i1> %m, i32 %off) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.v8i32(<8 x i1> %m,
+                                            x86_mmx %cb,
+                                            i32 %base,
+                                            <8 x i32> %val)
+  %r = tail call x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx %cb, i32 5)
+  ret x86_mmx %r
+}
+
+; CHECK-LABEL: bb_4
+; CHECK-VF: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}}}
+; CHECK-GL: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}};
+; CHECK-NEXT: wrcbreg
+define x86_mmx @bb_4(x86_mmx %cb, <8 x i32> %val, i32 %base, <8 x i1> %m, i32 %off) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.v8i32(<8 x i1> %m,
+                                            x86_mmx %cb,
+                                            i32 %base,
+                                            <8 x i32> %val)
+  %r = tail call x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx %cb, i32 %off)
+  ret x86_mmx %r
+}
+
+; CHECK-LABEL: bb_5
+; CHECK-VF: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}}}
+; CHECK-GL: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}};
+; CHECK-NEXT: s{{[0-9]+}} = sld.cb.upd
+define i32 @bb_5(x86_mmx %cb, <8 x i32> %val, i32 %base, <8 x i1> %m, i32 %off) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.v8i32(<8 x i1> %m,
+                                            x86_mmx %cb,
+                                            i32 %base,
+                                            <8 x i32> %val)
+  %r = tail call i32 @llvm.tpu.sld.cb.upd(x86_mmx %cb, i32 %off)
+  ret i32 %r
+}
+
+; CHECK-LABEL: bb_6
+; CHECK-VF: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}}}
+; CHECK-GL: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}};
+; CHECK-NEXT: [smem:s{{[0-9]+}} cbreg:$0x0] = sst.cb.upd
+define void @bb_6(x86_mmx %cb, <8 x i32> %val0, i32 %val1, i32 %base, <8 x i1> %m, i32 %off) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.v8i32(<8 x i1> %m,
+                                            x86_mmx %cb,
+                                            i32 %base,
+                                            <8 x i32> %val0)
+  tail call void @llvm.tpu.sst.cb.upd(i32 %val1, x86_mmx %cb, i32 %off)
+  ret void
+}
+
+; In the following are tests that we're able to re-order some limited cases
+; of circular buffer instructions:
+
+; Tests reordering circular buffer, no update, same register.
+
+; CHECK-LABEL: aa_0
+; CHECK: { s{{[0-9]+}} = sld.cb [smem:s{{[0-9]+}} cbreg:$0x0] }
+; CHECK-NEXT: { s{{[0-9]+}} = sld.cb [smem:s{{[0-9]+}} cbreg:$0x1] }
+; CHECK-NEXT: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}};
+define i32 @aa_0(x86_mmx %cb0, x86_mmx %cb1, <8 x i32> %val, <8 x i1> %m, i32 %pad, i32 %off, i32 %base) {
+; The %pad value makes sure $s0 is used and avoid an antidep in the test.
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.v8i32(<8 x i1> %m,
+                                            x86_mmx %cb0,
+                                            i32 %base,
+                                            <8 x i32> %val)
+  %r0 = tail call i32 @llvm.tpu.sld.cb(x86_mmx %cb0, i32 %off)
+  %r1 = tail call i32 @llvm.tpu.sld.cb(x86_mmx %cb1, i32 %off)
+  %r = add i32 %r0, %r1
+  ret i32 %r
+}
+
+; Tests reordering circular buffer, no update, same register.
+
+; CHECK-LABEL: aa_1
+; CHECK: { [smem:s{{[0-9]+}} cbreg:$0x0] = sst.cb s{{[0-9]+}} }
+; CHECK-NEXT: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}};
+; CHECK-NEXT: [smem:s{{[0-9]+}} cbreg:$0x1] = sst.cb s{{[0-9]+}}
+define void @aa_1(x86_mmx %cb0, x86_mmx %cb1, <8 x i32> %val0, i32 %val1, i32 %base, <8 x i1> %m, i32 %off) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.v8i32(<8 x i1> %m,
+                                            x86_mmx %cb0,
+                                            i32 %base,
+                                            <8 x i32> %val0)
+  tail call void @llvm.tpu.sst.cb(i32 %val1, x86_mmx %cb0, i32 %off)
+  tail call void @llvm.tpu.sst.cb(i32 %val1, x86_mmx %cb1, i32 %off)
+  ret void
+}
+
+; Tests reordering circular buffer update, but different register.
+
+; CHECK-LABEL: aa_2
+; CHECK-VF: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}};
+; CHECK-VF-NEXT: s{{[0-9]+}} = sld.cb.upd [smem:s{{[0-9]+}} cbreg:$0x1] }
+; CHECK-VF-NEXT: { s{{[0-9]+}} = sld.cb.upd [smem:s{{[0-9]+}} cbreg:$0x0] }
+; CHECK-GL: { s{{[0-9]+}} = sld.cb.upd [smem:s{{[0-9]+}} cbreg:$0x1] }
+; CHECK-GL-NEXT: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}};
+; CHECK-GL-NEXT:   s{{[0-9]+}} = sld.cb.upd [smem:s{{[0-9]+}} cbreg:$0x0] }
+define i32 @aa_2(x86_mmx %cb0, x86_mmx %cb1, <8 x i32> %val, <8 x i1> %m, i32 %pad, i32 %off, i32 %base) {
+; The %pad value makes sure $s0 is used and avoid an antidep in the test.
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.v8i32(<8 x i1> %m,
+                                            x86_mmx %cb0,
+                                            i32 %base,
+                                            <8 x i32> %val)
+  %r1 = tail call i32 @llvm.tpu.sld.cb.upd(x86_mmx %cb1, i32 %off)
+  %r0 = tail call i32 @llvm.tpu.sld.cb.upd(x86_mmx %cb0, i32 %off)
+  %r = add i32 %r0, %r1
+  ret i32 %r
+}
+
+; Tests reordering circular buffer update, but different register.
+
+; CHECK-LABEL: aa_3
+; CHECK-VF: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}};
+; CHECK-VF-NEXT: [smem:s{{[0-9]+}} cbreg:$0x1] = sst.cb.upd s{{[0-9]+}} }
+; CHECK-VF-NEXT: { [smem:s{{[0-9]+}} cbreg:$0x0] = sst.cb.upd s{{[0-9]+}}
+; CHECK-GL: { [smem:s{{[0-9]+}} cbreg:$0x1] = sst.cb.upd s{{[0-9]+}} }
+; CHECK-GL-NEXT: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}};
+; CHECK-GL-NEXT: [smem:s{{[0-9]+}} cbreg:$0x0] = sst.cb.upd s{{[0-9]+}}
+define void @aa_3(x86_mmx %cb0, x86_mmx %cb1, <8 x i32> %val0, i32 %val1, i32 %base, <8 x i1> %m, i32 %off) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.v8i32(<8 x i1> %m,
+                                            x86_mmx %cb0,
+                                            i32 %base,
+                                            <8 x i32> %val0)
+  tail call void @llvm.tpu.sst.cb.upd(i32 %val1, x86_mmx %cb1, i32 %off)
+  tail call void @llvm.tpu.sst.cb.upd(i32 %val1, x86_mmx %cb0, i32 %off)
+  ret void
+}
+
+; Tests preventing reordering circular buffer update, one different register.
+
+; CHECK-LABEL: aa_4
+; CHECK-VF: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}}}
+; CHECK-VF-NEXT: { s{{[0-9]+}} = sld.cb.upd [smem:s{{[0-9]+}} cbreg:$0x0] }
+; CHECK-VF-NEXT: { s{{[0-9]+}} = sld.cb.upd [smem:s{{[0-9]+}} cbreg:$0x1] }
+; CHECK-GL: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}};
+; CHECK-GL-NEXT: s{{[0-9]+}} = sld.cb.upd [smem:s{{[0-9]+}} cbreg:$0x0] }
+; CHECK-GL-NEXT: { s{{[0-9]+}} = sld.cb.upd [smem:s{{[0-9]+}} cbreg:$0x1] }
+define i32 @aa_4(x86_mmx %cb0, x86_mmx %cb1, <8 x i32> %val, <8 x i1> %m, i32 %pad, i32 %off, i32 %base) {
+; The %pad value makes sure $s0 is used and avoid an antidep in the test.
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.v8i32(<8 x i1> %m,
+                                            x86_mmx %cb0,
+                                            i32 %base,
+                                            <8 x i32> %val)
+  %r0 = tail call i32 @llvm.tpu.sld.cb.upd(x86_mmx %cb0, i32 %off)
+  %r1 = tail call i32 @llvm.tpu.sld.cb.upd(x86_mmx %cb1, i32 %off)
+  %r = add i32 %r0, %r1
+  ret i32 %r
+}
+
+; Tests preventing reordering circular buffer update, one different register.
+
+; CHECK-LABEL: aa_5
+; CHECK-VF: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}}}
+; CHECK-VF-NEXT: { [smem:s{{[0-9]+}} cbreg:$0x0] = sst.cb.upd s{{[0-9]+}} }
+; CHECK-VF-NEXT: { [smem:s{{[0-9]+}} cbreg:$0x1] = sst.cb.upd s{{[0-9]+}}
+; CHECK-GL: { [tilespmem:s{{[0-9]+}}+$0x0 cbreg:$0x0] = vst.cb.msk {{.*}};
+; CHECK-GL-NEXT: [smem:s{{[0-9]+}} cbreg:$0x0] = sst.cb.upd s{{[0-9]+}} }
+; CHECK-GL-NEXT: { [smem:s{{[0-9]+}} cbreg:$0x1] = sst.cb.upd s{{[0-9]+}}
+define void @aa_5(x86_mmx %cb0, x86_mmx %cb1, <8 x i32> %val0, i32 %val1, i32 %base, <8 x i1> %m, i32 %off) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.v8i32(<8 x i1> %m,
+                                            x86_mmx %cb0,
+                                            i32 %base,
+                                            <8 x i32> %val0)
+  tail call void @llvm.tpu.sst.cb.upd(i32 %val1, x86_mmx %cb0, i32 %off)
+  tail call void @llvm.tpu.sst.cb.upd(i32 %val1, x86_mmx %cb1, i32 %off)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/circular_buffer_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/circular_buffer_sc.ll
new file mode 100644
index 0000000..f679035
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/circular_buffer_sc.ll

@@ -0,0 +1,225 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32* @llvm.tpu.inttoptr.p0i32(i32)
+declare i32 @llvm.tpu.ptrtoint.p0i32(i32*)
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32)
+declare i32 addrspace(201)* @llvm.tpu.inttoptr.p201i32(i32)
+
+declare i32* @llvm.tpu.rdcbreg.smem.base(x86_mmx)
+declare i32 addrspace(201)* @llvm.tpu.rdcbreg.tilespmem.base(x86_mmx)
+declare i32 @llvm.tpu.rdcbreg.size(x86_mmx)
+declare i32 @llvm.tpu.rdcbreg.offset(x86_mmx)
+declare x86_mmx @llvm.tpu.wrcbreg.smem.base(x86_mmx, i32*)
+declare x86_mmx @llvm.tpu.wrcbreg.tilespmem.base(x86_mmx, i32 addrspace(201)*)
+declare x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx, i32)
+declare x86_mmx @llvm.tpu.wrcbreg.offset(x86_mmx, i32)
+
+; Tests that we can allocate cbregs.
+
+; CHECK-LABEL: cbreg_allocate:
+; CHECK-DAG: cbreg:$0x0 {{.*}} wrcbreg
+; CHECK-DAG: cbreg:$0x0 {{.*}} wrcbreg
+; CHECK-DAG: cbreg:$0x0 {{.*}} wrcbreg
+; CHECK-DAG: cbreg:$0x1 {{.*}} wrcbreg
+; CHECK-DAG: cbreg:$0x1 {{.*}} wrcbreg
+; CHECK-DAG: cbreg:$0x1 {{.*}} wrcbreg
+
+define x86_mmx @cbreg_allocate() {
+entry:
+  %g = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 0)
+  %a0 = getelementptr inbounds i32, i32* %g, i32 0
+  %l = load i32, i32* %a0, align 4
+  %cmp.i = icmp slt i32 %l, 15
+  br label %bb.m
+
+bb.m:
+  %a1 = getelementptr inbounds i32, i32* %g, i32 128
+  %a2 = getelementptr inbounds i32, i32* %g, i32 256
+  %a3 = getelementptr inbounds i32, i32* %g, i32 129
+  %a4 = getelementptr inbounds i32, i32* %g, i32 257
+  %b0 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 128)
+  %s0 = load i32, i32* %a1, align 4
+  %o0 = load i32, i32* %a2, align 4
+  %b1 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 129)
+  %s1 = load i32, i32* %a3, align 4
+  %o1 = load i32, i32* %a4, align 4
+  %cb0_0 = tail call x86_mmx @llvm.tpu.wrcbreg.smem.base(x86_mmx undef, i32* %b0)
+  %cb0_1 = tail call x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx %cb0_0, i32 %s0)
+  %cb0_2 = tail call x86_mmx @llvm.tpu.wrcbreg.offset(x86_mmx %cb0_1, i32 %o0)
+  %cb1_0 = tail call x86_mmx @llvm.tpu.wrcbreg.smem.base(x86_mmx undef, i32* %b1)
+  %cb1_1 = tail call x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx %cb1_0, i32 %s1)
+  %cb1_2 = tail call x86_mmx @llvm.tpu.wrcbreg.offset(x86_mmx %cb1_1, i32 %o1)
+  %result_b = tail call i32* @llvm.tpu.rdcbreg.smem.base(x86_mmx %cb1_2)
+  %result_s = tail call i32 @llvm.tpu.rdcbreg.size(x86_mmx %cb1_2)
+  %result_o = tail call i32 @llvm.tpu.rdcbreg.offset(x86_mmx %cb1_2)
+  %result_bi = tail call i32 @llvm.tpu.ptrtoint.p0i32(i32* %result_b)
+  %a1w = getelementptr inbounds i32, i32* %g, i32 1024
+  %a2w = getelementptr inbounds i32, i32* %g, i32 1036
+  %a3w = getelementptr inbounds i32, i32* %g, i32 1042
+  store i32 %result_bi, i32* %a1w, align 4
+  store i32 %result_s, i32* %a2w, align 4
+  store i32 %result_o, i32* %a3w, align 4
+  ret x86_mmx %cb0_2
+}
+
+; Tests that we are able to create and resolve cbreg copies.
+
+; CHECK-LABEL: cbreg_copy_smem_base:
+; CHECK-DAG: s[[s0:[0-9]+]] = rdcbreg [cbreg:$0x0 metadata:$0x0
+; CHECK-DAG: cbreg:$0x1 metadata:$0x0] = wrcbreg s[[s0]]
+; CHECK-DAG: s[[s1:[0-9]+]] = rdcbreg [cbreg:$0x0 metadata:$0x1
+; CHECK-DAG: cbreg:$0x1 metadata:$0x1] = wrcbreg s[[s1]]
+; CHECK-DAG: s[[s2:[0-9]+]] = rdcbreg [cbreg:$0x0 metadata:$0x2
+; CHECK-DAG: cbreg:$0x1 metadata:$0x2] = wrcbreg s[[s2]]
+
+define void @cbreg_copy_smem_base() {
+entry:
+  %g = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 0)
+  %a0 = getelementptr inbounds i32, i32* %g, i32 0
+  %l = load i32, i32* %a0, align 4
+  %a1t = getelementptr inbounds i32, i32* %g, i32 128
+  %a2t = getelementptr inbounds i32, i32* %g, i32 256
+  %bt = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 128)
+  %st = load i32, i32* %a1t, align 4
+  %ot = load i32, i32* %a2t, align 4
+  %cbt_0 = tail call x86_mmx @llvm.tpu.wrcbreg.smem.base(x86_mmx undef, i32* %bt)
+  %cbt_1 = tail call x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx %cbt_0, i32 %st)
+  %cbt_2 = tail call x86_mmx @llvm.tpu.wrcbreg.offset(x86_mmx %cbt_1, i32 %ot)
+  %cmp.i = icmp slt i32 %l, 15
+  br i1 %cmp.i, label %bb.t, label %bb.f
+
+bb.t:
+  br label %exit
+
+bb.f:
+  %a1f = getelementptr inbounds i32, i32* %g, i32 1024
+  %a2f = getelementptr inbounds i32, i32* %g, i32 1036
+  %bf = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 512)
+  %sf = load i32, i32* %a1f, align 4
+  %of = load i32, i32* %a2f, align 4
+  %cbf_0 = tail call x86_mmx @llvm.tpu.wrcbreg.smem.base(x86_mmx undef, i32* %bf)
+  %cbf_1 = tail call x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx %cbf_0, i32 %sf)
+  %cbf_2 = tail call x86_mmx @llvm.tpu.wrcbreg.offset(x86_mmx %cbf_1, i32 %of)
+  br label %exit
+
+exit:
+  %cb_2 = phi x86_mmx [ %cbt_2, %bb.t ], [ %cbf_2, %bb.f ]
+  %a1w = getelementptr inbounds i32, i32* %g, i32 1024
+  %a2w = getelementptr inbounds i32, i32* %g, i32 1036
+  %a3w = getelementptr inbounds i32, i32* %g, i32 1042
+  %result_b = tail call i32* @llvm.tpu.rdcbreg.smem.base(x86_mmx %cb_2)
+  %result_s = tail call i32 @llvm.tpu.rdcbreg.size(x86_mmx %cb_2)
+  %result_o = tail call i32 @llvm.tpu.rdcbreg.offset(x86_mmx %cbt_2)
+  %result_bi = tail call i32 @llvm.tpu.ptrtoint.p0i32(i32* %result_b)
+  store i32 %result_bi, i32* %a1w, align 4
+  store i32 %result_s, i32* %a2w, align 4
+  store i32 %result_o, i32* %a3w, align 4
+  ret void
+}
+
+; CHECK-LABEL: cbreg_copy_tilespmem_base:
+; CHECK-DAG: s[[s0:[0-9]+]] = rdcbreg [cbreg:$0x0 metadata:$0x0
+; CHECK-DAG: cbreg:$0x1 metadata:$0x0] = wrcbreg s[[s0]]
+; CHECK-DAG: s[[s1:[0-9]+]] = rdcbreg [cbreg:$0x0 metadata:$0x1
+; CHECK-DAG: cbreg:$0x1 metadata:$0x1] = wrcbreg s[[s1]]
+; CHECK-DAG: s[[s2:[0-9]+]] = rdcbreg [cbreg:$0x0 metadata:$0x2
+; CHECK-DAG: cbreg:$0x1 metadata:$0x2] = wrcbreg s[[s2]]
+
+define void @cbreg_copy_tilespmem_base() {
+entry:
+  %g = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 0)
+  %g2 = tail call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 1024)
+  %a0 = getelementptr inbounds i32, i32* %g, i32 0
+  %l = load i32, i32* %a0, align 4
+  %a1t = getelementptr inbounds i32, i32* %g, i32 128
+  %a2t = getelementptr inbounds i32, i32* %g, i32 256
+  %bt = tail call i32 addrspace(201)* @llvm.tpu.inttoptr.p201i32(i32 128)
+  %st = load i32, i32* %a1t, align 4
+  %ot = load i32, i32* %a2t, align 4
+  %cbt_0 = tail call x86_mmx @llvm.tpu.wrcbreg.tilespmem.base(x86_mmx undef, i32 addrspace(201)* %bt)
+  %cbt_1 = tail call x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx %cbt_0, i32 %st)
+  %cbt_2 = tail call x86_mmx @llvm.tpu.wrcbreg.offset(x86_mmx %cbt_1, i32 %ot)
+  %cmp.i = icmp slt i32 %l, 15
+  br i1 %cmp.i, label %bb.t, label %bb.f
+
+bb.t:
+  br label %exit
+
+bb.f:
+  %a1f = getelementptr inbounds i32, i32* %g, i32 1024
+  %a2f = getelementptr inbounds i32, i32* %g, i32 1036
+  %bf = tail call i32 addrspace(201)* @llvm.tpu.inttoptr.p201i32(i32 512)
+  %sf = load i32, i32* %a1f, align 4
+  %of = load i32, i32* %a2f, align 4
+  %cbf_0 = tail call x86_mmx @llvm.tpu.wrcbreg.tilespmem.base(x86_mmx undef, i32 addrspace(201)* %bf)
+  %cbf_1 = tail call x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx %cbf_0, i32 %sf)
+  %cbf_2 = tail call x86_mmx @llvm.tpu.wrcbreg.offset(x86_mmx %cbf_1, i32 %of)
+  br label %exit
+
+exit:
+  %cb_2 = phi x86_mmx [ %cbt_2, %bb.t ], [ %cbf_2, %bb.f ]
+  %a1w = getelementptr inbounds i32, i32* %g, i32 1024
+  %a2w = getelementptr inbounds i32, i32* %g, i32 1036
+  %a3w = getelementptr inbounds i32, i32* %g, i32 1036
+  %result_b = tail call i32 addrspace(201)* @llvm.tpu.rdcbreg.tilespmem.base(x86_mmx %cb_2)
+  %result_s = tail call i32 @llvm.tpu.rdcbreg.size(x86_mmx %cb_2)
+  %result_o = tail call i32 @llvm.tpu.rdcbreg.offset(x86_mmx %cbt_2)
+  %result_bi = ptrtoint i32 addrspace(201)* %result_b to i32
+  store i32 %result_bi, i32* %a1w, align 4
+  store i32 %result_s, i32* %a2w, align 4
+  store i32 %result_o, i32* %a3w, align 4
+  ret void
+}
+
+; Tests that select is lowered to a predicated cbreg copy.
+
+; CHECK-LABEL: cbreg_select_smem:
+; CHECK: [cbreg:$0x0 metadata:$0x0] = wrcbreg $0x80
+; CHECK: [cbreg:$0x1 metadata:$0x0] = wrcbreg $0x200
+; CHECK: p[[p0:[0-9]+]] = slt.s32 s{{[0-9]+}}, $0xf
+; CHECK-NEXT: s[[s0:[0-9]+]] = rdcbreg @!p[[p0]] [cbreg:$0x1 metadata:$0x0]
+; CHECK-NEXT: [cbreg:$0x0 metadata:$0x0] = wrcbreg @!p[[p0]] s[[s0]]
+; CHECK-NEXT: s[[s0]] = rdcbreg @!p[[p0]] [cbreg:$0x1 metadata:$0x1]
+; CHECK-NEXT: [cbreg:$0x0 metadata:$0x1] = wrcbreg @!p[[p0]] s[[s0]]
+; CHECK-NEXT: s[[s0]] = rdcbreg @!p[[p0]] [cbreg:$0x1 metadata:$0x2]
+; CHECK-NEXT: [cbreg:$0x0 metadata:$0x2] = wrcbreg @!p[[p0]] s[[s0]]
+
+define void @cbreg_select_smem() {
+entry:
+  %g = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 0)
+  %a0 = getelementptr inbounds i32, i32* %g, i32 0
+  %ar = getelementptr inbounds i32, i32* %g, i32 1036
+  %l = load i32, i32* %a0, align 4
+  %bt = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 128)
+  %bf = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 512)
+  %cbt = tail call x86_mmx @llvm.tpu.wrcbreg.smem.base(x86_mmx undef, i32* %bt)
+  %cbf = tail call x86_mmx @llvm.tpu.wrcbreg.smem.base(x86_mmx undef, i32* %bf)
+  %cmp.i = icmp slt i32 %l, 15
+  %cb_cond = select i1 %cmp.i, x86_mmx %cbt, x86_mmx %cbf
+  %result_s = tail call i32 @llvm.tpu.rdcbreg.size(x86_mmx %cb_cond)
+  store i32 %result_s, i32* %ar, align 4
+  ret void
+}
+
+declare void @llvm.tpu.vst.cb.upd.msk(<8 x i1>, x86_mmx, i32, <8 x i32>)
+
+; Tests that the cb.upd instruction and rdcbreg stay ordered.
+
+; CHECK-LABEL: cbreg_rdcbreg_upd:
+; CHECK: = vst.cb.upd.msk
+; CHECK: s{{[0-9]+}} = rdcbreg
+; CHECK: s{{[0-9]+}} = rdcbreg
+; CHECK: s{{[0-9]+}} = rdcbreg
+define i32 @cbreg_rdcbreg_upd(x86_mmx %cb0, x86_mmx %cb1, <8 x i1> %m, <8 x i32> %v0) {
+  tail call void @llvm.tpu.vst.cb.upd.msk(<8 x i1> %m, x86_mmx %cb0, i32 0, <8 x i32> %v0)
+  %r0 = tail call i32 @llvm.tpu.rdcbreg.size(x86_mmx %cb0)
+  %r1 = tail call i32 @llvm.tpu.rdcbreg.offset(x86_mmx %cb0)
+  %r2 = tail call i32 @llvm.tpu.rdcbreg.size(x86_mmx %cb1)
+  %r3 = add i32 %r0, %r1
+  %r = add i32 %r3, %r2
+  ret i32 %r
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/clear_tile_ibuf.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/clear_tile_ibuf.ll
new file mode 100644
index 0000000..7b4920f
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/clear_tile_ibuf.ll

@@ -0,0 +1,29 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-scs-vf < %s \
+; RUN: | llc -mcpu=sparsecore-scs-vf -asm-verbose=false -disable-cgp \
+; RUN: -opaque-pointers | FileCheck %s
+
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; This test checks that the tpu_clear_ibuf intrinsic is correctly lowered to an
+; sfence followed by a task.clear_ibuf with an empty task.
+
+declare void @llvm.tpu.clear.ibuf(i32 addrspace(208)*)
+declare i32 addrspace(208)* @llvm.tpu.allocate.dreg(i32, i32)
+
+; CHECK-LABEL: SCS:
+; CHECK: sfence
+; CHECK-DAG: [dreg:$0x5] = wrdreg $-0x1
+; CHECK-DAG: [dreg:$0x6] = wrdreg $-0x1
+; CHECK-DAG: [[S0:s[0-9]+]] = simm.s32 $0x5
+; CHECK-NEXT: task.clear_ibuf [dreg:[[S0]]], $0x2ffff
+define void @SCS() #0 section ".text.scs" {
+entry:
+  %dreg = call i32 addrspace(208)* @llvm.tpu.allocate.dreg(i32 2, i32 5)
+  call void @llvm.tpu.clear.ibuf(i32 addrspace(208)* %dreg)
+  ret void
+}
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/compiler_metadata_test.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/compiler_metadata_test.ll
new file mode 100644
index 0000000..35a07ea
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/compiler_metadata_test.ll

@@ -0,0 +1,89 @@
+; RUN: llc %s -o - -mcpu=tensorcore-jf -tpu-report-used-spillslots | FileCheck %s
+; REQUIRES: tpu
+
+; Test emission of function metadata
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Extarnal functions
+declare void @g()
+declare void @g1(<1024 x float> %a)
+declare void @g2(float %a)
+declare void @g3(float %a, <1024 x float> %b)
+
+; Make sure we record spill slots and spill instructions correctly for VMem
+; spills and emit them as function metadata.
+; CHECK-LABEL: f_vmem
+; CHECK: shalt
+; CHECK: .section	function_metadata.f_vmem,"",@progbits
+; CHECK: .Lvector_spill_slots_num0:
+; CHECK: .long	1
+; CHECK: .Lscalar_spill_slots_num0:
+; CHECK: .long	0
+; CHECK: .Lmax_cross_call_scalar_regs0:
+; CHECK: .long	0
+; CHECK: .Lmax_cross_call_vector_regs0:
+; CHECK: .long	1
+
+define void @f_vmem(<1024 x float> %a) {
+  call void @g()
+  call void @g1(<1024 x float> %a)
+  ret void
+}
+
+; Make sure we record spill slots and spill instructions correctly for SMem
+; spills and emit them as function metadata.
+; CHECK-LABEL: f_smem
+; CHECK: shalt
+; CHECK: .section	function_metadata.f_smem,"",@progbits
+; CHECK: .Lvector_spill_slots_num1:
+; CHECK: .long	0
+; CHECK: .Lscalar_spill_slots_num1:
+; CHECK: .long	1
+; CHECK: .Lmax_cross_call_scalar_regs1:
+; CHECK: .long	1
+; CHECK: .Lmax_cross_call_vector_regs1:
+; CHECK: .long	0
+
+; CHECK: // --- End of Compiler Statistics for @f_smem ---
+define void @f_smem(float %a) {
+  call void @g()
+  call void @g2(float %a)
+  ret void
+}
+
+; Make sure we record spill slots and spill instructions correctly for VMem/SMem
+; spills when they are interleaved
+; CHECK-LABEL: f_smem_vmem
+; CHECK: shalt
+; CHECK: .section	function_metadata.f_smem_vmem,"",@progbits
+; CHECK: .Lvector_spill_slots_num2:
+; CHECK: .long	1
+; CHECK: .Lscalar_spill_slots_num2:
+; CHECK: .long	1
+; CHECK: .Lmax_cross_call_scalar_regs2:
+; CHECK: .long	1
+; CHECK: .Lmax_cross_call_vector_regs2:
+; CHECK: .long	1
+
+define void @f_smem_vmem(float %a, <1024 x float> %b) {
+  call void @g()
+  call void @g3(float %a, <1024 x float> %b)
+  ret void
+}
+
+!smem.funcs.spill = !{!0, !1, !2}
+!smem.ranges.spill.start = !{!3, !3, !3}
+!smem.ranges.spill.limit = !{!4, !4, !4}
+!vmem.funcs.spill = !{!0, !1, !2}
+!vmem.ranges.spill.start = !{!3, !3, !3}
+!vmem.ranges.spill.limit = !{!4, !4, !4}
+
+
+!0 = !{void (<1024 x float>)* @f_vmem}
+!1 = !{void (float)* @f_smem}
+!2 = !{void (float, <1024 x float>)* @f_smem_vmem}
+
+!3 = !{i32 4}
+!4 = !{i32 100}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/compiler_statistics_test.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/compiler_statistics_test.ll
new file mode 100644
index 0000000..5a2663f
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/compiler_statistics_test.ll

@@ -0,0 +1,94 @@
+; RUN: llc %s -o - -mcpu=tensorcore-jf -asm-verbose=true | FileCheck %s
+; REQUIRES: tpu
+
+; Test emission of compiler statistics with verbose assembly.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Extarnal functions
+declare void @g()
+declare void @g1(<1024 x float> %a)
+declare void @g2(float %a)
+declare void @g3(float %a, <1024 x float> %b)
+
+; Make sure we record spill slots and spill instructions correctly for VMem
+; spills
+; CHECK-LABEL: f_vmem
+; CHECK: shalt
+; CHECK: // --- Compiler Statistics for @f_vmem ---
+; CHECK: // NumberOfBundles           = 8
+; CHECK: // NumberOfStaticVDelayWaits = 0
+; CHECK: // NumberOfVectorSpills      = 1
+; CHECK: // NumberOfScalarSpills      = 0
+; CHECK: // NumberOfVectorReloads     = 1
+; CHECK: // NumberOfScalarReloads     = 0
+; CHECK: // VectorUsedFrameIndices    = 1
+; CHECK: // ScalarUsedFrameIndices    = 0
+; CHECK: // MaxCrossCallScalarRegs    = 0
+; CHECK: // MaxCrossCallVectorRegs    = 1
+; CHECK: // --- End of Compiler Statistics for @f_vmem ---
+define void @f_vmem(<1024 x float> %a) {
+  call void @g()
+  call void @g1(<1024 x float> %a)
+  ret void
+}
+
+; Make sure we record spill slots and spill instructions correctly for SMem
+; spills
+; CHECK-LABEL: f_smem
+; CHECK: shalt
+; CHECK: // --- Compiler Statistics for @f_smem ---
+; CHECK: // NumberOfBundles           = 6
+; CHECK: // NumberOfStaticVDelayWaits = 0
+; CHECK: // NumberOfVectorSpills      = 0
+; CHECK: // NumberOfScalarSpills      = 1
+; CHECK: // NumberOfVectorReloads     = 0
+; CHECK: // NumberOfScalarReloads     = 1
+; CHECK: // VectorUsedFrameIndices    = 0
+; CHECK: // ScalarUsedFrameIndices    = 1
+; CHECK: // MaxCrossCallScalarRegs    = 1
+; CHECK: // MaxCrossCallVectorRegs    = 0
+; CHECK: // --- End of Compiler Statistics for @f_smem ---
+define void @f_smem(float %a) {
+  call void @g()
+  call void @g2(float %a)
+  ret void
+}
+
+; Make sure we record spill slots and spill instructions correctly for VMem/SMem
+; spills when they are interleaved
+; CHECK-LABEL: f_smem_vmem
+; CHECK: shalt
+; CHECK: // --- Compiler Statistics for @f_smem_vmem ---
+; CHECK: // NumberOfBundles           = 8
+; CHECK: // NumberOfStaticVDelayWaits = 0
+; CHECK: // NumberOfVectorSpills      = 1
+; CHECK: // NumberOfScalarSpills      = 1
+; CHECK: // NumberOfVectorReloads     = 1
+; CHECK: // NumberOfScalarReloads     = 1
+; CHECK: // VectorUsedFrameIndices    = 1
+; CHECK: // ScalarUsedFrameIndices    = 1
+; CHECK: // MaxCrossCallScalarRegs    = 1
+; CHECK: // MaxCrossCallVectorRegs    = 1
+; CHECK: // --- End of Compiler Statistics for @f_smem_vmem ---
+define void @f_smem_vmem(float %a, <1024 x float> %b) {
+  call void @g()
+  call void @g3(float %a, <1024 x float> %b)
+  ret void
+}
+
+!smem.funcs.spill = !{!0, !1, !2}
+!smem.ranges.spill.start = !{!3, !3, !3}
+!smem.ranges.spill.limit = !{!4, !4, !4}
+!vmem.funcs.spill = !{!0, !1, !2}
+!vmem.ranges.spill.start = !{!3, !3, !3}
+!vmem.ranges.spill.limit = !{!4, !4, !4}
+
+
+!0 = !{void (<1024 x float>)* @f_vmem}
+!1 = !{void (float)* @f_smem}
+!2 = !{void (float, <1024 x float>)* @f_smem_vmem}
+
+!3 = !{i32 4}
+!4 = !{i32 100}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/continuations_main_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/continuations_main_sc.ll
new file mode 100644
index 0000000..76d2521
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/continuations_main_sc.ll

@@ -0,0 +1,43 @@
+; RUN: llc < %s -mcpu=sparsecore-scs-vf -disable-cgp \
+; RUN: -enable-continuations | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; This test that -enable-continuations adds a br.abs to the
+; @_section_cstart instead of a shalt, on sparse-scs- cores.
+
+; Use special section start symbol
+@_section_cstart = common addrspace(215) global i32 0
+
+; CHECK-LABEL: @execute_lowered
+; CHECK: shalt
+define void @execute_lowered(<8 x i32> addrspace(201)* %a) #2 {
+entry:
+
+  %r = load <8 x i32>, <8 x i32> addrspace(201)* %a, align 32
+  ret void
+}
+
+; CHECK-LABEL: @access_lowered
+; CHECK: shalt
+define void @access_lowered(<8 x i32> addrspace(201)* %a) #1 {
+entry:
+
+  %r = load <8 x i32>, <8 x i32> addrspace(201)* %a, align 32
+  ret void
+}
+
+; CHECK-LABEL: @main_lowered
+; CHECK: sbr.abs _section_cstart
+define void @main_lowered(<8 x i32> addrspace(201)* %a) #0 {
+entry:
+
+  %r = load <8 x i32>, <8 x i32> addrspace(201)* %a, align 32
+  ret void
+}
+
+attributes #0 = { "implicit-section-name"=".text.tile_execute" "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "implicit-section-name"=".text.tile_access" "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "implicit-section-name"=".text.scs" "target-cpu"="sparsecore-tec-vf" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/continuations_opt_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/continuations_opt_sc.ll
new file mode 100644
index 0000000..f9fdd3e
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/continuations_opt_sc.ll

@@ -0,0 +1,17 @@
+; RUN: opt < %s -S -O2 -mcpu=sparsecore-scs-vf -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests that @_section_cstart is not optimized out by globaldce or others.
+
+; Use special section start symbol
+@_section_cstart = common addrspace(215) global i32 0
+
+; CHECK: @_section_cstart
+
+attributes #0 = { "implicit-section-name"=".text.tile_execute" "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "implicit-section-name"=".text.tile_access" "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "implicit-section-name"=".text.scs" "target-cpu"="sparsecore-tec-vf" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/continuations_symbol.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/continuations_symbol.ll
new file mode 100644
index 0000000..329a51c
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/continuations_symbol.ll

@@ -0,0 +1,14 @@
+; RUN: opt < %s -S -O2 -mcpu=sparsecore-scs-vf --enable-continuations \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests that @_section_cstart is inserted if continuations are enabled.
+
+; CHECK: @_section_cstart
+
+attributes #0 = { "implicit-section-name"=".text.tile_execute" "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "implicit-section-name"=".text.tile_access" "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "implicit-section-name"=".text.scs" "target-cpu"="sparsecore-tec-vf" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/continuations_tail_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/continuations_tail_sc.ll
new file mode 100644
index 0000000..4a005e0
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/continuations_tail_sc.ll

@@ -0,0 +1,27 @@
+; RUN: llc < %s -mcpu=sparsecore-scs-vf -disable-cgp \
+; RUN: -enable-continuations | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; This test that -enable-continuations-exit adds a br.abs to the
+; @_section_cfallthrough instead of a shalt, on sparse-scs- cores.
+
+declare void @llvm.tpu.dma.hbm.to.simem.sc.simple(i32 addrspace(211)*, i32 addrspace(203)*, i32 addrspace(215)*, i32, i32) argmemonly nounwind
+
+; Use special section fallthrough symbol
+@_section_cfallthrough = common addrspace(215) global i32 0
+
+; CHECK-LABEL: tail_lowered:
+; CHECK: (pc) =  sbr.abs _section_cfallthrough
+; CHECK: s[[ft:[0-9]+]] = simm.s32 _section_cfallthrough
+; CHECK: [simem:s[[ft]]], [sflag:s{{[0-9]+}}] = dma.local [hbm:s{{[0-9]+}}], $0x4
+define void @tail_lowered(i32 addrspace(203)* %src, i32 addrspace(215)* %dst, i32 addrspace(211)* %rflag) #0 {
+  call void @llvm.tpu.dma.hbm.to.simem.sc.simple(i32 addrspace(211)* %rflag, i32 addrspace(203)* %src, i32 addrspace(215)* @_section_cfallthrough, i32 4, i32 0)
+  ret void
+}
+
+attributes #0 = { "implicit-section-name"=".text.tile_execute" "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "implicit-section-name"=".text.tile_access" "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "implicit-section-name"=".text.scs" "target-cpu"="sparsecore-tec-vf" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/copy_rotate_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/copy_rotate_sc.ll
new file mode 100644
index 0000000..a4006a4
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/copy_rotate_sc.ll

@@ -0,0 +1,202 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf \
+; RUN: -tpu-enable-copy-rotate -stop-after=tpu-copy-rotate \
+; RUN: -instcombine-max-iterations=0 | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests that all copies have been rotate to the bottom of the
+; loop (experimental pass) for GFC analysis.
+
+; The LLVM-IR from this test has been copied from C-lang
+; sorted_deduplicate_unroll3_predicated.
+
+; CHECK-LABEL: bb.5 (%ir-block.20):
+; CHECK-NOT: COPY
+; CHECK: scBUNDLE
+; CHECK: scBUNDLE
+; CHECK: scBUNDLE
+; CHECK: scBUNDLE
+; CHECK: scBUNDLE
+; CHECK: scBUNDLE
+; CHECK: scBUNDLE
+; CHECK: scBUNDLE
+; CHECK: BRcond
+; CHECK: COPY
+; CHECK: COPY
+; CHECK: COPY
+; CHECK: COPY
+; CHECK: COPY
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind readnone willreturn
+define dso_local void @tile_access() #0 section ".text.tile_access" {
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define dso_local void @tile_execute() #1 section ".text.tile_execute" {
+  %1 = load i32, i32* inttoptr (i32 256 to i32*), align 256, !tbaa !10
+  %2 = load i32, i32* inttoptr (i32 257 to i32*), align 4, !tbaa !10
+  %3 = sdiv i32 %1, 8
+  %4 = inttoptr i32 %2 to <8 x i32> addrspace(201)*
+  %5 = icmp sgt i32 %1, 7
+  br i1 %5, label %.lr.ph, label %_ZN10embeddings17SortedDeduplicate7ComputeENS_20TileSpmemVectorArrayIiEEPS2_PiS3_.exit, !llvm.loop !14
+
+.lr.ph:                                           ; preds = %0
+  %6 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %4, i32 0, i32 0
+  %7 = load i32, i32 addrspace(201)* %6, align 32
+  %8 = add nsw i32 %7, -1
+  %9 = insertelement <8 x i32> poison, i32 %8, i64 0
+  %10 = shufflevector <8 x i32> %9, <8 x i32> poison, <8 x i32> zeroinitializer
+  %11 = load i32, i32* inttoptr (i32 259 to i32*), align 4, !tbaa !10
+  %12 = load i32, i32* inttoptr (i32 258 to i32*), align 4, !tbaa !10
+  %13 = inttoptr i32 %12 to <8 x i32> addrspace(201)*
+  %14 = inttoptr i32 %11 to <8 x i32> addrspace(201)*
+  br label %15
+
+15:                                               ; preds = %.lr.ph, %50
+  %.0717 = phi <8 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %.lr.ph ], [ %.3, %50 ]
+  %.0516716 = phi <8 x i32> [ %10, %.lr.ph ], [ %.3519, %50 ]
+  %.0520715 = phi i32 [ 0, %.lr.ph ], [ %51, %50 ]
+  %16 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %4, i32 %.0520715
+  %17 = load <8 x i32>, <8 x i32> addrspace(201)* %16, align 32, !tbaa !22, !alias.scope !23, !noalias !26, !llvm.access.group !17
+  %18 = tail call <8 x i32> @llvm.tpu.vshift.insert.v8i32(<8 x i32> %.0516716, <8 x i32> %17, i32 7)
+  %19 = icmp ne <8 x i32> %18, %17
+  %20 = tail call <8 x i32> @llvm.tpu.vmpcnt.ones(<8 x i1> %19)
+  %21 = tail call <8 x i32> @llvm.tpu.mprefix.v8i32(<8 x i1> %19)
+  %22 = add <8 x i32> %21, %.0717
+  tail call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> %19, <8 x i32> addrspace(201)* %13, <8 x i32> %22, <8 x i32> %17), !alias.scope !23, !noalias !26, !llvm.access.group !17
+  %23 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %14, i32 %.0520715
+  store <8 x i32> %22, <8 x i32> addrspace(201)* %23, align 32, !tbaa !22, !alias.scope !23, !noalias !26, !llvm.access.group !17
+  %24 = add <8 x i32> %20, %.0717
+  %25 = add nuw nsw i32 %.0520715, 1
+  %26 = icmp slt i32 %25, %3
+  br i1 %26, label %27, label %37, !llvm.loop !14
+
+27:                                               ; preds = %15
+  %28 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %4, i32 %25
+  %29 = load <8 x i32>, <8 x i32> addrspace(201)* %28, align 32, !tbaa !22, !alias.scope !37, !noalias !38, !llvm.access.group !17
+  %30 = tail call <8 x i32> @llvm.tpu.vshift.insert.v8i32(<8 x i32> %17, <8 x i32> %29, i32 7)
+  %31 = icmp ne <8 x i32> %30, %29
+  %32 = tail call <8 x i32> @llvm.tpu.vmpcnt.ones(<8 x i1> %31)
+  %33 = tail call <8 x i32> @llvm.tpu.mprefix.v8i32(<8 x i1> %31)
+  %34 = add <8 x i32> %33, %24
+  tail call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> %31, <8 x i32> addrspace(201)* %13, <8 x i32> %34, <8 x i32> %29), !alias.scope !37, !noalias !38, !llvm.access.group !17
+  %35 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %14, i32 %25
+  store <8 x i32> %34, <8 x i32> addrspace(201)* %35, align 32, !tbaa !22, !alias.scope !37, !noalias !38, !llvm.access.group !17
+  %36 = add <8 x i32> %32, %24
+  br label %37, !llvm.loop !14
+
+37:                                               ; preds = %27, %15
+  %.2518 = phi <8 x i32> [ %29, %27 ], [ %17, %15 ]
+  %.2 = phi <8 x i32> [ %36, %27 ], [ %24, %15 ]
+  %38 = add nuw nsw i32 %.0520715, 2
+  %39 = icmp slt i32 %38, %3
+  br i1 %39, label %40, label %50, !llvm.loop !14
+
+40:                                               ; preds = %37
+  %41 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %4, i32 %38
+  %42 = load <8 x i32>, <8 x i32> addrspace(201)* %41, align 32, !tbaa !22, !alias.scope !39, !noalias !40, !llvm.access.group !17
+  %43 = tail call <8 x i32> @llvm.tpu.vshift.insert.v8i32(<8 x i32> %.2518, <8 x i32> %42, i32 7)
+  %44 = icmp ne <8 x i32> %43, %42
+  %45 = tail call <8 x i32> @llvm.tpu.vmpcnt.ones(<8 x i1> %44)
+  %46 = tail call <8 x i32> @llvm.tpu.mprefix.v8i32(<8 x i1> %44)
+  %47 = add <8 x i32> %46, %.2
+  tail call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> %44, <8 x i32> addrspace(201)* %13, <8 x i32> %47, <8 x i32> %42), !alias.scope !39, !noalias !40, !llvm.access.group !17
+  %48 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %14, i32 %38
+  store <8 x i32> %47, <8 x i32> addrspace(201)* %48, align 32, !tbaa !22, !alias.scope !39, !noalias !40, !llvm.access.group !17
+  %49 = add <8 x i32> %45, %.2
+  br label %50, !llvm.loop !14
+
+50:                                               ; preds = %40, %37
+  %.3519 = phi <8 x i32> [ %42, %40 ], [ %.2518, %37 ]
+  %.3 = phi <8 x i32> [ %49, %40 ], [ %.2, %37 ]
+  %51 = add nuw nsw i32 %.0520715, 3
+  %52 = icmp slt i32 %51, %3
+  br i1 %52, label %15, label %_ZN10embeddings17SortedDeduplicate7ComputeENS_20TileSpmemVectorArrayIiEEPS2_PiS3_.exit, !llvm.loop !14
+
+_ZN10embeddings17SortedDeduplicate7ComputeENS_20TileSpmemVectorArrayIiEEPS2_PiS3_.exit: ; preds = %50, %0
+  %.0.lcssa = phi <8 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %0 ], [ %.3, %50 ]
+  %53 = extractelement <8 x i32> %.0.lcssa, i64 0
+  %54 = add nsw i32 %53, 1
+  store i32 %54, i32* inttoptr (i32 256 to i32*), align 256, !tbaa !10
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind readnone willreturn
+define dso_local void @scs() #2 section ".text.scs" {
+  ret void
+}
+
+; Function Attrs: alwaysinline nofree nosync nounwind readnone
+declare <8 x i32> @llvm.tpu.vshift.insert.v8i32(<8 x i32>, <8 x i32>, i32) #3
+
+; Function Attrs: alwaysinline nofree nosync nounwind readnone
+declare <8 x i32> @llvm.tpu.vmpcnt.ones(<8 x i1>) #3
+
+; Function Attrs: alwaysinline nofree nosync nounwind readnone
+declare <8 x i32> @llvm.tpu.mprefix.v8i32(<8 x i1>) #3
+
+; Function Attrs: alwaysinline argmemonly nounwind writeonly
+declare void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>) #4
+
+attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind readnone willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tac-vf" }
+attributes #1 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #2 = { alwaysinline mustprogress nofree norecurse nosync nounwind readnone willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-scs-vf" }
+attributes #3 = { alwaysinline nofree nosync nounwind readnone }
+attributes #4 = { alwaysinline argmemonly nounwind writeonly }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+!smem.funcs.spill = !{!3, !4, !5}
+!smem.ranges.spill.start = !{!6, !6, !6}
+!smem.ranges.spill.limit = !{!7, !7, !8}
+!tilespmem.funcs.spill = !{!3, !4, !5}
+!tilespmem.ranges.spill.start = !{!6, !6, !6}
+!tilespmem.ranges.spill.limit = !{!9, !9, !6}
+!vmem.funcs.spill = !{!3, !4, !5}
+!vmem.ranges.spill.start = !{!6, !6, !6}
+!vmem.ranges.spill.limit = !{!6, !6, !6}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"clang version google3-trunk (b223e5f8468cbed5cffe0d872de8feac2a73b030)"}
+!3 = !{void ()* @tile_access}
+!4 = !{void ()* @tile_execute}
+!5 = !{void ()* @scs}
+!6 = !{i32 0}
+!7 = !{i32 2048}
+!8 = !{i32 16384}
+!9 = !{i32 131072}
+!10 = !{!11, !11, i64 0}
+!11 = !{!"int", !12, i64 0}
+!12 = !{!"omnipotent char", !13, i64 0}
+!13 = !{!"Simple C++ TBAA"}
+!14 = distinct !{!14, !15, !16, !18, !19, !20, !21}
+!15 = !{!"llvm.loop.mustprogress"}
+!16 = !{!"llvm.loop.parallel_accesses", !17}
+!17 = distinct !{}
+!18 = !{!"llvm.loop.unroll.disable"}
+!19 = !{!"llvm.loop.vectorize.width", i32 1}
+!20 = !{!"llvm.loop.interleave.count", i32 1}
+!21 = !{!"llvm.loop.vectorize.enable", i1 true}
+!22 = !{!12, !12, i64 0}
+!23 = !{!24}
+!24 = distinct !{!24, !25, !"loop.parallel"}
+!25 = distinct !{!25, !"<unnamed loop>"}
+!26 = !{!27, !28, !30, !32, !33, !35}
+!27 = distinct !{!27, !25, !"loop.parallel"}
+!28 = distinct !{!28, !29, !"loop.parallel"}
+!29 = distinct !{!29, !"<unnamed loop>"}
+!30 = distinct !{!30, !31, !"loop.parallel"}
+!31 = distinct !{!31, !"<unnamed loop>"}
+!32 = distinct !{!32, !31, !"loop.parallel"}
+!33 = distinct !{!33, !34, !"loop.parallel"}
+!34 = distinct !{!34, !"<unnamed loop>"}
+!35 = distinct !{!35, !36, !"loop.parallel"}
+!36 = distinct !{!36, !"<unnamed loop>"}
+!37 = !{!28}
+!38 = !{!24, !27, !30, !32, !33, !35}
+!39 = !{!33}
+!40 = !{!24, !27, !28, !30, !32, !35}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/cross_lane_gf_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/cross_lane_gf_sc.ll
new file mode 100644
index 0000000..b860c93
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/cross_lane_gf_sc.ll

@@ -0,0 +1,704 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.dupcnti.v16i32(<16 x i1>, <16 x i32>) readnone nounwind
+declare { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.dupcntf.v16f32(<16 x i1>, <16 x float>) readnone nounwind
+declare { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.uniquei.v16i32(<16 x i1>, <16 x i32>) readnone nounwind
+declare { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.uniquef.v16f32(<16 x i1>, <16 x float>) readnone nounwind
+declare { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.sort.ascdi.v16i32.v16i32(<16 x i1>, <16 x i32>, <16 x i32>) readnone nounwind
+declare { <16 x i32>, <16 x float>, <16 x i1> } @llvm.tpu.sort.ascdi.v16i32.v16f32(<16 x i1>, <16 x i32>, <16 x float>) readnone nounwind
+declare { <16 x float>, <16 x float>, <16 x i1> } @llvm.tpu.sort.ascdf.v16f32.v16f32(<16 x i1>, <16 x float>, <16 x float>) readnone nounwind
+declare { <16 x float>, <16 x i32>, <16 x i1> } @llvm.tpu.sort.ascdf.v16f32.v16i32(<16 x i1>, <16 x float>, <16 x i32>) readnone nounwind
+declare { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.sort.dscdi.v16i32.v16i32(<16 x i1>, <16 x i32>, <16 x i32>) readnone nounwind
+declare { <16 x i32>, <16 x float>, <16 x i1> } @llvm.tpu.sort.dscdi.v16i32.v16f32(<16 x i1>, <16 x i32>, <16 x float>) readnone nounwind
+declare { <16 x float>, <16 x float>, <16 x i1> } @llvm.tpu.sort.dscdf.v16f32.v16f32(<16 x i1>, <16 x float>, <16 x float>) readnone nounwind
+declare { <16 x float>, <16 x i32>, <16 x i1> } @llvm.tpu.sort.dscdf.v16f32.v16i32(<16 x i1>, <16 x float>, <16 x i32>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.add.scan1xNi.v16i32(<16 x i1>, <16 x i32>) readnone nounwind
+declare { <16 x float>, <16 x i1> } @llvm.tpu.add.scan1xNf.v16f32(<16 x i1>, <16 x float>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.add.seg.scan1xNi.v16i32(<16 x i1>, <16 x i32>) readnone nounwind
+declare { <16 x float>, <16 x i1> } @llvm.tpu.add.seg.scan1xNf.v16f32(<16 x i1>, <16 x float>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.min.seg.scan1xNi.v16i32(<16 x i1>, <16 x i32>) readnone nounwind
+declare { <16 x float>, <16 x i1> } @llvm.tpu.min.seg.scan1xNf.v16f32(<16 x i1>, <16 x float>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.min.seg.index.scan1xNi.v16i32(<16 x i1>, <16 x i32>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.min.seg.index.scan1xNf.v16f32(<16 x i1>, <16 x float>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.max.seg.scan1xNi.v16i32(<16 x i1>, <16 x i32>) readnone nounwind
+declare { <16 x float>, <16 x i1> } @llvm.tpu.max.seg.scan1xNf.v16f32(<16 x i1>, <16 x float>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.max.seg.index.scan1xNi.v16i32(<16 x i1>, <16 x i32>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.max.seg.index.scan1xNf.v16f32(<16 x i1>, <16 x float>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.min.scan1xNi.v16i32(<16 x i1>, <16 x i32>) readnone nounwind
+declare { <16 x float>, <16 x i1> } @llvm.tpu.min.scan1xNf.v16f32(<16 x i1>, <16 x float>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.min.index.scan1xNi.v16i32(<16 x i1>, <16 x i32>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.min.index.scan1xNf.v16f32(<16 x i1>, <16 x float>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.max.scan1xNi.v16i32(<16 x i1>, <16 x i32>) readnone nounwind
+declare { <16 x float>, <16 x i1> } @llvm.tpu.max.scan1xNf.v16f32(<16 x i1>, <16 x float>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.max.index.scan1xNi.v16i32(<16 x i1>, <16 x i32>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.max.index.scan1xNf.v16f32(<16 x i1>, <16 x float>) readnone nounwind
+
+; We deliberately make this a change-detector test because we really care that
+; there are 7 nops and one pop per cycle.
+; CHECK-LABEL: vdupcnti:
+; CHECK:      {  	(xrf1) = vdupcnt.msk.u32 vm0, v0;
+; CHECK-NEXT:   	_ = 	sdelay $0x8  }
+; CHECK-NEXT: {  	_, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)
+define <16 x i32> @vdupcnti(<16 x i1> %m, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.dupcnti.v16i32(<16 x i1> %m, <16 x i32> %v)
+  %co = extractvalue { <16 x i32>, <16 x i32>, <16 x i1> } %a, 1
+  %mo = extractvalue { <16 x i32>, <16 x i32>, <16 x i1> } %a, 2
+  %b = select <16 x i1> %mo, <16 x i32> %co, <16 x i32> %v
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vdupcntf:
+; CHECK:      {  	(xrf1) = vdupcnt.msk.f32 vm0, v0;
+; CHECK-NEXT:   	_ = 	sdelay $0x8  }
+; CHECK-NEXT: {  	_, v{{[0-9]+}}, 	_ =	vpop (xrf1)
+define <16 x i32> @vdupcntf(<16 x i1> %m, <16 x float> %v) {
+  %a = call { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.dupcntf.v16f32(<16 x i1> %m, <16 x float> %v)
+  %co = extractvalue { <16 x i32>, <16 x i32>, <16 x i1> } %a, 1
+  ret <16 x i32> %co
+}
+
+; CHECK-LABEL: vuniquei:
+; CHECK:      {  	(xrf1) = vunique.msk.u32 vm0, v0;
+; CHECK-NEXT:   	_ = 	sdelay $0x8  }
+; CHECK-NEXT: {  	_, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)
+define <16 x i32> @vuniquei(<16 x i1> %m, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.uniquei.v16i32(<16 x i1> %m, <16 x i32> %v)
+  %co = extractvalue { <16 x i32>, <16 x i32>, <16 x i1> } %a, 1
+  %mo = extractvalue { <16 x i32>, <16 x i32>, <16 x i1> } %a, 2
+  %b = select <16 x i1> %mo, <16 x i32> %co, <16 x i32> %v
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vuniquef:
+; CHECK:      {  	(xrf1) = vunique.msk.f32 vm0, v0;
+; CHECK-NEXT:   	_ = 	sdelay $0x8  }
+; CHECK-NEXT: {  	_, v{{[0-9]+}}, 	_ =	vpop (xrf1)
+define <16 x i32> @vuniquef(<16 x i1> %m, <16 x float> %v) {
+  %a = call { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.uniquef.v16f32(<16 x i1> %m, <16 x float> %v)
+  %co = extractvalue { <16 x i32>, <16 x i32>, <16 x i1> } %a, 1
+  ret <16 x i32> %co
+}
+
+; CHECK-LABEL: vsort.ascd.i.u32:
+; CHECK:      {  	(xrf1) = vsort.ascd.msk.u32 vm0, v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x8  }
+; CHECK-NEXT: {  	v0, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)  }
+define <16 x i32> @vsort.ascd.i.u32(<16 x i1> %m, <16 x i32> %k, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.sort.ascdi.v16i32.v16i32(<16 x i1> %m, <16 x i32> %k, <16 x i32> %v)
+  %ko = extractvalue { <16 x i32>, <16 x i32>, <16 x i1> } %a, 0
+  %vo = extractvalue { <16 x i32>, <16 x i32>, <16 x i1> } %a, 1
+  %mo = extractvalue { <16 x i32>, <16 x i32>, <16 x i1> } %a, 2
+  %b = select <16 x i1> %mo, <16 x i32> %ko, <16 x i32> %vo
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vsort.ascd.f.u32:
+; CHECK:      {  	(xrf1) = vsort.ascd.msk.u32 vm0, v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x8  }
+; CHECK-NEXT: {  	v0, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)  }
+define <16 x i32> @vsort.ascd.f.u32(<16 x i1> %m, <16 x i32> %k, <16 x float> %v) {
+  %a = call { <16 x i32>, <16 x float>, <16 x i1> } @llvm.tpu.sort.ascdi.v16i32.v16f32(<16 x i1> %m, <16 x i32> %k, <16 x float> %v)
+  %ko = extractvalue { <16 x i32>, <16 x float>, <16 x i1> } %a, 0
+  %vo = extractvalue { <16 x i32>, <16 x float>, <16 x i1> } %a, 1
+  %voi = bitcast <16 x float> %vo to <16 x i32>
+  %mo = extractvalue { <16 x i32>, <16 x float>, <16 x i1> } %a, 2
+  %b = select <16 x i1> %mo, <16 x i32> %ko, <16 x i32> %voi
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vsort.ascd.f.f32:
+; CHECK:      {  	(xrf1) = vsort.ascd.msk.f32 vm0, v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x8  }
+; CHECK-NEXT: {  	v0, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)  }
+define <16 x float> @vsort.ascd.f.f32(<16 x i1> %m, <16 x float> %k, <16 x float> %v) {
+  %a = call { <16 x float>, <16 x float>, <16 x i1> } @llvm.tpu.sort.ascdf.v16f32.v16f32(<16 x i1> %m, <16 x float> %k, <16 x float> %v)
+  %ko = extractvalue { <16 x float>, <16 x float>, <16 x i1> } %a, 0
+  %vo = extractvalue { <16 x float>, <16 x float>, <16 x i1> } %a, 1
+  %mo = extractvalue { <16 x float>, <16 x float>, <16 x i1> } %a, 2
+  %b = select <16 x i1> %mo, <16 x float> %ko, <16 x float> %vo
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: vsort.ascd.i.f32:
+; CHECK:      {  	(xrf1) = vsort.ascd.msk.f32 vm0, v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x8  }
+; CHECK-NEXT: {  	v0, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)  }
+define <16 x float> @vsort.ascd.i.f32(<16 x i1> %m, <16 x float> %k, <16 x i32> %v) {
+  %a = call { <16 x float>, <16 x i32>, <16 x i1> } @llvm.tpu.sort.ascdf.v16f32.v16i32(<16 x i1> %m, <16 x float> %k, <16 x i32> %v)
+  %ko = extractvalue { <16 x float>, <16 x i32>, <16 x i1> } %a, 0
+  %vo = extractvalue { <16 x float>, <16 x i32>, <16 x i1> } %a, 1
+  %vof = bitcast <16 x i32> %vo to <16 x float>
+  %mo = extractvalue { <16 x float>, <16 x i32>, <16 x i1> } %a, 2
+  %b = select <16 x i1> %mo, <16 x float> %ko, <16 x float> %vof
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: vsort.dscd.i.u32:
+; CHECK:      {  	(xrf1) = vsort.dscd.msk.u32 vm0, v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x8  }
+; CHECK-NEXT: {  	v0, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)  }
+define <16 x i32> @vsort.dscd.i.u32(<16 x i1> %m, <16 x i32> %k, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.sort.dscdi.v16i32.v16i32(<16 x i1> %m, <16 x i32> %k, <16 x i32> %v)
+  %ko = extractvalue { <16 x i32>, <16 x i32>, <16 x i1> } %a, 0
+  %vo = extractvalue { <16 x i32>, <16 x i32>, <16 x i1> } %a, 1
+  %mo = extractvalue { <16 x i32>, <16 x i32>, <16 x i1> } %a, 2
+  %b = select <16 x i1> %mo, <16 x i32> %ko, <16 x i32> %vo
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vsort.dscd.f.u32:
+; CHECK:      {  	(xrf1) = vsort.dscd.msk.u32 vm0, v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x8  }
+; CHECK-NEXT: {  	v0, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)  }
+define <16 x i32> @vsort.dscd.f.u32(<16 x i1> %m, <16 x i32> %k, <16 x float> %v) {
+  %a = call { <16 x i32>, <16 x float>, <16 x i1> } @llvm.tpu.sort.dscdi.v16i32.v16f32(<16 x i1> %m, <16 x i32> %k, <16 x float> %v)
+  %ko = extractvalue { <16 x i32>, <16 x float>, <16 x i1> } %a, 0
+  %vo = extractvalue { <16 x i32>, <16 x float>, <16 x i1> } %a, 1
+  %voi = bitcast <16 x float> %vo to <16 x i32>
+  %mo = extractvalue { <16 x i32>, <16 x float>, <16 x i1> } %a, 2
+  %b = select <16 x i1> %mo, <16 x i32> %ko, <16 x i32> %voi
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vsort.dscd.f.f32:
+; CHECK:      {  	(xrf1) = vsort.dscd.msk.f32 vm0, v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x8  }
+; CHECK-NEXT: {  	v0, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)  }
+define <16 x float> @vsort.dscd.f.f32(<16 x i1> %m, <16 x float> %k, <16 x float> %v) {
+  %a = call { <16 x float>, <16 x float>, <16 x i1> } @llvm.tpu.sort.dscdf.v16f32.v16f32(<16 x i1> %m, <16 x float> %k, <16 x float> %v)
+  %ko = extractvalue { <16 x float>, <16 x float>, <16 x i1> } %a, 0
+  %vo = extractvalue { <16 x float>, <16 x float>, <16 x i1> } %a, 1
+  %mo = extractvalue { <16 x float>, <16 x float>, <16 x i1> } %a, 2
+  %b = select <16 x i1> %mo, <16 x float> %ko, <16 x float> %vo
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: vsort.dscd.i.f32:
+; CHECK:      {  	(xrf1) = vsort.dscd.msk.f32 vm0, v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x8  }
+; CHECK-NEXT: {  	v0, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)  }
+define <16 x float> @vsort.dscd.i.f32(<16 x i1> %m, <16 x float> %k, <16 x i32> %v) {
+  %a = call { <16 x float>, <16 x i32>, <16 x i1> } @llvm.tpu.sort.dscdf.v16f32.v16i32(<16 x i1> %m, <16 x float> %k, <16 x i32> %v)
+  %ko = extractvalue { <16 x float>, <16 x i32>, <16 x i1> } %a, 0
+  %vo = extractvalue { <16 x float>, <16 x i32>, <16 x i1> } %a, 1
+  %vof = bitcast <16 x i32> %vo to <16 x float>
+  %mo = extractvalue { <16 x float>, <16 x i32>, <16 x i1> } %a, 2
+  %b = select <16 x i1> %mo, <16 x float> %ko, <16 x float> %vof
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: vadd.scan1xNi:
+; CHECK: (xrf0) = vadd.scan.msk.s32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vadd.scan1xNi(<16 x i1> %m, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.add.scan1xNi.v16i32(<16 x i1> %m, <16 x i32> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %v
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vadd.scan1xNf:
+; CHECK: (xrf0) = vadd.scan.msk.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x float> @vadd.scan1xNf(<16 x i1> %m, <16 x float> %v) {
+  %a = call { <16 x float>, <16 x i1> } @llvm.tpu.add.scan1xNf.v16f32(<16 x i1> %m, <16 x float> %v)
+  %po = extractvalue { <16 x float>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x float>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x float> %po, <16 x float> %v
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: vadd.seg.scan1xN.s32:
+; CHECK: (xrf0) = vadd.seg.scan.s32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vadd.seg.scan1xN.s32(<16 x i1> %m, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.add.seg.scan1xNi.v16i32(<16 x i1> %m, <16 x i32> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %v
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vadd.seg.scan1xNf32:
+; CHECK: (xrf0) = vadd.seg.scan.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x float> @vadd.seg.scan1xNf32(<16 x i1> %m, <16 x float> %v) {
+  %a = call { <16 x float>, <16 x i1> } @llvm.tpu.add.seg.scan1xNf.v16f32(<16 x i1> %m, <16 x float> %v)
+  %po = extractvalue { <16 x float>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x float>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x float> %po, <16 x float> %v
+  ret <16 x float> %b
+}
+
+; Tests that the following intrinsics are DCE'd, because there is no data
+; dependency and no other memory dependency.
+; CHECK-LABEL: elim_vdupcnti:
+; CHECK-NOT: { (xrf1) = vdupcnt
+define void @elim_vdupcnti(<16 x i1> %m, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.dupcnti.v16i32(<16 x i1> %m, <16 x i32> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_vdupcntf:
+; CHECK-NOT: { (xrf1) = vdupcnt
+define void @elim_vdupcntf(<16 x i1> %m, <16 x float> %v) {
+  %a = call { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.dupcntf.v16f32(<16 x i1> %m, <16 x float> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_uniquei:
+; CHECK-NOT: { (xrf1) = vunique
+define void @elim_uniquei(<16 x i32> %id, <16 x i1> %m, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.uniquei.v16i32(<16 x i1> %m, <16 x i32> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_uniquef:
+; CHECK-NOT: { (xrf1) = vunique
+define void @elim_uniquef(<16 x i32> %id, <16 x i1> %m, <16 x float> %v) {
+  %a = call { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.uniquef.v16f32(<16 x i1> %m, <16 x float> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_sort_ascdi:
+; CHECK-NOT: { (xrf1) = vsort.ascd
+define void @elim_sort_ascdi(<16 x i1> %m, <16 x i32> %k, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.sort.ascdi.v16i32.v16i32(<16 x i1> %m,
+                                                                          <16 x i32> %k,
+                                                                          <16 x i32> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_sort_ascdf:
+; CHECK-NOT: { (xrf1) = vsort.ascd
+define void @elim_sort_ascdf(<16 x i1> %m, <16 x float> %k, <16 x float> %v) {
+  %a = call { <16 x float>, <16 x float>, <16 x i1> } @llvm.tpu.sort.ascdf.v16f32.v16f32(<16 x i1> %m,
+                                                                              <16 x float> %k,
+                                                                              <16 x float> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_sort_dscdi:
+; CHECK-NOT: { (xrf1) = vsort.dscd
+define void @elim_sort_dscdi(<16 x i1> %m, <16 x i32> %k, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i32>, <16 x i1> } @llvm.tpu.sort.dscdi.v16i32.v16i32(<16 x i1> %m,
+                                                                          <16 x i32> %k,
+                                                                          <16 x i32> %v)
+  ret void
+}
+
+; CHECK-LABEL: elim_sort_dscdf:
+; CHECK-NOT: { (xrf1) = vsort.dscd
+define void @elim_sort_dscdf(<16 x i1> %m, <16 x float> %k, <16 x float> %v) {
+  %a = call { <16 x float>, <16 x float>, <16 x i1> } @llvm.tpu.sort.dscdf.v16f32.v16f32(<16 x i1> %m,
+                                                                              <16 x float> %k,
+                                                                              <16 x float> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_vadd_scan1xNi:
+; CHECK-NOT: (xrf0) = vadd.scan
+define void @elim_vadd_scan1xNi(<16 x i1> %m, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.add.scan1xNi.v16i32(<16 x i1> %m, <16 x i32> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_vadd_scan1xNf:
+; CHECK-NOT: (xrf0) = vadd.scan
+define void @elim_vadd_scan1xNf(<16 x i1> %m, <16 x float> %v) {
+  %a = call { <16 x float>, <16 x i1> } @llvm.tpu.add.scan1xNf.v16f32(<16 x i1> %m, <16 x float> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_vadd_seg_scan1xNi:
+; CHECK-NOT: (xrf0) = vadd.seg.scan
+define void @elim_vadd_seg_scan1xNi(<16 x i1> %m, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.add.seg.scan1xNi.v16i32(<16 x i1> %m, <16 x i32> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_vadd_seg_scan1xNf:
+; CHECK-NOT: (xrf0) = vadd.seg.scan
+define void @elim_vadd_seg_scan1xNf(<16 x i1> %m, <16 x float> %v) {
+  %a = call { <16 x float>, <16 x i1> } @llvm.tpu.add.seg.scan1xNf.v16f32(<16 x i1> %m, <16 x float> %v)
+  ret void
+}
+
+; CHECK-LABEL: vmin.seg.scan1xN.u32:
+; CHECK: (xrf0) = vmin.seg.scan.u32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vmin.seg.scan1xN.u32(<16 x i1> %m, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.min.seg.scan1xNi.v16i32(<16 x i1> %m, <16 x i32> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %v
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vmin.seg.scan1xNf32:
+; CHECK: (xrf0) = vmin.seg.scan.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x float> @vmin.seg.scan1xNf32(<16 x i1> %m, <16 x float> %v) {
+  %a = call { <16 x float>, <16 x i1> } @llvm.tpu.min.seg.scan1xNf.v16f32(<16 x i1> %m, <16 x float> %v)
+  %po = extractvalue { <16 x float>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x float>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x float> %po, <16 x float> %v
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: vmin.seg.index.scan1xN.u32:
+; CHECK: (xrf0) = vmin.seg.index.scan.u32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vmin.seg.index.scan1xN.u32(<16 x i1> %m, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.min.seg.index.scan1xNi.v16i32(<16 x i1> %m, <16 x i32> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %v
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vmin.seg.index.scan1xNf32:
+; CHECK: (xrf0) = vmin.seg.index.scan.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vmin.seg.index.scan1xNf32(<16 x i1> %m, <16 x float> %v, <16 x i32> %pp) {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.min.seg.index.scan1xNf.v16f32(<16 x i1> %m, <16 x float> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %pp
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vmax.seg.scan1xN.u32:
+; CHECK: (xrf0) = vmax.seg.scan.u32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vmax.seg.scan1xN.u32(<16 x i1> %m, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.max.seg.scan1xNi.v16i32(<16 x i1> %m, <16 x i32> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %v
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vmax.seg.scan1xNf32:
+; CHECK: (xrf0) = vmax.seg.scan.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x float> @vmax.seg.scan1xNf32(<16 x i1> %m, <16 x float> %v) {
+  %a = call { <16 x float>, <16 x i1> } @llvm.tpu.max.seg.scan1xNf.v16f32(<16 x i1> %m, <16 x float> %v)
+  %po = extractvalue { <16 x float>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x float>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x float> %po, <16 x float> %v
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: vmax.seg.index.scan1xN.u32:
+; CHECK: (xrf0) = vmax.seg.index.scan.u32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vmax.seg.index.scan1xN.u32(<16 x i1> %m, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.max.seg.index.scan1xNi.v16i32(<16 x i1> %m, <16 x i32> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %v
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vmax.seg.index.scan1xNf32:
+; CHECK: (xrf0) = vmax.seg.index.scan.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vmax.seg.index.scan1xNf32(<16 x i1> %m, <16 x float> %v, <16 x i32> %pp) {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.max.seg.index.scan1xNf.v16f32(<16 x i1> %m, <16 x float> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %pp
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vmin.scan1xN.msk.u32:
+; CHECK: (xrf0) = vmin.scan.msk.u32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vmin.scan1xN.msk.u32(<16 x i1> %m, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.min.scan1xNi.v16i32(<16 x i1> %m, <16 x i32> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %v
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vmin.scan1xN.msk.f32:
+; CHECK: (xrf0) = vmin.scan.msk.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x float> @vmin.scan1xN.msk.f32(<16 x i1> %m, <16 x float> %v) {
+  %a = call { <16 x float>, <16 x i1> } @llvm.tpu.min.scan1xNf.v16f32(<16 x i1> %m, <16 x float> %v)
+  %po = extractvalue { <16 x float>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x float>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x float> %po, <16 x float> %v
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: vmin.index.scan1xN.u32:
+; CHECK: (xrf0) = vmin.index.scan.u32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vmin.index.scan1xN.u32(<16 x i1> %m, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.min.index.scan1xNi.v16i32(<16 x i1> %m, <16 x i32> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %v
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vmin.index.scan1xNf32:
+; CHECK: (xrf0) = vmin.index.scan.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vmin.index.scan1xNf32(<16 x i1> %m, <16 x float> %v, <16 x i32> %pp) {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.min.index.scan1xNf.v16f32(<16 x i1> %m, <16 x float> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %pp
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vmax.scan1xN.msk.u32:
+; CHECK: (xrf0) = vmax.scan.msk.u32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vmax.scan1xN.msk.u32(<16 x i1> %m, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.max.scan1xNi.v16i32(<16 x i1> %m, <16 x i32> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %v
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vmax.scan1xN.msk.f32:
+; CHECK: (xrf0) = vmax.scan.msk.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x float> @vmax.scan1xN.msk.f32(<16 x i1> %m, <16 x float> %v) {
+  %a = call { <16 x float>, <16 x i1> } @llvm.tpu.max.scan1xNf.v16f32(<16 x i1> %m, <16 x float> %v)
+  %po = extractvalue { <16 x float>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x float>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x float> %po, <16 x float> %v
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: vmax.index.scan1xN.u32:
+; CHECK: (xrf0) = vmax.index.scan.u32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vmax.index.scan1xN.u32(<16 x i1> %m, <16 x i32> %v) {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.max.index.scan1xNi.v16i32(<16 x i1> %m, <16 x i32> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %v
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vmax.index.scan1xNf32:
+; CHECK: (xrf0) = vmax.index.scan.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vmax.index.scan1xNf32(<16 x i1> %m, <16 x float> %v, <16 x i32> %pp) {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.max.index.scan1xNf.v16f32(<16 x i1> %m, <16 x float> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %pp
+  ret <16 x i32> %b
+}
+
+; Packed cross-lane instructions.
+declare { <32 x bfloat>, <16 x i1> } @llvm.tpu.add.half.scan2xNbf16.v32bf16.v16i1(<16 x i1>, <32 x bfloat>) readnone nounwind
+declare { <16 x float>, <16 x float>, <16 x i1> } @llvm.tpu.add.full.scan2xNbf16.v16f32.v32bf16(<16 x i1>, <32 x bfloat>) readnone nounwind
+declare { <32 x bfloat>, <16 x i1> } @llvm.tpu.min.scan2xNbf16.v32bf16.v16i1(<16 x i1>, <32 x bfloat>) readnone nounwind
+declare { <32 x bfloat>, <16 x i1> } @llvm.tpu.max.scan2xNbf16.v32bf16.v16i1(<16 x i1>, <32 x bfloat>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.min.index.scan2xNbf16.v16i32.v16i1.v32bf16(<16 x i1>, <32 x bfloat>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.max.index.scan2xNbf16.v16i32.v16i1.v32bf16(<16 x i1>, <32 x bfloat>) readnone nounwind
+declare { <32 x bfloat>, <16 x i1> } @llvm.tpu.add.half.seg.scan2xNbf16.v32bf16.v16i1(<16 x i1>, <32 x bfloat>) readnone nounwind
+declare { <16 x float>, <16 x float>, <16 x i1> } @llvm.tpu.add.full.seg.scan2xNbf16.v16f32.v32bf16(<16 x i1>, <32 x bfloat>) readnone nounwind
+declare { <32 x bfloat>, <16 x i1> } @llvm.tpu.min.seg.scan2xNbf16.v32bf16.v16i1(<16 x i1>, <32 x bfloat>) readnone nounwind
+declare { <32 x bfloat>, <16 x i1> } @llvm.tpu.max.seg.scan2xNbf16.v32bf16.v16i1(<16 x i1>, <32 x bfloat>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.min.seg.index.scan2xNbf16.v16i32.v16i1.v32bf16(<16 x i1>, <32 x bfloat>) readnone nounwind
+declare { <16 x i32>, <16 x i1> } @llvm.tpu.max.seg.index.scan2xNbf16.v16i32.v16i1.v32bf16(<16 x i1>, <32 x bfloat>) readnone nounwind
+declare  <32 x i1> @llvm.tpu.16i1.to.32i1(<16 x i1>) readnone nounwind
+
+; CHECK-LABEL: vadd.half.scan2xNbf16:
+; CHECK: (xrf0) = vadd.scan.msk.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <32 x bfloat> @vadd.half.scan2xNbf16(<16 x i1> %m, <32 x bfloat> %v) #0 {
+  %a = call { <32 x bfloat>, <16 x i1> } @llvm.tpu.add.half.scan2xNbf16.v32bf16.v16i1(<16 x i1> %m, <32 x bfloat> %v)
+  %po = extractvalue { <32 x bfloat>, <16 x i1> } %a, 0
+  %mo = extractvalue { <32 x bfloat>, <16 x i1> } %a, 1
+  %ms = call <32 x i1> @llvm.tpu.16i1.to.32i1(<16 x i1> %mo)
+  %b = select <32 x i1> %ms, <32 x bfloat> %po, <32 x bfloat> %v
+  ret <32 x bfloat> %b
+}
+
+; CHECK-LABEL: vadd.full.scan2xNbf16:
+; CHECK: (xrf0) = vadd.scan.msk.bf16.f32
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, v{{[0-9]+}}, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x float> @vadd.full.scan2xNbf16(<16 x i1> %m, <32 x bfloat> %v) #0 {
+  %a = call { <16 x float>, <16 x float>, <16 x i1> } @llvm.tpu.add.full.scan2xNbf16.v16f32.v32bf16(<16 x i1> %m, <32 x bfloat> %v)
+  %po0 = extractvalue { <16 x float>, <16 x float>, <16 x i1> } %a, 0
+  %po1 = extractvalue { <16 x float>, <16 x float>, <16 x i1> } %a, 1
+  %mo = extractvalue { <16 x float>, <16 x float>, <16 x i1> } %a, 2
+  %b = select <16 x i1> %mo, <16 x float> %po0, <16 x float> %po1
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: vmin.scan2xNbf16:
+; CHECK: (xrf0) = vmin.scan.msk.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <32 x bfloat> @vmin.scan2xNbf16(<16 x i1> %m, <32 x bfloat> %v) #0 {
+  %a = call { <32 x bfloat>, <16 x i1> } @llvm.tpu.min.scan2xNbf16.v32bf16.v16i1(<16 x i1> %m, <32 x bfloat> %v)
+  %po = extractvalue { <32 x bfloat>, <16 x i1> } %a, 0
+  %mo = extractvalue { <32 x bfloat>, <16 x i1> } %a, 1
+  %ms = call <32 x i1> @llvm.tpu.16i1.to.32i1(<16 x i1> %mo)
+  %b = select <32 x i1> %ms, <32 x bfloat> %po, <32 x bfloat> %v
+  ret <32 x bfloat> %b
+}
+
+; CHECK-LABEL: vmax.scan2xNbf16:
+; CHECK: (xrf0) = vmax.scan.msk.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <32 x bfloat> @vmax.scan2xNbf16(<16 x i1> %m, <32 x bfloat> %v) #0 {
+  %a = call { <32 x bfloat>, <16 x i1> } @llvm.tpu.max.scan2xNbf16.v32bf16.v16i1(<16 x i1> %m, <32 x bfloat> %v)
+  %po = extractvalue { <32 x bfloat>, <16 x i1> } %a, 0
+  %mo = extractvalue { <32 x bfloat>, <16 x i1> } %a, 1
+  %ms = call <32 x i1> @llvm.tpu.16i1.to.32i1(<16 x i1> %mo)
+  %b = select <32 x i1> %ms, <32 x bfloat> %po, <32 x bfloat> %v
+  ret <32 x bfloat> %b
+}
+
+; CHECK-LABEL: vmin.index.scan2xNbf16:
+; CHECK: (xrf0) = vmin.index.scan.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vmin.index.scan2xNbf16(<16 x i1> %m, <32 x bfloat> %v, <16 x i32> %pp) #0 {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.min.index.scan2xNbf16.v16i32.v16i1.v32bf16(<16 x i1> %m, <32 x bfloat> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %pp
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vmax.index.scan2xNbf16:
+; CHECK: (xrf0) = vmax.index.scan.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vmax.index.scan2xNbf16(<16 x i1> %m, <32 x bfloat> %v, <16 x i32> %pp) #0 {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.max.index.scan2xNbf16.v16i32.v16i1.v32bf16(<16 x i1> %m, <32 x bfloat> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %pp
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vadd.half.seg.scan2xNbf16:
+; CHECK: (xrf0) = vadd.seg.scan.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <32 x bfloat> @vadd.half.seg.scan2xNbf16(<16 x i1> %m, <32 x bfloat> %v) #0 {
+  %a = call { <32 x bfloat>, <16 x i1> } @llvm.tpu.add.half.seg.scan2xNbf16.v32bf16.v16i1(<16 x i1> %m, <32 x bfloat> %v)
+  %po = extractvalue { <32 x bfloat>, <16 x i1> } %a, 0
+  %mo = extractvalue { <32 x bfloat>, <16 x i1> } %a, 1
+  %ms = call <32 x i1> @llvm.tpu.16i1.to.32i1(<16 x i1> %mo)
+  %b = select <32 x i1> %ms, <32 x bfloat> %po, <32 x bfloat> %v
+  ret <32 x bfloat> %b
+}
+
+; CHECK-LABEL: vadd.full.seg.scan2xNbf16:
+; CHECK: (xrf0) = vadd.seg.scan.bf16.f32
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, v{{[0-9]+}}, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x float> @vadd.full.seg.scan2xNbf16(<16 x i1> %m, <32 x bfloat> %v) #0 {
+  %a = call { <16 x float>, <16 x float>, <16 x i1> } @llvm.tpu.add.full.seg.scan2xNbf16.v16f32.v32bf16(<16 x i1> %m, <32 x bfloat> %v)
+  %po0 = extractvalue { <16 x float>, <16 x float>, <16 x i1> } %a, 0
+  %po1 = extractvalue { <16 x float>, <16 x float>, <16 x i1> } %a, 1
+  %mo = extractvalue { <16 x float>, <16 x float>, <16 x i1> } %a, 2
+  %b = select <16 x i1> %mo, <16 x float> %po0, <16 x float> %po1
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: vmin.seg.scan2xNbf16:
+; CHECK: (xrf0) = vmin.seg.scan.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <32 x bfloat> @vmin.seg.scan2xNbf16(<16 x i1> %m, <32 x bfloat> %v) #0 {
+  %a = call { <32 x bfloat>, <16 x i1> } @llvm.tpu.min.seg.scan2xNbf16.v32bf16.v16i1(<16 x i1> %m, <32 x bfloat> %v)
+  %po = extractvalue { <32 x bfloat>, <16 x i1> } %a, 0
+  %mo = extractvalue { <32 x bfloat>, <16 x i1> } %a, 1
+  %ms = call <32 x i1> @llvm.tpu.16i1.to.32i1(<16 x i1> %mo)
+  %b = select <32 x i1> %ms, <32 x bfloat> %po, <32 x bfloat> %v
+  ret <32 x bfloat> %b
+}
+
+; CHECK-LABEL: vmax.seg.scan2xNbf16:
+; CHECK: (xrf0) = vmax.seg.scan.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <32 x bfloat> @vmax.seg.scan2xNbf16(<16 x i1> %m, <32 x bfloat> %v) #0 {
+  %a = call { <32 x bfloat>, <16 x i1> } @llvm.tpu.max.seg.scan2xNbf16.v32bf16.v16i1(<16 x i1> %m, <32 x bfloat> %v)
+  %po = extractvalue { <32 x bfloat>, <16 x i1> } %a, 0
+  %mo = extractvalue { <32 x bfloat>, <16 x i1> } %a, 1
+  %ms = call <32 x i1> @llvm.tpu.16i1.to.32i1(<16 x i1> %mo)
+  %b = select <32 x i1> %ms, <32 x bfloat> %po, <32 x bfloat> %v
+  ret <32 x bfloat> %b
+}
+
+; CHECK-LABEL: vmin.seg.index.scan2xNbf16:
+; CHECK: (xrf0) = vmin.seg.index.scan.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vmin.seg.index.scan2xNbf16(<16 x i1> %m, <32 x bfloat> %v, <16 x i32> %pp) #0 {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.min.seg.index.scan2xNbf16.v16i32.v16i1.v32bf16(<16 x i1> %m, <32 x bfloat> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %pp
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vmax.seg.index.scan2xNbf16:
+; CHECK: (xrf0) = vmax.seg.index.scan.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x i32> @vmax.seg.index.scan2xNbf16(<16 x i1> %m, <32 x bfloat> %v, <16 x i32> %pp) #0 {
+  %a = call { <16 x i32>, <16 x i1> } @llvm.tpu.max.seg.index.scan2xNbf16.v16i32.v16i1.v32bf16(<16 x i1> %m, <32 x bfloat> %v)
+  %po = extractvalue { <16 x i32>, <16 x i1> } %a, 0
+  %mo = extractvalue { <16 x i32>, <16 x i1> } %a, 1
+  %b = select <16 x i1> %mo, <16 x i32> %po, <16 x i32> %pp
+  ret <16 x i32> %b
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/cross_lane_gl_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/cross_lane_gl_sc.ll
new file mode 100644
index 0000000..5606fcf
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/cross_lane_gl_sc.ll

@@ -0,0 +1,233 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Ghostlite specific cross-lane instructions.
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquei.v8i32(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquef.v8f32(<8 x i1>, <8 x float>) readnone nounwind
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.dupcnti.v8i32(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.dupcntf.v8f32(<8 x i1>, <8 x float>) readnone nounwind
+
+; Packed cross-lane instructions.
+declare { <16 x bfloat>, <8 x i1> } @llvm.tpu.add.half.scan2xNbf16.v16bf16.v8i1(<8 x i1>, <16 x bfloat>) readnone nounwind
+declare { <8 x float>, <8 x float>, <8 x i1> } @llvm.tpu.add.full.scan2xNbf16.v8f32.v16bf16(<8 x i1>, <16 x bfloat>) readnone nounwind
+declare { <16 x bfloat>, <8 x i1> } @llvm.tpu.min.scan2xNbf16.v16bf16.v8i1(<8 x i1>, <16 x bfloat>) readnone nounwind
+declare { <16 x bfloat>, <8 x i1> } @llvm.tpu.max.scan2xNbf16.v16bf16.v8i1(<8 x i1>, <16 x bfloat>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.min.index.scan2xNbf16.v8i32.v8i1.v16bf16(<8 x i1>, <16 x bfloat>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.max.index.scan2xNbf16.v8i32.v8i1.v16bf16(<8 x i1>, <16 x bfloat>) readnone nounwind
+declare { <16 x bfloat>, <8 x i1> } @llvm.tpu.add.half.seg.scan2xNbf16.v16bf16.v8i1(<8 x i1>, <16 x bfloat>) readnone nounwind
+declare { <8 x float>, <8 x float>, <8 x i1> } @llvm.tpu.add.full.seg.scan2xNbf16.v8f32.v16bf16(<8 x i1>, <16 x bfloat>) readnone nounwind
+declare { <16 x bfloat>, <8 x i1> } @llvm.tpu.min.seg.scan2xNbf16.v16bf16.v8i1(<8 x i1>, <16 x bfloat>) readnone nounwind
+declare { <16 x bfloat>, <8 x i1> } @llvm.tpu.max.seg.scan2xNbf16.v16bf16.v8i1(<8 x i1>, <16 x bfloat>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.min.seg.index.scan2xNbf16.v8i32.v8i1.v16bf16(<8 x i1>, <16 x bfloat>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.max.seg.index.scan2xNbf16.v8i32.v8i1.v16bf16(<8 x i1>, <16 x bfloat>) readnone nounwind
+declare  <16 x i1> @llvm.tpu.8i1.to.16i1(<8 x i1>) readnone nounwind
+
+; CHECK-LABEL: vuniquei_laneids:
+; CHECK:      {         (xrf1) = vunique.msk.u32 vm0, v0;
+; CHECK-NEXT:           _ =     sdelay $0x7  }
+; CHECK-NEXT: {         v{{[0-9]+}}, v{{[0-9]+}}, vm{{[0-9]+}} =        vpop (xrf1)
+define <8 x i32> @vuniquei_laneids(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquei.v8i32(<8 x i1> %m, <8 x i32> %v)
+  %ids = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 0
+  %co = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %ids, <8 x i32> %co
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vuniquef_laneids:
+; CHECK:      {         (xrf1) = vunique.msk.f32 vm0, v0;
+; CHECK-NEXT:           _ =     sdelay $0x7  }
+; CHECK-NEXT: {         v{{[0-9]+}}, v{{[0-9]+}}, vm{{[0-9]+}} =        vpop (xrf1)
+define <8 x i32> @vuniquef_laneids(<8 x i1> %m, <8 x float> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquef.v8f32(<8 x i1> %m, <8 x float> %v)
+  %ids = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 0
+  %co = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %ids, <8 x i32> %co
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: dupcnti_laneids:
+; CHECK:      {         (xrf1) = vdupcnt.msk.u32 vm0, v0;
+; CHECK-NEXT:           _ =     sdelay $0x7  }
+; CHECK-NEXT: {         v{{[0-9]+}}, v{{[0-9]+}}, vm{{[0-9]+}} =        vpop (xrf1)
+define <8 x i32> @dupcnti_laneids(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.dupcnti.v8i32(<8 x i1> %m, <8 x i32> %v)
+  %ids = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 0
+  %co = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %ids, <8 x i32> %co
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: dupcntf_laneids:
+; CHECK:      {         (xrf1) = vdupcnt.msk.f32 vm0, v0;
+; CHECK-NEXT:           _ =     sdelay $0x7  }
+; CHECK-NEXT: {         v{{[0-9]+}}, v{{[0-9]+}}, vm{{[0-9]+}} =        vpop (xrf1)
+define <8 x i32> @dupcntf_laneids(<8 x i1> %m, <8 x float> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.dupcntf.v8f32(<8 x i1> %m, <8 x float> %v)
+  %ids = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 0
+  %co = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %ids, <8 x i32> %co
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vadd.half.scan2xNbf16:
+; CHECK: (xrf0) = vadd.scan.msk.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x bfloat> @vadd.half.scan2xNbf16(<8 x i1> %m, <16 x bfloat> %v) #0 {
+  %a = call { <16 x bfloat>, <8 x i1> } @llvm.tpu.add.half.scan2xNbf16.v16bf16.v8i1(<8 x i1> %m, <16 x bfloat> %v)
+  %po = extractvalue { <16 x bfloat>, <8 x i1> } %a, 0
+  %mo = extractvalue { <16 x bfloat>, <8 x i1> } %a, 1
+  %ms = call <16 x i1> @llvm.tpu.8i1.to.16i1(<8 x i1> %mo)
+  %b = select <16 x i1> %ms, <16 x bfloat> %po, <16 x bfloat> %v
+  ret <16 x bfloat> %b
+}
+
+; CHECK-LABEL: vadd.full.scan2xNbf16:
+; CHECK: (xrf0) = vadd.scan.msk.bf16.f32
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, v{{[0-9]+}}, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x float> @vadd.full.scan2xNbf16(<8 x i1> %m, <16 x bfloat> %v) #0 {
+  %a = call { <8 x float>, <8 x float>, <8 x i1> } @llvm.tpu.add.full.scan2xNbf16.v8f32.v16bf16(<8 x i1> %m, <16 x bfloat> %v)
+  %po0 = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 0
+  %po1 = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x float> %po0, <8 x float> %po1
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vmin.scan2xNbf16:
+; CHECK: (xrf0) = vmin.scan.msk.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x bfloat> @vmin.scan2xNbf16(<8 x i1> %m, <16 x bfloat> %v) #0 {
+  %a = call { <16 x bfloat>, <8 x i1> } @llvm.tpu.min.scan2xNbf16.v16bf16.v8i1(<8 x i1> %m, <16 x bfloat> %v)
+  %po = extractvalue { <16 x bfloat>, <8 x i1> } %a, 0
+  %mo = extractvalue { <16 x bfloat>, <8 x i1> } %a, 1
+  %ms = call <16 x i1> @llvm.tpu.8i1.to.16i1(<8 x i1> %mo)
+  %b = select <16 x i1> %ms, <16 x bfloat> %po, <16 x bfloat> %v
+  ret <16 x bfloat> %b
+}
+
+; CHECK-LABEL: vmax.scan2xNbf16:
+; CHECK: (xrf0) = vmax.scan.msk.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x bfloat> @vmax.scan2xNbf16(<8 x i1> %m, <16 x bfloat> %v) #0 {
+  %a = call { <16 x bfloat>, <8 x i1> } @llvm.tpu.max.scan2xNbf16.v16bf16.v8i1(<8 x i1> %m, <16 x bfloat> %v)
+  %po = extractvalue { <16 x bfloat>, <8 x i1> } %a, 0
+  %mo = extractvalue { <16 x bfloat>, <8 x i1> } %a, 1
+  %ms = call <16 x i1> @llvm.tpu.8i1.to.16i1(<8 x i1> %mo)
+  %b = select <16 x i1> %ms, <16 x bfloat> %po, <16 x bfloat> %v
+  ret <16 x bfloat> %b
+}
+
+; CHECK-LABEL: vmin.index.scan2xNbf16:
+; CHECK: (xrf0) = vmin.index.scan.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vmin.index.scan2xNbf16(<8 x i1> %m, <16 x bfloat> %v, <8 x i32> %pp) #0 {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.min.index.scan2xNbf16.v8i32.v8i1.v16bf16(<8 x i1> %m, <16 x bfloat> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %pp
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vmax.index.scan2xNbf16:
+; CHECK: (xrf0) = vmax.index.scan.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vmax.index.scan2xNbf16(<8 x i1> %m, <16 x bfloat> %v, <8 x i32> %pp) #0 {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.max.index.scan2xNbf16.v8i32.v8i1.v16bf16(<8 x i1> %m, <16 x bfloat> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %pp
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vadd.half.seg.scan2xNbf16:
+; CHECK: (xrf0) = vadd.seg.scan.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x bfloat> @vadd.half.seg.scan2xNbf16(<8 x i1> %m, <16 x bfloat> %v) #0 {
+  %a = call { <16 x bfloat>, <8 x i1> } @llvm.tpu.add.half.seg.scan2xNbf16.v16bf16.v8i1(<8 x i1> %m, <16 x bfloat> %v)
+  %po = extractvalue { <16 x bfloat>, <8 x i1> } %a, 0
+  %mo = extractvalue { <16 x bfloat>, <8 x i1> } %a, 1
+  %ms = call <16 x i1> @llvm.tpu.8i1.to.16i1(<8 x i1> %mo)
+  %b = select <16 x i1> %ms, <16 x bfloat> %po, <16 x bfloat> %v
+  ret <16 x bfloat> %b
+}
+
+; CHECK-LABEL: vadd.full.seg.scan2xNbf16:
+; CHECK: (xrf0) = vadd.seg.scan.bf16.f32
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, v{{[0-9]+}}, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x float> @vadd.full.seg.scan2xNbf16(<8 x i1> %m, <16 x bfloat> %v) #0 {
+  %a = call { <8 x float>, <8 x float>, <8 x i1> } @llvm.tpu.add.full.seg.scan2xNbf16.v8f32.v16bf16(<8 x i1> %m, <16 x bfloat> %v)
+  %po0 = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 0
+  %po1 = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x float> %po0, <8 x float> %po1
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vmin.seg.scan2xNbf16:
+; CHECK: (xrf0) = vmin.seg.scan.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x bfloat> @vmin.seg.scan2xNbf16(<8 x i1> %m, <16 x bfloat> %v) #0 {
+  %a = call { <16 x bfloat>, <8 x i1> } @llvm.tpu.min.seg.scan2xNbf16.v16bf16.v8i1(<8 x i1> %m, <16 x bfloat> %v)
+  %po = extractvalue { <16 x bfloat>, <8 x i1> } %a, 0
+  %mo = extractvalue { <16 x bfloat>, <8 x i1> } %a, 1
+  %ms = call <16 x i1> @llvm.tpu.8i1.to.16i1(<8 x i1> %mo)
+  %b = select <16 x i1> %ms, <16 x bfloat> %po, <16 x bfloat> %v
+  ret <16 x bfloat> %b
+}
+
+; CHECK-LABEL: vmax.seg.scan2xNbf16:
+; CHECK: (xrf0) = vmax.seg.scan.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <16 x bfloat> @vmax.seg.scan2xNbf16(<8 x i1> %m, <16 x bfloat> %v) #0 {
+  %a = call { <16 x bfloat>, <8 x i1> } @llvm.tpu.max.seg.scan2xNbf16.v16bf16.v8i1(<8 x i1> %m, <16 x bfloat> %v)
+  %po = extractvalue { <16 x bfloat>, <8 x i1> } %a, 0
+  %mo = extractvalue { <16 x bfloat>, <8 x i1> } %a, 1
+  %ms = call <16 x i1> @llvm.tpu.8i1.to.16i1(<8 x i1> %mo)
+  %b = select <16 x i1> %ms, <16 x bfloat> %po, <16 x bfloat> %v
+  ret <16 x bfloat> %b
+}
+
+; CHECK-LABEL: vmin.seg.index.scan2xNbf16:
+; CHECK: (xrf0) = vmin.seg.index.scan.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vmin.seg.index.scan2xNbf16(<8 x i1> %m, <16 x bfloat> %v, <8 x i32> %pp) #0 {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.min.seg.index.scan2xNbf16.v8i32.v8i1.v16bf16(<8 x i1> %m, <16 x bfloat> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %pp
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vmax.seg.index.scan2xNbf16:
+; CHECK: (xrf0) = vmax.seg.index.scan.bf16
+; CHECK: _ = sdelay $0x4
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vmax.seg.index.scan2xNbf16(<8 x i1> %m, <16 x bfloat> %v, <8 x i32> %pp) #0 {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.max.seg.index.scan2xNbf16.v8i32.v8i1.v16bf16(<8 x i1> %m, <16 x bfloat> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %pp
+  ret <8 x i32> %b
+}
+
+attributes #0 = { "implicit-section-name"=".text.tile_execute" "target-cpu"="sparsecore-tec-gl" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/cross_lane_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/cross_lane_sc.ll
new file mode 100644
index 0000000..8d9d6b0
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/cross_lane_sc.ll

@@ -0,0 +1,550 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.dupcnti.v8i32(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.dupcntf.v8f32(<8 x i1>, <8 x float>) readnone nounwind
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquei.v8i32(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquef.v8f32(<8 x i1>, <8 x float>) readnone nounwind
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32.v8i32(<8 x i1>, <8 x i32>, <8 x i32>) readnone nounwind
+declare { <8 x i32>, <8 x float>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32.v8f32(<8 x i1>, <8 x i32>, <8 x float>) readnone nounwind
+declare { <8 x float>, <8 x float>, <8 x i1> } @llvm.tpu.sort.ascdf.v8f32.v8f32(<8 x i1>, <8 x float>, <8 x float>) readnone nounwind
+declare { <8 x float>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdf.v8f32.v8i32(<8 x i1>, <8 x float>, <8 x i32>) readnone nounwind
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.dscdi.v8i32.v8i32(<8 x i1>, <8 x i32>, <8 x i32>) readnone nounwind
+declare { <8 x i32>, <8 x float>, <8 x i1> } @llvm.tpu.sort.dscdi.v8i32.v8f32(<8 x i1>, <8 x i32>, <8 x float>) readnone nounwind
+declare { <8 x float>, <8 x float>, <8 x i1> } @llvm.tpu.sort.dscdf.v8f32.v8f32(<8 x i1>, <8 x float>, <8 x float>) readnone nounwind
+declare { <8 x float>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.dscdf.v8f32.v8i32(<8 x i1>, <8 x float>, <8 x i32>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.add.scan1xNi.v8i32(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x float>, <8 x i1> } @llvm.tpu.add.scan1xNf.v8f32(<8 x i1>, <8 x float>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.add.seg.scan1xNi.v8i32(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x float>, <8 x i1> } @llvm.tpu.add.seg.scan1xNf.v8f32(<8 x i1>, <8 x float>) readnone nounwind
+declare { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32>, <8 x float>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.min.seg.scan1xNi.v8i32(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x float>, <8 x i1> } @llvm.tpu.min.seg.scan1xNf.v8f32(<8 x i1>, <8 x float>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.min.seg.index.scan1xNi.v8i32(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.min.seg.index.scan1xNf.v8f32(<8 x i1>, <8 x float>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.max.seg.scan1xNi.v8i32(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x float>, <8 x i1> } @llvm.tpu.max.seg.scan1xNf.v8f32(<8 x i1>, <8 x float>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.max.seg.index.scan1xNi.v8i32(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.max.seg.index.scan1xNf.v8f32(<8 x i1>, <8 x float>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.min.scan1xNi.v8i32(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x float>, <8 x i1> } @llvm.tpu.min.scan1xNf.v8f32(<8 x i1>, <8 x float>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.min.index.scan1xNi.v8i32(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.min.index.scan1xNf.v8f32(<8 x i1>, <8 x float>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.max.scan1xNi.v8i32(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x float>, <8 x i1> } @llvm.tpu.max.scan1xNf.v8f32(<8 x i1>, <8 x float>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.max.index.scan1xNi.v8i32(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.max.index.scan1xNf.v8f32(<8 x i1>, <8 x float>) readnone nounwind
+
+; We deliberately make this a change-detector test because we really care that
+; there are 7 nops and one pop per cycle.
+; CHECK-LABEL: vdupcnti:
+; CHECK:      {  	(xrf1) = vdupcnt.msk.u32 vm0, v0;
+; CHECK-NEXT:   	_ = 	sdelay $0x7  }
+; CHECK-NEXT: {  	_, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)
+define <8 x i32> @vdupcnti(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.dupcnti.v8i32(<8 x i1> %m, <8 x i32> %v)
+  %co = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %co, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vdupcntf:
+; CHECK:      {  	(xrf1) = vdupcnt.msk.f32 vm0, v0;
+; CHECK-NEXT:   	_ = 	sdelay $0x7  }
+; CHECK-NEXT: {  	_, v{{[0-9]+}}, 	_ =	vpop (xrf1)
+define <8 x i32> @vdupcntf(<8 x i1> %m, <8 x float> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.dupcntf.v8f32(<8 x i1> %m, <8 x float> %v)
+  %co = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  ret <8 x i32> %co
+}
+
+; CHECK-LABEL: vuniquei:
+; CHECK:      {  	(xrf1) = vunique.msk.u32 vm0, v0;
+; CHECK-NEXT:   	_ = 	sdelay $0x7  }
+; CHECK-NEXT: {  	_, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)
+define <8 x i32> @vuniquei(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquei.v8i32(<8 x i1> %m, <8 x i32> %v)
+  %co = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %co, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vuniquef:
+; CHECK:      {  	(xrf1) = vunique.msk.f32 vm0, v0;
+; CHECK-NEXT:   	_ = 	sdelay $0x7  }
+; CHECK-NEXT: {  	_, v{{[0-9]+}}, 	_ =	vpop (xrf1)
+define <8 x i32> @vuniquef(<8 x i1> %m, <8 x float> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquef.v8f32(<8 x i1> %m, <8 x float> %v)
+  %co = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  ret <8 x i32> %co
+}
+
+; CHECK-LABEL: vsort.ascd.i.u32:
+; CHECK:      {  	(xrf1) = vsort.ascd.msk.u32 vm0, v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x7  }
+; CHECK-NEXT: {  	v0, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)  }
+define <8 x i32> @vsort.ascd.i.u32(<8 x i1> %m, <8 x i32> %k, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32.v8i32(<8 x i1> %m, <8 x i32> %k, <8 x i32> %v)
+  %ko = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 0
+  %vo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %ko, <8 x i32> %vo
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vsort.ascd.f.u32:
+; CHECK:      {  	(xrf1) = vsort.ascd.msk.u32 vm0, v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x7  }
+; CHECK-NEXT: {  	v0, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)  }
+define <8 x i32> @vsort.ascd.f.u32(<8 x i1> %m, <8 x i32> %k, <8 x float> %v) {
+  %a = call { <8 x i32>, <8 x float>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32.v8f32(<8 x i1> %m, <8 x i32> %k, <8 x float> %v)
+  %ko = extractvalue { <8 x i32>, <8 x float>, <8 x i1> } %a, 0
+  %vo = extractvalue { <8 x i32>, <8 x float>, <8 x i1> } %a, 1
+  %voi = bitcast <8 x float> %vo to <8 x i32>
+  %mo = extractvalue { <8 x i32>, <8 x float>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %ko, <8 x i32> %voi
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vsort.ascd.f.f32:
+; CHECK:      {  	(xrf1) = vsort.ascd.msk.f32 vm0, v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x7  }
+; CHECK-NEXT: {  	v0, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)  }
+define <8 x float> @vsort.ascd.f.f32(<8 x i1> %m, <8 x float> %k, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x float>, <8 x i1> } @llvm.tpu.sort.ascdf.v8f32.v8f32(<8 x i1> %m, <8 x float> %k, <8 x float> %v)
+  %ko = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 0
+  %vo = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x float> %ko, <8 x float> %vo
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vsort.ascd.i.f32:
+; CHECK:      {  	(xrf1) = vsort.ascd.msk.f32 vm0, v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x7  }
+; CHECK-NEXT: {  	v0, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)  }
+define <8 x float> @vsort.ascd.i.f32(<8 x i1> %m, <8 x float> %k, <8 x i32> %v) {
+  %a = call { <8 x float>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdf.v8f32.v8i32(<8 x i1> %m, <8 x float> %k, <8 x i32> %v)
+  %ko = extractvalue { <8 x float>, <8 x i32>, <8 x i1> } %a, 0
+  %vo = extractvalue { <8 x float>, <8 x i32>, <8 x i1> } %a, 1
+  %vof = bitcast <8 x i32> %vo to <8 x float>
+  %mo = extractvalue { <8 x float>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x float> %ko, <8 x float> %vof
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vsort.dscd.i.u32:
+; CHECK:      {  	(xrf1) = vsort.dscd.msk.u32 vm0, v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x7  }
+; CHECK-NEXT: {  	v0, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)  }
+define <8 x i32> @vsort.dscd.i.u32(<8 x i1> %m, <8 x i32> %k, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.dscdi.v8i32.v8i32(<8 x i1> %m, <8 x i32> %k, <8 x i32> %v)
+  %ko = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 0
+  %vo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %ko, <8 x i32> %vo
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vsort.dscd.f.u32:
+; CHECK:      {  	(xrf1) = vsort.dscd.msk.u32 vm0, v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x7  }
+; CHECK-NEXT: {  	v0, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)  }
+define <8 x i32> @vsort.dscd.f.u32(<8 x i1> %m, <8 x i32> %k, <8 x float> %v) {
+  %a = call { <8 x i32>, <8 x float>, <8 x i1> } @llvm.tpu.sort.dscdi.v8i32.v8f32(<8 x i1> %m, <8 x i32> %k, <8 x float> %v)
+  %ko = extractvalue { <8 x i32>, <8 x float>, <8 x i1> } %a, 0
+  %vo = extractvalue { <8 x i32>, <8 x float>, <8 x i1> } %a, 1
+  %voi = bitcast <8 x float> %vo to <8 x i32>
+  %mo = extractvalue { <8 x i32>, <8 x float>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %ko, <8 x i32> %voi
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vsort.dscd.f.f32:
+; CHECK:      {  	(xrf1) = vsort.dscd.msk.f32 vm0, v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x7  }
+; CHECK-NEXT: {  	v0, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)  }
+define <8 x float> @vsort.dscd.f.f32(<8 x i1> %m, <8 x float> %k, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x float>, <8 x i1> } @llvm.tpu.sort.dscdf.v8f32.v8f32(<8 x i1> %m, <8 x float> %k, <8 x float> %v)
+  %ko = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 0
+  %vo = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x float> %ko, <8 x float> %vo
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vsort.dscd.i.f32:
+; CHECK:      {  	(xrf1) = vsort.dscd.msk.f32 vm0, v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x7  }
+; CHECK-NEXT: {  	v0, v{{[0-9]+}}, vm{{[0-9]+}} =	vpop (xrf1)  }
+define <8 x float> @vsort.dscd.i.f32(<8 x i1> %m, <8 x float> %k, <8 x i32> %v) {
+  %a = call { <8 x float>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.dscdf.v8f32.v8i32(<8 x i1> %m, <8 x float> %k, <8 x i32> %v)
+  %ko = extractvalue { <8 x float>, <8 x i32>, <8 x i1> } %a, 0
+  %vo = extractvalue { <8 x float>, <8 x i32>, <8 x i1> } %a, 1
+  %vof = bitcast <8 x i32> %vo to <8 x float>
+  %mo = extractvalue { <8 x float>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x float> %ko, <8 x float> %vof
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vadd.scan1xNi:
+; CHECK: (xrf0) = vadd.scan.msk.s32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vadd.scan1xNi(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.add.scan1xNi.v8i32(<8 x i1> %m, <8 x i32> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vadd.scan1xNf:
+; CHECK: (xrf0) = vadd.scan.msk.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x float> @vadd.scan1xNf(<8 x i1> %m, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x i1> } @llvm.tpu.add.scan1xNf.v8f32(<8 x i1> %m, <8 x float> %v)
+  %po = extractvalue { <8 x float>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x float>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x float> %po, <8 x float> %v
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vadd.seg.scan1xN.s32:
+; CHECK: (xrf0) = vadd.seg.scan.s32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vadd.seg.scan1xN.s32(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.add.seg.scan1xNi.v8i32(<8 x i1> %m, <8 x i32> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vadd.seg.scan1xNf32:
+; CHECK: (xrf0) = vadd.seg.scan.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x float> @vadd.seg.scan1xNf32(<8 x i1> %m, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x i1> } @llvm.tpu.add.seg.scan1xNf.v8f32(<8 x i1> %m, <8 x float> %v)
+  %po = extractvalue { <8 x float>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x float>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x float> %po, <8 x float> %v
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vsegreduce_add:
+; CHECK:      {  	(xrf0) = vsegadd.xlane v0, v1;
+; CHECK-NEXT:   	_ = 	sdelay $0x3  }
+; CHECK-NEXT: {  	v0, _, _ =	vpop (xrf0)
+define <8 x float> @vsegreduce_add(<8 x i32> %s, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x i1>} @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %s, <8 x float> %v)
+  %vo = extractvalue { <8 x float>, <8 x i1>} %a, 0
+  ret <8 x float> %vo
+}
+
+; Tests that the following intrinsics are DCE'd, because there is no data
+; dependency and no other memory dependency.
+; CHECK-LABEL: elim_vdupcnti:
+; CHECK-NOT: { (xrf1) = vdupcnt
+define void @elim_vdupcnti(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.dupcnti.v8i32(<8 x i1> %m, <8 x i32> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_vdupcntf:
+; CHECK-NOT: { (xrf1) = vdupcnt
+define void @elim_vdupcntf(<8 x i1> %m, <8 x float> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.dupcntf.v8f32(<8 x i1> %m, <8 x float> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_uniquei:
+; CHECK-NOT: { (xrf1) = vunique
+define void @elim_uniquei(<8 x i32> %id, <8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquei.v8i32(<8 x i1> %m, <8 x i32> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_uniquef:
+; CHECK-NOT: { (xrf1) = vunique
+define void @elim_uniquef(<8 x i32> %id, <8 x i1> %m, <8 x float> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquef.v8f32(<8 x i1> %m, <8 x float> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_sort_ascdi:
+; CHECK-NOT: { (xrf1) = vsort.ascd
+define void @elim_sort_ascdi(<8 x i1> %m, <8 x i32> %k, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32.v8i32(<8 x i1> %m,
+                                                                          <8 x i32> %k,
+                                                                          <8 x i32> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_sort_ascdf:
+; CHECK-NOT: { (xrf1) = vsort.ascd
+define void @elim_sort_ascdf(<8 x i1> %m, <8 x float> %k, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x float>, <8 x i1> } @llvm.tpu.sort.ascdf.v8f32.v8f32(<8 x i1> %m,
+                                                                              <8 x float> %k,
+                                                                              <8 x float> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_sort_dscdi:
+; CHECK-NOT: { (xrf1) = vsort.dscd
+define void @elim_sort_dscdi(<8 x i1> %m, <8 x i32> %k, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.dscdi.v8i32.v8i32(<8 x i1> %m,
+                                                                          <8 x i32> %k,
+                                                                          <8 x i32> %v)
+  ret void
+}
+
+; CHECK-LABEL: elim_sort_dscdf:
+; CHECK-NOT: { (xrf1) = vsort.dscd
+define void @elim_sort_dscdf(<8 x i1> %m, <8 x float> %k, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x float>, <8 x i1> } @llvm.tpu.sort.dscdf.v8f32.v8f32(<8 x i1> %m,
+                                                                              <8 x float> %k,
+                                                                              <8 x float> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_vadd_scan1xNi:
+; CHECK-NOT: (xrf0) = vadd.scan
+define void @elim_vadd_scan1xNi(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.add.scan1xNi.v8i32(<8 x i1> %m, <8 x i32> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_vadd_scan1xNf:
+; CHECK-NOT: (xrf0) = vadd.scan
+define void @elim_vadd_scan1xNf(<8 x i1> %m, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x i1> } @llvm.tpu.add.scan1xNf.v8f32(<8 x i1> %m, <8 x float> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_vadd_seg_scan1xNi:
+; CHECK-NOT: (xrf0) = vadd.seg.scan
+define void @elim_vadd_seg_scan1xNi(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.add.seg.scan1xNi.v8i32(<8 x i1> %m, <8 x i32> %v)
+  ret void
+}
+
+; Tests: see above
+; CHECK-LABEL: elim_vadd_seg_scan1xNf:
+; CHECK-NOT: (xrf0) = vadd.seg.scan
+define void @elim_vadd_seg_scan1xNf(<8 x i1> %m, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x i1> } @llvm.tpu.add.seg.scan1xNf.v8f32(<8 x i1> %m, <8 x float> %v)
+  ret void
+}
+
+; CHECK-LABEL: vmin.seg.scan1xN.u32:
+; CHECK: (xrf0) = vmin.seg.scan.u32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vmin.seg.scan1xN.u32(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.min.seg.scan1xNi.v8i32(<8 x i1> %m, <8 x i32> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vmin.seg.scan1xNf32:
+; CHECK: (xrf0) = vmin.seg.scan.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x float> @vmin.seg.scan1xNf32(<8 x i1> %m, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x i1> } @llvm.tpu.min.seg.scan1xNf.v8f32(<8 x i1> %m, <8 x float> %v)
+  %po = extractvalue { <8 x float>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x float>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x float> %po, <8 x float> %v
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vmin.seg.index.scan1xN.u32:
+; CHECK: (xrf0) = vmin.seg.index.scan.u32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vmin.seg.index.scan1xN.u32(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.min.seg.index.scan1xNi.v8i32(<8 x i1> %m, <8 x i32> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vmin.seg.index.scan1xNf32:
+; CHECK: (xrf0) = vmin.seg.index.scan.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vmin.seg.index.scan1xNf32(<8 x i1> %m, <8 x float> %v, <8 x i32> %pp) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.min.seg.index.scan1xNf.v8f32(<8 x i1> %m, <8 x float> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %pp
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vmax.seg.scan1xN.u32:
+; CHECK: (xrf0) = vmax.seg.scan.u32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vmax.seg.scan1xN.u32(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.max.seg.scan1xNi.v8i32(<8 x i1> %m, <8 x i32> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vmax.seg.scan1xNf32:
+; CHECK: (xrf0) = vmax.seg.scan.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x float> @vmax.seg.scan1xNf32(<8 x i1> %m, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x i1> } @llvm.tpu.max.seg.scan1xNf.v8f32(<8 x i1> %m, <8 x float> %v)
+  %po = extractvalue { <8 x float>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x float>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x float> %po, <8 x float> %v
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vmax.seg.index.scan1xN.u32:
+; CHECK: (xrf0) = vmax.seg.index.scan.u32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vmax.seg.index.scan1xN.u32(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.max.seg.index.scan1xNi.v8i32(<8 x i1> %m, <8 x i32> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vmax.seg.index.scan1xNf32:
+; CHECK: (xrf0) = vmax.seg.index.scan.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vmax.seg.index.scan1xNf32(<8 x i1> %m, <8 x float> %v, <8 x i32> %pp) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.max.seg.index.scan1xNf.v8f32(<8 x i1> %m, <8 x float> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %pp
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vmin.scan1xN.msk.u32:
+; CHECK: (xrf0) = vmin.scan.msk.u32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vmin.scan1xN.msk.u32(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.min.scan1xNi.v8i32(<8 x i1> %m, <8 x i32> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vmin.scan1xN.msk.f32:
+; CHECK: (xrf0) = vmin.scan.msk.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x float> @vmin.scan1xN.msk.f32(<8 x i1> %m, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x i1> } @llvm.tpu.min.scan1xNf.v8f32(<8 x i1> %m, <8 x float> %v)
+  %po = extractvalue { <8 x float>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x float>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x float> %po, <8 x float> %v
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vmin.index.scan1xN.u32:
+; CHECK: (xrf0) = vmin.index.scan.u32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vmin.index.scan1xN.u32(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.min.index.scan1xNi.v8i32(<8 x i1> %m, <8 x i32> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vmin.index.scan1xNf32:
+; CHECK: (xrf0) = vmin.index.scan.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vmin.index.scan1xNf32(<8 x i1> %m, <8 x float> %v, <8 x i32> %pp) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.min.index.scan1xNf.v8f32(<8 x i1> %m, <8 x float> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %pp
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vmax.scan1xN.msk.u32:
+; CHECK: (xrf0) = vmax.scan.msk.u32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vmax.scan1xN.msk.u32(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.max.scan1xNi.v8i32(<8 x i1> %m, <8 x i32> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vmax.scan1xN.msk.f32:
+; CHECK: (xrf0) = vmax.scan.msk.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x float> @vmax.scan1xN.msk.f32(<8 x i1> %m, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x i1> } @llvm.tpu.max.scan1xNf.v8f32(<8 x i1> %m, <8 x float> %v)
+  %po = extractvalue { <8 x float>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x float>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x float> %po, <8 x float> %v
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vmax.index.scan1xN.u32:
+; CHECK: (xrf0) = vmax.index.scan.u32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vmax.index.scan1xN.u32(<8 x i1> %m, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.max.index.scan1xNi.v8i32(<8 x i1> %m, <8 x i32> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vmax.index.scan1xNf32:
+; CHECK: (xrf0) = vmax.index.scan.f32
+; CHECK: _ = sdelay $0x3
+; CHECK-NEXT: v{{[0-9]+}}, _, vm{{[0-9]+}} = vpop (xrf0)
+define <8 x i32> @vmax.index.scan1xNf32(<8 x i1> %m, <8 x float> %v, <8 x i32> %pp) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.max.index.scan1xNf.v8f32(<8 x i1> %m, <8 x float> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %pp
+  ret <8 x i32> %b
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/deadstores_bf16_gl_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/deadstores_bf16_gl_sc.ll
new file mode 100644
index 0000000..9a42442
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/deadstores_bf16_gl_sc.ll

@@ -0,0 +1,54 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-gl < %s \
+; RUN: | llc -mcpu=sparsecore-tec-vf -asm-verbose=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <32 x i8> @llvm.tpu.vld.msk.v32i8.p201v32i8(<8 x i1>, <32 x i8> addrspace(201)*)
+declare void @llvm.tpu.vst.msk.p201v16bf16.v16bf16(<8 x i1>, <16 x bfloat> addrspace(201)*, <16 x bfloat>)
+declare <8 x i1> @llvm.tpu.16i1.to.8i1(<16 x i1>)
+declare <16 x bfloat> @llvm.tpu.unpack.c.l.u8.bf16(<32 x i8>)
+
+; Tests that the optimizer eliminates dead stores if possible, and not, if not.
+; Implicitly tests redundant load elimination.
+
+; CHECK-LABEL: vstmsk_samemasks:
+; CHECK: { v{{[0-9]+}} = vld.msk
+; CHECK-NOT: { v{{[0-9]+}} = vld.msk
+; CHECK: { [tilespmem:s{{[0-9]+}}+$0x0] = vst.msk
+; Can't optimize the store, because we can't reuse the llvm.masked.store/load
+; if the mask vector length doesn't match the value.
+; CHECK: { [tilespmem:s{{[0-9]+}}+$0x0] = vst.msk
+define void @vstmsk_samemasks(<32 x i8> addrspace(201)* %lbase, <8 x i1> %lm,
+                              <16 x bfloat> addrspace(201)* %sbase, <16 x i1> %sm) #0 {
+entry:
+  %0 = tail call <32 x i8> @llvm.tpu.vld.msk.v32i8.p201v32i8(<8 x i1> %lm, <32 x i8> addrspace(201)* %lbase)
+  %1 = tail call <16 x bfloat> @llvm.tpu.unpack.c.l.u8.bf16(<32 x i8> %0)
+  %sm8 = tail call <8 x i1> @llvm.tpu.16i1.to.8i1(<16 x i1> %sm)
+  tail call void @llvm.tpu.vst.msk.p201v16bf16.v16bf16(<8 x i1> %sm8, <16 x bfloat> addrspace(201)* %sbase, <16 x bfloat> %1)
+  tail call void @llvm.tpu.vst.msk.p201v16bf16.v16bf16(<8 x i1> %sm8, <16 x bfloat> addrspace(201)* %sbase, <16 x bfloat> %1)
+  ret void
+}
+
+; CHECK-LABEL: vstmsk_diffmasks:
+; CHECK: { v{{[0-9]+}} = vld.msk
+; CHECK-NOT: { v{{[0-9]+}} = vld.msk
+; CHECK: { [tilespmem:s{{[0-9]+}}+$0x0] = vst.msk
+; CHECK: { [tilespmem:s{{[0-9]+}}+$0x0] = vst.msk
+define void @vstmsk_diffmasks(<32 x i8> addrspace(201)* %lbase, <8 x i1> %lm,
+                              <16 x bfloat> addrspace(201)* %sbase, <16 x i1> %sm1, <16 x i1> %sm2) #0 {
+entry:
+  %0 = tail call <32 x i8> @llvm.tpu.vld.msk.v32i8.p201v32i8(<8 x i1> %lm, <32 x i8> addrspace(201)* %lbase)
+  %1 = tail call <32 x i8> @llvm.tpu.vld.msk.v32i8.p201v32i8(<8 x i1> %lm, <32 x i8> addrspace(201)* %lbase)
+  %2 = tail call <16 x bfloat> @llvm.tpu.unpack.c.l.u8.bf16(<32 x i8> %0)
+  %3 = tail call <16 x bfloat> @llvm.tpu.unpack.c.l.u8.bf16(<32 x i8> %1)
+  %sm81 = tail call <8 x i1> @llvm.tpu.16i1.to.8i1(<16 x i1> %sm1)
+  %sm82 = tail call <8 x i1> @llvm.tpu.16i1.to.8i1(<16 x i1> %sm2)
+  tail call void @llvm.tpu.vst.msk.p201v16bf16.v16bf16(<8 x i1> %sm81, <16 x bfloat> addrspace(201)* %sbase, <16 x bfloat> %2)
+  tail call void @llvm.tpu.vst.msk.p201v16bf16.v16bf16(<8 x i1> %sm82, <16 x bfloat> addrspace(201)* %sbase, <16 x bfloat> %3)
+  ret void
+}
+
+attributes #0 = { "implicit-section-name"=".text.tile_execute" "target-cpu"="sparsecore-tec-gl" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/deadstores_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/deadstores_sc.ll
new file mode 100644
index 0000000..e04e3a5
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/deadstores_sc.ll

@@ -0,0 +1,79 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-vf < %s \
+; RUN: | llc -mcpu=sparsecore-tec-vf -asm-verbose=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <8 x i32> @llvm.tpu.vld.msk.idx.v8i32.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>)
+declare <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*)
+declare void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+declare void @llvm.tpu.vst.msk.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>)
+
+; Tests that the optimizer eliminates dead stores if possible, and not, if not. Implicitly
+; tests redundant load elimination.
+
+; CHECK-LABEL: vstmsk_samemasks:
+; CHECK: { v{{[0-9]+}} = vld.msk
+; CHECK-NOT: { v{{[0-9]+}} = vld.msk
+; CHECK: { [tilespmem:s{{[0-9]+}}+$0x0] = vst.msk
+; CHECK-NOT: { [tilespmem:s{{[0-9]+}}+$0x0] = vst.msk
+define void @vstmsk_samemasks(<8 x i32> addrspace(201)* %lbase, <8 x i1> %lm,
+                              <8 x i32> addrspace(201)* %sbase, <8 x i1> %sm) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> %lm, <8 x i32> addrspace(201)* %lbase)
+  %1 = tail call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> %lm, <8 x i32> addrspace(201)* %lbase)
+  tail call void @llvm.tpu.vst.msk.p201v8i32.v8i32(<8 x i1> %sm, <8 x i32> addrspace(201)* %sbase, <8 x i32> %0)
+  tail call void @llvm.tpu.vst.msk.p201v8i32.v8i32(<8 x i1> %sm, <8 x i32> addrspace(201)* %sbase, <8 x i32> %0)
+  ret void
+}
+
+; CHECK-LABEL: vstmsk_diffmasks:
+; CHECK: { v{{[0-9]+}} = vld.idx.msk
+; CHECK: { [tilespmem:s{{[0-9]+}}+$0x0] = vst.msk
+; CHECK: { [tilespmem:s{{[0-9]+}}+$0x0] = vst.msk
+define void @vstmsk_diffmasks(<8 x i32> addrspace(201)* %lbase, <8 x i32> %loff, <8 x i1> %lm,
+                              <8 x i32> addrspace(201)* %sbase, <8 x i1> %sm1, <8 x i1> %sm2) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.v8i32.p201v8i32(<8 x i1> %lm, <8 x i32> addrspace(201)* %lbase, <8 x i32> %loff)
+  %1 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.v8i32.p201v8i32(<8 x i1> %lm, <8 x i32> addrspace(201)* %lbase, <8 x i32> %loff)
+  tail call void @llvm.tpu.vst.msk.p201v8i32.v8i32(<8 x i1> %sm1, <8 x i32> addrspace(201)* %sbase, <8 x i32> %0)
+  tail call void @llvm.tpu.vst.msk.p201v8i32.v8i32(<8 x i1> %sm2, <8 x i32> addrspace(201)* %sbase, <8 x i32> %0)
+  ret void
+}
+
+; CHECK-LABEL: vstmsk_idx:
+; CHECK: { v{{[0-9]+}} = vld.idx.msk
+; CHECK: { [tilespmem:v{{[0-9]+}}+s{{[0-9]+}}+$0x0] = vst.idx.msk
+; CHECK: { [tilespmem:v{{[0-9]+}}+s{{[0-9]+}}+$0x0] = vst.idx.msk
+define void @vstmsk_idx(<8 x i32> addrspace(201)* %lbase, <8 x i32> %loff, <8 x i1> %lm,
+                        <8 x i32> addrspace(201)* %sbase, <8 x i32> %soff, <8 x i1> %sm) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.v8i32.p201v8i32(<8 x i1> %lm, <8 x i32> addrspace(201)* %lbase, <8 x i32> %loff)
+  %1 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.v8i32.p201v8i32(<8 x i1> %lm, <8 x i32> addrspace(201)* %lbase, <8 x i32> %loff)
+  tail call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> %sm, <8 x i32> addrspace(201)* %sbase, <8 x i32> %soff, <8 x i32> %0)
+  tail call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> %sm, <8 x i32> addrspace(201)* %sbase, <8 x i32> %soff, <8 x i32> %0)
+  ret void
+}
+
+; CHECK-LABEL: vstvldmsk_sameaddress:
+; CHECK: { [tilespmem:s{{[0-9]+}}+$0x0] = vst.msk
+; CHECK-NOT: { v{{[0-9]+}} = vld.msk
+define <8 x i32> @vstvldmsk_sameaddress(<8 x i32> addrspace(201)* %base, <8 x i1> %m, <8 x i32> %v) {
+entry:
+  tail call void @llvm.tpu.vst.msk.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %v)
+  %r = tail call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base)
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: vstidxvldmsk_sameaddress:
+; CHECK: { [tilespmem:v{{[0-9]+}}+s{{[0-9]+}}+$0x0] = vst.idx.msk
+; CHECK: { v{{[0-9]+}} = vld.msk
+define <8 x i32> @vstidxvldmsk_sameaddress(<8 x i32> addrspace(201)* %base, <8 x i1> %m,
+                                           <8 x i32> %soff, <8 x i32> %v) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %soff, <8 x i32> %v)
+  %r = tail call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base)
+  ret <8 x i32> %r
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/disable_licm_parallel_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/disable_licm_parallel_sc.ll
new file mode 100644
index 0000000..33a8854
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/disable_licm_parallel_sc.ll

@@ -0,0 +1,53 @@
+; RUN: opt -S -O2 < %s -mcpu=sparsecore-tec-vf \
+; RUN: -parallel-licm-disable=true -licm-disable-threshold=2 \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.loop.parallel()
+declare i32* @llvm.tpu.inttoptr.pi32(i32)
+
+; Tests that LICM gets disabled with metadata if the parallel licm heuristic is enabled,
+; based on a manual unroller threshold of 2 in this test case.
+
+; CHECK-LABEL: for.body.i:
+; CHECK: %idx0 = getelementptr
+; CHECK: %idx1 = getelementptr
+; CHECK: br i1
+; CHECK: !{{[0-9]+}} = !{!"llvm.licm.disable"}
+
+define void @func_vst_idx_ret_add_noparallel_loop(<8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  %0 = call i32* @llvm.tpu.inttoptr.pi32(i32 0)
+  %1 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  %2 = call i32* @llvm.tpu.inttoptr.pi32(i32 64)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.i ]
+  %idx0 = getelementptr inbounds i32, i32* %0, i32 64
+  %3 = load i32, i32* %idx0, align 4
+  store i32 %3, i32* %1, align 4
+  tail call void @llvm.tpu.loop.parallel()
+  %idx1 = getelementptr inbounds i32, i32* %0, i32 32
+  %4 = load i32, i32* %idx1, align 4
+  store i32 %4, i32* %2, align 4
+  tail call void @llvm.tpu.loop.parallel()
+  %cmp.i = icmp slt i32 %i, 15
+  %ic = add i32 %i, 1
+  br i1 %cmp.i, label %for.body.i, label %exit, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+!1 = distinct !{!1, !2, !4, !5, !6, !7}
+!2 = !{!"llvm.loop.parallel_accesses", !3}
+!3 = distinct !{}
+!4 = !{!"llvm.loop.unroll.disable"}
+!5 = !{!"llvm.loop.vectorize.width", i32 1}
+!6 = !{!"llvm.loop.interleave.count", i32 1}
+!7 = !{!"llvm.loop.vectorize.enable", i1 true}
+!8 = distinct !{!8, !4}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/div_i32.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/div_i32.ll
new file mode 100644
index 0000000..b858543
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/div_i32.ll

@@ -0,0 +1,342 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-emulate-signed-divrem | FileCheck %s --check-prefixes=CHECK,CHECK-SC
+; REQUIRES: tpu
+
+; Test different configurations of udiv and urem. The differences between SparseCore and TensorCore
+; are due to (1) different fifo overflow protections (see -tpu-convervative-fifo-oflw), (2) different
+; default [s|v]nop, and (3) different pre-RA schedulers and/or (4) different pass configuration.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests that udiv and urem gets combined into one drf instructions.
+
+; CHECK-LABEL: func_div_mod:
+; CHECK: (drf) = sdivrem.u32 s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-SC: _ =  sdelay $0x8
+; CHECK: s[[s0:[0-9]+]] = spop (drf)
+; CHECK: s[[s1:[0-9]+]] = spop (drf)
+; CHECK: s{{[0-9]+}} = ssub.s32 s[[s0]], s[[s1]]
+define i32 @func_div_mod(i32 %x, i32 %y) {
+  %a = udiv i32 %x, %y
+  %b = urem i32 %x, %y
+  %r = sub i32 %a, %b
+  ret i32 %r
+}
+
+; Tests that udiv w/o urem works, and that the mod pop does not get lost.
+
+; CHECK-LABEL: func_div_tec:
+; CHECK: (drf) = sdiv.u32 s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK: _ =  sdelay $0x8
+; CHECK: s{{[0-9]+}} = spop (drf)
+define i32 @func_div_tec(i32 %x, i32 %y) #2 {
+  %r = udiv i32 %x, %y
+  ret i32 %r
+}
+
+; CHECK-LABEL: func_div_tac:
+; CHECK: (drf) = sdiv.u32 s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK: _ =  sdelay $0x8
+; CHECK: s{{[0-9]+}} = spop (drf)
+define i32 @func_div_tac(i32 %x, i32 %y) #1 {
+  %r = udiv i32 %x, %y
+  ret i32 %r
+}
+
+; CHECK-LABEL: func_div_scs:
+; CHECK: (drf) = sdiv.u32 s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK: _ =  sdelay $0x8
+; CHECK: s{{[0-9]+}} = spop (drf)
+define i32 @func_div_scs(i32 %x, i32 %y) #0 {
+  %r = udiv i32 %x, %y
+  ret i32 %r
+}
+
+; Tests back-to-back divide.
+
+; CHECK-LABEL: func_div_div:
+; CHECK: (drf) = sdiv.u32 s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-SC: _ =  sdelay $0x8
+; CHECK: s[[s0:[0-9]+]] = spop (drf)
+; CHECK-SC: (drf) = sdiv.u32 s[[s0]], s{{[0-9]+}}
+; CHECK-SC: _ =  sdelay $0x8
+; CHECK-SC: s{{[0-9]+}} = spop (drf)
+define i32 @func_div_div(i32 %x, i32 %y) {
+  %r1 = udiv i32 %x, %y
+  %r2 = udiv i32 %r1, %y
+  ret i32 %r2
+}
+
+; Same as previous, but for multiple divs.
+
+; CHECK-LABEL: func_div_multiple:
+; CHECK: { (drf) = sdiv.u32 s0, s1
+; CHECK-SC: { (drf) = sdiv.u32 s2, s3;
+; CHECK-SC: _ =  sdelay $0x7 }
+; CHECK-SC: { s[[s0:[0-9]+]] = spop (drf)
+; CHECK-SC: { s[[s1:[0-9]+]] = spop (drf)
+; CHECK: s{{[0-9]+}} = ssub.s32 s[[s0]], s[[s1]]
+define i32 @func_div_multiple(i32 %x, i32 %y, i32 %u, i32 %v) {
+  %r1 = udiv i32 %x, %y
+  %r2 = udiv i32 %u, %v
+  %r = sub i32 %r1, %r2
+  ret i32 %r
+}
+
+; Tests that umod w/o udiv works, and that the mod pop does not get lost.
+
+; CHECK-LABEL: func_mod:
+; CHECK: (drf) = srem.u32 s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-SC: _ =  sdelay $0x8
+; CHECK: s[[s0:[0-9]+]] = spop (drf)
+define i32 @func_mod(i32 %x, i32 %y) {
+  %r = urem i32 %x, %y
+  ret i32 %r
+}
+
+; Same as previous, but for multiple mods.
+
+; CHECK-LABEL: func_mod_multiple:
+; CHECK: { (drf) = srem.u32 s0, s1
+; CHECK-SC: { (drf) = srem.u32 s2, s3;
+; CHECK-SC: _ =  sdelay $0x7 }
+; CHECK-SC: { s[[s0:[0-9]+]] = spop (drf)
+; CHECK-SC: { s[[s1:[0-9]+]] = spop (drf)
+; CHECK: s{{[0-9]+}} = ssub.s32 s[[s0]], s[[s1]]
+define i32 @func_mod_multiple(i32 %x, i32 %y, i32 %u, i32 %v) {
+  %r1 = urem i32 %x, %y
+  %r2 = urem i32 %u, %v
+  %r = sub i32 %r1, %r2
+  ret i32 %r
+}
+
+; Tests that multiple udiv, umod get combined into separate drf instructions,
+; that the push/pops can get re-ordered, and their respective pop order is
+; correct.
+
+; CHECK-LABEL: func_div_mod_multiple:
+; CHECK: { (drf) = sdivrem.u32 s2, s3;
+; CHECK-SC: _ =  sdelay $0x1 }
+; CHECK-SC: { (drf) = sdivrem.u32 s0, s1;
+; CHECK-SC: _ =  sdelay $0x6 }
+; CHECK-SC: s[[s2:[0-9]+]] = spop (drf)
+; CHECK-SC: s[[s3:[0-9]+]] = spop (drf)
+; CHECK-SC: s[[s0:[0-9]+]] = spop (drf)
+; CHECK-SC: s[[s1:[0-9]+]] = spop (drf)
+; CHECK-SC: s[[s4:[0-9]+]] = ssub.s32 s[[s3]], s[[s2]]
+; CHECK-SC: s[[s5:[0-9]+]] = ssub.s32 s[[s0]], s[[s1]]
+; CHECK-SC: s{{[0-9]+}} = sadd.s32 s[[s5]], s[[s4]]
+define i32 @func_div_mod_multiple(i32 %x, i32 %y, i32 %u, i32 %v) {
+  %a = udiv i32 %x, %y
+  %b = udiv i32 %u, %v
+  %c = urem i32 %u, %v
+  %d = urem i32 %x, %y
+  %r1 = sub i32 %a, %d
+  %r2 = sub i32 %b, %c
+  %r = sub i32 %r1, %r2
+  ret i32 %r
+}
+
+; Tests that udiv and urem gets combined into one drf instructions,
+; immediate version.
+
+; CHECK-LABEL: func_div_mod_imm:
+; CHECK: (drf) = sdivrem.u32 s{{[0-9]+}}, $0x3
+; CHECK-SC: _ =  sdelay $0x8
+; CHECK: s[[s0:[0-9]+]] = spop (drf)
+; CHECK: s[[s1:[0-9]+]] = spop (drf)
+; CHECK: s{{[0-9]+}} = ssub.s32 s[[s0]], s[[s1]]
+define i32 @func_div_mod_imm(i32 %x) {
+  %a = udiv i32 %x, 3
+  %b = urem i32 %x, 3
+  %r = sub i32 %a, %b
+  ret i32 %r
+}
+
+; Tests that udiv w/o urem works, and that the mod pop does not get lost,
+; immediate version.
+
+; CHECK-LABEL: func_div_imm:
+; CHECK: (drf) = sdiv.u32 s{{[0-9]+}}, $0x57
+; CHECK-SC: _ =  sdelay $0x8
+; CHECK: s[[s0:[0-9]+]] = spop (drf)
+define i32 @func_div_imm(i32 %x) {
+  %r = udiv i32 %x, 87
+  ret i32 %r
+}
+
+; Same as previous, but for multiple divs, immediate version.
+
+; CHECK-LABEL: func_div_multiple_imm:
+; CHECK: { (drf) = sdiv.u32 s0, $0x401
+; CHECK-SC: { (drf) = sdiv.u32 s1, $0xbd1;
+; CHECK-SC: _ =  sdelay $0x7 }
+; CHECK-SC: { s[[s0:[0-9]+]] = spop (drf)
+; CHECK-SC: { s[[s1:[0-9]+]] = spop (drf)
+; CHECK-NOT: s[[s2]] = spop (drf)
+; CHECK: s{{[0-9]+}} = ssub.s32 s[[s0]], s[[s1]]
+define i32 @func_div_multiple_imm(i32 %x, i32 %u) {
+  %r1 = udiv i32 %x, 1025
+  %r2 = udiv i32 %u, 3025
+  %r = sub i32 %r1, %r2
+  ret i32 %r
+}
+
+; Tests that umod w/o udiv works, and that the mod pop does not get lost,
+; immediate version.
+
+; CHECK-LABEL: func_mod_imm:
+; CHECK: (drf) = srem.u32 s{{[0-9]+}}, $0x100400
+; CHECK-SC: _ =  sdelay $0x8
+; CHECK: s[[s0:[0-9]+]] = spop (drf)
+define i32 @func_mod_imm(i32 %x) {
+  %r = urem i32 %x, 1049600
+  ret i32 %r
+}
+
+; Same as previous, but for multiple mods, immediate version.
+
+; CHECK-LABEL: func_mod_multiple_imm:
+; CHECK: { (drf) = srem.u32 s0, $0x5
+; CHECK-SC: { (drf) = srem.u32 s1, $0x15;
+; CHECK-SC: _ =  sdelay $0x7 }
+; CHECK-SC: { s[[s0:[0-9]+]] = spop (drf)
+; CHECK-SC: { s[[s1:[0-9]+]] = spop (drf)
+; CHECK-NOT: s[[s2]] = spop (drf)
+; CHECK: s{{[0-9]+}} = ssub.s32 s[[s0]], s[[s1]]
+define i32 @func_mod_multiple_imm(i32 %x, i32 %u) {
+  %r1 = urem i32 %x, 5
+  %r2 = urem i32 %u, 21
+  %r = sub i32 %r1, %r2
+  ret i32 %r
+}
+
+; Tests that multiple udiv, umod get combined into separate drf instructions,
+; that the push/pops can get re-ordered, and their respective pop order is
+; correct, immediate version.
+
+; CHECK-LABEL: func_div_mod_multiple_imm:
+; CHECK: { (drf) = sdivrem.u32 s1, $0x65;
+; CHECK-SC: _ =  sdelay $0x1 }
+; CHECK-SC: { (drf) = sdivrem.u32 s0, $0x64;
+; CHECK-SC: _ =  sdelay $0x6 }
+; CHECK-SC: { s[[s1:[0-9]+]] = spop (drf)
+; CHECK-SC: { s[[s2:[0-9]+]] = spop (drf)
+; CHECK-SC: { s[[s0:[0-9]+]] = spop (drf)
+; CHECK-SC: { s[[s3:[0-9]+]] = spop (drf)
+; CHECK-SC: s[[s4:[0-9]+]] = ssub.s32 s[[s2]], s[[s1]]
+; CHECK-SC: s[[s5:[0-9]+]] = ssub.s32 s[[s0]], s[[s3]]
+; CHECK-SC: s{{[0-9]+}} = sadd.s32 s[[s5]], s[[s4]]
+define i32 @func_div_mod_multiple_imm(i32 %x, i32 %u) {
+  %a = udiv i32 %x, 100
+  %b = udiv i32 %u, 101
+  %c = urem i32 %u, 101
+  %d = urem i32 %x, 100
+  %r1 = sub i32 %a, %d
+  %r2 = sub i32 %b, %c
+  %r = sub i32 %r1, %r2
+  ret i32 %r
+}
+
+; Check that signed div is expanded into an unsigned div
+;
+; CHECK-LABEL: func_sdiv_emulation:
+; CHECK: [[y_mask:s[0-9]+]] = sshra.s32 [[y:s[0-9]+]], $0x1f
+; CHECK: [[x_mask:s[0-9]+]] = sshra.s32 [[x:s[0-9]+]], $0x1f
+; CHECK: [[y_inv:s[0-9]+]]  = sxor.u32 [[y_mask]], [[y]]
+; CHECK: [[x_inv:s[0-9]+]]  = sxor.u32 [[x_mask]], [[x]]
+; CHECK: [[y_abs:s[0-9]+]]  = ssub.s32 [[y_inv]], [[y_mask]]
+; CHECK: [[x_abs:s[0-9]+]]  = ssub.s32 [[x_inv]], [[x_mask]]
+; CHECK: (drf)              = sdiv.u32 [[x_abs]], [[y_abs]]
+; CHECK:  _ = {{[sv]}}delay
+; CHECK: [[result_mask:s[0-9]+]] = sxor.u32 [[y_mask]], [[x_mask]]
+; CHECK: [[result_abs:s[0-9]+]]  = spop (drf)
+; CHECK: [[result_inv:s[0-9]+]]  = sxor.u32 [[result_mask]], [[result_abs]]
+; CHECK: s{{[0-9]+}}             = ssub.s32 [[result_inv]], [[result_mask]]
+define i32 @func_sdiv_emulation(i32 %x, i32 %y) {
+  %d = sdiv i32 %x, %y
+  ret i32 %d
+}
+
+; Check that signed div with immediate is expanded into an unsigned div
+;
+; CHECK-LABEL: func_sdiv_emulation_imm:
+; CHECK: [[x_mask:s[0-9]+]] = sshra.s32 [[x:s[0-9]+]], $0x1f
+; CHECK: [[x_inv:s[0-9]+]]  = sxor.u32 [[x_mask]], [[x]]
+; CHECK: [[x_abs:s[0-9]+]]  = ssub.s32 [[x_inv]], [[x_mask]]
+; CHECK: (drf)              = sdiv.u32 [[x_abs]], $0x100400;
+; CHECK:  _ = {{[sv]}}delay
+; CHECK: [[result_abs:s[0-9]+]]  = spop (drf)
+; CHECK: [[result_inv:s[0-9]+]]  = sxor.u32 [[x_mask]], [[result_abs]]
+; CHECK: s{{[0-9]+}}             = ssub.s32 [[result_inv]], [[x_mask]]
+define i32 @func_sdiv_emulation_imm(i32 %x) {
+  %d = sdiv i32 %x, 1049600
+  ret i32 %d
+}
+
+; Check that signed rem is expanded into an unsigned rem
+;
+; CHECK-LABEL: func_srem_emulation:
+; CHECK: [[y_mask:s[0-9]+]] = sshra.s32 [[y:s[0-9]+]], $0x1f
+; CHECK: [[x_mask:s[0-9]+]] = sshra.s32 [[x:s[0-9]+]], $0x1f
+; CHECK: [[y_inv:s[0-9]+]]  = sxor.u32 [[y_mask]], [[y]]
+; CHECK: [[x_inv:s[0-9]+]]  = sxor.u32 [[x_mask]], [[x]]
+; CHECK: [[y_abs:s[0-9]+]]  = ssub.s32 [[y_inv]], [[y_mask]]
+; CHECK: [[x_abs:s[0-9]+]]  = ssub.s32 [[x_inv]], [[x_mask]]
+; CHECK: (drf)              = srem.u32 [[x_abs]], [[y_abs]]
+; CHECK:  _ = {{[sv]}}delay
+; CHECK: [[result_abs:s[0-9]+]]  = spop (drf)
+; CHECK: [[result_inv:s[0-9]+]]  = sxor.u32 [[x_mask]], [[result_abs]]
+; CHECK: s{{[0-9]+}}             = ssub.s32 [[result_inv]], [[x_mask]]
+define i32 @func_srem_emulation(i32 %x, i32 %y) {
+  %r = srem i32 %x, %y
+  ret i32 %r
+}
+
+; Check that signed rem with immediate is expanded into an unsigned rem
+;
+; CHECK-LABEL: func_srem_emulation_imm:
+; CHECK: [[x_mask:s[0-9]+]] = sshra.s32 [[x:s[0-9]+]], $0x1f
+; CHECK: [[x_inv:s[0-9]+]]  = sxor.u32 [[x_mask]], [[x]]
+; CHECK: [[x_abs:s[0-9]+]]  = ssub.s32 [[x_inv]], [[x_mask]]
+; CHECK: (drf)              = srem.u32 [[x_abs]], $0x100400;
+; CHECK:  _ = {{[sv]}}delay
+; CHECK: [[result_abs:s[0-9]+]]  = spop (drf)
+; CHECK: [[result_inv:s[0-9]+]]  = sxor.u32 [[x_mask]], [[result_abs]]
+; CHECK: s{{[0-9]+}}             = ssub.s32 [[result_inv]], [[x_mask]]
+define i32 @func_srem_emulation_imm(i32 %x) {
+  %r = srem i32 %x, 1049600
+  ret i32 %r
+}
+
+; Check that signed div and signed rem are expanded and combined into a single
+; unsigned divrem instruction.
+;
+; CHECK-LABEL: func_sdiv_srem_emulation:
+; CHECK: [[y_mask:s[0-9]+]] = sshra.s32 [[y:s[0-9]+]], $0x1f
+; CHECK: [[x_mask:s[0-9]+]] = sshra.s32 [[x:s[0-9]+]], $0x1f
+; CHECK: [[y_inv:s[0-9]+]]  = sxor.u32 [[y_mask]], [[y]]
+; CHECK: [[x_inv:s[0-9]+]]  = sxor.u32 [[x_mask]], [[x]]
+; CHECK: [[y_abs:s[0-9]+]]  = ssub.s32 [[y_inv]], [[y_mask]]
+; CHECK: [[x_abs:s[0-9]+]]  = ssub.s32 [[x_inv]], [[x_mask]]
+; CHECK: (drf)              = sdivrem.u32 [[x_abs]], [[y_abs]]
+; CHECK:  _ = {{[sv]}}delay
+; CHECK: [[div_result_abs:s[0-9]+]] = spop (drf)
+; CHECK: [[result_mask:s[0-9]+]]    = sxor.u32 [[y_mask]], [[x_mask]]
+; CHECK: [[rem_result_abs:s[0-9]+]] = spop (drf)
+; CHECK: [[div_result_inv:s[0-9]+]] = sxor.u32 [[result_mask]], [[div_result_abs]]
+; CHECK: [[rem_result_inv:s[0-9]+]] = sxor.u32 [[x_mask]], [[rem_result_abs]]
+; CHECK: [[div_result:s[0-9]+]]     = ssub.s32 [[div_result_inv]], [[result_mask]]
+; CHECK: [[rem_result:s[0-9]+]]     = ssub.s32 [[rem_result_inv]], [[x_mask]]
+; CHECK: s{{[0-9]+}}                = sadd.s32
+define i32 @func_sdiv_srem_emulation(i32 %x, i32 %y) {
+  %d = sdiv i32 %x, %y
+  %r = srem i32 %x, %y
+  %res = add i32 %d, %r
+  ret i32 %res
+}
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/dma_host_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/dma_host_sc.ll
new file mode 100644
index 0000000..b2647fa
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/dma_host_sc.ll

@@ -0,0 +1,197 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-vf < %s \
+; RUN: | llc -mcpu=sparsecore-tec-vf -asm-verbose=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.dma.iova.to.hbm.sc.simple(i32 addrspace(211)*, i32 addrspace(216)*, i32 addrspace(203)*, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.dma.hbm.to.iova.sc.simple(i32 addrspace(211)*, i32 addrspace(203)*, i32 addrspace(216)*, i32, i32, i32) argmemonly nounwind
+declare i32 addrspace(216)* @llvm.tpu.allocate.iova(i32, i32)
+
+@flag = addrspace(204) global i32 0, align 4
+@rflag = addrspace(211) global i32 0, align 4
+
+; CHECK-LABEL: dma_iova_to_hbm:
+; CHECK: s[[shi:[0-9]+]] = sshra.s32 s[[sin:[0-9]+]], $0x14
+; CHECK-NEXT: s[[slo:[0-9]+]] = sshll.u32 s[[sin]], $0xc
+; CHECK-NEXT: { [hbm:s1], [sflag:rflag] = dma.local [iova:s[[shi]],s[[slo]]], $0x4
+define void @dma_iova_to_hbm(i32 addrspace(216)* %src, i32 addrspace(203)* %dst) {
+  call void @llvm.tpu.dma.iova.to.hbm.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(216)* %src, i32 addrspace(203)* %dst, i32 0, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_iova_to_hbm_offset_i:
+; CHECK:      [[ADDR_LO:s[0-9]+]]     = sshll.u32 [[ADDR:s[0-9]+]], $0xc
+; CHECK-NEXT: [[CARRY:s[0-9]+]]       = simm.s32 $0x1
+; CHECK-NEXT: [[HAS_CARRY:p[0-9]+]]   = sc.u32 [[ADDR_LO]], $0x2000
+; CHECK-NEXT: [[ADDR_HI:s[0-9]+]]     = sshra.s32 [[ADDR]], $0x14
+; CHECK-NEXT: [[CARRY]]               = simm.s32 @![[HAS_CARRY]] $0x0
+; CHECK-NEXT: [[LO:s[0-9]+]]          = sadd.s32 $0x2000, [[ADDR_LO]]
+; CHECK-NEXT: [[HI:s[0-9]+]]          = sadd.s32 [[CARRY]], [[ADDR_HI]]
+; CHECK-NEXT: [hbm:s1], [sflag:rflag] = dma.local [iova:[[HI]],[[LO]]], $0x4
+define void @dma_iova_to_hbm_offset_i(i32 addrspace(216)* %src, i32 addrspace(203)* %dst) {
+  call void @llvm.tpu.dma.iova.to.hbm.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(216)* %src, i32 addrspace(203)* %dst, i32 8192, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_iova_to_hbm_offset_r:
+; CHECK:      [[ADDR_LO:s[0-9]+]]     = sshll.u32 [[ADDR:s[0-9]+]], $0xc
+; CHECK-NEXT: [[CARRY:s[0-9]+]]       = simm.s32 $0x1
+; CHECK-NEXT: [[HAS_CARRY:p[0-9]+]]   = sc.u32 [[ADDR_LO]], [[OFF:s[0-9]+]]
+; CHECK-NEXT: [[ADDR_HI:s[0-9]+]]     = sshra.s32 [[ADDR]], $0x14
+; CHECK-NEXT: [[CARRY]]               = simm.s32 @![[HAS_CARRY]] $0x0
+; CHECK-NEXT: [[LO:s[0-9]+]]          = sadd.s32 [[OFF]], [[ADDR_LO]]
+; CHECK-NEXT: [[HI:s[0-9]+]]          = sadd.s32 [[CARRY]], [[ADDR_HI]]
+; CHECK-NEXT: [hbm:s1], [sflag:rflag] = dma.local [iova:[[HI]],[[LO]]], $0x4
+define void @dma_iova_to_hbm_offset_r(i32 addrspace(216)* %src, i32 addrspace(203)* %dst, i32 %off) {
+  call void @llvm.tpu.dma.iova.to.hbm.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(216)* %src, i32 addrspace(203)* %dst, i32 %off, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_hbm_to_iova:
+; CHECK: s[[shi:[0-9]+]] = sshra.s32 s[[sin:[0-9]+]], $0x14
+; CHECK-NEXT: s[[slo:[0-9]+]] = sshll.u32 s[[sin]], $0xc
+; CHECK-NEXT: { [iova:s[[shi]],s[[slo]]], [sflag:rflag] = dma.local [hbm:s0], $0x4
+define void @dma_hbm_to_iova(i32 addrspace(203)* %src, i32 addrspace(216)* %dst) {
+  call void @llvm.tpu.dma.hbm.to.iova.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(203)* %src, i32 addrspace(216)* %dst, i32 0, i32 4, i32 0)
+  ret void
+}
+
+
+; CHECK-LABEL: dma_hbm_to_iova_offset_i:
+; CHECK:      [[ADDR_LO:s[0-9]+]]     = sshll.u32 [[ADDR:s[0-9]+]], $0xc
+; CHECK-NEXT: [[CARRY:s[0-9]+]]       = simm.s32 $0x1
+; CHECK-NEXT: [[HAS_CARRY:p[0-9]+]]   = sc.u32 [[ADDR_LO]], $0x2000
+; CHECK-NEXT: [[ADDR_HI:s[0-9]+]]     = sshra.s32 [[ADDR]], $0x14
+; CHECK-NEXT: [[CARRY]]               = simm.s32 @![[HAS_CARRY]] $0x0
+; CHECK-NEXT: [[LO:s[0-9]+]]          = sadd.s32 $0x2000, [[ADDR_LO]]
+; CHECK-NEXT: [[HI:s[0-9]+]]          = sadd.s32 [[CARRY]], [[ADDR_HI]]
+; CHECK-NEXT: [iova:[[HI]],[[LO]]], [sflag:rflag] = dma.local [hbm:s0], $0x4
+define void @dma_hbm_to_iova_offset_i(i32 addrspace(203)* %src, i32 addrspace(216)* %dst) {
+  call void @llvm.tpu.dma.hbm.to.iova.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(203)* %src, i32 addrspace(216)* %dst, i32 8192, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_hbm_to_iova_offset_r:
+; CHECK:      [[ADDR_LO:s[0-9]+]]     = sshll.u32 [[ADDR:s[0-9]+]], $0xc
+; CHECK-NEXT: [[CARRY:s[0-9]+]]       = simm.s32 $0x1
+; CHECK-NEXT: [[HAS_CARRY:p[0-9]+]]   = sc.u32 [[ADDR_LO]], [[OFF:s[0-9]+]]
+; CHECK-NEXT: [[ADDR_HI:s[0-9]+]]     = sshra.s32 [[ADDR]], $0x14
+; CHECK-NEXT: [[CARRY]]               = simm.s32 @![[HAS_CARRY]] $0x0
+; CHECK-NEXT: [[LO:s[0-9]+]]          = sadd.s32 [[OFF]], [[ADDR_LO]]
+; CHECK-NEXT: [[HI:s[0-9]+]]          = sadd.s32 [[CARRY]], [[ADDR_HI]]
+; CHECK-NEXT: [iova:[[HI]],[[LO]]], [sflag:rflag] = dma.local [hbm:s0], $0x4
+define void @dma_hbm_to_iova_offset_r(i32 addrspace(203)* %src, i32 addrspace(216)* %dst, i32 %off) {
+  call void @llvm.tpu.dma.hbm.to.iova.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(203)* %src, i32 addrspace(216)* %dst, i32 %off, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_iova_to_hbm_allocate:
+; CHECK: s1 = simm.s32 $0x0
+; CHECK-NEXT: s2 = simm.s32 $0x1000
+; CHECK-NEXT: [hbm:s0], [sflag:rflag] = dma.local [iova:s1,s2], $0x4
+define void @dma_iova_to_hbm_allocate(i32 addrspace(203)* %dst) {
+  %a = call i32 addrspace(216)* @llvm.tpu.allocate.iova(i32 4096, i32 1)
+  call void @llvm.tpu.dma.iova.to.hbm.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(216)* %a, i32 addrspace(203)* %dst, i32 0, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_hbm_to_iova_allocate:
+; CHECK: s1 = simm.s32 $0x0
+; CHECK-NEXT: s2 = simm.s32 $0x1000
+; CHECK-NEXT: [iova:s1,s2], [sflag:rflag] = dma.local [hbm:s0], $0x4
+define void @dma_hbm_to_iova_allocate(i32 addrspace(203)* %src) {
+  %a = call i32 addrspace(216)* @llvm.tpu.allocate.iova(i32 4096, i32 1)
+  call void @llvm.tpu.dma.hbm.to.iova.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(203)* %src, i32 addrspace(216)* %a, i32 0, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_iova_to_hbm_allocate_gep:
+; CHECK: s1 = simm.s32 $0x0
+; CHECK-NEXT: s2 = simm.s32 $0x2000
+; CHECK-NEXT: [hbm:s0], [sflag:rflag] = dma.local [iova:s1,s2], $0x4
+define void @dma_iova_to_hbm_allocate_gep(i32 addrspace(203)* %dst) {
+  %a = call i32 addrspace(216)* @llvm.tpu.allocate.iova(i32 4096, i32 1)
+  %g = getelementptr i32, i32 addrspace(216)* %a, i32 1024
+  call void @llvm.tpu.dma.iova.to.hbm.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(216)* %g, i32 addrspace(203)* %dst, i32 0, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_hbm_to_iova_allocate_gep:
+; CHECK: s1 = simm.s32 $0x0
+; CHECK-NEXT: s2 = simm.s32 $0x2000
+; CHECK-NEXT: [iova:s1,s2], [sflag:rflag] = dma.local [hbm:s0], $0x4
+define void @dma_hbm_to_iova_allocate_gep(i32 addrspace(203)* %src) {
+  %a = call i32 addrspace(216)* @llvm.tpu.allocate.iova(i32 4096, i32 1)
+  %g = getelementptr i32, i32 addrspace(216)* %a, i32 1024
+  call void @llvm.tpu.dma.hbm.to.iova.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(203)* %src, i32 addrspace(216)* %g, i32 0, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_iova_to_hbm_allocate_gep2:
+; CHECK: s1 = simm.s32 $0x0
+; CHECK-NEXT: s2 = simm.s32 $0x10000000
+; CHECK-NEXT: [hbm:s0], [sflag:rflag] = dma.local [iova:s1,s2], $0x4
+define void @dma_iova_to_hbm_allocate_gep2(i32 addrspace(203)* %dst) {
+  %a = call i32 addrspace(216)* @llvm.tpu.allocate.iova(i32 4096, i32 1)
+  %g = getelementptr i32, i32 addrspace(216)* %a, i32 67107840
+  call void @llvm.tpu.dma.iova.to.hbm.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(216)* %g, i32 addrspace(203)* %dst, i32 0, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_hbm_to_iova_allocate_gep2:
+; CHECK: s1 = simm.s32 $0x0
+; CHECK-NEXT: s2 = simm.s32 $0x10000000
+; CHECK-NEXT: [iova:s1,s2], [sflag:rflag] = dma.local [hbm:s0], $0x4
+define void @dma_hbm_to_iova_allocate_gep2(i32 addrspace(203)* %src) {
+  %a = call i32 addrspace(216)* @llvm.tpu.allocate.iova(i32 4096, i32 1)
+  %g = getelementptr i32, i32 addrspace(216)* %a, i32 67107840
+  call void @llvm.tpu.dma.hbm.to.iova.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(203)* %src, i32 addrspace(216)* %g, i32 0, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_iova_to_hbm_allocate_gep3:
+; CHECK: s1 = simm.s32 $0x1
+; CHECK-NEXT: s2 = simm.s32 $0x0
+; CHECK-NEXT: [hbm:s0], [sflag:rflag] = dma.local [iova:s1,s2], $0x4
+define void @dma_iova_to_hbm_allocate_gep3(i32 addrspace(203)* %dst) {
+  %a = call i32 addrspace(216)* @llvm.tpu.allocate.iova(i32 4096, i32 1)
+  %g = getelementptr i32, i32 addrspace(216)* %a, i32 1073740800
+  call void @llvm.tpu.dma.iova.to.hbm.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(216)* %g, i32 addrspace(203)* %dst, i32 0, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_hbm_to_iova_allocate_gep3:
+; CHECK: s1 = simm.s32 $0x1
+; CHECK-NEXT: s2 = simm.s32 $0x0
+; CHECK-NEXT: [iova:s1,s2], [sflag:rflag] = dma.local [hbm:s0], $0x4
+define void @dma_hbm_to_iova_allocate_gep3(i32 addrspace(203)* %src) {
+  %a = call i32 addrspace(216)* @llvm.tpu.allocate.iova(i32 4096, i32 1)
+  %g = getelementptr i32, i32 addrspace(216)* %a, i32 1073740800
+  call void @llvm.tpu.dma.hbm.to.iova.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(203)* %src, i32 addrspace(216)* %g, i32 0, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_iova_to_hbm_allocate_gep4:
+; CHECK: s1 = simm.s32 $0x1
+; CHECK-NEXT: s2 = simm.s32 $0x1000
+; CHECK-NEXT: [hbm:s0], [sflag:rflag] = dma.local [iova:s1,s2], $0x4
+define void @dma_iova_to_hbm_allocate_gep4(i32 addrspace(203)* %dst) {
+  %a = call i32 addrspace(216)* @llvm.tpu.allocate.iova(i32 4096, i32 1)
+  %g = getelementptr i32, i32 addrspace(216)* %a, i32 1073741824
+  call void @llvm.tpu.dma.iova.to.hbm.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(216)* %g, i32 addrspace(203)* %dst, i32 0, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_hbm_to_iova_allocate_gep4:
+; CHECK: s1 = simm.s32 $0x1
+; CHECK-NEXT: s2 = simm.s32 $0x1000
+; CHECK-NEXT: [iova:s1,s2], [sflag:rflag] = dma.local [hbm:s0], $0x4
+define void @dma_hbm_to_iova_allocate_gep4(i32 addrspace(203)* %src) {
+  %a = call i32 addrspace(216)* @llvm.tpu.allocate.iova(i32 4096, i32 1)
+  %g = getelementptr i32, i32 addrspace(216)* %a, i32 1073741824
+  call void @llvm.tpu.dma.hbm.to.iova.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(203)* %src, i32 addrspace(216)* %g, i32 0, i32 4, i32 0)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/dma_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/dma_sc.ll
new file mode 100644
index 0000000..db1acd8
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/dma_sc.ll

@@ -0,0 +1,168 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.dma.hbm.to.hbm.sc.simple(i32 addrspace(211)*, i32 addrspace(203)*, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.dma.spmem.to.hbm.sc.simple(i32 addrspace(211)*, i32 addrspace(202)*, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.dma.timem.to.hbm.sc.simple(i32 addrspace(211)*, i32 addrspace(214)*, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.dma.hbm.to.simem.sc.simple(i32 addrspace(211)*, i32 addrspace(203)*, i32 addrspace(215)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.dma.hbm.to.timem.sc.simple(i32 addrspace(211)*, i32 addrspace(203)*, i32 addrspace(214)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.dma.hbm.to.hbm.sc.general(i32 addrspace(211)*, i32, i32 addrspace(203)*, i32 addrspace(213)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32)
+declare void @llvm.tpu.dma.smem.to.smem.sc.general(i32 addrspace(211)*, i32, i32*, i32 addrspace(212)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32)
+declare void @llvm.tpu.dma.hbm.to.smem.sc.general(i32 addrspace(211)*, i32, i32 addrspace(213)*, i32*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32)
+declare void @llvm.tpu.dma.hbm.to.timem.sc.general(i32 addrspace(211)*, i32, i32 addrspace(213)*, i32 addrspace(214)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32)
+declare void @llvm.tpu.dma.hbm.to.tilespmem.sc.general(i32 addrspace(211)*, i32, i32 addrspace(213)*, i32 addrspace(201)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32)
+declare void @llvm.tpu.dma.hbm.to.spmem.sc.general(i32 addrspace(211)*, i32, i32 addrspace(213)*, i32 addrspace(202)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32)
+declare void @llvm.tpu.dma.smem.to.hbm.sc.general(i32 addrspace(211)*, i32, i32*, i32 addrspace(213)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32)
+declare void @llvm.tpu.dma.timem.to.hbm.sc.general(i32 addrspace(211)*, i32, i32 addrspace(214)*, i32 addrspace(213)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32)
+declare void @llvm.tpu.dma.tilespmem.to.hbm.sc.general(i32 addrspace(211)*, i32, i32 addrspace(201)*, i32 addrspace(213)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32)
+declare void @llvm.tpu.dma.spmem.to.hbm.sc.general(i32 addrspace(211)*, i32, i32 addrspace(202)*, i32 addrspace(213)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32)
+declare void @llvm.tpu.dma.spmem.to.spmem.sc.general(i32 addrspace(211)*, i32, i32 addrspace(202)*, i32 addrspace(202)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32)
+declare void @llvm.tpu.dma.tilespmem.to.spmem.sc.general(i32 addrspace(211)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32)
+declare void @llvm.tpu.dma.spmem.to.tilespmem.sc.general(i32 addrspace(211)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32)
+declare void @llvm.tpu.dma.descriptor(i32 *) nounwind
+declare i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32)
+
+@flag = addrspace(204) global i32 0, align 4
+@rflag = addrspace(211) global i32 0, align 4
+
+; CHECK-LABEL: dma_hbm_to_hbm:
+; CHECK: [hbm:s1], [sflag:rflag] = dma.local [hbm:s0], $0x4
+define void @dma_hbm_to_hbm(i32 addrspace(203)* %src, i32 addrspace(203)* %dst) {
+  call void @llvm.tpu.dma.hbm.to.hbm.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(203)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_spmem_to_hbm:
+; CHECK: s[[s:[0-9]+]] = sshrl.u32 s0, $0x3
+; CHECK-NEXT: [hbm:s1], [sflag:rflag] = dma.local [spmem:s[[s]]], $0x4
+define void @dma_spmem_to_hbm(i32 addrspace(202)* %src, i32 addrspace(203)* %dst) {
+  call void @llvm.tpu.dma.spmem.to.hbm.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(202)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_hbm_to_simem:
+; CHECK: [simem:s1], [sflag:rflag] = dma.local [hbm:s0], $0x4
+define void @dma_hbm_to_simem(i32 addrspace(203)* %src, i32 addrspace(215)* %dst) {
+  call void @llvm.tpu.dma.hbm.to.simem.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(203)* %src, i32 addrspace(215)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_hbm_to_timem:
+; CHECK: [timem:s1], [sflag:rflag] = dma.local [hbm:s0], $0x4
+define void @dma_hbm_to_timem(i32 addrspace(203)* %src, i32 addrspace(214)* %dst) {
+  call void @llvm.tpu.dma.hbm.to.timem.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(203)* %src, i32 addrspace(214)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_timem_to_hbm:
+; CHECK: [hbm:s1], [sflag:rflag] = dma.local [timem:s0], $0x4
+define void @dma_timem_to_hbm(i32 addrspace(214)* %src, i32 addrspace(203)* %dst) {
+  call void @llvm.tpu.dma.timem.to.hbm.sc.simple(i32 addrspace(211)* @rflag, i32 addrspace(214)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_desc:
+; CHECK: _ = dma.desc [smem:s0]
+define void @dma_desc(i32* %desc) {
+  call void @llvm.tpu.dma.descriptor(i32* %desc)
+  ret void
+}
+
+; CHECK-LABEL: dma_desc_imm:
+; CHECK: _ = dma.desc [smem:$0x1]
+define void @dma_desc_imm() {
+  call void @llvm.tpu.dma.descriptor(i32* nonnull inttoptr (i32 1 to i32*))
+  ret void
+}
+
+; CHECK-LABEL: dma_general_hbm_to_hbm:
+; CHECK: [hbm:s1], [sflag:s{{[0-9]+}}] = dma.general [hbm:s0], [sflag:s{{[0-9]+}}], length:s2, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @dma_general_hbm_to_hbm(i32 addrspace(203)* %src, i32 addrspace(213)* %dst, i32 %size, i32 %dstcid, i32 %srccid) {
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  call void @llvm.tpu.dma.hbm.to.hbm.sc.general(i32 addrspace(211)* @rflag, i32 %dstcid, i32 addrspace(203)* %src, i32 addrspace(213)* %dst, i32 %size, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_general_hbm_to_hbm_imm_len:
+; CHECK: [smem:s1], [sflag:s3] = dma.general [smem:s0], [sflag:s2], length:$0x4, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @dma_general_hbm_to_hbm_imm_len(i32* %src, i32 addrspace(212)* %dst, i32 addrspace(204)* %flag, i32 addrspace(211)* %rflag, i32 %dstcid, i32 %srccid) {
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  call void @llvm.tpu.dma.smem.to.smem.sc.general(i32 addrspace(211)* %rflag, i32 %dstcid, i32* %src, i32 addrspace(212)* %dst, i32 4, i32 addrspace(204)* %flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_general_hbm_to_hbm_imm_desc:
+; CHECK: [hbm:s1], [sflag:s3] = dma.general [hbm:s0], [sflag:s2], length:s4, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @dma_general_hbm_to_hbm_imm_desc(i32 addrspace(203)* %src, i32 addrspace(213)* %dst, i32 addrspace(204)* %flag, i32 addrspace(211)* %rflag, i32 %size, i32 %dstcid, i32 %srccid) {
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  call void @llvm.tpu.dma.hbm.to.hbm.sc.general(i32 addrspace(211)* %rflag, i32 %dstcid, i32 addrspace(203)* %src, i32 addrspace(213)* %dst, i32 %size, i32 addrspace(204)* %flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_general_hbm_to_hbm_imm_desc_len:
+; CHECK: [smem:s1], [sflag:s3] = dma.general [smem:s0], [sflag:s2], length:$0x4, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @dma_general_hbm_to_hbm_imm_desc_len(i32* %src, i32 addrspace(212)* %dst, i32 addrspace(204)* %flag, i32 addrspace(211)* %rflag, i32 %size, i32 %dstcid, i32 %srccid) {
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  call void @llvm.tpu.dma.smem.to.smem.sc.general(i32 addrspace(211)* %rflag, i32 %dstcid, i32* %src, i32 addrspace(212)* %dst, i32 4, i32 addrspace(204)* %flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_general_hbm_to_smem:
+; CHECK: [smem:s1], [sflag:s{{[0-9]+}}] = dma.general [hbm:s0], [sflag:s{{[0-9]+}}], length:s2, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @dma_general_hbm_to_smem(i32 addrspace(213)* %src, i32* %dst, i32 %size, i32 %dstcid, i32 %srccid) {
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  call void @llvm.tpu.dma.hbm.to.smem.sc.general(i32 addrspace(211)* @rflag, i32 %dstcid, i32 addrspace(213)* %src, i32* %dst, i32 %size, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_general_hbm_to_timem:
+; CHECK: [timem:s1], [sflag:s{{[0-9]+}}] = dma.general [hbm:s0], [sflag:s{{[0-9]+}}], length:s2, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @dma_general_hbm_to_timem(i32 addrspace(213)* %src, i32 addrspace(214)* %dst, i32 %size, i32 %dstcid, i32 %srccid) {
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  call void @llvm.tpu.dma.hbm.to.timem.sc.general(i32 addrspace(211)* @rflag, i32 %dstcid, i32 addrspace(213)* %src, i32 addrspace(214)* %dst, i32 %size, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_general_hbm_to_spmem:
+; CHECK: [spmem:s1], [sflag:s{{[0-9]+}}] = dma.general [hbm:s0], [sflag:s{{[0-9]+}}], length:s2, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @dma_general_hbm_to_spmem(i32 addrspace(213)* %src, i32 addrspace(202)* %dst, i32 %size, i32 %dstcid, i32 %srccid) {
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  call void @llvm.tpu.dma.hbm.to.spmem.sc.general(i32 addrspace(211)* @rflag, i32 %dstcid, i32 addrspace(213)* %src, i32 addrspace(202)* %dst, i32 %size, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_general_smem_to_hbm:
+; CHECK: [hbm:s1], [sflag:s{{[0-9]+}}] = dma.general [smem:s0], [sflag:s{{[0-9]+}}], length:s2, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @dma_general_smem_to_hbm(i32* %src, i32 addrspace(213)* %dst, i32 %size, i32 %dstcid, i32 %srccid) {
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  call void @llvm.tpu.dma.smem.to.hbm.sc.general(i32 addrspace(211)* @rflag, i32 %dstcid, i32* %src, i32 addrspace(213)* %dst, i32 %size, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_general_timem_to_hbm:
+; CHECK: [hbm:s1], [sflag:s{{[0-9]+}}] = dma.general [timem:s0], [sflag:s{{[0-9]+}}], length:s2, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @dma_general_timem_to_hbm(i32 addrspace(214)* %src, i32 addrspace(213)* %dst, i32 %size, i32 %dstcid, i32 %srccid) {
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  call void @llvm.tpu.dma.timem.to.hbm.sc.general(i32 addrspace(211)* @rflag, i32 %dstcid, i32 addrspace(214)* %src, i32 addrspace(213)* %dst, i32 %size, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_general_spmem_to_hbm:
+; CHECK: [hbm:s1], [sflag:s{{[0-9]+}}] = dma.general [spmem:s0], [sflag:s{{[0-9]+}}], length:s2, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @dma_general_spmem_to_hbm(i32 addrspace(202)* %src, i32 addrspace(213)* %dst, i32 %size, i32 %dstcid, i32 %srccid) {
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  call void @llvm.tpu.dma.spmem.to.hbm.sc.general(i32 addrspace(211)* @rflag, i32 %dstcid, i32 addrspace(202)* %src, i32 addrspace(213)* %dst, i32 %size, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: dma_general_spmem_to_spmem:
+; CHECK: [spmem:s1], [sflag:s{{[0-9]+}}] = dma.general [spmem:s0], [sflag:s{{[0-9]+}}], length:s2, [dreg:$0x0], stride_count:$0x4, ici_dest:s{{[0-9]+}}
+define void @dma_general_spmem_to_spmem(i32 addrspace(202)* %src, i32 addrspace(202)* %dst, i32 %size, i32 %dstcid, i32 %srccid) {
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  call void @llvm.tpu.dma.spmem.to.spmem.sc.general(i32 addrspace(211)* @rflag, i32 %dstcid, i32 addrspace(202)* %src, i32 addrspace(202)* %dst, i32 %size, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/dma_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/dma_tc.ll
new file mode 100644
index 0000000..525a4a3
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/dma_tc.ll

@@ -0,0 +1,58 @@
+; RUN: llc < %s -mcpu=tensorcore-jf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.dma.vmem.to.hbm(i32 addrspace(204)*, <1024 x i32> addrspace(205)*, i32 addrspace(203)*, i32) argmemonly nounwind
+declare void @llvm.tpu.dma.hbm.to.vmem(i32 addrspace(204)*, i32 addrspace(203)*, <1024 x i32> addrspace(205)*, i32) argmemonly nounwind
+declare void @llvm.tpu.dma.hbm.to.hib(i32 addrspace(204)*, <1024 x i32> addrspace(203)*, i32) argmemonly nounwind
+declare void @llvm.tpu.dma.hbm.to.vmem.hib.update(i32 addrspace(204)*, i32 addrspace(203)*, <1024 x i32> addrspace(205)*, i32) argmemonly nounwind
+
+@flag = addrspace(204) global i32 0, align 4
+
+; CHECK-LABEL: dma_vmem_to_hbm:
+; CHECK: [hbm:s1], [sflag:flag] = dma.local [vmem:s0], $0x4
+define void @dma_vmem_to_hbm(<1024 x i32> addrspace(205)* %src, i32 addrspace(203)* %dst) {
+  call void @llvm.tpu.dma.vmem.to.hbm(i32 addrspace(204)* @flag, <1024 x i32> addrspace(205)* %src, i32 addrspace(203)* %dst, i32 4)
+  ret void
+}
+
+; CHECK-LABEL: dma_hbm_to_hib:
+; CHECK: [hib], [sflag:flag] = dma.local [hbm:s0], $0x4
+define void @dma_hbm_to_hib(<1024 x i32> addrspace(203)* %src) {
+  call void @llvm.tpu.dma.hbm.to.hib(i32 addrspace(204)* @flag, <1024 x i32> addrspace(203)* %src, i32 4)
+  ret void
+}
+
+; CHECK-LABEL: dma_hbm_to_vmem_hib_update:
+; CHECK: [vmem:s1], [sflag:flag] = dma.local.hibupdate [hbm:s0], $0x4
+define void @dma_hbm_to_vmem_hib_update(i32 addrspace(203)* %src, <1024 x i32> addrspace(205)* %dst, i32 addrspace(204)* %flag) {
+  call void @llvm.tpu.dma.hbm.to.vmem.hib.update(i32 addrspace(204)* @flag, i32 addrspace(203)* %src, <1024 x i32> addrspace(205)* %dst, i32 4)
+  ret void
+}
+
+; CHECK-LABEL: dma_cadence
+; CHECK: dma.local
+; CHECK-NEXT: vdelay $0x1
+; CHECK-NEXT: dma.local
+; CHECK-NEXT: vdelay $0x1
+; CHECK-NEXT: dma.local
+; CHECK-NEXT: shalt
+define void @dma_cadence(i32 addrspace(203)* %src, <1024 x i32> addrspace(205)* %dst, i32 addrspace(204)* %flag) {
+  call void @llvm.tpu.dma.hbm.to.vmem(i32 addrspace(204)* @flag, i32 addrspace(203)* %src, <1024 x i32> addrspace(205)* %dst, i32 4)
+  call void @llvm.tpu.dma.hbm.to.vmem(i32 addrspace(204)* @flag, i32 addrspace(203)* %src, <1024 x i32> addrspace(205)* %dst, i32 4)
+  call void @llvm.tpu.dma.hbm.to.vmem(i32 addrspace(204)* @flag, i32 addrspace(203)* %src, <1024 x i32> addrspace(205)* %dst, i32 4)
+  ret void
+}
+
+; Test that the store and DMA are in different bundles.
+; CHECK-LABEL: dma_after_store:
+; CHECK: vst
+; CHECK: }
+; CHECK: dma.local
+define void @dma_after_store(<1024 x i32> %a, <1024 x i32> addrspace(205)* %src, i32 addrspace(203)* %dst) {
+  store <1024 x i32> %a, <1024 x i32> addrspace(205)* %src
+  call void @llvm.tpu.dma.vmem.to.hbm(i32 addrspace(204)* @flag, <1024 x i32> addrspace(205)* %src, i32 addrspace(203)* %dst, i32 4)
+  ret void
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/early-if-predicator.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/early-if-predicator.ll
new file mode 100644
index 0000000..3306707
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/early-if-predicator.ll

@@ -0,0 +1,181 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+; Test that basic 32-bit integer operations assemble as expected.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Check that the store is predicated
+; CHECK-LABEL: ifprednophi:
+; CHECK: [[vi:v[0-9]+]] =	vimm.s32 @!p{{[0-9]+}} $0x2  
+; CHECK: [tilespmem:s{{[0-9]+}}+$0x0] =	vst @!p{{[0-9]+}} [[vi]];
+define void @ifprednophi(i1 %cmp, <8 x i32> addrspace(201)* %ptr0, <8 x i32> addrspace(201)* %ptr1) {
+entry:
+  %0 = load <8 x i32>, <8 x i32> addrspace(201)* %ptr0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %v0 = insertelement <8 x i32> undef, i32 2, i32 0
+  %i2 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> zeroinitializer ; create vector of all 2
+  store <8 x i32> %i2, <8 x i32> addrspace(201)* %ptr1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; Check that we support predication emulation of scalar select
+; CHECK-LABEL: canpredicatesel_scalar:
+; CHECK: p[[pi:[0-9]+]] = por !p{{[0-9]+}}, p{{[0-9]+}}
+; CHECK: s[[si:[0-9]+]] = smov.u32 @!p[[pi]] s{{[0-9]+}}
+; CHECK: [smem:s0] = sst @!p{{[0-9]+}} s[[si]]
+define i1 @canpredicatesel_scalar(i1 %p, i1 %s, i32* %x, i32 %y0, i32 %y1) {
+  br i1 %p, label %next, label %one
+one:
+  %y = select i1 %s, i32 %y0, i32 %y1
+  store i32 %y, i32* %x
+  br label %next
+next:
+  ret i1 %p
+}
+
+; Check that we support predication emulation of vector select
+; CHECK-LABEL: canpredicatesel_vector:
+; CHECK: p[[pi:[0-9]+]] = por !p1, p0
+; CHECK: v[[si:[0-9]+]] = vpsel p[[pi]], v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: [tilespmem:s0+$0x0] = vst @!p0 v[[si]]
+define i1 @canpredicatesel_vector(i1 %p, i1 %s, <8 x i32> addrspace(201)* %x, <8 x i32> %y0, <8 x i32> %y1) {
+  br i1 %p, label %next, label %one
+one:
+  %y = select i1 %s, <8 x i32> %y0, <8 x i32> %y1
+  store <8 x i32> %y, <8 x i32> addrspace(201)* %x
+  br label %next
+next:
+  ret i1 %p
+}
+
+; Check that we support predication emulation of i1 select
+; CHECK-LABEL: canpredicatesel_pred:
+; CHECK: p[[pi0:[0-9]+]] = por !p{{[0-9]+}}, p[[pi3:[0-9]+]]
+; CHECK: p[[pi1:[0-9]+]] = por @!p[[pi0]] p{{[0-9]+}}, p{{[0-9]+}}
+; CHECK: s1 = simm.s32 @!p[[pi3]] $0x1
+; CHECK: p[[pi2:[0-9]+]] = por !p[[pi1]], p{{[0-9]+}}
+; CHECK: s1 = simm.s32 @p[[pi2]] $0x0
+; CHECK: [smem:s0] = sst @!p[[pi3]] s1
+define i1 @canpredicatesel_pred(i1 %p, i1 %s, i32* %x, i1 %y0, i1 %y1) {
+  br i1 %p, label %next, label %one
+one:
+  %y = select i1 %s, i1 %y0, i1 %y1
+  %z = zext i1 %y to i32
+  store i32 %z, i32* %x
+  br label %next
+next:
+  ret i1 %p
+}
+
+; Check that we support predication emulation of mask select
+; CHECK-LABEL: canpredicatesel_mask:
+; CHECK: p[[pi0:[0-9]+]] = por !p{{[0-9]+}}, p[[pi1:[0-9]+]]
+; CHECK: v[[vi:[0-9]+]] = vimm.s32 @!p[[pi1]] $0x0
+; CHECK: vm[[si:[0-9]+]] = vmmov @!p[[pi0]] vm{{[0-9]+}}
+; CHECK: v[[vi:[0-9]+]] = vsel @!p[[pi1]] vm[[si]], $0x1, v[[vi]]
+; CHECK: [tilespmem:s0+$0x0] = vst @!p[[pi1]] v[[vi]]
+define i1 @canpredicatesel_mask(i1 %p, i1 %s, <8 x i32> addrspace(201)* %x, <8 x i1> %y0, <8 x i1> %y1) {
+  br i1 %p, label %next, label %one
+one:
+  %y = select i1 %s, <8 x i1> %y0, <8 x i1> %y1
+  %z = zext <8 x i1> %y to <8 x i32>
+  store <8 x i32> %z, <8 x i32> addrspace(201)* %x
+  br label %next
+next:
+  ret i1 %p
+}
+
+; Check that the store and shift are predicated
+; CHECK-LABEL: ifpredphi:
+; CHECK-DAG: v{{[0-9]+}} =	vshll.u32 @!p{{[0-9]+}} v{{[0-9]+}}, $0x2
+; CHECK-DAG: [[vi:v[0-9]+]] =	vimm.s32 @!p{{[0-9]+}} $0x2  
+; CHECK: [tilespmem:s{{[0-9]+}}+$0x0] =	vst @!p{{[0-9]+}} [[vi]];
+define <8 x i32> @ifpredphi(i1 %cmp, <8 x i32> addrspace(201)* %ptr0, <8 x i32> addrspace(201)* %ptr1) {
+entry:
+  %0 = load <8 x i32>, <8 x i32> addrspace(201)* %ptr0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %v0 = insertelement <8 x i32> undef, i32 2, i32 0
+  %i2 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> zeroinitializer ; create vector of all 2
+  store <8 x i32> %i2, <8 x i32> addrspace(201)* %ptr1
+  %shl = shl <8 x i32> %0, %i2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %p.0 = phi <8 x i32> [ %shl, %if.then ], [ %0, %entry ]
+  ret <8 x i32> %p.0
+}
+
+; Check that both stores are predicated
+; CHECK-LABEL: ififpredphi:
+; CHECK: v[[vi:[0-9]+]] = vimm.s32 @!p{{[0-9]+}} $0x2;
+; CHECK: [tilespmem:s{{[0-9]+}}+$0x0] = vst @!p{{[0-9]+}} v[[vi]];
+; CHECK: v[[vi2:[0-9]+]] = vimm.s32 @!p{{[0-9]+}} $0x3;
+; CHECK: [tilespmem:s{{[0-9]+}}+$0x0] = vst @!p{{[0-9]+}} v[[vi2]];
+define <8 x i32> @ififpredphi(i1 %cmpout, i1 %cmpin, <8 x i32> addrspace(201)* %ptr0, <8 x i32> addrspace(201)* %ptr1, <8 x i32> addrspace(201)* %ptr2, <8 x i32> addrspace(201)* %ptr3) {
+entry:
+  %0 = load <8 x i32>, <8 x i32> addrspace(201)* %ptr0
+  %1 = load <8 x i32>, <8 x i32> addrspace(201)* %ptr1
+  br i1 %cmpout, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %v0 = insertelement <8 x i32> undef, i32 2, i32 0
+  %i2 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> zeroinitializer
+  store <8 x i32> %i2, <8 x i32> addrspace(201)* %ptr2
+  %shl = shl <8 x i32> %0, %i2
+  br i1 %cmpin, label %if.then2, label %if.end
+
+if.then2:                                          ; preds = %entry
+  %v1 = insertelement <8 x i32> undef, i32 3, i32 0
+  %i3 = shufflevector <8 x i32> %v1, <8 x i32> undef, <8 x i32> zeroinitializer
+  store <8 x i32> %i3, <8 x i32> addrspace(201)* %ptr3
+  %shl2 = shl <8 x i32> %1, %i3
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %p.0 = phi <8 x i32> [ %shl, %if.then ], [ %shl2, %if.then2 ], [ %0, %entry ]
+  ret <8 x i32> %p.0
+}
+
+; Check that both stores are predicated
+; CHECK-LABEL: ifcomplexpredphi:
+; CHECK: v[[vi:[0-9]+]] = vimm.s32 @!p{{[0-9]+}} $0x2
+; CHECK: [tilespmem:s{{[0-9]+}}+$0x0] = vst @!p{{[0-9]+}} v[[vi]]
+; CHECK: v[[vi2:[0-9]+]] = vimm.s32 @!p{{[0-9]+}} $0x3
+; CHECK: [tilespmem:s{{[0-9]+}}+$0x0] = vst @!p{{[0-9]+}} v[[vi2]]
+define <8 x i32> @ifcomplexpredphi(i1 %cmpout, i1 %cmpin, <8 x i32> addrspace(201)* %ptr0, <8 x i32> addrspace(201)* %ptr1,
+                                   <8 x i32> addrspace(201)* %ptr2, <8 x i32> addrspace(201)* %ptr3, i1 %cmpl) {
+entry:
+  %0 = load <8 x i32>, <8 x i32> addrspace(201)* %ptr0
+  br label %loop
+
+loop:
+  %1 = load <8 x i32>, <8 x i32> addrspace(201)* %ptr1
+  br i1 %cmpout, label %if.then, label %if.end
+
+if.then:
+  %v0 = insertelement <8 x i32> undef, i32 2, i32 0
+  %i2 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> zeroinitializer
+  store <8 x i32> %i2, <8 x i32> addrspace(201)* %ptr2
+  %shl = shl <8 x i32> %0, %i2
+  br i1 %cmpin, label %if.then2, label %if.end
+
+if.then2:
+  %v1 = insertelement <8 x i32> undef, i32 3, i32 0
+  %i3 = shufflevector <8 x i32> %v1, <8 x i32> undef, <8 x i32> zeroinitializer
+  store <8 x i32> %i3, <8 x i32> addrspace(201)* %ptr3
+  %shl2 = shl <8 x i32> %1, %i3
+  br i1 %cmpl, label %loop, label %if.end
+
+if.end:
+  %p.0 = phi <8 x i32> [ %shl, %if.then ], [ %shl2, %if.then2 ], [ %0, %loop ]
+  ret <8 x i32> %p.0
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/embedded_masks_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/embedded_masks_sc.ll
new file mode 100644
index 0000000..67b5f8a
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/embedded_masks_sc.ll

@@ -0,0 +1,353 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-enable-embedded-masks=true | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.dupcnti(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquei(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1>, <8 x i32>, <8 x i32>) readnone nounwind
+declare { <8 x float>, <8 x float>, <8 x i1> } @llvm.tpu.sort.ascdf.v8f32(<8 x i1>, <8 x float>, <8 x float>) readnone nounwind
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.dscdi.v8i32(<8 x i1>, <8 x i32>, <8 x i32>) readnone nounwind
+declare { <8 x float>, <8 x float>, <8 x i1> } @llvm.tpu.sort.dscdf.v8f32(<8 x i1>, <8 x float>, <8 x float>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.add.scan1xNi(<8 x i1>, <8 x i32>) readnone nounwind
+declare { <8 x float>, <8 x i1> } @llvm.tpu.add.scan1xNf(<8 x i1>, <8 x float>) readnone nounwind
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.add.seg.scan1xNi(<8 x i1>, <8 x i32>) readnone nounwind
+
+; Tests that we propagate supported fixed hardware masks into into instructions.
+; This lit test does not cover all opcodes and masks.
+
+; CHECK-LABEL: vdupcnt_0xff:
+; CHECK:      {  	(xrf1) = vdupcnt.msk.u32 $0xff, v0;
+define <8 x i32> @vdupcnt_0xff(<8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.dupcnti(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i32> %v)
+  %co = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %co, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vunique_0xff:
+; CHECK:      {  	(xrf1) = vunique.msk.u32 $0xff, v0;
+define <8 x i32> @vunique_0xff(<8 x i32> %v) {
+  %a = call {<8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquei(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i32> %v)
+  %co = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %co, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vsort.ascd.u32_0xff:
+; CHECK:      {  	(xrf1) = vsort.ascd.msk.u32 $0xff, v0, v1;
+define <8 x i32> @vsort.ascd.u32_0xff(<8 x i32> %k, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i32> %k, <8 x i32> %v)
+  %ko = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 0
+  %vo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %ko, <8 x i32> %vo
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vsort.ascd.f32_0xff:
+; CHECK:      {  	(xrf1) = vsort.ascd.msk.f32 $0xff, v0, v1;
+define <8 x float> @vsort.ascd.f32_0xff(<8 x float> %k, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x float>, <8 x i1> } @llvm.tpu.sort.ascdf.v8f32(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x float> %k, <8 x float> %v)
+  %ko = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 0
+  %vo = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x float> %ko, <8 x float> %vo
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vsort.dscd.u32_0xff:
+; CHECK:      {  	(xrf1) = vsort.dscd.msk.u32 $0xff, v0, v1;
+define <8 x i32> @vsort.dscd.u32_0xff(<8 x i32> %k, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.dscdi.v8i32(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i32> %k, <8 x i32> %v)
+  %ko = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 0
+  %vo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %ko, <8 x i32> %vo
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vsort.dscd.f32_0xff:
+; CHECK:      {  	(xrf1) = vsort.dscd.msk.f32 $0xff, v0, v1;
+define <8 x float> @vsort.dscd.f32_0xff(<8 x float> %k, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x float>, <8 x i1> } @llvm.tpu.sort.dscdf.v8f32(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x float> %k, <8 x float> %v)
+  %ko = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 0
+  %vo = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x float> %ko, <8 x float> %vo
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vadd.scan.s32_0xff:
+; CHECK: (xrf0) = vadd.scan.msk.s32 $0xff, v0;
+define <8 x i32> @vadd.scan.s32_0xff(<8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.add.scan1xNi(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i32> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vadd.scan.f32_0xff:
+; CHECK: (xrf0) = vadd.scan.msk.f32 $0xff, v0;
+define <8 x float> @vadd.scan.f32_0xff(<8 x float> %v) {
+  %a = call { <8 x float>, <8 x i1> } @llvm.tpu.add.scan1xNf(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x float> %v)
+  %po = extractvalue { <8 x float>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x float>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x float> %po, <8 x float> %v
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vadd.seg.scan.s32_0xff:
+; CHECK: (xrf0) = vadd.seg.scan.s32 $0xff, v0;
+define <8 x i32> @vadd.seg.scan.s32_0xff(<8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.add.seg.scan1xNi(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i32> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+declare void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+declare void @llvm.tpu.vst.msk.idx.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, <8 x float>)
+declare void @llvm.tpu.vst.msk.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>)
+declare <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*)
+declare <8 x i32> @llvm.tpu.vld.msk.idx.v8i32.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>)
+declare <8 x float> @llvm.tpu.vld.msk.idx.v8f32.p201v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>)
+declare void @llvm.tpu.vst.cb.msk.idx.v8i32(<8 x i1>, x86_mmx, i32, <8 x i32>, <8 x i32>)
+declare void @llvm.tpu.vst.cb.msk.idx.v8f32(<8 x i1>, x86_mmx, i32, <8 x i32>, <8 x float>)
+
+@garr = addrspace(201) global <8 x i32> zeroinitializer
+@garrf = addrspace(201) global <8 x float> zeroinitializer
+
+; CHECK-LABEL: vst_msk_idxi_0xff:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.msk $0xff, v1
+define void @vst_msk_idxi_0xff(<8 x i32> %offs, <8 x i32> %v) {
+  tail call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>,
+                                        <8 x i32> addrspace(201)* @garr,
+                                        <8 x i32> %offs,
+                                        <8 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idxf_0xff:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.msk $0xff, v1
+define void @vst_msk_idxf_0xff(<8 x i32> %offs, <8 x float> %v) {
+  tail call void @llvm.tpu.vst.msk.idx.p201v8f32.v8f32(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>,
+                                        <8 x float> addrspace(201)* @garrf,
+                                        <8 x i32> %offs,
+                                        <8 x float> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_0xff:
+; CHECK: [tilespmem:s0+$0x0] = vst.msk $0xff, v0
+define void @vst_msk_0xff(<8 x i32> %v) {
+  tail call void @llvm.tpu.vst.msk.p201v8i32.v8i32(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>,
+                                                   <8 x i32> addrspace(201)* @garr,
+                                                   <8 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vld_msk_0xff:
+; CHECK: v0 = vld.msk [tilespmem:s0+$0x0], $0xff
+define <8 x i32> @vld_msk_0xff() {
+  %v = tail call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>,
+                                                             <8 x i32> addrspace(201)* @garr)
+  ret <8 x i32> %v
+}
+
+; CHECK-LABEL: vld_msk_idxi_0xff:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], $0xff
+define <8 x i32> @vld_msk_idxi_0xff(<8 x i32> %vo) {
+  %v = tail call <8 x i32> @llvm.tpu.vld.msk.idx.v8i32.p201v8i32(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>,
+                                                  <8 x i32> addrspace(201)* @garr, <8 x i32> %vo)
+  ret <8 x i32> %v
+}
+
+; CHECK-LABEL: vld_msk_idxf_0xff:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], $0xff
+define <8 x float> @vld_msk_idxf_0xff(<8 x i32> %vo) {
+  %v = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32.p201v8f32(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>,
+                                                    <8 x float> addrspace(201)* @garrf, <8 x i32> %vo)
+  ret <8 x float> %v
+}
+
+; CHECK-LABEL: vdupcnt_0x3:
+; CHECK:      {  	(xrf1) = vdupcnt.msk.u32 $0x3, v0;
+define <8 x i32> @vdupcnt_0x3(<8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.dupcnti(<8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %v)
+  %co = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %co, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vunique_0x3:
+; CHECK:      {  	(xrf1) = vunique.msk.u32 $0x3, v0;
+define <8 x i32> @vunique_0x3(<8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquei(<8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %v)
+  %co = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %co, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vsort.ascd.u32_0x3:
+; CHECK:      {  	(xrf1) = vsort.ascd.msk.u32 $0x3, v0, v1;
+define <8 x i32> @vsort.ascd.u32_0x3(<8 x i32> %k, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %k, <8 x i32> %v)
+  %ko = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 0
+  %vo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %ko, <8 x i32> %vo
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vsort.ascd.f32_0x3:
+; CHECK:      {  	(xrf1) = vsort.ascd.msk.f32 $0x3, v0, v1;
+define <8 x float> @vsort.ascd.f32_0x3(<8 x float> %k, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x float>, <8 x i1> } @llvm.tpu.sort.ascdf.v8f32(<8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %k, <8 x float> %v)
+  %ko = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 0
+  %vo = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x float> %ko, <8 x float> %vo
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vsort.dscd.u32_0x3:
+; CHECK:      {  	(xrf1) = vsort.dscd.msk.u32 $0x3, v0, v1;
+define <8 x i32> @vsort.dscd.u32_0x3(<8 x i32> %k, <8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.dscdi.v8i32(<8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %k, <8 x i32> %v)
+  %ko = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 0
+  %vo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x i32> %ko, <8 x i32> %vo
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vsort.dscd.f32_0x3:
+; CHECK:      {  	(xrf1) = vsort.dscd.msk.f32 $0x3, v0, v1;
+define <8 x float> @vsort.dscd.f32_0x3(<8 x float> %k, <8 x float> %v) {
+  %a = call { <8 x float>, <8 x float>, <8 x i1> } @llvm.tpu.sort.dscdf.v8f32(<8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %k, <8 x float> %v)
+  %ko = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 0
+  %vo = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 1
+  %mo = extractvalue { <8 x float>, <8 x float>, <8 x i1> } %a, 2
+  %b = select <8 x i1> %mo, <8 x float> %ko, <8 x float> %vo
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vadd.scan.s32_0x3:
+; CHECK: (xrf0) = vadd.scan.msk.s32 $0x3, v0;
+define <8 x i32> @vadd.scan.s32_0x3(<8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.add.scan1xNi(<8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vadd.seg.scan.s32_0x3:
+; CHECK: (xrf0) = vadd.seg.scan.s32 $0x3, v0;
+define <8 x i32> @vadd.seg.scan.s32_0x3(<8 x i32> %v) {
+  %a = call { <8 x i32>, <8 x i1> } @llvm.tpu.add.seg.scan1xNi(<8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %v)
+  %po = extractvalue { <8 x i32>, <8 x i1> } %a, 0
+  %mo = extractvalue { <8 x i32>, <8 x i1> } %a, 1
+  %b = select <8 x i1> %mo, <8 x i32> %po, <8 x i32> %v
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vst_msk_idxi_0x3:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.msk $0x3, v1
+define void @vst_msk_idxi_0x3(<8 x i32> %offs, <8 x i32> %v) {
+  tail call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>,
+                                        <8 x i32> addrspace(201)* @garr,
+                                        <8 x i32> %offs,
+                                        <8 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idxf_0x3:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.msk $0x3, v1
+define void @vst_msk_idxf_0x3(<8 x i32> %offs, <8 x float> %v) {
+  tail call void @llvm.tpu.vst.msk.idx.p201v8f32.v8f32(<8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>,
+                                        <8 x float> addrspace(201)* @garrf,
+                                        <8 x i32> %offs,
+                                        <8 x float> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_0x3:
+; CHECK: [tilespmem:s0+$0x0] = vst.msk $0x3, v0
+define void @vst_msk_0x3(<8 x i32> %v) {
+  tail call void @llvm.tpu.vst.msk.p201v8i32.v8i32(<8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>,
+                                   <8 x i32> addrspace(201)* @garr,
+                                   <8 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vld_msk_0x3:
+; CHECK: v0 = vld.msk [tilespmem:s0+$0x0], $0x3
+define <8 x i32> @vld_msk_0x3() {
+  %v = tail call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>,
+                                                             <8 x i32> addrspace(201)* @garr)
+  ret <8 x i32> %v
+}
+
+; CHECK-LABEL: vld_msk_idxi_0x3:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], $0x3
+define <8 x i32> @vld_msk_idxi_0x3(<8 x i32> %vo) {
+  %v = tail call <8 x i32> @llvm.tpu.vld.msk.idx.v8i32.p201v8i32(<8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>,
+                                                  <8 x i32> addrspace(201)* @garr, <8 x i32> %vo)
+  ret <8 x i32> %v
+}
+
+; CHECK-LABEL: vld_msk_idxf_0x3:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], $0x3
+define <8 x float> @vld_msk_idxf_0x3(<8 x i32> %vo) {
+  %v = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32.p201v8f32(<8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>,
+                                                    <8 x float> addrspace(201)* @garrf, <8 x i32> %vo)
+  ret <8 x float> %v
+}
+
+; CHECK: get_splat_no_embed_i32:
+; CHECK: v0 = vimm.s32 $0x1
+define <8 x i32> @get_splat_no_embed_i32() {
+  %splatinsert = insertelement <8 x i32> undef, i32 1, i32 0
+  %splat = shufflevector <8 x i32> %splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
+  ret <8 x i32> %splat
+}
+
+; CHECK-LABEL: vst_cb_msk_idxi_0xff:
+; CHECK: [tilespmem:v0+s0+$0x0 cbreg:$0x0] = vst.idx.cb.msk $0xff, v1
+define void @vst_cb_msk_idxi_0xff(<8 x i32> %offs, <8 x i32> %v) {
+  tail call void @llvm.tpu.vst.cb.msk.idx.v8i32(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>,
+                                           x86_mmx undef,
+					   i32 0,
+                                           <8 x i32> %offs,
+                                           <8 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_cb_msk_idxf_0xff:
+; CHECK: [tilespmem:v0+s0+$0x0 cbreg:$0x0] = vst.idx.cb.msk $0xff, v1
+define void @vst_cb_msk_idxf_0xff(<8 x i32> %offs, <8 x float> %v) {
+  tail call void @llvm.tpu.vst.cb.msk.idx.v8f32(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>,
+                                           x86_mmx undef,
+					   i32 0,
+                                           <8 x i32> %offs,
+                                           <8 x float> %v)
+
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/eup.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/eup.ll
new file mode 100644
index 0000000..cc404de
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/eup.ll

@@ -0,0 +1,138 @@
+; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+; Test EUP intrinsics code generation
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.rsqrt.v1024f32(<1024 x float>)
+declare i32 @llvm.tpu.pow2.v1024f32(<1024 x float>)
+declare i32 @llvm.tpu.log2.v1024f32(<1024 x float>)
+declare i32 @llvm.tpu.tanh.v1024f32(<1024 x float>)
+declare i32 @llvm.tpu.rcp.v1024f32(<1024 x float>)
+declare i32 @llvm.tpu.eup.push.v1024f32(<1024 x float>)
+
+declare <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32)
+
+; CHECK-LABEL: rsqrt:
+; CHECK: (erf) = vrsqrt.f32 v0
+; CHECK: _ = vdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <1024 x float> @rsqrt(<1024 x float> %v) {
+  %f = call i32 @llvm.tpu.rsqrt.v1024f32(<1024 x float> %v)
+  %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: pow2:
+; CHECK: (erf) = vpow2.f32 v0
+; CHECK: _ = vdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <1024 x float> @pow2(<1024 x float> %v) {
+  %f = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v)
+  %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: log2:
+; CHECK: (erf) = vlog2.f32 v0
+; CHECK: _ = vdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <1024 x float> @log2(<1024 x float> %v) {
+  %f = call i32 @llvm.tpu.log2.v1024f32(<1024 x float> %v)
+  %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: tanh:
+; CHECK: (erf) = vtanh.f32 v0
+; CHECK: _ = vdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <1024 x float> @tanh(<1024 x float> %v) {
+  %f = call i32 @llvm.tpu.tanh.v1024f32(<1024 x float> %v)
+  %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: rcp:
+; CHECK: (erf) = vrcp.f32 v0
+; CHECK: _ = vdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <1024 x float> @rcp(<1024 x float> %v) {
+  %f = call i32 @llvm.tpu.rcp.v1024f32(<1024 x float> %v)
+  %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: fdivrcpimm:
+; CHECK: (erf) = vrcp.f32 v0
+; CHECK: _ = vdelay $0x6
+; CHECK: v0 = vpop (erf)
+; CHECK: v0 = vmul.f32 $1.0, v0
+ define <1024 x float> @fdivrcpimm(<1024 x float> %v) {
+  %splatinsert = insertelement <1024 x float> undef, float 1., i32 0
+  %splat = shufflevector <1024 x float> %splatinsert, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %res = fdiv <1024 x float> %splat, %v
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: fdivrcpvar:
+; CHECK: (erf) = vrcp.f32 v0
+; CHECK: _ = vdelay $0x6
+; CHECK: v0 = vpop (erf)
+; CHECK: v0 = vmul.f32 s0, v0
+ define <1024 x float> @fdivrcpvar(float %a, <1024 x float> %v) {
+  %splatinsert = insertelement <1024 x float> undef, float %a, i32 0
+  %splat = shufflevector <1024 x float> %splatinsert, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %res = fdiv <1024 x float> %splat, %v
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: fdivrcpvec:
+; CHECK: (erf) = vrcp.f32 v1
+; CHECK: _ = vdelay $0x6
+; CHECK: v1 = vpop (erf)
+; CHECK: v0 = vmul.f32 v0, v1
+ define <1024 x float> @fdivrcpvec(<1024 x float> %vx, <1024 x float> %vy) {
+  %res = fdiv <1024 x float> %vx, %vy
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: push:
+; CHECK: (erf) = vpush v0
+; CHECK: _ = vdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <1024 x float> @push(<1024 x float> %v) {
+  %f = call i32 @llvm.tpu.eup.push.v1024f32(<1024 x float> %v)
+  %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: eup_scheduling:
+; CHECK:  (erf) = vrcp.f32
+; CHECK:  _ = vdelay $0x1
+; CHECK:  (erf) = vtanh.f32
+; CHECK:  _ = vdelay $0x1
+; CHECK:  (erf) = vlog2.f32
+; CHECK:  _ = vdelay $0x1
+; CHECK:  (erf) = vpow2.f32
+; CHECK:  _ = vdelay $0x3
+; CHECK:  v{{[0-9]+}} = vpop (erf)
+; CHECK:  v{{[0-9]+}} = vpop (erf)
+; CHECK:  v{{[0-9]+}} = vpop (erf)
+; CHECK:  v{{[0-9]+}} = vpop (erf)
+ define <1024 x float> @eup_scheduling(<1024 x float> %v) {
+  %f1 = call i32 @llvm.tpu.rcp.v1024f32(<1024 x float> %v)
+  %res1 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f1)
+  %f2 = call i32 @llvm.tpu.tanh.v1024f32(<1024 x float> %v)
+  %res2 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f2)
+  %f3 = call i32 @llvm.tpu.log2.v1024f32(<1024 x float> %v)
+  %res3 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f3)
+  %f4 = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v)
+  %res4 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f4)
+  %res5 = fadd <1024 x float> %res1, %res2
+  %res6 = fadd <1024 x float> %res3, %res4
+  %res7 = fadd <1024 x float> %res6, %res5
+  ret <1024 x float> %res7
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/eup_bf16_gl_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/eup_bf16_gl_sc.ll
new file mode 100644
index 0000000..5a79c48
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/eup_bf16_gl_sc.ll

@@ -0,0 +1,99 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Test bfloat EUP intrinsics code generation
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <16 x bfloat> @llvm.tpu.rsqrt.macro.v16bf16(<16 x bfloat>)
+declare <16 x bfloat> @llvm.tpu.pow2.macro.v16bf16(<16 x bfloat>)
+declare <16 x bfloat> @llvm.tpu.log2.macro.v16bf16(<16 x bfloat>)
+declare <16 x bfloat> @llvm.tpu.tanh.macro.v16bf16(<16 x bfloat>)
+declare <16 x bfloat> @llvm.tpu.sigshft.macro.v16bf16(<16 x bfloat>)
+declare <16 x bfloat> @llvm.tpu.rcp.macro.v16bf16(<16 x bfloat>)
+declare <16 x bfloat> @llvm.tpu.sin.macro.v16bf16(<16 x bfloat>)
+declare <16 x bfloat> @llvm.tpu.cos.macro.v16bf16(<16 x bfloat>)
+declare <16 x bfloat> @llvm.tpu.erf.macro.v16bf16(<16 x bfloat>)
+
+; CHECK-LABEL: rsqrt:
+; CHECK: (erf) = vrsqrt.bf16 v0
+; CHECK: _ = sdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <16 x bfloat> @rsqrt(<16 x bfloat> %v) {
+  %res = call <16 x bfloat> @llvm.tpu.rsqrt.macro.v16bf16(<16 x bfloat> %v)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: pow2:
+; CHECK: (erf) = vpow2.bf16 v0
+; CHECK: _ = sdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <16 x bfloat> @pow2(<16 x bfloat> %v) {
+  %res = call <16 x bfloat> @llvm.tpu.pow2.macro.v16bf16(<16 x bfloat> %v)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: log2:
+; CHECK: (erf) = vlog2.bf16 v0
+; CHECK: _ = sdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <16 x bfloat> @log2(<16 x bfloat> %v) {
+  %res = call <16 x bfloat> @llvm.tpu.log2.macro.v16bf16(<16 x bfloat> %v)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: tanh:
+; CHECK: (erf) = vtanh.bf16 v0
+; CHECK: _ = sdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <16 x bfloat> @tanh(<16 x bfloat> %v) {
+  %res = call <16 x bfloat> @llvm.tpu.tanh.macro.v16bf16(<16 x bfloat> %v)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: sigshft:
+; CHECK: (erf) = vsigshft.bf16 v0
+; CHECK: _ = sdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <16 x bfloat> @sigshft(<16 x bfloat> %v) {
+  %res = call <16 x bfloat> @llvm.tpu.sigshft.macro.v16bf16(<16 x bfloat> %v)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: rcp:
+; CHECK: (erf) = vrcp.bf16 v0
+; CHECK: _ = sdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <16 x bfloat> @rcp(<16 x bfloat> %v) {
+  %res = call <16 x bfloat> @llvm.tpu.rcp.macro.v16bf16(<16 x bfloat> %v)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: sin:
+; CHECK: (erf) = vsin.bf16 v0
+; CHECK: _ = sdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <16 x bfloat> @sin(<16 x bfloat> %v) {
+  %res = call <16 x bfloat> @llvm.tpu.sin.macro.v16bf16(<16 x bfloat> %v)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: cos:
+; CHECK: (erf) = vcos.bf16 v0
+; CHECK: _ = sdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <16 x bfloat> @cos(<16 x bfloat> %v) {
+  %res = call <16 x bfloat> @llvm.tpu.cos.macro.v16bf16(<16 x bfloat> %v)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: erf:
+; CHECK: (erf) = verf.bf16 v0
+; CHECK: _ = sdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <16 x bfloat> @erf(<16 x bfloat> %v) {
+  %res = call <16 x bfloat> @llvm.tpu.erf.macro.v16bf16(<16 x bfloat> %v)
+  ret <16 x bfloat> %res
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/eup_gl_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/eup_gl_sc.ll
new file mode 100644
index 0000000..d189353
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/eup_gl_sc.ll

@@ -0,0 +1,89 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Test EUP intrinsics code generation
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <8 x float> @llvm.tpu.sin.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.cos.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.erf.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.tanh.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float>)
+
+; CHECK-LABEL: sin:
+; CHECK: (erf) = vsin.f32 v0
+; CHECK: _ = sdelay $0x5
+; CHECK  v0 = vpop (erf)
+ define <8 x float> @sin(<8 x float> %v) {
+  %res = call <8 x float> @llvm.tpu.sin.macro.v8f32(<8 x float> %v)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: cos:
+; CHECK: (erf) = vcos.f32 v0
+; CHECK: _ = sdelay $0x5
+; CHECK  v0 = vpop (erf)
+ define <8 x float> @cos(<8 x float> %v) {
+  %res = call <8 x float> @llvm.tpu.cos.macro.v8f32(<8 x float> %v)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: erf:
+; CHECK: (erf) = verf.f32 v0
+; CHECK: _ = sdelay $0x5
+; CHECK  v0 = vpop (erf)
+ define <8 x float> @erf(<8 x float> %v) {
+  %res = call <8 x float> @llvm.tpu.erf.macro.v8f32(<8 x float> %v)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: tanh:
+; CHECK: (erf) = vtanh.f32 v0
+; CHECK: _ = sdelay $0x5
+; CHECK  v0 = vpop (erf)
+ define <8 x float> @tanh(<8 x float> %v) {
+  %res = call <8 x float> @llvm.tpu.tanh.macro.v8f32(<8 x float> %v)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: rsqrt:
+; CHECK: (erf) = vrsqrt.f32 v0
+; CHECK: _ = sdelay $0x5
+; CHECK  v0 = vpop (erf)
+ define <8 x float> @rsqrt(<8 x float> %v) {
+  %res = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %v)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: pow2:
+; CHECK: (erf) = vpow2.f32 v0
+; CHECK: _ = sdelay $0x5
+; CHECK  v0 = vpop (erf)
+ define <8 x float> @pow2(<8 x float> %v) {
+  %res = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %v)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: log2:
+; CHECK: (erf) = vlog2.f32 v0
+; CHECK: _ = sdelay $0x5
+; CHECK  v0 = vpop (erf)
+ define <8 x float> @log2(<8 x float> %v) {
+  %res = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %v)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: rcp:
+; CHECK: (erf) = vrcp.f32 v0
+; CHECK: _ = sdelay $0x5
+; CHECK  v0 = vpop (erf)
+ define <8 x float> @rcp(<8 x float> %v) {
+  %res = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %v)
+  ret <8 x float> %res
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/eup_vf.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/eup_vf.ll
new file mode 100644
index 0000000..641ece7
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/eup_vf.ll

@@ -0,0 +1,87 @@
+; RUN: llc < %s -mcpu=tensorcore-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+; Test EUP intrinsics code generation
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.rsqrt.v1024f32(<1024 x float>)
+declare i32 @llvm.tpu.pow2.v1024f32(<1024 x float>)
+declare i32 @llvm.tpu.log2.v1024f32(<1024 x float>)
+declare i32 @llvm.tpu.tanh.v1024f32(<1024 x float>)
+declare i32 @llvm.tpu.sigshft.v1024f32(<1024 x float>)
+declare i32 @llvm.tpu.rcp.v1024f32(<1024 x float>)
+declare i32 @llvm.tpu.eup.push.v1024f32(<1024 x float>)
+
+declare <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32)
+
+; CHECK-LABEL: rsqrt:
+; CHECK: (erf) = vrsqrt.f32 v0
+; CHECK: _ = vdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <1024 x float> @rsqrt(<1024 x float> %v) {
+  %f = call i32 @llvm.tpu.rsqrt.v1024f32(<1024 x float> %v)
+  %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: pow2:
+; CHECK: (erf) = vpow2.f32 v0
+; CHECK: _ = vdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <1024 x float> @pow2(<1024 x float> %v) {
+  %f = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v)
+  %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: log2:
+; CHECK: (erf) = vlog2.f32 v0
+; CHECK: _ = vdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <1024 x float> @log2(<1024 x float> %v) {
+  %f = call i32 @llvm.tpu.log2.v1024f32(<1024 x float> %v)
+  %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: tanh:
+; CHECK: (erf) = vtanh.f32 v0
+; CHECK: _ = vdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <1024 x float> @tanh(<1024 x float> %v) {
+  %f = call i32 @llvm.tpu.tanh.v1024f32(<1024 x float> %v)
+  %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: sigshft:
+; CHECK: (erf) = vsigshft.f32 v0
+; CHECK: _ = vdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <1024 x float> @sigshft(<1024 x float> %v) {
+  %f = call i32 @llvm.tpu.sigshft.v1024f32(<1024 x float> %v)
+  %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: rcp:
+; CHECK: (erf) = vrcp.f32 v0
+; CHECK: _ = vdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <1024 x float> @rcp(<1024 x float> %v) {
+  %f = call i32 @llvm.tpu.rcp.v1024f32(<1024 x float> %v)
+  %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: push:
+; CHECK: (erf) = vpush v0
+; CHECK: _ = vdelay $0x6
+; CHECK  v0 = vpop (erf)
+ define <1024 x float> @push(<1024 x float> %v) {
+  %f = call i32 @llvm.tpu.eup.push.v1024f32(<1024 x float> %v)
+  %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  ret <1024 x float> %res
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/eup_vf_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/eup_vf_sc.ll
new file mode 100644
index 0000000..6adf7d4
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/eup_vf_sc.ll

@@ -0,0 +1,59 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Test EUP intrinsics code generation
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.tanh.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float>)
+
+; CHECK-LABEL: tanh:
+; CHECK: (erf) = vtanh.f32 v0
+; CHECK: _ = sdelay $0x5
+; CHECK  v0 = vpop (erf)
+ define <8 x float> @tanh(<8 x float> %v) {
+  %res = call <8 x float> @llvm.tpu.tanh.macro.v8f32(<8 x float> %v)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: rsqrt:
+; CHECK: (erf) = vrsqrt.f32 v0
+; CHECK: _ = sdelay $0x5
+; CHECK  v0 = vpop (erf)
+ define <8 x float> @rsqrt(<8 x float> %v) {
+  %res = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %v)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: pow2:
+; CHECK: (erf) = vpow2.f32 v0
+; CHECK: _ = sdelay $0x5
+; CHECK  v0 = vpop (erf)
+ define <8 x float> @pow2(<8 x float> %v) {
+  %res = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %v)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: log2:
+; CHECK: (erf) = vlog2.f32 v0
+; CHECK: _ = sdelay $0x5
+; CHECK  v0 = vpop (erf)
+ define <8 x float> @log2(<8 x float> %v) {
+  %res = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %v)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: rcp:
+; CHECK: (erf) = vrcp.f32 v0
+; CHECK: _ = sdelay $0x5
+; CHECK  v0 = vpop (erf)
+ define <8 x float> @rcp(<8 x float> %v) {
+  %res = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %v)
+  ret <8 x float> %res
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/event.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/event.ll
new file mode 100644
index 0000000..d3d7d2b
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/event.ll

@@ -0,0 +1,36 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu-unknown-unknown"
+
+declare void @llvm.tpu.event(i8*, ...) nounwind
+
+@.str = private unnamed_addr constant [13 x i8] c"hello, world\00", align 1
+
+; CHECK-LABEL: event:
+; CHECK: s0 = simm.s32 $0x2a
+; CHECK-NEXT: _ = event "hello, world", s0
+; CHECK-NEXT: _ = event "hello, world";
+define void @event()  {
+  tail call void (i8*, ...) @llvm.tpu.event(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 42)
+  tail call void (i8*, ...) @llvm.tpu.event(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0))
+  ret void
+}
+
+; Make sure that inserting an event doesn't cause the load and store to get
+; bundled together. We had a bug where event was marked as a none-memory
+; scheduling barrier causing the DAG to have an egdge of latency 0 between
+; store and event and an edge of latency 0 between event and ld. So all the
+; instructions were merged in the same bundle.
+; CHECK-LABEL: event_sch
+; CHECK: {  [smem:s1] = sst s0  }
+; CHECK-NEXT: {  _ = event "hello, world"  }
+; CHECK-NEXT: {  s0 = sld [smem:s2+$0x0];
+define i32 @event_sch(i32 %a, i32* %addr, i32* %addr1, <8 x i32> %b) {
+  store i32 %a, i32* %addr
+  ;call void @llvm.tpu.barrier(i32 0, i32 15)
+  call void (i8*, ...) @llvm.tpu.event(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0))
+  %c = load i32, i32* %addr1
+  ret i32 %c
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_jf_df_vxpose_early_allocation.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_jf_df_vxpose_early_allocation.ll
new file mode 100644
index 0000000..220caf2
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_jf_df_vxpose_early_allocation.ll

@@ -0,0 +1,285 @@
+; RUN: llc %s -o - -tpu-use-original-order-sched -mcpu=tensorcore-jf -enable-tpu-xlu-opt=false | FileCheck %s --check-prefixes=CHECK,CHECK-JFDF
+; RUN: llc %s -o - -tpu-use-original-order-sched -mcpu=tensorcore-df -enable-tpu-xlu-opt=false | FileCheck %s --check-prefixes=CHECK,CHECK-JFDF
+; RUN: llc %s -o - -tpu-use-original-order-sched -mcpu=tensorcore-pf -enable-tpu-xlu-opt=false | FileCheck %s --check-prefixes=CHECK,CHECK-PF
+; REQUIRES: tpu
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu--"
+
+; Test that when we have a chain of vxpose instructions follwed by pops the FifoOverflowMutation
+; makes sure that the vpop that gets connected to the vxpose that would overflow the fifo
+; doesn't endup being in the same bundle in JF and DF, because the allocation happens before
+; any pop takes effect (and that would overflow the fifo).
+define void @fifo_early_vxpose_alloc(<1024 x i32> %v) local_unnamed_addr #0 {
+entry:
+; CHECK-LABEL: fifo_early_vxpose_alloc
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0.end v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK: vxpose.0 v{{[0-9]+}}, $0x80
+
+
+; CHECK-JFDF: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK-JFDF-NOT: vxpose
+; CHECK-JFDF: vpop
+; CHECK-JFDF: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK-JFDF-NOT: vxpose
+; CHECK-JFDF: vpop
+; CHECK-JFDF: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK-JFDF-NOT: vxpose
+; CHECK-JFDF: vpop
+; CHECK-JFDF: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK-JFDF-NOT: vxpose
+; CHECK-JFDF: vpop
+; CHECK-JFDF: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK-JFDF-NOT: vxpose
+; CHECK-JFDF: vpop
+; CHECK-JFDF: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK-JFDF-NOT: vxpose
+; CHECK-JFDF: vpop
+; CHECK-JFDF: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK-JFDF-NOT: vxpose
+; CHECK-JFDF: vpop
+; CHECK-JFDF: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK-JFDF-NOT: vxpose
+; CHECK-JFDF: vpop
+; CHECK-JFDF: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK-JFDF-NOT: vxpose
+; CHECK-JFDF: vpop
+; CHECK-JFDF: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK-JFDF-NOT: vxpose
+; CHECK-JFDF: vpop
+; CHECK-JFDF: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK-JFDF-NOT: vxpose
+; CHECK-JFDF: vpop
+; CHECK-JFDF: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK-JFDF-NOT: vxpose
+; CHECK-JFDF: vpop
+; CHECK-JFDF: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK-JFDF-NOT: vxpose
+; CHECK-JFDF: vpop
+; CHECK-JFDF: vxpose.0.end v{{[0-9]+}}, $0x80
+; CHECK-JFDF-NOT: vxpose
+; CHECK-JFDF: vpop
+; CHECK-JFDF: vxpose.0 v{{[0-9]+}}, $0x80
+; CHECK-JFDF-NOT: vxpose
+; CHECK-JFDF: v{{[0-9]+}} =	vpop (trf0)
+; CHECK-JFDF: vxpose.0 v{{[0-9]+}}, $0x80
+
+; CHECK-PF: vpop (trf0)
+; CHECK-PF: vxpose.0 v0, $0x80
+; CHECK-PF-NOT: vxpose
+; CHECK-PF: vpop (trf0)
+; CHECK-PF: vxpose.0 v0, $0x80
+; CHECK-PF-NOT: vxpose
+; CHECK-PF: vpop (trf0)
+; CHECK-PF: vxpose.0 v0, $0x80
+; CHECK-PF-NOT: vxpose
+; CHECK-PF: vpop (trf0)
+; CHECK-PF: vxpose.0 v0, $0x80
+; CHECK-PF-NOT: vxpose
+; CHECK-PF: vpop (trf0)
+; CHECK-PF: vxpose.0 v0, $0x80
+; CHECK-PF-NOT: vxpose
+; CHECK-PF: vpop (trf0)
+; CHECK-PF: vxpose.0 v0, $0x80
+; CHECK-PF-NOT: vxpose
+; CHECK-PF: vpop (trf0)
+; CHECK-PF: vxpose.0 v0, $0x80
+; CHECK-PF-NOT: vxpose
+; CHECK-PF: vpop (trf0)
+; CHECK-PF: vxpose.0 v0, $0x80
+; CHECK-PF-NOT: vxpose
+; CHECK-PF: vpop (trf0)
+; CHECK-PF: vxpose.0 v0, $0x80
+; CHECK-PF-NOT: vxpose
+; CHECK-PF: vpop (trf0)
+; CHECK-PF: vxpose.0 v0, $0x80
+; CHECK-PF-NOT: vxpose
+; CHECK-PF: vpop (trf0)
+; CHECK-PF: vxpose.0 v0, $0x80
+; CHECK-PF-NOT: vxpose
+; CHECK-PF: vpop (trf0)
+; CHECK-PF: vxpose.0 v0, $0x80
+; CHECK-PF-NOT: vxpose
+; CHECK-PF: vpop (trf0)
+; CHECK-PF: vxpose.0 v0, $0x80
+; CHECK-PF-NOT: vxpose
+; CHECK-PF: vpop (trf0)
+; CHECK-PF: vxpose.0.end v0, $0x80
+
+  %t1 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 undef)
+  %t2 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t1)
+  %t3 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t2)
+  %t4 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t3)
+  %t5 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t4)
+  %t6 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t5)
+  %t7 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t6)
+  %t8 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t7)
+  %t9 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t8)
+  %t10 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t9)
+  %t11 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t10)
+  %t12 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t11)
+  %t13 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t12)
+  %t14 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t13)
+  %t15 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t14)
+  %t16 = tail call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t15)
+  %t17 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 undef)
+  %t18 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t17)
+  %t19 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t18)
+  %t20 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t19)
+  %t21 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t20)
+  %t22 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t21)
+  %t23 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t22)
+  %t24 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t23)
+  %t25 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t24)
+  %t26 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t25)
+  %t27 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t26)
+  %t28 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t27)
+  %t29 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t28)
+  %t30 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t29)
+  %t31 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t30)
+  %t32 = tail call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t31)
+  %t33 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 undef)
+  %t34 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t33)
+  %t35 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t34)
+  %t36 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t35)
+  %t37 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t36)
+  %t38 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t37)
+  %t39 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t38)
+  %t40 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t39)
+  %t41 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t40)
+  %t42 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t41)
+  %t43 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t42)
+  %t44 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t43)
+  %t45 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t44)
+  %t46 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t45)
+  %t47 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t46)
+  %t48 = tail call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 %t47)
+  %p1 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p2 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p3 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p4 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p5 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p6 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p7 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p8 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p9 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p10 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p11 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p12 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p13 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p14 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p15 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p16 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+
+  %p17 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p18 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p19 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p20 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p21 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p22 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p23 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p24 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p25 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p26 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p27 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p28 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p29 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p30 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p31 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p32 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+
+
+  %p33 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p34 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p35 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p36 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p37 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p38 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p39 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p40 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p41 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p42 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p43 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p44 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p45 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p46 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p47 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p48 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+
+  ret void
+
+}
+
+; Function Attrs: nounwind readnone
+declare <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32) #1
+
+; Function Attrs: inaccessiblememonly nounwind
+declare i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32>, i32 immarg, i32 immarg, i32, i32) #4
+
+; Function Attrs: inaccessiblememonly nounwind
+declare i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32>, i32 immarg, i32 immarg, i32, i32) #4
+
+; Function Attrs: inaccessiblememonly nounwind
+declare <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32, i32) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { argmemonly nounwind }
+attributes #3 = { argmemonly nounwind readonly }
+attributes #4 = { inaccessiblememonly nounwind }
+attributes #5 = { argmemonly nounwind willreturn }
+
+!smem.spill.start = !{!0}
+!smem.spill.limit = !{!1}
+!vmem.spill.start = !{!2}
+!vmem.spill.limit = !{!3}
+
+!0 = !{i32 624}
+!1 = !{i32 4078}
+!2 = !{i32 15176}
+!3 = !{i32 32768}
+!4 = !{!5}
+!5 = distinct !{!5, !6, !"alloc"}
+!6 = distinct !{!6, !"fusion.45"}
+!7 = !{!8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19}
+!8 = distinct !{!8, !6, !"alloc"}
+!9 = distinct !{!9, !6, !"alloc"}
+!10 = distinct !{!10, !6, !"alloc"}
+!11 = distinct !{!11, !6, !"alloc"}
+!12 = distinct !{!12, !6, !"alloc"}
+!13 = distinct !{!13, !6, !"alloc"}
+!14 = distinct !{!14, !6, !"alloc"}
+!15 = distinct !{!15, !6, !"alloc"}
+!16 = distinct !{!16, !6, !"alloc"}
+!17 = distinct !{!17, !6, !"alloc"}
+!18 = distinct !{!18, !6, !"alloc"}
+!19 = distinct !{!19, !6, !"alloc"}
+!20 = !{!11}
+!21 = !{!8, !9, !5, !10, !12, !13, !14, !15, !16, !17, !18, !19}
+!22 = !{!8}
+!23 = !{!9, !5, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19}
+!24 = !{!12}
+!25 = !{!8, !9, !5, !10, !11, !13, !14, !15, !16, !17, !18, !19}
+!26 = !{!9}
+!27 = !{!8, !5, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19}
+!28 = !{!10}
+!29 = !{!8, !9, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19}
+!30 = !{!17}
+!31 = !{!8, !9, !5, !10, !11, !12, !13, !14, !15, !16, !18, !19}
+!32 = !{!19}
+!33 = !{!8, !9, !5, !10, !11, !12, !13, !14, !15, !16, !17, !18}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_mutation_handle_multiple_dags.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_mutation_handle_multiple_dags.ll
new file mode 100644
index 0000000..f3832a4
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_mutation_handle_multiple_dags.ll

@@ -0,0 +1,99 @@
+; RUN: llc -o - -mcpu=tensorcore-jf -tpu-use-original-order-sched -tpu-original-order-test-overflow-fifo-mut -stop-after=tpu-machine-scheduler-fast %s -disable-cgp | FileCheck %s
+; RUN: llc -o - -mcpu=tensorcore-pf -tpu-use-original-order-sched -tpu-original-order-test-overflow-fifo-mut -stop-after=tpu-machine-scheduler-fast %s -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.vmatpush.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vdwg(i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.packed.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare <1024 x float> @llvm.tpu.vmatres.f32(i32, i32) nounwind
+declare void @myfun()
+
+; Check that we are able to pass between DAGs in the same basicblock information about
+; fifo usage, in order to correctly model FIFO overflow. If such information is not
+; correctly passed between the DAGs below we should see that the MATMULs and the MATPOPs
+; are going to be completely disjoint. (All the matmuls first and all the matpops after)
+; CHECK-LABEL: matmul_fifo_multiple_dags
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATMUL
+; CHECK: CALL
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATMUL
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATPOP
+; CHECK: MXU0MATPOP
+
+define void @matmul_fifo_multiple_dags(<1024 x float> %v0, <1024 x float> %v1) {
+  %v = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %matmul = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v0, <1024 x i1> %mask, i32 0, i32 undef)
+  %matmul2 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matmul3 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matmul4 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matmul5 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matmul6 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matmul7 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matmul8 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matmul9 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matmul10 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matmul11 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matmul12 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matmul13 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matmul14 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matmul15 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  call void @myfun()
+  %matmul16 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matmul17 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matmul18 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul)
+  %matres2 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul2)
+  %matres3 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul3)
+  %matres4 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul4)
+  %matres5 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul5)
+  %matres6 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul6)
+  %matres7 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul7)
+  %matres8 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul8)
+  %matres9 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul9)
+  %matres10 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul10)
+  %matres11 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul11)
+  %matres12 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul12)
+  %matres13 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul13)
+  %matres14 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul14)
+  %matres15 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul15)
+  %matres16 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul16)
+  %matres17 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul17)
+  %matres18 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul18)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_mutation_transpose_test.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_mutation_transpose_test.ll
new file mode 100644
index 0000000..b7794b5
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_mutation_transpose_test.ll

@@ -0,0 +1,470 @@
+; RUN: llc %s -o - -mcpu=tensorcore-jf -tpu-use-original-order-sched -tpu-original-order-test-overflow-fifo-mut -stop-after=tpu-machine-scheduler-fast -enable-tpu-xlu-opt=false | FileCheck %s --check-prefixes=CHECK,CHECK-JF
+; RUN: llc %s -o - -mcpu=tensorcore-pf -tpu-use-original-order-sched -tpu-original-order-test-overflow-fifo-mut -stop-after=tpu-machine-scheduler-fast -enable-tpu-xlu-opt=false | FileCheck %s --check-prefixes=CHECK,CHECK-PF
+; REQUIRES: tpu
+source_filename = ""
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu--"
+
+define void @pushpop_transpose_test() local_unnamed_addr #0 {
+; CHECK-LABEL: pushpop_transpose_test
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE_END
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE
+; CHECK-JF: XLU0B0TRANSPOSE_END
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+; CHECK-JF: XLU0Pop
+
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE_END
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE
+; CHECK-PF: XLU0B0TRANSPOSE_END
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+; CHECK-PF: XLU0Pop
+
+entry:
+
+  %base_val = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 0)
+  %gep0 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 0
+  %l1 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep0, align 4096
+  %gep1 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 1
+  %l2 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep1, align 4096
+  %gep2 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 2
+  %l3 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep2, align 4096
+  %gep3 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 3
+  %l4 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep3, align 4096
+  %gep4 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 4
+  %l5 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep4, align 4096
+  %gep5 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 5
+  %l6 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep5, align 4096
+  %gep6 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 6
+  %l7 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep6, align 4096
+  %gep7 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 7
+  %l8 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep7, align 4096
+  %gep8 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 8
+  %l9 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep8, align 4096
+  %gep9 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 9
+  %l10 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep9, align 4096
+  %gep10 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 10
+  %l11 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep10, align 4096
+  %gep11 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 11
+  %l12 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep11, align 4096
+  %gep12 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 12
+  %l13 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep12, align 4096
+  %gep13 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 13
+  %l14 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep13, align 4096
+  %gep14 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 14
+  %l15 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep14, align 4096
+  %gep15 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 15
+  %l16 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep15, align 4096
+
+
+  %gep16 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 16
+  %l17 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep16, align 4096
+  %gep17 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 17
+  %l18 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep17, align 4096
+  %gep18 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 18
+  %l19 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep18, align 4096
+  %gep19 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 19
+  %l20 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep19, align 4096
+  %gep20 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 20
+  %l21 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep20, align 4096
+  %gep21 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 21
+  %l22 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep21, align 4096
+  %gep22 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 22
+  %l23 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep22, align 4096
+  %gep23 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 23
+  %l24 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep23, align 4096
+  %gep24 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 24
+  %l25 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep24, align 4096
+  %gep25 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 25
+  %l26 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep25, align 4096
+  %gep26 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 26
+  %l27 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep26, align 4096
+  %gep27 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 27
+  %l28 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep27, align 4096
+  %gep28 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 28
+  %l29 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep28, align 4096
+  %gep29 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 29
+  %l30 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep29, align 4096
+  %gep30 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 30
+  %l31 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep30, align 4096
+  %gep31 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 31
+  %l32 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep31, align 4096
+
+  %gep32 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 32
+  %l33 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep32, align 4096
+  %gep33 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 33
+  %l34 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep33, align 4096
+  %gep34 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 34
+  %l35 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep34, align 4096
+  %gep35 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 35
+  %l36 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep35, align 4096
+  %gep36 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 36
+  %l37 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep36, align 4096
+  %gep37 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 37
+  %l38 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep37, align 4096
+  %gep38 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 38
+  %l39 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep38, align 4096
+  %gep39 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 39
+  %l40 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep39, align 4096
+  %gep40 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 40
+  %l41 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep40, align 4096
+  %gep41 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 41
+  %l42 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep41, align 4096
+  %gep42 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 42
+  %l43 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep42, align 4096
+  %gep43 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 43
+  %l44 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep43, align 4096
+  %gep44 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 44
+  %l45 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep44, align 4096
+  %gep45 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 45
+  %l46 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep45, align 4096
+  %gep46 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 46
+  %l47 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep46, align 4096
+  %gep47 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %base_val, i32 47
+  %l48 = load <1024 x i32>, <1024 x i32> addrspace(205)* %gep47, align 4096
+
+
+
+  %t1 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l1, i32 128, i32 128, i32 0, i32 undef)
+  %t2 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l2, i32 128, i32 128, i32 0, i32 %t1)
+  %t3 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l3, i32 128, i32 128, i32 0, i32 %t2)
+  %t4 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l4, i32 128, i32 128, i32 0, i32 %t3)
+  %t5 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l5, i32 128, i32 128, i32 0, i32 %t4)
+  %t6 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l6, i32 128, i32 128, i32 0, i32 %t5)
+  %t7 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l7, i32 128, i32 128, i32 0, i32 %t6)
+  %t8 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l8, i32 128, i32 128, i32 0, i32 %t7)
+  %t9 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l9, i32 128, i32 128, i32 0, i32 %t8)
+  %t10 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l10, i32 128, i32 128, i32 0, i32 %t9)
+  %t11 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l11, i32 128, i32 128, i32 0, i32 %t10)
+  %t12 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l12, i32 128, i32 128, i32 0, i32 %t11)
+  %t13 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l13, i32 128, i32 128, i32 0, i32 %t12)
+  %t14 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l14, i32 128, i32 128, i32 0, i32 %t13)
+  %t15 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l15, i32 128, i32 128, i32 0, i32 %t14)
+  %t16 = tail call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %l16, i32 128, i32 128, i32 0, i32 %t15)
+  %t17 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l17, i32 128, i32 128, i32 0, i32 undef)
+  %t18 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l18, i32 128, i32 128, i32 0, i32 %t17)
+  %t19 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l19, i32 128, i32 128, i32 0, i32 %t18)
+  %t20 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l20, i32 128, i32 128, i32 0, i32 %t19)
+  %t21 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l21, i32 128, i32 128, i32 0, i32 %t20)
+  %t22 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l22, i32 128, i32 128, i32 0, i32 %t21)
+  %t23 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l23, i32 128, i32 128, i32 0, i32 %t22)
+  %t24 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l24, i32 128, i32 128, i32 0, i32 %t23)
+  %t25 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l25, i32 128, i32 128, i32 0, i32 %t24)
+  %t26 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l26, i32 128, i32 128, i32 0, i32 %t25)
+  %t27 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l27, i32 128, i32 128, i32 0, i32 %t26)
+  %t28 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l28, i32 128, i32 128, i32 0, i32 %t27)
+  %t29 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l29, i32 128, i32 128, i32 0, i32 %t28)
+  %t30 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l30, i32 128, i32 128, i32 0, i32 %t29)
+  %t31 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l31, i32 128, i32 128, i32 0, i32 %t30)
+  %t32 = tail call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %l32, i32 128, i32 128, i32 0, i32 %t31)
+  %t33 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l33, i32 128, i32 128, i32 0, i32 undef)
+  %t34 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l34, i32 128, i32 128, i32 0, i32 %t33)
+  %t35 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l35, i32 128, i32 128, i32 0, i32 %t34)
+  %t36 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l36, i32 128, i32 128, i32 0, i32 %t35)
+  %t37 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l37, i32 128, i32 128, i32 0, i32 %t36)
+  %t38 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l38, i32 128, i32 128, i32 0, i32 %t37)
+  %t39 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l39, i32 128, i32 128, i32 0, i32 %t38)
+  %t40 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l40, i32 128, i32 128, i32 0, i32 %t39)
+  %t41 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l41, i32 128, i32 128, i32 0, i32 %t40)
+  %t42 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l42, i32 128, i32 128, i32 0, i32 %t41)
+  %t43 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l43, i32 128, i32 128, i32 0, i32 %t42)
+  %t44 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l44, i32 128, i32 128, i32 0, i32 %t43)
+  %t45 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l45, i32 128, i32 128, i32 0, i32 %t44)
+  %t46 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l46, i32 128, i32 128, i32 0, i32 %t45)
+  %t47 = tail call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %l47, i32 128, i32 128, i32 0, i32 %t46)
+  %t48 = tail call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %l48, i32 128, i32 128, i32 0, i32 %t47)
+  %p1 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p2 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p3 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p4 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p5 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p6 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p7 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p8 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p9 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p10 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p11 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p12 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p13 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p14 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p15 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+  %p16 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t16)
+
+  %p17 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p18 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p19 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p20 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p21 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p22 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p23 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p24 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p25 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p26 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p27 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p28 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p29 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p30 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p31 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+  %p32 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t32)
+
+
+  %p33 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p34 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p35 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p36 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p37 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p38 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p39 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p40 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p41 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p42 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p43 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p44 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p45 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p46 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p47 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+  %p48 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %t48)
+
+  ret void
+
+}
+
+; Function Attrs: nounwind readnone
+declare <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32) #1
+
+; Function Attrs: inaccessiblememonly nounwind
+declare i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32>, i32 immarg, i32 immarg, i32, i32) #4
+
+; Function Attrs: inaccessiblememonly nounwind
+declare i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32>, i32 immarg, i32 immarg, i32, i32) #4
+
+; Function Attrs: inaccessiblememonly nounwind
+declare <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32, i32) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { argmemonly nounwind }
+attributes #3 = { argmemonly nounwind readonly }
+attributes #4 = { inaccessiblememonly nounwind }
+attributes #5 = { argmemonly nounwind willreturn }
+
+!smem.spill.start = !{!0}
+!smem.spill.limit = !{!1}
+!vmem.spill.start = !{!2}
+!vmem.spill.limit = !{!3}
+
+!0 = !{i32 624}
+!1 = !{i32 4078}
+!2 = !{i32 15176}
+!3 = !{i32 32768}
+!4 = !{!5}
+!5 = distinct !{!5, !6, !"alloc"}
+!6 = distinct !{!6, !"fusion.45"}
+!7 = !{!8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19}
+!8 = distinct !{!8, !6, !"alloc"}
+!9 = distinct !{!9, !6, !"alloc"}
+!10 = distinct !{!10, !6, !"alloc"}
+!11 = distinct !{!11, !6, !"alloc"}
+!12 = distinct !{!12, !6, !"alloc"}
+!13 = distinct !{!13, !6, !"alloc"}
+!14 = distinct !{!14, !6, !"alloc"}
+!15 = distinct !{!15, !6, !"alloc"}
+!16 = distinct !{!16, !6, !"alloc"}
+!17 = distinct !{!17, !6, !"alloc"}
+!18 = distinct !{!18, !6, !"alloc"}
+!19 = distinct !{!19, !6, !"alloc"}
+!20 = !{!11}
+!21 = !{!8, !9, !5, !10, !12, !13, !14, !15, !16, !17, !18, !19}
+!22 = !{!8}
+!23 = !{!9, !5, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19}
+!24 = !{!12}
+!25 = !{!8, !9, !5, !10, !11, !13, !14, !15, !16, !17, !18, !19}
+!26 = !{!9}
+!27 = !{!8, !5, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19}
+!28 = !{!10}
+!29 = !{!8, !9, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19}
+!30 = !{!17}
+!31 = !{!8, !9, !5, !10, !11, !12, !13, !14, !15, !16, !18, !19}
+!32 = !{!19}
+!33 = !{!8, !9, !5, !10, !11, !12, !13, !14, !15, !16, !17, !18}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_overflow.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_overflow.ll
new file mode 100644
index 0000000..b54a57e
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_overflow.ll

@@ -0,0 +1,81 @@
+; RUN: llc < %s -mcpu=tensorcore-pf -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.vmatpush.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vdwg(i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare <1024 x float> @llvm.tpu.vmatres.f32(i32, i32) nounwind
+
+; Make sure we don't overflow MRF fifo. Here the Matmul needs to be scheduled
+; early due to be able to start pushing the next matrix but the matres can be
+; scheduled much later. We need to be sure it doesn't get scheduled after the
+; 18th matmul (push %17, as we start from %0).
+; CHECK-LABEL: matmul:
+; CHECK: // pop %0, push %17
+ define <1024 x float> @matmul(<1024 x float> %v, <1024 x float> %v0, <1024 x float> %v1) {
+  %v2 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v2, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %d = call i32 @llvm.tpu.vdwg(i32 0, i32 undef)
+  %matmul = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v0, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul)
+  %matmul2 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres2 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul2)
+  %matmul3 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres3 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul3)
+  %matmul4 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres4 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul4)
+  %matmul5 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres5 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul5)
+  %matmul6 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres6 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul6)
+  %matmul7 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres7 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul7)
+  %matmul8 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres8 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul8)
+  %matmul9 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres9 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul9)
+  %matmul10 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres10 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul10)
+  %matmul11 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres11 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul11)
+  %matmul12 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres12 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul12)
+  %matmul13 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres13 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul13)
+  %matmul14 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres14 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul14)
+  %matmul15 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres15 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul15)
+  %matmul16 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres16 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul16)
+  %matmul17 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d)
+  %matres17 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul17)
+  %push = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %d1 = call i32 @llvm.tpu.vdwg(i32 0, i32 %push)
+  %matmul18 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d1)
+  %matres18 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul18)
+  %push1 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push2 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push3 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push4 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push5 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push6 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push7 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push8 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push9 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push10 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push11 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push12 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push13 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push14 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push15 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push16 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %d2 = call i32 @llvm.tpu.vdwg(i32 0, i32 %push16)
+  %matmul19 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %d2)
+  %matres19 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul19)
+  %push17 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  ret <1024 x float> %matres
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_overflow_conservative_loopempty_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_overflow_conservative_loopempty_sc.ll
new file mode 100644
index 0000000..8b317c2
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_overflow_conservative_loopempty_sc.ll

@@ -0,0 +1,55 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp -max-ii=0 \
+; RUN: -tpu-no-push-pop-reordering | FileCheck %s --check-prefix CHECK-CONSERVATIVE
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp -max-ii=0 \
+; RUN: | FileCheck %s --check-prefix CHECK-LOOPEMPTY
+
+; REQUIRES: tpu
+
+; Tests that push is not hoisted above pop in loops that are not
+; software pipelined on Sparsecore.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float>)
+
+; CHECK-LABEL: fifo_func:
+; CHECK-LABEL: .LBB0_1:
+; CHECK-CONSERVATIVE: (erf) = vpow2
+; CHECK-CONSERVATIVE: vpop (erf)
+; CHECK-CONSERVATIVE: (erf) = vpow2
+; CHECK-CONSERVATIVE: vpop (erf)
+; CHECK-CONSERVATIVE: (erf) = vpow2
+; CHECK-CONSERVATIVE: vpop (erf)
+; CHECK-CONSERVATIVE: (erf) = vpow2
+; CHECK-CONSERVATIVE: vpop (erf)
+; CHECK-LOOPEMPTY: (erf) = vpow2
+; CHECK-LOOPEMPTY: (erf) = vpow2
+; CHECK-LOOPEMPTY: (erf) = vpow2
+; CHECK-LOOPEMPTY: (erf) = vpow2
+; CHECK-LOOPEMPTY: vpop (erf)
+; CHECK-LOOPEMPTY: vpop (erf)
+; CHECK-LOOPEMPTY: vpop (erf)
+; CHECK-LOOPEMPTY: vpop (erf)
+
+define <8 x float> @fifo_func(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2,
+                              <8 x float> %v3, <8 x float> addrspace(205)* %ptr) {
+ entry:
+  br label %loop_simple
+
+loop_simple:
+  %ind = phi i32 [ 0, %entry ], [ %ind.1, %loop_simple ]
+  %res = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %v0)
+  %res1 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %v1)
+  %res2 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %v2)
+  %res3 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %v3)
+  %res4 = fadd <8 x float> %res, %res1
+  %res5 = fadd <8 x float> %res4, %res2
+  %res6 = fadd <8 x float> %res5, %res3
+  %ind.1 = add nuw nsw i32 %ind, 1
+  %cnd = icmp eq i32 %ind.1, 100
+  br i1 %cnd, label %loop.cleanup, label %loop_simple
+
+loop.cleanup:
+  ret <8 x float> %res6
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_pipelining.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_pipelining.ll
new file mode 100644
index 0000000..73d699a
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_pipelining.ll

@@ -0,0 +1,46 @@
+; RUN: llc < %s -mcpu=tensorcore-jf -tpu-use-swing-modulo-sched -disable-cgp \
+; RUN: -tpu-enable-vliw-prep-postiv=false -tpu-enable-vliw-prep-post-addrinc=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Test EUP intrinsics code generation
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.pow2.v1024f32(<1024 x float>)
+
+declare <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32)
+
+; Test that the Fifo instructions are bundled efficiently by the bundle packer
+; based on Fifo analysis of the pipelined kernel.
+; Also test that the cross block latency is correct, there shouldn't be a delay
+; before the first pop instruction.
+; CHECK-LABEL: pow2_pipelining:
+; CHECK-LABEL: .LBB0_1:
+; CHECK-NOT: vdelay
+; CHECK: push %4, pop phi(%0 or %4)
+; CHECK: push %5, pop phi(%1 or %5)
+; CHECK: push %6, pop phi(%2 or %6)
+; CHECK: push %7, pop phi(%3 or %7)
+define void @pow2_pipelining(<1024 x float> %v0, <1024 x float> addrspace(205)* %ptr) {
+ entry:
+  br label %loop_simple
+
+loop_simple:
+  %ind = phi i32 [ 0, %entry ], [ %ind.1, %loop_simple ]
+  %f = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v0)
+  %f1 = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v0)
+  %f2 = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v0)
+  %f3 = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v0)
+  %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  %res1 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f1)
+  %res2 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f2)
+  %res3 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f3)
+  %ind.1 = add nuw nsw i32 %ind, 1
+  %cnd = icmp eq i32 %ind.1, 100
+  br i1 %cnd, label %loop.cleanup, label %loop_simple
+
+loop.cleanup:
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_scheduling.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_scheduling.ll
new file mode 100644
index 0000000..4ad3d9f
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_scheduling.ll

@@ -0,0 +1,73 @@
+; The fifo scheduling pass doesn't consider overflow and may generate illegal IR.
+; TODO(thomasraoux): Fix it when trying to turn on FIFO scheduler by default.
+; RUN: llc < %s -mcpu=tensorcore-pf -disable-cgp -asm-verbose=false -tpu-use-fifo-sched=true | FileCheck %s -check-prefix=PF
+; RUN: llc < %s -mcpu=tensorcore-jf -disable-cgp -asm-verbose=false -tpu-use-fifo-sched=true | FileCheck %s -check-prefix=JF
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.pow2.v1024f32(<1024 x float>)
+declare <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32)
+
+; Test that the Fifo instruction gets re-ordered correctly. We should move the 
+; 3 push/pop in second position to have optimal scheduling.
+; CHECK-LABEL: fifo_scheduling:
+; PF: vpow2.f32
+; PF: vpow2.f32
+; PF: vpop
+; PF: vpow2.f32
+; PF: vpop
+; PF: vpop
+define void @fifo_scheduling(<1024 x float> %v0, <1024 x float> %v1, <1024 x float> addrspace(205)* noalias %ptr0, <1024 x float> addrspace(205)* noalias %ptr1) {
+ entry:
+  %f = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v0)
+  %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  %f1 = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %res)
+  %res1 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f1)
+  %f2 = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v1)
+  %res2 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f2)
+  store <1024 x float> %res1, <1024 x float> addrspace(205)* %ptr0
+  store <1024 x float> %res2, <1024 x float> addrspace(205)* %ptr1
+  ret void
+}
+
+; Test that we don't overflow fifo. Values are used out of order on purpose to 
+; have the scheduler re-order them. The 5th push is happening at the same time 
+; as the first pop so we don't go over the FIFO limit of 4.
+; CHECK-LABEL: fifo_scheduling2:
+; JF: vpow2.f32
+; JF: vpow2.f32
+; JF: vpow2.f32
+; JF: vpow2.f32
+; JF: { [[V:v[0-9]+]] = vpop (erf);
+; JF-NEXT: (erf) = vpow2.f32 v0 }
+; JF-NEXT: { [[x1:v[0-9]+]] = vpop (erf);
+; JF-NEXT: v{{[0-9]+}} = vadd.f32 v1, [[V]]
+; JF-NEXT: { [[x2:v[0-9]+]] = vpop (erf);
+; JF-NEXT: v{{[0-9]+}} = vadd.f32 [[x1]]
+; JF-NEXT: { [[x3:v[0-9]+]] = vpop (erf);
+; JF-NEXT: v{{[0-9]+}} = vadd.f32 [[x2]]
+; JF-NEXT: { [[x4:v[0-9]+]] = vpop (erf);
+; JF-NEXT: v{{[0-9]+}} = vadd.f32 [[x3]]
+; JF-NEXT: { v{{[0-9]+}} = vadd.f32 [[x4]]
+define void @fifo_scheduling2(<1024 x float> %v0, <1024 x float> %acc, <1024 x float> addrspace(205)* noalias %ptr) {
+ entry:
+  %f = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v0)
+  %res = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f)
+  %f1 = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v0)
+  %res1 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f1)
+  %f2 = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v0)
+  %res2 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f2)
+  %f3 = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v0)
+  %res3 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f3)
+  %f4 = call i32 @llvm.tpu.pow2.v1024f32(<1024 x float> %v0)
+  %res4 = call <1024 x float> @llvm.tpu.eup.pop.v1024f32(i32 %f4)
+  %acc1 = fadd <1024 x float> %acc, %res4
+  %acc2 = fadd <1024 x float> %acc1, %res3
+  %acc3 = fadd <1024 x float> %acc2, %res2
+  %acc4 = fadd <1024 x float> %acc3, %res1
+  %acc5 = fadd <1024 x float> %acc4, %res
+  store <1024 x float> %acc5, <1024 x float> addrspace(205)* %ptr
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_void.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_void.ll
new file mode 100644
index 0000000..02ee10f
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/fifo_void.ll

@@ -0,0 +1,42 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare {<8 x i32>, <8 x i32>, <8 x i1>} @llvm.tpu.sort.ascdi.v8i32(<8 x i1>, <8 x i32>, <8 x i32>) readnone nounwind
+declare {<8 x float>, <8 x i1>} @llvm.tpu.deprecated.segreduce.addf(<8 x i32>, <8 x float>) readnone nounwind
+declare <8 x i32> @llvm.tpu.vld.msk.idx.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>)
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32) nounwind
+
+; Tests that xrf0 and xrf1 pop can be reordered because the fifo void mutation
+; is removing dependencies caused by void registers.
+
+; CHECK-LABEL: fifo_void_order:
+; CHECK: vsort.ascd.msk.u32
+; CHECK: vpop (xrf1)
+; CHECK: vsegadd.xlane
+; CHECK: vld.idx
+; CHECK: vpop (xrf0)
+; CHECK: shalt
+
+define <8 x i32> @fifo_void_order(<8 x i32> %s, <8 x i1> %m, <8 x float> %vf, <8 x i32> %vi) {
+  %a = call { <8 x float>, <8 x i1>} @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %s, <8 x float> %vf)
+  %b = call { <8 x i32>, <8 x i32>, <8 x i1>} @llvm.tpu.sort.ascdi.v8i32(<8 x i1> %m, <8 x i32> %s, <8 x i32> %vi)
+  %v2 = extractvalue { <8 x float>, <8 x i1>} %a, 0
+  %v3 = bitcast <8 x float> %v2 to <8 x i32>
+  %v1 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1>} %b, 0
+  %addr = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 0)
+  %v5 = call <8 x i32> @llvm.tpu.vld.msk.idx.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %addr, <8 x i32> %v1)
+  %v4 = add <8 x i32> %v3, %vi
+  %vres = add <8 x i32> %v4, %v5
+  ret <8 x i32> %vres
+}
+
+!0 = distinct !{!0, !1, !3, !4, !5, !6}
+!1 = !{!"llvm.loop.parallel_accesses", !2}
+!2 = distinct !{}
+!3 = !{!"llvm.loop.unroll.disable"}
+!4 = !{!"llvm.loop.vectorize.width", i32 1}
+!5 = !{!"llvm.loop.interleave.count", i32 1}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/float_exp_bf16_gl_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/float_exp_bf16_gl_sc.ll
new file mode 100644
index 0000000..6fa5269
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/float_exp_bf16_gl_sc.ll

@@ -0,0 +1,61 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "googletpu"
+
+@a = external addrspace(201) global <32 x i8>, align 32
+@b = external addrspace(201) global <32 x i8>, align 32
+@c = external addrspace(201) global <32 x i8>, align 32
+@d = external addrspace(201) global <32 x i8>, align 32
+@e = external addrspace(201) global <16 x bfloat>, align 32
+
+declare <16 x bfloat> @llvm.tpu.unpack.c.l.u8.bf16(<32 x i8>)
+declare <32 x i8> @llvm.tpu.pack.c.bf16.u8(<16 x bfloat>, <16 x bfloat>)
+declare void @llvm.tpu.vst.msk.v32i8(<8 x i1>, <32 x i8> addrspace(201)*, <32 x i8>)
+declare <8 x i1> @llvm.tpu.16i1.to.8i1(<16 x i1>)
+
+; CHECK-LABEL: basic_falu_experimental:
+; CHECK: v[[v0:[0-9]+]] = vunpack.c.l.u8.bf16 v{{[0-9]+}}
+; CHECK: v[[v1:[0-9]+]] = vunpack.c.l.u8.bf16 v{{[0-9]+}}
+; CHECK: v[[v3:[0-9]+]] = vadd.bf16 v[[v1]], v[[v0]]
+; CHECK: v[[v4:[0-9]+]] = vpack.c.bf16.u8 v[[v3]], v[[v3]]
+; CHECK: [tilespmem:c] = vst v[[v4]]
+define void @basic_falu_experimental() #0 {
+entry:
+  %0 = load <32 x i8>, <32 x i8> addrspace(201)* @a, align 32
+  %1 = tail call <16 x bfloat> @llvm.tpu.unpack.c.l.u8.bf16(<32 x i8> %0)
+  %2 = load <32 x i8>, <32 x i8> addrspace(201)* @b, align 32
+  %3 = tail call <16 x bfloat> @llvm.tpu.unpack.c.l.u8.bf16(<32 x i8> %2)
+  %a = fadd <16 x bfloat> %1, %3
+  %r = tail call <32 x i8> @llvm.tpu.pack.c.bf16.u8(<16 x bfloat> %a, <16 x bfloat> %a)
+  store <32 x i8> %r, <32 x i8> addrspace(201)* @c, align 32
+  ret void
+}
+
+; CHECK-LABEL: basic_fcmp_experimental:
+; CHECK: v[[v0:[0-9]+]] = vunpack.c.l.u8.bf16 v{{[0-9]+}}
+; CHECK: v[[v1:[0-9]+]] = vunpack.c.l.u8.bf16 v{{[0-9]+}}
+; CHECK: v[[v2:[0-9]+]] = vunpack.c.l.u8.bf16 v{{[0-9]+}}
+; CHECK: v[[v3:[0-9]+]] = vpack.c.bf16.u8 v[[v0]], v[[v0]]
+; CHECK: vm[[vm0:[0-9]+]] = vne.bf16 v[[v1]], v[[v2]]
+; CHECK: [tilespmem:s0+$0x0] = vst.msk vm[[vm0]], v[[v3]]
+define void @basic_fcmp_experimental() #0 {
+entry:
+  %0 = load <32 x i8>, <32 x i8> addrspace(201)* @a, align 32
+  %1 = tail call <16 x bfloat> @llvm.tpu.unpack.c.l.u8.bf16(<32 x i8> %0)
+  %2 = load <32 x i8>, <32 x i8> addrspace(201)* @b, align 32
+  %3 = tail call <16 x bfloat> @llvm.tpu.unpack.c.l.u8.bf16(<32 x i8> %2)
+  %4 = load <32 x i8>, <32 x i8> addrspace(201)* @c, align 32
+  %5 = tail call <16 x bfloat> @llvm.tpu.unpack.c.l.u8.bf16(<32 x i8> %4)
+  %m16 = fcmp une <16 x bfloat> %1, %3
+  %m8 = tail call <8 x i1> @llvm.tpu.16i1.to.8i1(<16 x i1> %m16)
+  %r = tail call <32 x i8> @llvm.tpu.pack.c.bf16.u8(<16 x bfloat> %5, <16 x bfloat> %5)
+  tail call void @llvm.tpu.vst.msk.v32i8(<8 x i1> %m8, <32 x i8> addrspace(201)* @d, <32 x i8> %r)
+  ret void
+}
+
+attributes #0 = { "implicit-section-name"=".text.tile_execute" "target-cpu"="sparsecore-tec-gl" }
+attributes #1 = { "implicit-section-name"=".text.tile_access" "target-cpu"="sparsecore-tac-gl" }
+attributes #2 = { "implicit-section-name"=".text.scs" "target-cpu"="sparsecore-scs-gl" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/float_imm.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/float_imm.ll
new file mode 100644
index 0000000..82a47d1
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/float_imm.ll

@@ -0,0 +1,30 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: add_pos_inf
+; CHECK: = sadd.f32 $Inf,
+define float @add_pos_inf(float %x) {
+entry:
+  %f = fadd float %x, 0x7FF0000000000000
+  ret float %f
+}
+
+; CHECK-LABEL: add_neg_inf
+; CHECK: = sadd.f32 $-Inf,
+define float @add_neg_inf(float %x) {
+entry:
+  %f = fadd float %x, 0xFFF0000000000000
+  ret float %f
+}
+
+; CHECK-LABEL: add_big_float
+; CHECK: = sadd.f32 $-3.4028230607370965E+38
+define float @add_big_float(float %x) {
+entry:
+  %f = fadd float %x, -3.4028230607370965E+38
+  ret float %f
+}
+

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/ftrl_mlo_pathological_super_pass_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/ftrl_mlo_pathological_super_pass_sc.ll
new file mode 100644
index 0000000..6f649f9
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/ftrl_mlo_pathological_super_pass_sc.ll

@@ -0,0 +1,1671 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-vf < %s \
+; RUN: | llc -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-pipeliner-strategy=swingslack -tpu-fixed-vregs=32-63 \
+; RUN: -tpu-enable-pipeliner-super-pass -tpu-pipeliner-annotate-for-testing \
+; RUN: -enable-pre-spill -debug-only=tpu-loop-analysis -tpu-enable-loop-analysis \
+; RUN: 2>&1 | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK: Post-RA pipelined loop bb.8 (from bb.8): II=36
+
+declare i8* @malloc(i64)
+
+declare void @free(i8*)
+
+define void @execute3_lowered() #0 !dbg !14 {
+  %1 = call i32 addrspace(208)* @llvm.tpu.allocate.dreg(i32 0, i32 0)
+  br label %2
+
+2:                                                ; preds = %0
+  br label %3
+
+3:                                                ; preds = %2
+  ret void
+}
+
+define void @access2_lowered() #1 !dbg !18 {
+  %1 = call i32 addrspace(208)* @llvm.tpu.allocate.dreg(i32 0, i32 0)
+  br label %2
+
+2:                                                ; preds = %0
+  br label %3
+
+3:                                                ; preds = %2
+  ret void
+}
+
+define void @execute1_lowered() #0 !dbg !19 {
+  %1 = call i32 addrspace(208)* @llvm.tpu.allocate.dreg(i32 11, i32 0), !dbg !21
+  %2 = getelementptr i32, i32 addrspace(208)* %1, i32 0, !dbg !21
+  %3 = load i32, i32 addrspace(208)* %2, align 4, !dbg !21
+  %4 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %3), !dbg !21
+  %5 = getelementptr i32, i32 addrspace(208)* %1, i32 1, !dbg !21
+  %6 = load i32, i32 addrspace(208)* %5, align 4, !dbg !21
+  %7 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %6), !dbg !21
+  %8 = getelementptr i32, i32 addrspace(208)* %1, i32 2, !dbg !21
+  %9 = load i32, i32 addrspace(208)* %8, align 4, !dbg !21
+  %10 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %9), !dbg !21
+  %11 = getelementptr i32, i32 addrspace(208)* %1, i32 3, !dbg !21
+  %12 = load i32, i32 addrspace(208)* %11, align 4, !dbg !21
+  %13 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %12), !dbg !21
+  %14 = getelementptr i32, i32 addrspace(208)* %1, i32 4, !dbg !21
+  %15 = load i32, i32 addrspace(208)* %14, align 4, !dbg !21
+  %16 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %15), !dbg !21
+  %17 = getelementptr i32, i32 addrspace(208)* %1, i32 5, !dbg !21
+  %18 = load i32, i32 addrspace(208)* %17, align 4, !dbg !21
+  %19 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %18), !dbg !21
+  %20 = getelementptr i32, i32 addrspace(208)* %1, i32 6, !dbg !21
+  %21 = load i32, i32 addrspace(208)* %20, align 4, !dbg !21
+  %22 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %21), !dbg !21
+  %23 = getelementptr i32, i32 addrspace(208)* %1, i32 7, !dbg !21
+  %24 = load i32, i32 addrspace(208)* %23, align 4, !dbg !21
+  %25 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %24), !dbg !21
+  %26 = getelementptr i32, i32 addrspace(208)* %1, i32 8, !dbg !21
+  %27 = load i32, i32 addrspace(208)* %26, align 4, !dbg !21
+  %28 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %27), !dbg !21
+  %29 = getelementptr i32, i32 addrspace(208)* %1, i32 9, !dbg !21
+  %30 = load i32, i32 addrspace(208)* %29, align 4, !dbg !21
+  %31 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %30), !dbg !21
+  %32 = getelementptr i32, i32 addrspace(208)* %1, i32 10, !dbg !21
+  %33 = load i32, i32 addrspace(208)* %32, align 4, !dbg !21
+  %34 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %33), !dbg !21
+  br label %35, !dbg !21
+
+35:                                               ; preds = %0
+  %36 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %4, 0, !dbg !21
+  %37 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %36, float addrspace(203)* %4, 1, !dbg !21
+  %38 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %37, i32 0, 2, !dbg !21
+  %39 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %38, i32 2048, 3, 0, !dbg !21
+  %40 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %39, i32 8, 4, 0, !dbg !21
+  %41 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %40, i32 8, 3, 1, !dbg !21
+  %42 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %41, i32 1, 4, 1, !dbg !21
+  %43 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %7, 0, !dbg !21
+  %44 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %43, float addrspace(203)* %7, 1, !dbg !21
+  %45 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %44, i32 0, 2, !dbg !21
+  %46 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %45, i32 2048, 3, 0, !dbg !21
+  %47 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %46, i32 8, 4, 0, !dbg !21
+  %48 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %47, i32 8, 3, 1, !dbg !21
+  %49 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %48, i32 1, 4, 1, !dbg !21
+  %50 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %10, 0, !dbg !21
+  %51 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %50, float addrspace(203)* %10, 1, !dbg !21
+  %52 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %51, i32 0, 2, !dbg !21
+  %53 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %52, i32 2048, 3, 0, !dbg !21
+  %54 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %53, i32 8, 4, 0, !dbg !21
+  %55 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %54, i32 8, 3, 1, !dbg !21
+  %56 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %55, i32 1, 4, 1, !dbg !21
+  %57 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %13, 0, !dbg !21
+  %58 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %57, float addrspace(203)* %13, 1, !dbg !21
+  %59 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %58, i32 0, 2, !dbg !21
+  %60 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %59, i32 2048, 3, 0, !dbg !21
+  %61 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %60, i32 8, 4, 0, !dbg !21
+  %62 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %61, i32 8, 3, 1, !dbg !21
+  %63 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %62, i32 1, 4, 1, !dbg !21
+  %64 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } undef, float addrspace(203)* %16, 0, !dbg !21
+  %65 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %64, float addrspace(203)* %16, 1, !dbg !21
+  %66 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %65, i32 0, 2, !dbg !21
+  %67 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %66, i32 8, 3, 0, !dbg !21
+  %68 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %67, i32 1, 4, 0, !dbg !21
+  %69 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } undef, float addrspace(203)* %19, 0, !dbg !21
+  %70 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %69, float addrspace(203)* %19, 1, !dbg !21
+  %71 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %70, i32 0, 2, !dbg !21
+  %72 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %71, i32 8, 3, 0, !dbg !21
+  %73 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %72, i32 1, 4, 0, !dbg !21
+  %74 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } undef, float addrspace(203)* %22, 0, !dbg !21
+  %75 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %74, float addrspace(203)* %22, 1, !dbg !21
+  %76 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %75, i32 0, 2, !dbg !21
+  %77 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %76, i32 8, 3, 0, !dbg !21
+  %78 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %77, i32 1, 4, 0, !dbg !21
+  %79 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } undef, float addrspace(203)* %25, 0, !dbg !21
+  %80 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %79, float addrspace(203)* %25, 1, !dbg !21
+  %81 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %80, i32 0, 2, !dbg !21
+  %82 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %81, i32 8, 3, 0, !dbg !21
+  %83 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %82, i32 1, 4, 0, !dbg !21
+  %84 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %28, 0, !dbg !21
+  %85 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %84, float addrspace(203)* %28, 1, !dbg !21
+  %86 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %85, i32 0, 2, !dbg !21
+  %87 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %86, i32 2048, 3, 0, !dbg !21
+  %88 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %87, i32 8, 4, 0, !dbg !21
+  %89 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %88, i32 8, 3, 1, !dbg !21
+  %90 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %89, i32 1, 4, 1, !dbg !21
+  %91 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %31, 0, !dbg !21
+  %92 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %91, float addrspace(203)* %31, 1, !dbg !21
+  %93 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %92, i32 0, 2, !dbg !21
+  %94 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %93, i32 2048, 3, 0, !dbg !21
+  %95 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %94, i32 8, 4, 0, !dbg !21
+  %96 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %95, i32 8, 3, 1, !dbg !21
+  %97 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %96, i32 1, 4, 1, !dbg !21
+  %98 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %34, 0, !dbg !21
+  %99 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %98, float addrspace(203)* %34, 1, !dbg !21
+  %100 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %99, i32 0, 2, !dbg !21
+  %101 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %100, i32 2048, 3, 0, !dbg !21
+  %102 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %101, i32 8, 4, 0, !dbg !21
+  %103 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %102, i32 8, 3, 1, !dbg !21
+  %104 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %103, i32 1, 4, 1, !dbg !21
+  br label %105, !dbg !21
+
+105:                                              ; preds = %35
+  %106 = call i32 @llvm.tpu.tileid(), !dbg !21
+  %107 = mul i32 %106, 32, !dbg !21
+  %108 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 lshr (i32 add (i32 ptrtoint (float addrspace(201)* getelementptr (float, float addrspace(201)* null, i32 512) to i32), i32 3), i32 2), i32 0), !dbg !21
+  %b108 = bitcast i32 addrspace(201)* %108 to <8 x i32> addrspace(201)* 
+  %109 = bitcast <8 x i32> addrspace(201)* %b108 to float addrspace(201)*, !dbg !21
+  %110 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } undef, float addrspace(201)* %109, 0, !dbg !21
+  %111 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %110, float addrspace(201)* %109, 1, !dbg !21
+  %112 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %111, i32 0, 2, !dbg !21
+  %113 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %112, i32 2, 3, 0, !dbg !21
+  %114 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %113, i32 32, 3, 1, !dbg !21
+  %115 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %114, i32 8, 3, 2, !dbg !21
+  %116 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %115, i32 256, 4, 0, !dbg !21
+  %117 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %116, i32 8, 4, 1, !dbg !21
+  %118 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %117, i32 1, 4, 2, !dbg !21
+  %119 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 lshr (i32 add (i32 ptrtoint (float addrspace(201)* getelementptr (float, float addrspace(201)* null, i32 512) to i32), i32 3), i32 2), i32 512), !dbg !21
+  %b119 = bitcast i32 addrspace(201)* %119 to <8 x i32> addrspace(201)* 
+  %120 = bitcast <8 x i32> addrspace(201)* %b119 to float addrspace(201)*, !dbg !21
+  %121 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } undef, float addrspace(201)* %120, 0, !dbg !21
+  %122 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %121, float addrspace(201)* %120, 1, !dbg !21
+  %123 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %122, i32 0, 2, !dbg !21
+  %124 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %123, i32 2, 3, 0, !dbg !21
+  %125 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %124, i32 32, 3, 1, !dbg !21
+  %126 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %125, i32 8, 3, 2, !dbg !21
+  %127 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %126, i32 256, 4, 0, !dbg !21
+  %128 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %127, i32 8, 4, 1, !dbg !21
+  %129 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %128, i32 1, 4, 2, !dbg !21
+  %130 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 lshr (i32 add (i32 ptrtoint (float addrspace(201)* getelementptr (float, float addrspace(201)* null, i32 512) to i32), i32 3), i32 2), i32 1024), !dbg !21
+  %b130 = bitcast i32 addrspace(201)* %130 to <8 x i32> addrspace(201)* 
+  %131 = bitcast <8 x i32> addrspace(201)* %b130 to float addrspace(201)*, !dbg !21
+  %132 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } undef, float addrspace(201)* %131, 0, !dbg !21
+  %133 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %132, float addrspace(201)* %131, 1, !dbg !21
+  %134 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %133, i32 0, 2, !dbg !21
+  %135 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %134, i32 2, 3, 0, !dbg !21
+  %136 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %135, i32 32, 3, 1, !dbg !21
+  %137 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %136, i32 8, 3, 2, !dbg !21
+  %138 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %137, i32 256, 4, 0, !dbg !21
+  %139 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %138, i32 8, 4, 1, !dbg !21
+  %140 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %139, i32 1, 4, 2, !dbg !21
+  %141 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 lshr (i32 add (i32 ptrtoint (float addrspace(201)* getelementptr (float, float addrspace(201)* null, i32 512) to i32), i32 3), i32 2), i32 1536), !dbg !21
+  %b141 = bitcast i32 addrspace(201)* %141 to <8 x i32> addrspace(201)* 
+  %142 = bitcast <8 x i32> addrspace(201)* %b141 to float addrspace(201)*, !dbg !21
+  %143 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } undef, float addrspace(201)* %142, 0, !dbg !21
+  %144 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %143, float addrspace(201)* %142, 1, !dbg !21
+  %145 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %144, i32 0, 2, !dbg !21
+  %146 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %145, i32 2, 3, 0, !dbg !21
+  %147 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %146, i32 32, 3, 1, !dbg !21
+  %148 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %147, i32 8, 3, 2, !dbg !21
+  %149 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %148, i32 256, 4, 0, !dbg !21
+  %150 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %149, i32 8, 4, 1, !dbg !21
+  %151 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %150, i32 1, 4, 2, !dbg !21
+  %152 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 lshr (i32 add (i32 ptrtoint (float addrspace(201)* getelementptr (float, float addrspace(201)* null, i32 1) to i32), i32 3), i32 2), i32 2048), !dbg !21
+  %b152 = bitcast i32 addrspace(201)* %152 to <8 x i32> addrspace(201)* 
+  %153 = bitcast <8 x i32> addrspace(201)* %b152 to float addrspace(201)*, !dbg !21
+  %154 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } undef, float addrspace(201)* %153, 0, !dbg !21
+  %155 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %154, float addrspace(201)* %153, 1, !dbg !21
+  %156 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %155, i32 0, 2, !dbg !21
+  %157 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %156, i32 1, 3, 0, !dbg !21
+  %158 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %157, i32 1, 4, 0, !dbg !21
+  %159 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 lshr (i32 add (i32 ptrtoint (float addrspace(201)* getelementptr (float, float addrspace(201)* null, i32 1) to i32), i32 3), i32 2), i32 2049), !dbg !21
+  %b159 = bitcast i32 addrspace(201)* %159 to <8 x i32> addrspace(201)* 
+  %160 = bitcast <8 x i32> addrspace(201)* %b159 to float addrspace(201)*, !dbg !21
+  %161 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } undef, float addrspace(201)* %160, 0, !dbg !21
+  %162 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %161, float addrspace(201)* %160, 1, !dbg !21
+  %163 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %162, i32 0, 2, !dbg !21
+  %164 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %163, i32 1, 3, 0, !dbg !21
+  %165 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %164, i32 1, 4, 0, !dbg !21
+  %166 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 lshr (i32 add (i32 ptrtoint (float addrspace(201)* getelementptr (float, float addrspace(201)* null, i32 1) to i32), i32 3), i32 2), i32 2050), !dbg !21
+  %b166 = bitcast i32 addrspace(201)* %166 to <8 x i32> addrspace(201)* 
+  %167 = bitcast <8 x i32> addrspace(201)* %b166 to float addrspace(201)*, !dbg !21
+  %168 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } undef, float addrspace(201)* %167, 0, !dbg !21
+  %169 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %168, float addrspace(201)* %167, 1, !dbg !21
+  %170 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %169, i32 0, 2, !dbg !21
+  %171 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %170, i32 1, 3, 0, !dbg !21
+  %172 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %171, i32 1, 4, 0, !dbg !21
+  %173 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 lshr (i32 add (i32 ptrtoint (float addrspace(201)* getelementptr (float, float addrspace(201)* null, i32 1) to i32), i32 3), i32 2), i32 2051), !dbg !21
+  %b173 = bitcast i32 addrspace(201)* %173 to <8 x i32> addrspace(201)* 
+  %174 = bitcast <8 x i32> addrspace(201)* %b173 to float addrspace(201)*, !dbg !21
+  %175 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } undef, float addrspace(201)* %174, 0, !dbg !21
+  %176 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %175, float addrspace(201)* %174, 1, !dbg !21
+  %177 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %176, i32 0, 2, !dbg !21
+  %178 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %177, i32 1, 3, 0, !dbg !21
+  %179 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %178, i32 1, 4, 0, !dbg !21
+  %180 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 lshr (i32 add (i32 ptrtoint (float addrspace(201)* getelementptr (float, float addrspace(201)* null, i32 512) to i32), i32 3), i32 2), i32 2052), !dbg !21
+  %b180 = bitcast i32 addrspace(201)* %180 to <8 x i32> addrspace(201)* 
+  %181 = bitcast <8 x i32> addrspace(201)* %b180 to float addrspace(201)*, !dbg !21
+  %182 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } undef, float addrspace(201)* %181, 0, !dbg !21
+  %183 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %182, float addrspace(201)* %181, 1, !dbg !21
+  %184 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %183, i32 0, 2, !dbg !21
+  %185 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %184, i32 2, 3, 0, !dbg !21
+  %186 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %185, i32 32, 3, 1, !dbg !21
+  %187 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %186, i32 8, 3, 2, !dbg !21
+  %188 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %187, i32 256, 4, 0, !dbg !21
+  %189 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %188, i32 8, 4, 1, !dbg !21
+  %190 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %189, i32 1, 4, 2, !dbg !21
+  %191 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 lshr (i32 add (i32 ptrtoint (float addrspace(201)* getelementptr (float, float addrspace(201)* null, i32 512) to i32), i32 3), i32 2), i32 2564), !dbg !21
+  %b191 = bitcast i32 addrspace(201)* %191 to <8 x i32> addrspace(201)* 
+  %192 = bitcast <8 x i32> addrspace(201)* %b191 to float addrspace(201)*, !dbg !21
+  %193 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } undef, float addrspace(201)* %192, 0, !dbg !21
+  %194 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %193, float addrspace(201)* %192, 1, !dbg !21
+  %195 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %194, i32 0, 2, !dbg !21
+  %196 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %195, i32 2, 3, 0, !dbg !21
+  %197 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %196, i32 32, 3, 1, !dbg !21
+  %198 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %197, i32 8, 3, 2, !dbg !21
+  %199 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %198, i32 256, 4, 0, !dbg !21
+  %200 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %199, i32 8, 4, 1, !dbg !21
+  %201 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %200, i32 1, 4, 2, !dbg !21
+  %202 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 lshr (i32 add (i32 ptrtoint (float addrspace(201)* getelementptr (float, float addrspace(201)* null, i32 512) to i32), i32 3), i32 2), i32 3076), !dbg !21
+  %b202 = bitcast i32 addrspace(201)* %202 to <8 x i32> addrspace(201)* 
+  %203 = bitcast <8 x i32> addrspace(201)* %b202 to float addrspace(201)*, !dbg !21
+  %204 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } undef, float addrspace(201)* %203, 0, !dbg !21
+  %205 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %204, float addrspace(201)* %203, 1, !dbg !21
+  %206 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %205, i32 0, 2, !dbg !21
+  %207 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %206, i32 2, 3, 0, !dbg !21
+  %208 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %207, i32 32, 3, 1, !dbg !21
+  %209 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %208, i32 8, 3, 2, !dbg !21
+  %210 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %209, i32 256, 4, 0, !dbg !21
+  %211 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %210, i32 8, 4, 1, !dbg !21
+  %212 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %211, i32 1, 4, 2, !dbg !21
+  %213 = call i32 addrspace(204)* @llvm.tpu.allocate.sflag(i32 lshr (i32 add (i32 ptrtoint (i32 addrspace(204)* getelementptr (i32, i32 addrspace(204)* null, i32 2) to i32), i32 3), i32 2), i32 0), !dbg !21
+  %214 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } undef, i32 addrspace(204)* %213, 0, !dbg !21
+  %215 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %214, i32 addrspace(204)* %213, 1, !dbg !21
+  %216 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %215, i32 0, 2, !dbg !21
+  %217 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %216, i32 2, 3, 0, !dbg !21
+  %218 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %217, i32 1, 3, 1, !dbg !21
+  %219 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %218, i32 1, 4, 0, !dbg !21
+  %220 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %219, i32 1, 4, 1, !dbg !21
+  %221 = call i32 addrspace(204)* @llvm.tpu.allocate.sflag(i32 lshr (i32 add (i32 ptrtoint (i32 addrspace(204)* getelementptr (i32, i32 addrspace(204)* null, i32 2) to i32), i32 3), i32 2), i32 2), !dbg !21
+  %222 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } undef, i32 addrspace(204)* %221, 0, !dbg !21
+  %223 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %222, i32 addrspace(204)* %221, 1, !dbg !21
+  %224 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %223, i32 0, 2, !dbg !21
+  %225 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %224, i32 2, 3, 0, !dbg !21
+  %226 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %225, i32 1, 3, 1, !dbg !21
+  %227 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %226, i32 1, 4, 0, !dbg !21
+  %228 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %227, i32 1, 4, 1, !dbg !21
+  %229 = sub i32 2048, %107, !dbg !21
+  %230 = udiv i32 %229, 32, !dbg !21
+  %231 = mul i32 %230, 32, !dbg !21
+  %232 = icmp ugt i32 %229, %231, !dbg !21
+  %233 = select i1 %232, i32 1, i32 0, !dbg !21
+  %234 = add i32 %230, %233, !dbg !21
+  %235 = add i32 %234, 2, !dbg !21
+  br label %236, !dbg !21
+
+236:                                              ; preds = %1042, %105
+  %237 = phi i32 [ %1050, %1042 ], [ 0, %105 ]
+  %238 = phi i32 [ %1049, %1042 ], [ %107, %105 ]
+  %239 = phi i32 [ %1045, %1042 ], [ 0, %105 ]
+  %240 = phi i32 [ %238, %1042 ], [ 0, %105 ]
+  %241 = phi i32 [ %239, %1042 ], [ 0, %105 ]
+  %242 = icmp slt i32 %237, %235, !dbg !21
+  br i1 %242, label %243, label %1051, !dbg !21, !llvm.loop !23
+
+243:                                              ; preds = %236
+  %244 = icmp sge i32 %237, 0, !dbg !21
+  %245 = icmp slt i32 %237, %234, !dbg !21
+  %246 = and i1 %244, %245, !dbg !21
+  br i1 %246, label %247, label %449, !dbg !21
+
+247:                                              ; preds = %243
+  %248 = add i32 %237, 1, !dbg !21
+  %249 = urem i32 %248, 2, !dbg !21
+  %250 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %220, 0, !dbg !21
+  %251 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } undef, i32 addrspace(204)* %250, 0, !dbg !21
+  %252 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %220, 1, !dbg !21
+  %253 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %251, i32 addrspace(204)* %252, 1, !dbg !21
+  %254 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %220, 4, 0, !dbg !21
+  %255 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %220, 4, 1, !dbg !21
+  %256 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %220, 2, !dbg !21
+  %257 = mul i32 %249, %254, !dbg !21
+  %258 = add i32 %256, %257, !dbg !21
+  %259 = mul i32 0, %255, !dbg !21
+  %260 = add i32 %258, %259, !dbg !21
+  %261 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %253, i32 %260, 2, !dbg !21
+  %262 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %261, i32 1, 3, 0, !dbg !21
+  %263 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %262, i32 1, 4, 0, !dbg !21
+  %264 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %118, 0, !dbg !21
+  %265 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(201)* %264, 0, !dbg !21
+  %266 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %118, 1, !dbg !21
+  %267 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %265, float addrspace(201)* %266, 1, !dbg !21
+  %268 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %118, 4, 0, !dbg !21
+  %269 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %118, 4, 1, !dbg !21
+  %270 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %118, 4, 2, !dbg !21
+  %271 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %118, 2, !dbg !21
+  %272 = mul i32 %249, %268, !dbg !21
+  %273 = add i32 %271, %272, !dbg !21
+  %274 = mul i32 0, %269, !dbg !21
+  %275 = add i32 %273, %274, !dbg !21
+  %276 = mul i32 0, %270, !dbg !21
+  %277 = add i32 %275, %276, !dbg !21
+  %278 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %267, i32 %277, 2, !dbg !21
+  %279 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %278, i32 8, 3, 1, !dbg !21
+  %280 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %279, i32 1, 4, 1, !dbg !21
+  %281 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %280, i32 32, 3, 0, !dbg !21
+  %282 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %281, i32 8, 4, 0, !dbg !21
+  %283 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %42, 1, !dbg !21
+  %284 = mul i32 %238, 8, !dbg !21
+  %285 = add i32 %284, %239, !dbg !21
+  %286 = getelementptr float, float addrspace(203)* %283, i32 %285, !dbg !21
+  %287 = bitcast float addrspace(203)* %286 to i32 addrspace(203)*, !dbg !21
+  %288 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %282, 1, !dbg !21
+  %289 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %282, 2, !dbg !21
+  %290 = add i32 %289, 0, !dbg !21
+  %291 = add i32 %290, 0, !dbg !21
+  %292 = getelementptr float, float addrspace(201)* %288, i32 %291, !dbg !21
+  %293 = bitcast float addrspace(201)* %292 to i32 addrspace(201)*, !dbg !21
+  %294 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %263, 1, !dbg !21
+  %295 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %263, 2, !dbg !21
+  %296 = add i32 %295, 0, !dbg !21
+  %297 = getelementptr i32, i32 addrspace(204)* %294, i32 %296, !dbg !21
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %297, i32 0, i32 addrspace(203)* %287, i32 addrspace(201)* %293, i32 256, i32 0), !dbg !21
+  %298 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %129, 0, !dbg !21
+  %299 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(201)* %298, 0, !dbg !21
+  %300 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %129, 1, !dbg !21
+  %301 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %299, float addrspace(201)* %300, 1, !dbg !21
+  %302 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %129, 4, 0, !dbg !21
+  %303 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %129, 4, 1, !dbg !21
+  %304 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %129, 4, 2, !dbg !21
+  %305 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %129, 2, !dbg !21
+  %306 = mul i32 %249, %302, !dbg !21
+  %307 = add i32 %305, %306, !dbg !21
+  %308 = mul i32 0, %303, !dbg !21
+  %309 = add i32 %307, %308, !dbg !21
+  %310 = mul i32 0, %304, !dbg !21
+  %311 = add i32 %309, %310, !dbg !21
+  %312 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %301, i32 %311, 2, !dbg !21
+  %313 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %312, i32 8, 3, 1, !dbg !21
+  %314 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %313, i32 1, 4, 1, !dbg !21
+  %315 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %314, i32 32, 3, 0, !dbg !21
+  %316 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %315, i32 8, 4, 0, !dbg !21
+  %317 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %49, 1, !dbg !21
+  %318 = mul i32 %238, 8, !dbg !21
+  %319 = add i32 %318, %239, !dbg !21
+  %320 = getelementptr float, float addrspace(203)* %317, i32 %319, !dbg !21
+  %321 = bitcast float addrspace(203)* %320 to i32 addrspace(203)*, !dbg !21
+  %322 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %316, 1, !dbg !21
+  %323 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %316, 2, !dbg !21
+  %324 = add i32 %323, 0, !dbg !21
+  %325 = add i32 %324, 0, !dbg !21
+  %326 = getelementptr float, float addrspace(201)* %322, i32 %325, !dbg !21
+  %327 = bitcast float addrspace(201)* %326 to i32 addrspace(201)*, !dbg !21
+  %328 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %263, 1, !dbg !21
+  %329 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %263, 2, !dbg !21
+  %330 = add i32 %329, 0, !dbg !21
+  %331 = getelementptr i32, i32 addrspace(204)* %328, i32 %330, !dbg !21
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %331, i32 0, i32 addrspace(203)* %321, i32 addrspace(201)* %327, i32 256, i32 0), !dbg !21
+  %332 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %140, 0, !dbg !21
+  %333 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(201)* %332, 0, !dbg !21
+  %334 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %140, 1, !dbg !21
+  %335 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %333, float addrspace(201)* %334, 1, !dbg !21
+  %336 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %140, 4, 0, !dbg !21
+  %337 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %140, 4, 1, !dbg !21
+  %338 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %140, 4, 2, !dbg !21
+  %339 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %140, 2, !dbg !21
+  %340 = mul i32 %249, %336, !dbg !21
+  %341 = add i32 %339, %340, !dbg !21
+  %342 = mul i32 0, %337, !dbg !21
+  %343 = add i32 %341, %342, !dbg !21
+  %344 = mul i32 0, %338, !dbg !21
+  %345 = add i32 %343, %344, !dbg !21
+  %346 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %335, i32 %345, 2, !dbg !21
+  %347 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %346, i32 8, 3, 1, !dbg !21
+  %348 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %347, i32 1, 4, 1, !dbg !21
+  %349 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %348, i32 32, 3, 0, !dbg !21
+  %350 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %349, i32 8, 4, 0, !dbg !21
+  %351 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %56, 1, !dbg !21
+  %352 = mul i32 %238, 8, !dbg !21
+  %353 = add i32 %352, %239, !dbg !21
+  %354 = getelementptr float, float addrspace(203)* %351, i32 %353, !dbg !21
+  %355 = bitcast float addrspace(203)* %354 to i32 addrspace(203)*, !dbg !21
+  %356 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %350, 1, !dbg !21
+  %357 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %350, 2, !dbg !21
+  %358 = add i32 %357, 0, !dbg !21
+  %359 = add i32 %358, 0, !dbg !21
+  %360 = getelementptr float, float addrspace(201)* %356, i32 %359, !dbg !21
+  %361 = bitcast float addrspace(201)* %360 to i32 addrspace(201)*, !dbg !21
+  %362 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %263, 1, !dbg !21
+  %363 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %263, 2, !dbg !21
+  %364 = add i32 %363, 0, !dbg !21
+  %365 = getelementptr i32, i32 addrspace(204)* %362, i32 %364, !dbg !21
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %365, i32 0, i32 addrspace(203)* %355, i32 addrspace(201)* %361, i32 256, i32 0), !dbg !21
+  %366 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %151, 0, !dbg !21
+  %367 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(201)* %366, 0, !dbg !21
+  %368 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %151, 1, !dbg !21
+  %369 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %367, float addrspace(201)* %368, 1, !dbg !21
+  %370 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %151, 4, 0, !dbg !21
+  %371 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %151, 4, 1, !dbg !21
+  %372 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %151, 4, 2, !dbg !21
+  %373 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %151, 2, !dbg !21
+  %374 = mul i32 %249, %370, !dbg !21
+  %375 = add i32 %373, %374, !dbg !21
+  %376 = mul i32 0, %371, !dbg !21
+  %377 = add i32 %375, %376, !dbg !21
+  %378 = mul i32 0, %372, !dbg !21
+  %379 = add i32 %377, %378, !dbg !21
+  %380 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %369, i32 %379, 2, !dbg !21
+  %381 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %380, i32 8, 3, 1, !dbg !21
+  %382 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %381, i32 1, 4, 1, !dbg !21
+  %383 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %382, i32 32, 3, 0, !dbg !21
+  %384 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %383, i32 8, 4, 0, !dbg !21
+  %385 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %63, 1, !dbg !21
+  %386 = mul i32 %238, 8, !dbg !21
+  %387 = add i32 %386, %239, !dbg !21
+  %388 = getelementptr float, float addrspace(203)* %385, i32 %387, !dbg !21
+  %389 = bitcast float addrspace(203)* %388 to i32 addrspace(203)*, !dbg !21
+  %390 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %384, 1, !dbg !21
+  %391 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %384, 2, !dbg !21
+  %392 = add i32 %391, 0, !dbg !21
+  %393 = add i32 %392, 0, !dbg !21
+  %394 = getelementptr float, float addrspace(201)* %390, i32 %393, !dbg !21
+  %395 = bitcast float addrspace(201)* %394 to i32 addrspace(201)*, !dbg !21
+  %396 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %263, 1, !dbg !21
+  %397 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %263, 2, !dbg !21
+  %398 = add i32 %397, 0, !dbg !21
+  %399 = getelementptr i32, i32 addrspace(204)* %396, i32 %398, !dbg !21
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %399, i32 0, i32 addrspace(203)* %389, i32 addrspace(201)* %395, i32 256, i32 0), !dbg !21
+  %400 = icmp eq i32 %237, 0, !dbg !21
+  br i1 %400, label %401, label %412, !dbg !21
+
+401:                                              ; preds = %247
+  %402 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %68, 1, !dbg !21
+  %403 = getelementptr float, float addrspace(203)* %402, i32 0, !dbg !21
+  %404 = bitcast float addrspace(203)* %403 to i32 addrspace(203)*, !dbg !21
+  %405 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %158, 1, !dbg !21
+  %406 = getelementptr float, float addrspace(201)* %405, i32 0, !dbg !21
+  %407 = bitcast float addrspace(201)* %406 to i32 addrspace(201)*, !dbg !21
+  %408 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %263, 1, !dbg !21
+  %409 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %263, 2, !dbg !21
+  %410 = add i32 %409, 0, !dbg !21
+  %411 = getelementptr i32, i32 addrspace(204)* %408, i32 %410, !dbg !21
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %411, i32 0, i32 addrspace(203)* %404, i32 addrspace(201)* %407, i32 1, i32 0), !dbg !21
+  br label %412, !dbg !21
+
+412:                                              ; preds = %401, %247
+  br i1 %400, label %413, label %424, !dbg !21
+
+413:                                              ; preds = %412
+  %414 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %73, 1, !dbg !21
+  %415 = getelementptr float, float addrspace(203)* %414, i32 0, !dbg !21
+  %416 = bitcast float addrspace(203)* %415 to i32 addrspace(203)*, !dbg !21
+  %417 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %165, 1, !dbg !21
+  %418 = getelementptr float, float addrspace(201)* %417, i32 0, !dbg !21
+  %419 = bitcast float addrspace(201)* %418 to i32 addrspace(201)*, !dbg !21
+  %420 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %263, 1, !dbg !21
+  %421 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %263, 2, !dbg !21
+  %422 = add i32 %421, 0, !dbg !21
+  %423 = getelementptr i32, i32 addrspace(204)* %420, i32 %422, !dbg !21
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %423, i32 0, i32 addrspace(203)* %416, i32 addrspace(201)* %419, i32 1, i32 0), !dbg !21
+  br label %424, !dbg !21
+
+424:                                              ; preds = %413, %412
+  br i1 %400, label %425, label %436, !dbg !21
+
+425:                                              ; preds = %424
+  %426 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %78, 1, !dbg !21
+  %427 = getelementptr float, float addrspace(203)* %426, i32 0, !dbg !21
+  %428 = bitcast float addrspace(203)* %427 to i32 addrspace(203)*, !dbg !21
+  %429 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %172, 1, !dbg !21
+  %430 = getelementptr float, float addrspace(201)* %429, i32 0, !dbg !21
+  %431 = bitcast float addrspace(201)* %430 to i32 addrspace(201)*, !dbg !21
+  %432 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %263, 1, !dbg !21
+  %433 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %263, 2, !dbg !21
+  %434 = add i32 %433, 0, !dbg !21
+  %435 = getelementptr i32, i32 addrspace(204)* %432, i32 %434, !dbg !21
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %435, i32 0, i32 addrspace(203)* %428, i32 addrspace(201)* %431, i32 1, i32 0), !dbg !21
+  br label %436, !dbg !21
+
+436:                                              ; preds = %425, %424
+  br i1 %400, label %437, label %448, !dbg !21
+
+437:                                              ; preds = %436
+  %438 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %83, 1, !dbg !21
+  %439 = getelementptr float, float addrspace(203)* %438, i32 0, !dbg !21
+  %440 = bitcast float addrspace(203)* %439 to i32 addrspace(203)*, !dbg !21
+  %441 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %179, 1, !dbg !21
+  %442 = getelementptr float, float addrspace(201)* %441, i32 0, !dbg !21
+  %443 = bitcast float addrspace(201)* %442 to i32 addrspace(201)*, !dbg !21
+  %444 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %263, 1, !dbg !21
+  %445 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %263, 2, !dbg !21
+  %446 = add i32 %445, 0, !dbg !21
+  %447 = getelementptr i32, i32 addrspace(204)* %444, i32 %446, !dbg !21
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %447, i32 0, i32 addrspace(203)* %440, i32 addrspace(201)* %443, i32 1, i32 0), !dbg !21
+  br label %448, !dbg !21
+
+448:                                              ; preds = %437, %436
+  br label %449, !dbg !21
+
+449:                                              ; preds = %448, %243
+  %450 = icmp sge i32 %237, 1, !dbg !21
+  %451 = add i32 %234, 1, !dbg !21
+  %452 = icmp slt i32 %237, %451, !dbg !21
+  %453 = and i1 %450, %452, !dbg !21
+  br i1 %453, label %454, label %1019, !dbg !21
+
+454:                                              ; preds = %449
+  %455 = urem i32 %237, 2, !dbg !21
+  %456 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %220, 0, !dbg !21
+  %457 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } undef, i32 addrspace(204)* %456, 0, !dbg !21
+  %458 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %220, 1, !dbg !21
+  %459 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %457, i32 addrspace(204)* %458, 1, !dbg !21
+  %460 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %220, 4, 0, !dbg !21
+  %461 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %220, 4, 1, !dbg !21
+  %462 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %220, 2, !dbg !21
+  %463 = mul i32 %455, %460, !dbg !21
+  %464 = add i32 %462, %463, !dbg !21
+  %465 = mul i32 0, %461, !dbg !21
+  %466 = add i32 %464, %465, !dbg !21
+  %467 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %459, i32 %466, 2, !dbg !21
+  %468 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %467, i32 1, 3, 0, !dbg !21
+  %469 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %468, i32 1, 4, 0, !dbg !21
+  %470 = icmp eq i32 %237, 1, !dbg !21
+  br i1 %470, label %471, label %476, !dbg !21
+
+471:                                              ; preds = %454
+  %472 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %469, 1, !dbg !21
+  %473 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %469, 2, !dbg !21
+  %474 = add i32 %473, 0, !dbg !21
+  %475 = getelementptr i32, i32 addrspace(204)* %472, i32 %474, !dbg !21
+  call void @llvm.tpu.waitge(i32 addrspace(204)* %475, i32 1028), !dbg !21
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %475, i32 -1028), !dbg !21
+  br label %481, !dbg !21
+
+476:                                              ; preds = %454
+  %477 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %469, 1, !dbg !21
+  %478 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %469, 2, !dbg !21
+  %479 = add i32 %478, 0, !dbg !21
+  %480 = getelementptr i32, i32 addrspace(204)* %477, i32 %479, !dbg !21
+  call void @llvm.tpu.waitge(i32 addrspace(204)* %480, i32 1024), !dbg !21
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %480, i32 -1024), !dbg !21
+  br label %481, !dbg !21
+
+481:                                              ; preds = %471, %476
+  %482 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %118, 0, !dbg !21
+  %483 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(201)* %482, 0, !dbg !21
+  %484 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %118, 1, !dbg !21
+  %485 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %483, float addrspace(201)* %484, 1, !dbg !21
+  %486 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %118, 4, 0, !dbg !21
+  %487 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %118, 4, 1, !dbg !21
+  %488 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %118, 4, 2, !dbg !21
+  %489 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %118, 2, !dbg !21
+  %490 = mul i32 %455, %486, !dbg !21
+  %491 = add i32 %489, %490, !dbg !21
+  %492 = mul i32 0, %487, !dbg !21
+  %493 = add i32 %491, %492, !dbg !21
+  %494 = mul i32 0, %488, !dbg !21
+  %495 = add i32 %493, %494, !dbg !21
+  %496 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %485, i32 %495, 2, !dbg !21
+  %497 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %496, i32 8, 3, 1, !dbg !21
+  %498 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %497, i32 1, 4, 1, !dbg !21
+  %499 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %498, i32 32, 3, 0, !dbg !21
+  %500 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %499, i32 8, 4, 0, !dbg !21
+  %501 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %129, 0, !dbg !21
+  %502 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(201)* %501, 0, !dbg !21
+  %503 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %129, 1, !dbg !21
+  %504 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %502, float addrspace(201)* %503, 1, !dbg !21
+  %505 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %129, 4, 0, !dbg !21
+  %506 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %129, 4, 1, !dbg !21
+  %507 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %129, 4, 2, !dbg !21
+  %508 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %129, 2, !dbg !21
+  %509 = mul i32 %455, %505, !dbg !21
+  %510 = add i32 %508, %509, !dbg !21
+  %511 = mul i32 0, %506, !dbg !21
+  %512 = add i32 %510, %511, !dbg !21
+  %513 = mul i32 0, %507, !dbg !21
+  %514 = add i32 %512, %513, !dbg !21
+  %515 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %504, i32 %514, 2, !dbg !21
+  %516 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %515, i32 8, 3, 1, !dbg !21
+  %517 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %516, i32 1, 4, 1, !dbg !21
+  %518 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %517, i32 32, 3, 0, !dbg !21
+  %519 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %518, i32 8, 4, 0, !dbg !21
+  %520 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %140, 0, !dbg !21
+  %521 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(201)* %520, 0, !dbg !21
+  %522 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %140, 1, !dbg !21
+  %523 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %521, float addrspace(201)* %522, 1, !dbg !21
+  %524 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %140, 4, 0, !dbg !21
+  %525 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %140, 4, 1, !dbg !21
+  %526 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %140, 4, 2, !dbg !21
+  %527 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %140, 2, !dbg !21
+  %528 = mul i32 %455, %524, !dbg !21
+  %529 = add i32 %527, %528, !dbg !21
+  %530 = mul i32 0, %525, !dbg !21
+  %531 = add i32 %529, %530, !dbg !21
+  %532 = mul i32 0, %526, !dbg !21
+  %533 = add i32 %531, %532, !dbg !21
+  %534 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %523, i32 %533, 2, !dbg !21
+  %535 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %534, i32 8, 3, 1, !dbg !21
+  %536 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %535, i32 1, 4, 1, !dbg !21
+  %537 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %536, i32 32, 3, 0, !dbg !21
+  %538 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %537, i32 8, 4, 0, !dbg !21
+  %539 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %151, 0, !dbg !21
+  %540 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(201)* %539, 0, !dbg !21
+  %541 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %151, 1, !dbg !21
+  %542 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %540, float addrspace(201)* %541, 1, !dbg !21
+  %543 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %151, 4, 0, !dbg !21
+  %544 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %151, 4, 1, !dbg !21
+  %545 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %151, 4, 2, !dbg !21
+  %546 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %151, 2, !dbg !21
+  %547 = mul i32 %455, %543, !dbg !21
+  %548 = add i32 %546, %547, !dbg !21
+  %549 = mul i32 0, %544, !dbg !21
+  %550 = add i32 %548, %549, !dbg !21
+  %551 = mul i32 0, %545, !dbg !21
+  %552 = add i32 %550, %551, !dbg !21
+  %553 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %542, i32 %552, 2, !dbg !21
+  %554 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %553, i32 8, 3, 1, !dbg !21
+  %555 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %554, i32 1, 4, 1, !dbg !21
+  %556 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %555, i32 32, 3, 0, !dbg !21
+  %557 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %556, i32 8, 4, 0, !dbg !21
+  %558 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %190, 0, !dbg !21
+  %559 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(201)* %558, 0, !dbg !21
+  %560 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %190, 1, !dbg !21
+  %561 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %559, float addrspace(201)* %560, 1, !dbg !21
+  %562 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %190, 4, 0, !dbg !21
+  %563 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %190, 4, 1, !dbg !21
+  %564 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %190, 4, 2, !dbg !21
+  %565 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %190, 2, !dbg !21
+  %566 = mul i32 %455, %562, !dbg !21
+  %567 = add i32 %565, %566, !dbg !21
+  %568 = mul i32 0, %563, !dbg !21
+  %569 = add i32 %567, %568, !dbg !21
+  %570 = mul i32 0, %564, !dbg !21
+  %571 = add i32 %569, %570, !dbg !21
+  %572 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %561, i32 %571, 2, !dbg !21
+  %573 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %572, i32 8, 3, 1, !dbg !21
+  %574 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %573, i32 1, 4, 1, !dbg !21
+  %575 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %574, i32 32, 3, 0, !dbg !21
+  %576 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %575, i32 8, 4, 0, !dbg !21
+  %577 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %201, 0, !dbg !21
+  %578 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(201)* %577, 0, !dbg !21
+  %579 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %201, 1, !dbg !21
+  %580 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %578, float addrspace(201)* %579, 1, !dbg !21
+  %581 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %201, 4, 0, !dbg !21
+  %582 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %201, 4, 1, !dbg !21
+  %583 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %201, 4, 2, !dbg !21
+  %584 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %201, 2, !dbg !21
+  %585 = mul i32 %455, %581, !dbg !21
+  %586 = add i32 %584, %585, !dbg !21
+  %587 = mul i32 0, %582, !dbg !21
+  %588 = add i32 %586, %587, !dbg !21
+  %589 = mul i32 0, %583, !dbg !21
+  %590 = add i32 %588, %589, !dbg !21
+  %591 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %580, i32 %590, 2, !dbg !21
+  %592 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %591, i32 8, 3, 1, !dbg !21
+  %593 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %592, i32 1, 4, 1, !dbg !21
+  %594 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %593, i32 32, 3, 0, !dbg !21
+  %595 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %594, i32 8, 4, 0, !dbg !21
+  %596 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %212, 0, !dbg !21
+  %597 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(201)* %596, 0, !dbg !21
+  %598 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %212, 1, !dbg !21
+  %599 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %597, float addrspace(201)* %598, 1, !dbg !21
+  %600 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %212, 4, 0, !dbg !21
+  %601 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %212, 4, 1, !dbg !21
+  %602 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %212, 4, 2, !dbg !21
+  %603 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [3 x i32], [3 x i32] } %212, 2, !dbg !21
+  %604 = mul i32 %455, %600, !dbg !21
+  %605 = add i32 %603, %604, !dbg !21
+  %606 = mul i32 0, %601, !dbg !21
+  %607 = add i32 %605, %606, !dbg !21
+  %608 = mul i32 0, %602, !dbg !21
+  %609 = add i32 %607, %608, !dbg !21
+  %610 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %599, i32 %609, 2, !dbg !21
+  %611 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %610, i32 8, 3, 1, !dbg !21
+  %612 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %611, i32 1, 4, 1, !dbg !21
+  %613 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %612, i32 32, 3, 0, !dbg !21
+  %614 = insertvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %613, i32 8, 4, 0, !dbg !21
+  %615 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %158, 1, !dbg !21
+  %616 = getelementptr float, float addrspace(201)* %615, i32 0, !dbg !21
+  %617 = load float, float addrspace(201)* %616, align 4, !dbg !21
+  %618 = insertelement <8 x float> undef, float %617, i32 0, !dbg !21
+  %619 = shufflevector <8 x float> %618, <8 x float> undef, <8 x i32> zeroinitializer, !dbg !21
+  %620 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %165, 1, !dbg !21
+  %621 = getelementptr float, float addrspace(201)* %620, i32 0, !dbg !21
+  %622 = load float, float addrspace(201)* %621, align 4, !dbg !21
+  %623 = insertelement <8 x float> undef, float %622, i32 0, !dbg !21
+  %624 = shufflevector <8 x float> %623, <8 x float> undef, <8 x i32> zeroinitializer, !dbg !21
+  %625 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %172, 1, !dbg !21
+  %626 = getelementptr float, float addrspace(201)* %625, i32 0, !dbg !21
+  %627 = load float, float addrspace(201)* %626, align 4, !dbg !21
+  %628 = insertelement <8 x float> undef, float %627, i32 0, !dbg !21
+  %629 = shufflevector <8 x float> %628, <8 x float> undef, <8 x i32> zeroinitializer, !dbg !21
+  %630 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [1 x i32], [1 x i32] } %179, 1, !dbg !21
+  %631 = getelementptr float, float addrspace(201)* %630, i32 0, !dbg !21
+  %632 = load float, float addrspace(201)* %631, align 4, !dbg !21
+  %633 = insertelement <8 x float> undef, float %632, i32 0, !dbg !21
+  %634 = shufflevector <8 x float> %633, <8 x float> undef, <8 x i32> zeroinitializer, !dbg !21
+  br label %635, !dbg !21
+
+635:                                              ; preds = %638, %481
+  %636 = phi i32 [ %958, %638 ], [ 0, %481 ]
+  %637 = icmp slt i32 %636, 32, !dbg !21
+  br i1 %637, label %638, label %959, !dbg !21, !llvm.loop !25
+
+638:                                              ; preds = %635
+  %639 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %519, 1, !dbg !21
+  %640 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %519, 2, !dbg !21
+  %641 = mul i32 %636, 8, !dbg !21
+  %642 = add i32 %640, %641, !dbg !21
+  %643 = add i32 %642, 0, !dbg !21
+  %644 = getelementptr float, float addrspace(201)* %639, i32 %643, !dbg !21
+  %645 = bitcast float addrspace(201)* %644 to <8 x float> addrspace(201)*, !dbg !21
+  %646 = load <8 x float>, <8 x float> addrspace(201)* %645, align 32, !dbg !21, !alias.scope !28, !noalias !31, !llvm.access.group !27
+  %647 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %538, 1, !dbg !21
+  %648 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %538, 2, !dbg !21
+  %649 = mul i32 %636, 8, !dbg !21
+  %650 = add i32 %648, %649, !dbg !21
+  %651 = add i32 %650, 0, !dbg !21
+  %652 = getelementptr float, float addrspace(201)* %647, i32 %651, !dbg !21
+  %653 = bitcast float addrspace(201)* %652 to <8 x float> addrspace(201)*, !dbg !21
+  %654 = load <8 x float>, <8 x float> addrspace(201)* %653, align 32, !dbg !21, !alias.scope !28, !noalias !31, !llvm.access.group !27
+  %655 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %500, 1, !dbg !21
+  %656 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %500, 2, !dbg !21
+  %657 = mul i32 %636, 8, !dbg !21
+  %658 = add i32 %656, %657, !dbg !21
+  %659 = add i32 %658, 0, !dbg !21
+  %660 = getelementptr float, float addrspace(201)* %655, i32 %659, !dbg !21
+  %661 = bitcast float addrspace(201)* %660 to <8 x float> addrspace(201)*, !dbg !21
+  %662 = load <8 x float>, <8 x float> addrspace(201)* %661, align 32, !dbg !21, !alias.scope !28, !noalias !31, !llvm.access.group !27
+  %663 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %557, 1, !dbg !21
+  %664 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %557, 2, !dbg !21
+  %665 = mul i32 %636, 8, !dbg !21
+  %666 = add i32 %664, %665, !dbg !21
+  %667 = add i32 %666, 0, !dbg !21
+  %668 = getelementptr float, float addrspace(201)* %663, i32 %667, !dbg !21
+  %669 = bitcast float addrspace(201)* %668 to <8 x float> addrspace(201)*, !dbg !21
+  %670 = load <8 x float>, <8 x float> addrspace(201)* %669, align 32, !dbg !21, !alias.scope !28, !noalias !31, !llvm.access.group !27
+  %671 = fneg <8 x float> %629, !dbg !21
+  %672 = fadd <8 x float> %670, %646, !dbg !21
+  %673 = fmul <8 x float> %646, %646, !dbg !21
+  %674 = fadd <8 x float> %654, %673, !dbg !21
+  %675 = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %674), !dbg !21
+  %676 = fmul <8 x float> %675, <float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000>, !dbg !21
+  %677 = fneg <8 x float> %624, !dbg !21
+  %678 = fmul <8 x float> %676, %677, !dbg !21
+  %679 = fmul <8 x float> %678, <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, !dbg !21
+  %680 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %679), !dbg !21
+  %681 = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %654), !dbg !21
+  %682 = fmul <8 x float> %681, <float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000>, !dbg !21
+  %683 = fmul <8 x float> %677, %682, !dbg !21
+  %684 = fmul <8 x float> %683, <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, !dbg !21
+  %685 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %684), !dbg !21
+  %686 = fsub <8 x float> %680, %685, !dbg !21
+  %687 = fdiv <8 x float> %662, %619, !dbg !21
+  %688 = fmul <8 x float> %686, %687, !dbg !21
+  %689 = fsub <8 x float> %672, %688, !dbg !21
+  %690 = call <8 x float> @llvm.minimum.v8f32(<8 x float> %689, <8 x float> %629), !dbg !21
+  %691 = call <8 x float> @llvm.maximum.v8f32(<8 x float> %690, <8 x float> %671), !dbg !21
+  %692 = fsub <8 x float> %691, %689, !dbg !21
+  %693 = fdiv <8 x float> %680, %619, !dbg !21
+  %694 = fmul <8 x float> %634, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, !dbg !21
+  %695 = fadd <8 x float> %693, %694, !dbg !21
+  %696 = fdiv <8 x float> %692, %695, !dbg !21
+  %697 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %576, 1, !dbg !21
+  %698 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %576, 2, !dbg !21
+  %699 = mul i32 %636, 8, !dbg !21
+  %700 = add i32 %698, %699, !dbg !21
+  %701 = add i32 %700, 0, !dbg !21
+  %702 = getelementptr float, float addrspace(201)* %697, i32 %701, !dbg !21
+  %703 = bitcast float addrspace(201)* %702 to <8 x float> addrspace(201)*, !dbg !21
+  store <8 x float> %696, <8 x float> addrspace(201)* %703, align 32, !dbg !21, !alias.scope !28, !noalias !31, !llvm.access.group !27
+  %704 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %595, 1, !dbg !21
+  %705 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %595, 2, !dbg !21
+  %706 = mul i32 %636, 8, !dbg !21
+  %707 = add i32 %705, %706, !dbg !21
+  %708 = add i32 %707, 0, !dbg !21
+  %709 = getelementptr float, float addrspace(201)* %704, i32 %708, !dbg !21
+  %710 = bitcast float addrspace(201)* %709 to <8 x float> addrspace(201)*, !dbg !21
+  store <8 x float> %674, <8 x float> addrspace(201)* %710, align 32, !dbg !21, !alias.scope !28, !noalias !31, !llvm.access.group !27
+  %711 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %614, 1, !dbg !21
+  %712 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %614, 2, !dbg !21
+  %713 = mul i32 %636, 8, !dbg !21
+  %714 = add i32 %712, %713, !dbg !21
+  %715 = add i32 %714, 0, !dbg !21
+  %716 = getelementptr float, float addrspace(201)* %711, i32 %715, !dbg !21
+  %717 = bitcast float addrspace(201)* %716 to <8 x float> addrspace(201)*, !dbg !21
+  store <8 x float> %689, <8 x float> addrspace(201)* %717, align 32, !dbg !21, !alias.scope !28, !noalias !31, !llvm.access.group !27
+  %718 = add i32 %636, 1, !dbg !21
+  %719 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %519, 1, !dbg !21
+  %720 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %519, 2, !dbg !21
+  %721 = mul i32 %718, 8, !dbg !21
+  %722 = add i32 %720, %721, !dbg !21
+  %723 = add i32 %722, 0, !dbg !21
+  %724 = getelementptr float, float addrspace(201)* %719, i32 %723, !dbg !21
+  %725 = bitcast float addrspace(201)* %724 to <8 x float> addrspace(201)*, !dbg !21
+  %726 = load <8 x float>, <8 x float> addrspace(201)* %725, align 32, !dbg !21, !alias.scope !35, !noalias !36, !llvm.access.group !27
+  %727 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %538, 1, !dbg !21
+  %728 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %538, 2, !dbg !21
+  %729 = mul i32 %718, 8, !dbg !21
+  %730 = add i32 %728, %729, !dbg !21
+  %731 = add i32 %730, 0, !dbg !21
+  %732 = getelementptr float, float addrspace(201)* %727, i32 %731, !dbg !21
+  %733 = bitcast float addrspace(201)* %732 to <8 x float> addrspace(201)*, !dbg !21
+  %734 = load <8 x float>, <8 x float> addrspace(201)* %733, align 32, !dbg !21, !alias.scope !35, !noalias !36, !llvm.access.group !27
+  %735 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %500, 1, !dbg !21
+  %736 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %500, 2, !dbg !21
+  %737 = mul i32 %718, 8, !dbg !21
+  %738 = add i32 %736, %737, !dbg !21
+  %739 = add i32 %738, 0, !dbg !21
+  %740 = getelementptr float, float addrspace(201)* %735, i32 %739, !dbg !21
+  %741 = bitcast float addrspace(201)* %740 to <8 x float> addrspace(201)*, !dbg !21
+  %742 = load <8 x float>, <8 x float> addrspace(201)* %741, align 32, !dbg !21, !alias.scope !35, !noalias !36, !llvm.access.group !27
+  %743 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %557, 1, !dbg !21
+  %744 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %557, 2, !dbg !21
+  %745 = mul i32 %718, 8, !dbg !21
+  %746 = add i32 %744, %745, !dbg !21
+  %747 = add i32 %746, 0, !dbg !21
+  %748 = getelementptr float, float addrspace(201)* %743, i32 %747, !dbg !21
+  %749 = bitcast float addrspace(201)* %748 to <8 x float> addrspace(201)*, !dbg !21
+  %750 = load <8 x float>, <8 x float> addrspace(201)* %749, align 32, !dbg !21, !alias.scope !35, !noalias !36, !llvm.access.group !27
+  %751 = fneg <8 x float> %629, !dbg !21
+  %752 = fadd <8 x float> %750, %726, !dbg !21
+  %753 = fmul <8 x float> %726, %726, !dbg !21
+  %754 = fadd <8 x float> %734, %753, !dbg !21
+  %755 = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %754), !dbg !21
+  %756 = fmul <8 x float> %755, <float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000>, !dbg !21
+  %757 = fneg <8 x float> %624, !dbg !21
+  %758 = fmul <8 x float> %756, %757, !dbg !21
+  %759 = fmul <8 x float> %758, <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, !dbg !21
+  %760 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %759), !dbg !21
+  %761 = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %734), !dbg !21
+  %762 = fmul <8 x float> %761, <float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000>, !dbg !21
+  %763 = fmul <8 x float> %757, %762, !dbg !21
+  %764 = fmul <8 x float> %763, <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, !dbg !21
+  %765 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %764), !dbg !21
+  %766 = fsub <8 x float> %760, %765, !dbg !21
+  %767 = fdiv <8 x float> %742, %619, !dbg !21
+  %768 = fmul <8 x float> %766, %767, !dbg !21
+  %769 = fsub <8 x float> %752, %768, !dbg !21
+  %770 = call <8 x float> @llvm.minimum.v8f32(<8 x float> %769, <8 x float> %629), !dbg !21
+  %771 = call <8 x float> @llvm.maximum.v8f32(<8 x float> %770, <8 x float> %751), !dbg !21
+  %772 = fsub <8 x float> %771, %769, !dbg !21
+  %773 = fdiv <8 x float> %760, %619, !dbg !21
+  %774 = fmul <8 x float> %634, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, !dbg !21
+  %775 = fadd <8 x float> %773, %774, !dbg !21
+  %776 = fdiv <8 x float> %772, %775, !dbg !21
+  %777 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %576, 1, !dbg !21
+  %778 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %576, 2, !dbg !21
+  %779 = mul i32 %718, 8, !dbg !21
+  %780 = add i32 %778, %779, !dbg !21
+  %781 = add i32 %780, 0, !dbg !21
+  %782 = getelementptr float, float addrspace(201)* %777, i32 %781, !dbg !21
+  %783 = bitcast float addrspace(201)* %782 to <8 x float> addrspace(201)*, !dbg !21
+  store <8 x float> %776, <8 x float> addrspace(201)* %783, align 32, !dbg !21, !alias.scope !35, !noalias !36, !llvm.access.group !27
+  %784 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %595, 1, !dbg !21
+  %785 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %595, 2, !dbg !21
+  %786 = mul i32 %718, 8, !dbg !21
+  %787 = add i32 %785, %786, !dbg !21
+  %788 = add i32 %787, 0, !dbg !21
+  %789 = getelementptr float, float addrspace(201)* %784, i32 %788, !dbg !21
+  %790 = bitcast float addrspace(201)* %789 to <8 x float> addrspace(201)*, !dbg !21
+  store <8 x float> %754, <8 x float> addrspace(201)* %790, align 32, !dbg !21, !alias.scope !35, !noalias !36, !llvm.access.group !27
+  %791 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %614, 1, !dbg !21
+  %792 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %614, 2, !dbg !21
+  %793 = mul i32 %718, 8, !dbg !21
+  %794 = add i32 %792, %793, !dbg !21
+  %795 = add i32 %794, 0, !dbg !21
+  %796 = getelementptr float, float addrspace(201)* %791, i32 %795, !dbg !21
+  %797 = bitcast float addrspace(201)* %796 to <8 x float> addrspace(201)*, !dbg !21
+  store <8 x float> %769, <8 x float> addrspace(201)* %797, align 32, !dbg !21, !alias.scope !35, !noalias !36, !llvm.access.group !27
+  %798 = add i32 %636, 2, !dbg !21
+  %799 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %519, 1, !dbg !21
+  %800 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %519, 2, !dbg !21
+  %801 = mul i32 %798, 8, !dbg !21
+  %802 = add i32 %800, %801, !dbg !21
+  %803 = add i32 %802, 0, !dbg !21
+  %804 = getelementptr float, float addrspace(201)* %799, i32 %803, !dbg !21
+  %805 = bitcast float addrspace(201)* %804 to <8 x float> addrspace(201)*, !dbg !21
+  %806 = load <8 x float>, <8 x float> addrspace(201)* %805, align 32, !dbg !21, !alias.scope !37, !noalias !38, !llvm.access.group !27
+  %807 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %538, 1, !dbg !21
+  %808 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %538, 2, !dbg !21
+  %809 = mul i32 %798, 8, !dbg !21
+  %810 = add i32 %808, %809, !dbg !21
+  %811 = add i32 %810, 0, !dbg !21
+  %812 = getelementptr float, float addrspace(201)* %807, i32 %811, !dbg !21
+  %813 = bitcast float addrspace(201)* %812 to <8 x float> addrspace(201)*, !dbg !21
+  %814 = load <8 x float>, <8 x float> addrspace(201)* %813, align 32, !dbg !21, !alias.scope !37, !noalias !38, !llvm.access.group !27
+  %815 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %500, 1, !dbg !21
+  %816 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %500, 2, !dbg !21
+  %817 = mul i32 %798, 8, !dbg !21
+  %818 = add i32 %816, %817, !dbg !21
+  %819 = add i32 %818, 0, !dbg !21
+  %820 = getelementptr float, float addrspace(201)* %815, i32 %819, !dbg !21
+  %821 = bitcast float addrspace(201)* %820 to <8 x float> addrspace(201)*, !dbg !21
+  %822 = load <8 x float>, <8 x float> addrspace(201)* %821, align 32, !dbg !21, !alias.scope !37, !noalias !38, !llvm.access.group !27
+  %823 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %557, 1, !dbg !21
+  %824 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %557, 2, !dbg !21
+  %825 = mul i32 %798, 8, !dbg !21
+  %826 = add i32 %824, %825, !dbg !21
+  %827 = add i32 %826, 0, !dbg !21
+  %828 = getelementptr float, float addrspace(201)* %823, i32 %827, !dbg !21
+  %829 = bitcast float addrspace(201)* %828 to <8 x float> addrspace(201)*, !dbg !21
+  %830 = load <8 x float>, <8 x float> addrspace(201)* %829, align 32, !dbg !21, !alias.scope !37, !noalias !38, !llvm.access.group !27
+  %831 = fneg <8 x float> %629, !dbg !21
+  %832 = fadd <8 x float> %830, %806, !dbg !21
+  %833 = fmul <8 x float> %806, %806, !dbg !21
+  %834 = fadd <8 x float> %814, %833, !dbg !21
+  %835 = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %834), !dbg !21
+  %836 = fmul <8 x float> %835, <float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000>, !dbg !21
+  %837 = fneg <8 x float> %624, !dbg !21
+  %838 = fmul <8 x float> %836, %837, !dbg !21
+  %839 = fmul <8 x float> %838, <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, !dbg !21
+  %840 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %839), !dbg !21
+  %841 = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %814), !dbg !21
+  %842 = fmul <8 x float> %841, <float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000>, !dbg !21
+  %843 = fmul <8 x float> %837, %842, !dbg !21
+  %844 = fmul <8 x float> %843, <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, !dbg !21
+  %845 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %844), !dbg !21
+  %846 = fsub <8 x float> %840, %845, !dbg !21
+  %847 = fdiv <8 x float> %822, %619, !dbg !21
+  %848 = fmul <8 x float> %846, %847, !dbg !21
+  %849 = fsub <8 x float> %832, %848, !dbg !21
+  %850 = call <8 x float> @llvm.minimum.v8f32(<8 x float> %849, <8 x float> %629), !dbg !21
+  %851 = call <8 x float> @llvm.maximum.v8f32(<8 x float> %850, <8 x float> %831), !dbg !21
+  %852 = fsub <8 x float> %851, %849, !dbg !21
+  %853 = fdiv <8 x float> %840, %619, !dbg !21
+  %854 = fmul <8 x float> %634, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, !dbg !21
+  %855 = fadd <8 x float> %853, %854, !dbg !21
+  %856 = fdiv <8 x float> %852, %855, !dbg !21
+  %857 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %576, 1, !dbg !21
+  %858 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %576, 2, !dbg !21
+  %859 = mul i32 %798, 8, !dbg !21
+  %860 = add i32 %858, %859, !dbg !21
+  %861 = add i32 %860, 0, !dbg !21
+  %862 = getelementptr float, float addrspace(201)* %857, i32 %861, !dbg !21
+  %863 = bitcast float addrspace(201)* %862 to <8 x float> addrspace(201)*, !dbg !21
+  store <8 x float> %856, <8 x float> addrspace(201)* %863, align 32, !dbg !21, !alias.scope !37, !noalias !38, !llvm.access.group !27
+  %864 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %595, 1, !dbg !21
+  %865 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %595, 2, !dbg !21
+  %866 = mul i32 %798, 8, !dbg !21
+  %867 = add i32 %865, %866, !dbg !21
+  %868 = add i32 %867, 0, !dbg !21
+  %869 = getelementptr float, float addrspace(201)* %864, i32 %868, !dbg !21
+  %870 = bitcast float addrspace(201)* %869 to <8 x float> addrspace(201)*, !dbg !21
+  store <8 x float> %834, <8 x float> addrspace(201)* %870, align 32, !dbg !21, !alias.scope !37, !noalias !38, !llvm.access.group !27
+  %871 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %614, 1, !dbg !21
+  %872 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %614, 2, !dbg !21
+  %873 = mul i32 %798, 8, !dbg !21
+  %874 = add i32 %872, %873, !dbg !21
+  %875 = add i32 %874, 0, !dbg !21
+  %876 = getelementptr float, float addrspace(201)* %871, i32 %875, !dbg !21
+  %877 = bitcast float addrspace(201)* %876 to <8 x float> addrspace(201)*, !dbg !21
+  store <8 x float> %849, <8 x float> addrspace(201)* %877, align 32, !dbg !21, !alias.scope !37, !noalias !38, !llvm.access.group !27
+  %878 = add i32 %636, 3, !dbg !21
+  %879 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %519, 1, !dbg !21
+  %880 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %519, 2, !dbg !21
+  %881 = mul i32 %878, 8, !dbg !21
+  %882 = add i32 %880, %881, !dbg !21
+  %883 = add i32 %882, 0, !dbg !21
+  %884 = getelementptr float, float addrspace(201)* %879, i32 %883, !dbg !21
+  %885 = bitcast float addrspace(201)* %884 to <8 x float> addrspace(201)*, !dbg !21
+  %886 = load <8 x float>, <8 x float> addrspace(201)* %885, align 32, !dbg !21, !alias.scope !39, !noalias !40, !llvm.access.group !27
+  %887 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %538, 1, !dbg !21
+  %888 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %538, 2, !dbg !21
+  %889 = mul i32 %878, 8, !dbg !21
+  %890 = add i32 %888, %889, !dbg !21
+  %891 = add i32 %890, 0, !dbg !21
+  %892 = getelementptr float, float addrspace(201)* %887, i32 %891, !dbg !21
+  %893 = bitcast float addrspace(201)* %892 to <8 x float> addrspace(201)*, !dbg !21
+  %894 = load <8 x float>, <8 x float> addrspace(201)* %893, align 32, !dbg !21, !alias.scope !39, !noalias !40, !llvm.access.group !27
+  %895 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %500, 1, !dbg !21
+  %896 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %500, 2, !dbg !21
+  %897 = mul i32 %878, 8, !dbg !21
+  %898 = add i32 %896, %897, !dbg !21
+  %899 = add i32 %898, 0, !dbg !21
+  %900 = getelementptr float, float addrspace(201)* %895, i32 %899, !dbg !21
+  %901 = bitcast float addrspace(201)* %900 to <8 x float> addrspace(201)*, !dbg !21
+  %902 = load <8 x float>, <8 x float> addrspace(201)* %901, align 32, !dbg !21, !alias.scope !39, !noalias !40, !llvm.access.group !27
+  %903 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %557, 1, !dbg !21
+  %904 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %557, 2, !dbg !21
+  %905 = mul i32 %878, 8, !dbg !21
+  %906 = add i32 %904, %905, !dbg !21
+  %907 = add i32 %906, 0, !dbg !21
+  %908 = getelementptr float, float addrspace(201)* %903, i32 %907, !dbg !21
+  %909 = bitcast float addrspace(201)* %908 to <8 x float> addrspace(201)*, !dbg !21
+  %910 = load <8 x float>, <8 x float> addrspace(201)* %909, align 32, !dbg !21, !alias.scope !39, !noalias !40, !llvm.access.group !27
+  %911 = fneg <8 x float> %629, !dbg !21
+  %912 = fadd <8 x float> %910, %886, !dbg !21
+  %913 = fmul <8 x float> %886, %886, !dbg !21
+  %914 = fadd <8 x float> %894, %913, !dbg !21
+  %915 = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %914), !dbg !21
+  %916 = fmul <8 x float> %915, <float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000>, !dbg !21
+  %917 = fneg <8 x float> %624, !dbg !21
+  %918 = fmul <8 x float> %916, %917, !dbg !21
+  %919 = fmul <8 x float> %918, <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, !dbg !21
+  %920 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %919), !dbg !21
+  %921 = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %894), !dbg !21
+  %922 = fmul <8 x float> %921, <float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000>, !dbg !21
+  %923 = fmul <8 x float> %917, %922, !dbg !21
+  %924 = fmul <8 x float> %923, <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, !dbg !21
+  %925 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %924), !dbg !21
+  %926 = fsub <8 x float> %920, %925, !dbg !21
+  %927 = fdiv <8 x float> %902, %619, !dbg !21
+  %928 = fmul <8 x float> %926, %927, !dbg !21
+  %929 = fsub <8 x float> %912, %928, !dbg !21
+  %930 = call <8 x float> @llvm.minimum.v8f32(<8 x float> %929, <8 x float> %629), !dbg !21
+  %931 = call <8 x float> @llvm.maximum.v8f32(<8 x float> %930, <8 x float> %911), !dbg !21
+  %932 = fsub <8 x float> %931, %929, !dbg !21
+  %933 = fdiv <8 x float> %920, %619, !dbg !21
+  %934 = fmul <8 x float> %634, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, !dbg !21
+  %935 = fadd <8 x float> %933, %934, !dbg !21
+  %936 = fdiv <8 x float> %932, %935, !dbg !21
+  %937 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %576, 1, !dbg !21
+  %938 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %576, 2, !dbg !21
+  %939 = mul i32 %878, 8, !dbg !21
+  %940 = add i32 %938, %939, !dbg !21
+  %941 = add i32 %940, 0, !dbg !21
+  %942 = getelementptr float, float addrspace(201)* %937, i32 %941, !dbg !21
+  %943 = bitcast float addrspace(201)* %942 to <8 x float> addrspace(201)*, !dbg !21
+  store <8 x float> %936, <8 x float> addrspace(201)* %943, align 32, !dbg !21, !alias.scope !39, !noalias !40, !llvm.access.group !27
+  %944 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %595, 1, !dbg !21
+  %945 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %595, 2, !dbg !21
+  %946 = mul i32 %878, 8, !dbg !21
+  %947 = add i32 %945, %946, !dbg !21
+  %948 = add i32 %947, 0, !dbg !21
+  %949 = getelementptr float, float addrspace(201)* %944, i32 %948, !dbg !21
+  %950 = bitcast float addrspace(201)* %949 to <8 x float> addrspace(201)*, !dbg !21
+  store <8 x float> %914, <8 x float> addrspace(201)* %950, align 32, !dbg !21, !alias.scope !39, !noalias !40, !llvm.access.group !27
+  %951 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %614, 1, !dbg !21
+  %952 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %614, 2, !dbg !21
+  %953 = mul i32 %878, 8, !dbg !21
+  %954 = add i32 %952, %953, !dbg !21
+  %955 = add i32 %954, 0, !dbg !21
+  %956 = getelementptr float, float addrspace(201)* %951, i32 %955, !dbg !21
+  %957 = bitcast float addrspace(201)* %956 to <8 x float> addrspace(201)*, !dbg !21
+  store <8 x float> %929, <8 x float> addrspace(201)* %957, align 32, !dbg !21, !alias.scope !39, !noalias !40, !llvm.access.group !27
+  %958 = add i32 %636, 4, !dbg !21
+  br label %635, !dbg !21, !llvm.loop !25
+
+959:                                              ; preds = %635
+  %960 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %228, 0, !dbg !21
+  %961 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } undef, i32 addrspace(204)* %960, 0, !dbg !21
+  %962 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %228, 1, !dbg !21
+  %963 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %961, i32 addrspace(204)* %962, 1, !dbg !21
+  %964 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %228, 4, 0, !dbg !21
+  %965 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %228, 4, 1, !dbg !21
+  %966 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %228, 2, !dbg !21
+  %967 = mul i32 %455, %964, !dbg !21
+  %968 = add i32 %966, %967, !dbg !21
+  %969 = mul i32 0, %965, !dbg !21
+  %970 = add i32 %968, %969, !dbg !21
+  %971 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %963, i32 %970, 2, !dbg !21
+  %972 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %971, i32 1, 3, 0, !dbg !21
+  %973 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %972, i32 1, 4, 0, !dbg !21
+  %974 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %576, 1, !dbg !21
+  %975 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %576, 2, !dbg !21
+  %976 = add i32 %975, 0, !dbg !21
+  %977 = add i32 %976, 0, !dbg !21
+  %978 = getelementptr float, float addrspace(201)* %974, i32 %977, !dbg !21
+  %979 = bitcast float addrspace(201)* %978 to i32 addrspace(201)*, !dbg !21
+  %980 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %90, 1, !dbg !21
+  %981 = mul i32 %240, 8, !dbg !21
+  %982 = add i32 %981, %241, !dbg !21
+  %983 = getelementptr float, float addrspace(203)* %980, i32 %982, !dbg !21
+  %984 = bitcast float addrspace(203)* %983 to i32 addrspace(203)*, !dbg !21
+  %985 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %973, 1, !dbg !21
+  %986 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %973, 2, !dbg !21
+  %987 = add i32 %986, 0, !dbg !21
+  %988 = getelementptr i32, i32 addrspace(204)* %985, i32 %987, !dbg !21
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %988, i32 0, i32 addrspace(201)* %979, i32 addrspace(203)* %984, i32 256, i32 0), !dbg !21
+  %989 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %595, 1, !dbg !21
+  %990 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %595, 2, !dbg !21
+  %991 = add i32 %990, 0, !dbg !21
+  %992 = add i32 %991, 0, !dbg !21
+  %993 = getelementptr float, float addrspace(201)* %989, i32 %992, !dbg !21
+  %994 = bitcast float addrspace(201)* %993 to i32 addrspace(201)*, !dbg !21
+  %995 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %97, 1, !dbg !21
+  %996 = mul i32 %240, 8, !dbg !21
+  %997 = add i32 %996, %241, !dbg !21
+  %998 = getelementptr float, float addrspace(203)* %995, i32 %997, !dbg !21
+  %999 = bitcast float addrspace(203)* %998 to i32 addrspace(203)*, !dbg !21
+  %1000 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %973, 1, !dbg !21
+  %1001 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %973, 2, !dbg !21
+  %1002 = add i32 %1001, 0, !dbg !21
+  %1003 = getelementptr i32, i32 addrspace(204)* %1000, i32 %1002, !dbg !21
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %1003, i32 0, i32 addrspace(201)* %994, i32 addrspace(203)* %999, i32 256, i32 0), !dbg !21
+  %1004 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %614, 1, !dbg !21
+  %1005 = extractvalue { float addrspace(201)*, float addrspace(201)*, i32, [2 x i32], [2 x i32] } %614, 2, !dbg !21
+  %1006 = add i32 %1005, 0, !dbg !21
+  %1007 = add i32 %1006, 0, !dbg !21
+  %1008 = getelementptr float, float addrspace(201)* %1004, i32 %1007, !dbg !21
+  %1009 = bitcast float addrspace(201)* %1008 to i32 addrspace(201)*, !dbg !21
+  %1010 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %104, 1, !dbg !21
+  %1011 = mul i32 %240, 8, !dbg !21
+  %1012 = add i32 %1011, %241, !dbg !21
+  %1013 = getelementptr float, float addrspace(203)* %1010, i32 %1012, !dbg !21
+  %1014 = bitcast float addrspace(203)* %1013 to i32 addrspace(203)*, !dbg !21
+  %1015 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %973, 1, !dbg !21
+  %1016 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %973, 2, !dbg !21
+  %1017 = add i32 %1016, 0, !dbg !21
+  %1018 = getelementptr i32, i32 addrspace(204)* %1015, i32 %1017, !dbg !21
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %1018, i32 0, i32 addrspace(201)* %1009, i32 addrspace(203)* %1014, i32 256, i32 0), !dbg !21
+  br label %1019, !dbg !21
+
+1019:                                             ; preds = %959, %449
+  %1020 = icmp sge i32 %237, 2, !dbg !21
+  br i1 %1020, label %1021, label %1042, !dbg !21
+
+1021:                                             ; preds = %1019
+  %1022 = add i32 %237, 1, !dbg !21
+  %1023 = urem i32 %1022, 2, !dbg !21
+  %1024 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %228, 0, !dbg !21
+  %1025 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } undef, i32 addrspace(204)* %1024, 0, !dbg !21
+  %1026 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %228, 1, !dbg !21
+  %1027 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %1025, i32 addrspace(204)* %1026, 1, !dbg !21
+  %1028 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %228, 4, 0, !dbg !21
+  %1029 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %228, 4, 1, !dbg !21
+  %1030 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [2 x i32], [2 x i32] } %228, 2, !dbg !21
+  %1031 = mul i32 %1023, %1028, !dbg !21
+  %1032 = add i32 %1030, %1031, !dbg !21
+  %1033 = mul i32 0, %1029, !dbg !21
+  %1034 = add i32 %1032, %1033, !dbg !21
+  %1035 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %1027, i32 %1034, 2, !dbg !21
+  %1036 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %1035, i32 1, 3, 0, !dbg !21
+  %1037 = insertvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %1036, i32 1, 4, 0, !dbg !21
+  %1038 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %1037, 1, !dbg !21
+  %1039 = extractvalue { i32 addrspace(204)*, i32 addrspace(204)*, i32, [1 x i32], [1 x i32] } %1037, 2, !dbg !21
+  %1040 = add i32 %1039, 0, !dbg !21
+  %1041 = getelementptr i32, i32 addrspace(204)* %1038, i32 %1040, !dbg !21
+  call void @llvm.tpu.waitge(i32 addrspace(204)* %1041, i32 768), !dbg !21
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %1041, i32 -768), !dbg !21
+  br label %1042, !dbg !21
+
+1042:                                             ; preds = %1021, %1019
+  %1043 = add i32 %239, 8, !dbg !21
+  %1044 = icmp sge i32 %1043, 8, !dbg !21
+  %1045 = select i1 %1044, i32 0, i32 %1043, !dbg !21
+  %1046 = add i32 %238, 32, !dbg !21
+  %1047 = select i1 %1044, i32 %1046, i32 %238, !dbg !21
+  %1048 = icmp sge i32 %1047, 2048, !dbg !21
+  %1049 = select i1 %1048, i32 %107, i32 %1047, !dbg !21
+  %1050 = add i32 %237, 1, !dbg !21
+  br label %236, !dbg !21, !llvm.loop !23
+
+1051:                                             ; preds = %236
+  fence seq_cst, !dbg !21
+  call void @llvm.tpu.barrier(i32 1, i32 1), !dbg !21
+  ret void, !dbg !21
+}
+
+define void @access0_lowered() #1 !dbg !41 {
+  %1 = call i32 addrspace(208)* @llvm.tpu.allocate.dreg(i32 11, i32 0), !dbg !42
+  %2 = getelementptr i32, i32 addrspace(208)* %1, i32 0, !dbg !42
+  %3 = load i32, i32 addrspace(208)* %2, align 4, !dbg !42
+  %4 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %3), !dbg !42
+  %5 = getelementptr i32, i32 addrspace(208)* %1, i32 1, !dbg !42
+  %6 = load i32, i32 addrspace(208)* %5, align 4, !dbg !42
+  %7 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %6), !dbg !42
+  %8 = getelementptr i32, i32 addrspace(208)* %1, i32 2, !dbg !42
+  %9 = load i32, i32 addrspace(208)* %8, align 4, !dbg !42
+  %10 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %9), !dbg !42
+  %11 = getelementptr i32, i32 addrspace(208)* %1, i32 3, !dbg !42
+  %12 = load i32, i32 addrspace(208)* %11, align 4, !dbg !42
+  %13 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %12), !dbg !42
+  %14 = getelementptr i32, i32 addrspace(208)* %1, i32 4, !dbg !42
+  %15 = load i32, i32 addrspace(208)* %14, align 4, !dbg !42
+  %16 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %15), !dbg !42
+  %17 = getelementptr i32, i32 addrspace(208)* %1, i32 5, !dbg !42
+  %18 = load i32, i32 addrspace(208)* %17, align 4, !dbg !42
+  %19 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %18), !dbg !42
+  %20 = getelementptr i32, i32 addrspace(208)* %1, i32 6, !dbg !42
+  %21 = load i32, i32 addrspace(208)* %20, align 4, !dbg !42
+  %22 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %21), !dbg !42
+  %23 = getelementptr i32, i32 addrspace(208)* %1, i32 7, !dbg !42
+  %24 = load i32, i32 addrspace(208)* %23, align 4, !dbg !42
+  %25 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %24), !dbg !42
+  %26 = getelementptr i32, i32 addrspace(208)* %1, i32 8, !dbg !42
+  %27 = load i32, i32 addrspace(208)* %26, align 4, !dbg !42
+  %28 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %27), !dbg !42
+  %29 = getelementptr i32, i32 addrspace(208)* %1, i32 9, !dbg !42
+  %30 = load i32, i32 addrspace(208)* %29, align 4, !dbg !42
+  %31 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %30), !dbg !42
+  %32 = getelementptr i32, i32 addrspace(208)* %1, i32 10, !dbg !42
+  %33 = load i32, i32 addrspace(208)* %32, align 4, !dbg !42
+  %34 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %33), !dbg !42
+  br label %35, !dbg !42
+
+35:                                               ; preds = %0
+  %36 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %4, 0, !dbg !42
+  %37 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %36, float addrspace(203)* %4, 1, !dbg !42
+  %38 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %37, i32 0, 2, !dbg !42
+  %39 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %38, i32 2048, 3, 0, !dbg !42
+  %40 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %39, i32 8, 4, 0, !dbg !42
+  %41 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %40, i32 8, 3, 1, !dbg !42
+  %42 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %41, i32 1, 4, 1, !dbg !42
+  %43 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %7, 0, !dbg !42
+  %44 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %43, float addrspace(203)* %7, 1, !dbg !42
+  %45 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %44, i32 0, 2, !dbg !42
+  %46 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %45, i32 2048, 3, 0, !dbg !42
+  %47 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %46, i32 8, 4, 0, !dbg !42
+  %48 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %47, i32 8, 3, 1, !dbg !42
+  %49 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %48, i32 1, 4, 1, !dbg !42
+  %50 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %10, 0, !dbg !42
+  %51 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %50, float addrspace(203)* %10, 1, !dbg !42
+  %52 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %51, i32 0, 2, !dbg !42
+  %53 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %52, i32 2048, 3, 0, !dbg !42
+  %54 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %53, i32 8, 4, 0, !dbg !42
+  %55 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %54, i32 8, 3, 1, !dbg !42
+  %56 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %55, i32 1, 4, 1, !dbg !42
+  %57 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %13, 0, !dbg !42
+  %58 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %57, float addrspace(203)* %13, 1, !dbg !42
+  %59 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %58, i32 0, 2, !dbg !42
+  %60 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %59, i32 2048, 3, 0, !dbg !42
+  %61 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %60, i32 8, 4, 0, !dbg !42
+  %62 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %61, i32 8, 3, 1, !dbg !42
+  %63 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %62, i32 1, 4, 1, !dbg !42
+  %64 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } undef, float addrspace(203)* %16, 0, !dbg !42
+  %65 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %64, float addrspace(203)* %16, 1, !dbg !42
+  %66 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %65, i32 0, 2, !dbg !42
+  %67 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %66, i32 8, 3, 0, !dbg !42
+  %68 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %67, i32 1, 4, 0, !dbg !42
+  %69 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } undef, float addrspace(203)* %19, 0, !dbg !42
+  %70 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %69, float addrspace(203)* %19, 1, !dbg !42
+  %71 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %70, i32 0, 2, !dbg !42
+  %72 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %71, i32 8, 3, 0, !dbg !42
+  %73 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %72, i32 1, 4, 0, !dbg !42
+  %74 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } undef, float addrspace(203)* %22, 0, !dbg !42
+  %75 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %74, float addrspace(203)* %22, 1, !dbg !42
+  %76 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %75, i32 0, 2, !dbg !42
+  %77 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %76, i32 8, 3, 0, !dbg !42
+  %78 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %77, i32 1, 4, 0, !dbg !42
+  %79 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } undef, float addrspace(203)* %25, 0, !dbg !42
+  %80 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %79, float addrspace(203)* %25, 1, !dbg !42
+  %81 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %80, i32 0, 2, !dbg !42
+  %82 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %81, i32 8, 3, 0, !dbg !42
+  %83 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %82, i32 1, 4, 0, !dbg !42
+  %84 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %28, 0, !dbg !42
+  %85 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %84, float addrspace(203)* %28, 1, !dbg !42
+  %86 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %85, i32 0, 2, !dbg !42
+  %87 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %86, i32 2048, 3, 0, !dbg !42
+  %88 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %87, i32 8, 4, 0, !dbg !42
+  %89 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %88, i32 8, 3, 1, !dbg !42
+  %90 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %89, i32 1, 4, 1, !dbg !42
+  %91 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %31, 0, !dbg !42
+  %92 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %91, float addrspace(203)* %31, 1, !dbg !42
+  %93 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %92, i32 0, 2, !dbg !42
+  %94 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %93, i32 2048, 3, 0, !dbg !42
+  %95 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %94, i32 8, 4, 0, !dbg !42
+  %96 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %95, i32 8, 3, 1, !dbg !42
+  %97 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %96, i32 1, 4, 1, !dbg !42
+  %98 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %34, 0, !dbg !42
+  %99 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %98, float addrspace(203)* %34, 1, !dbg !42
+  %100 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %99, i32 0, 2, !dbg !42
+  %101 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %100, i32 2048, 3, 0, !dbg !42
+  %102 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %101, i32 8, 4, 0, !dbg !42
+  %103 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %102, i32 8, 3, 1, !dbg !42
+  %104 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %103, i32 1, 4, 1, !dbg !42
+  br label %105, !dbg !42
+
+105:                                              ; preds = %35
+  ret void, !dbg !42
+}
+
+define void @main_lowered() #2 !dbg !44 {
+  %1 = call i32* @llvm.tpu.allocate.smem(i32 8, i32 16351)
+  %2 = call i32* @llvm.tpu.allocate.smem(i32 1, i32 16364)
+  %3 = load i32, i32* %2, align 4
+  %4 = call i32* @llvm.tpu.allocate.smem(i32 16, i32 9)
+  %5 = call i32* @llvm.tpu.alloca.smem(i32 16)
+  %6 = getelementptr i32, i32* %4, i32 0
+  store i32 %3, i32* %6, align 4
+  %7 = call i32 addrspace(204)* @llvm.tpu.allocate.sflag(i32 1, i32 5)
+  %8 = call i32 addrspace(204)* @llvm.tpu.alloca.sflag(i32 1)
+  %9 = getelementptr i32, i32* %4, i32 0
+  %10 = load i32, i32* %9, align 4
+  %11 = call i32 addrspace(203)* @llvm.tpu.inttoptr.p203i32(i32 %10)
+  %12 = getelementptr i32, i32* %4, i32 8
+  %13 = call i32 addrspace(211)* @llvm.tpu.addrspacecast.scs(i32 addrspace(204)* %7)
+  call void @llvm.tpu.dma.hbm.to.smem.sc.simple(i32 addrspace(211)* %13, i32 addrspace(203)* %11, i32* %12, i32 1, i32 0)
+  call void @llvm.tpu.waitge(i32 addrspace(204)* %7, i32 1)
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %7, i32 -1)
+  %14 = getelementptr i32, i32* %1, i32 7
+  %15 = load i32, i32* %14, align 4
+  %16 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %15)
+  %17 = getelementptr i32, i32* %1, i32 6
+  %18 = load i32, i32* %17, align 4
+  %19 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %18)
+  %20 = getelementptr i32, i32* %1, i32 5
+  %21 = load i32, i32* %20, align 4
+  %22 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %21)
+  %23 = getelementptr i32, i32* %1, i32 4
+  %24 = load i32, i32* %23, align 4
+  %25 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %24)
+  %26 = getelementptr i32, i32* %1, i32 3
+  %27 = load i32, i32* %26, align 4
+  %28 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %27)
+  %29 = getelementptr i32, i32* %1, i32 2
+  %30 = load i32, i32* %29, align 4
+  %31 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %30)
+  %32 = getelementptr i32, i32* %1, i32 1
+  %33 = load i32, i32* %32, align 4
+  %34 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %33)
+  %35 = getelementptr i32, i32* %1, i32 0
+  %36 = load i32, i32* %35, align 4
+  %37 = call float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32 %36)
+  %38 = getelementptr i32, i32* %4, i32 8
+  %39 = load i32, i32* %38, align 4
+  %40 = call i8 addrspace(203)* @llvm.tpu.inttoptr.p203i8(i32 %39)
+  %41 = getelementptr i32, i32* %4, i32 9
+  %42 = load i32, i32* %41, align 4
+  %43 = call i8 addrspace(203)* @llvm.tpu.inttoptr.p203i8(i32 %42)
+  %44 = getelementptr i32, i32* %4, i32 10
+  %45 = load i32, i32* %44, align 4
+  %46 = call i8 addrspace(203)* @llvm.tpu.inttoptr.p203i8(i32 %45)
+  %47 = getelementptr i32, i32* %4, i32 0
+  %48 = load i32, i32* %47, align 4
+  %49 = call i8 addrspace(203)* @llvm.tpu.inttoptr.p203i8(i32 %48)
+  br label %50
+
+50:                                               ; preds = %0
+  %51 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %16, 0
+  %52 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %51, float addrspace(203)* %16, 1
+  %53 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %52, i32 0, 2
+  %54 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %53, i32 2048, 3, 0
+  %55 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %54, i32 8, 4, 0
+  %56 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %55, i32 8, 3, 1
+  %57 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %56, i32 1, 4, 1
+  %58 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %19, 0
+  %59 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %58, float addrspace(203)* %19, 1
+  %60 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %59, i32 0, 2
+  %61 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %60, i32 2048, 3, 0
+  %62 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %61, i32 8, 4, 0
+  %63 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %62, i32 8, 3, 1
+  %64 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %63, i32 1, 4, 1
+  %65 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %22, 0
+  %66 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %65, float addrspace(203)* %22, 1
+  %67 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %66, i32 0, 2
+  %68 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %67, i32 2048, 3, 0
+  %69 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %68, i32 8, 4, 0
+  %70 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %69, i32 8, 3, 1
+  %71 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %70, i32 1, 4, 1
+  %72 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %25, 0
+  %73 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %72, float addrspace(203)* %25, 1
+  %74 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %73, i32 0, 2
+  %75 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %74, i32 2048, 3, 0
+  %76 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %75, i32 8, 4, 0
+  %77 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %76, i32 8, 3, 1
+  %78 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %77, i32 1, 4, 1
+  %79 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } undef, float addrspace(203)* %28, 0
+  %80 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %79, float addrspace(203)* %28, 1
+  %81 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %80, i32 0, 2
+  %82 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %81, i32 8, 3, 0
+  %83 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %82, i32 1, 4, 0
+  %84 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } undef, float addrspace(203)* %31, 0
+  %85 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %84, float addrspace(203)* %31, 1
+  %86 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %85, i32 0, 2
+  %87 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %86, i32 8, 3, 0
+  %88 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %87, i32 1, 4, 0
+  %89 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } undef, float addrspace(203)* %34, 0
+  %90 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %89, float addrspace(203)* %34, 1
+  %91 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %90, i32 0, 2
+  %92 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %91, i32 8, 3, 0
+  %93 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %92, i32 1, 4, 0
+  %94 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } undef, float addrspace(203)* %37, 0
+  %95 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %94, float addrspace(203)* %37, 1
+  %96 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %95, i32 0, 2
+  %97 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %96, i32 8, 3, 0
+  %98 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %97, i32 1, 4, 0
+  %99 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } undef, i8 addrspace(203)* %40, 0
+  %100 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %99, i8 addrspace(203)* %40, 1
+  %101 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %100, i32 0, 2
+  %102 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %101, i32 65536, 3, 0
+  %103 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %102, i32 1, 4, 0
+  %104 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } undef, i8 addrspace(203)* %43, 0
+  %105 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %104, i8 addrspace(203)* %43, 1
+  %106 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %105, i32 0, 2
+  %107 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %106, i32 65536, 3, 0
+  %108 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %107, i32 1, 4, 0
+  %109 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } undef, i8 addrspace(203)* %46, 0
+  %110 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %109, i8 addrspace(203)* %46, 1
+  %111 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %110, i32 0, 2
+  %112 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %111, i32 65536, 3, 0
+  %113 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %112, i32 1, 4, 0
+  %114 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } undef, i8 addrspace(203)* %49, 0
+  %115 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %114, i8 addrspace(203)* %49, 1
+  %116 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %115, i32 0, 2
+  %117 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %116, i32 4096, 3, 0
+  %118 = insertvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %117, i32 1, 4, 0
+  call void @llvm.tpu.sc.ssettm(i32 1)
+  call void @llvm.tpu.sc.strace(i32 -1879048193)
+  %119 = extractvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %103, 0, !dbg !45
+  %120 = bitcast i8 addrspace(203)* %119 to float addrspace(203)*, !dbg !45
+  %121 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %120, 0, !dbg !45
+  %122 = extractvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %103, 1, !dbg !45
+  %123 = getelementptr i8, i8 addrspace(203)* %122, i32 0, !dbg !45
+  %124 = bitcast i8 addrspace(203)* %123 to float addrspace(203)*, !dbg !45
+  %125 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %121, float addrspace(203)* %124, 1, !dbg !45
+  %126 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %125, i32 0, 2, !dbg !45
+  %127 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %126, i32 8, 3, 1, !dbg !45
+  %128 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %127, i32 1, 4, 1, !dbg !45
+  %129 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %128, i32 2048, 3, 0, !dbg !45
+  %130 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %129, i32 8, 4, 0, !dbg !45
+  %131 = extractvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %108, 0, !dbg !45
+  %132 = bitcast i8 addrspace(203)* %131 to float addrspace(203)*, !dbg !45
+  %133 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %132, 0, !dbg !45
+  %134 = extractvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %108, 1, !dbg !45
+  %135 = getelementptr i8, i8 addrspace(203)* %134, i32 0, !dbg !45
+  %136 = bitcast i8 addrspace(203)* %135 to float addrspace(203)*, !dbg !45
+  %137 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %133, float addrspace(203)* %136, 1, !dbg !45
+  %138 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %137, i32 0, 2, !dbg !45
+  %139 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %138, i32 8, 3, 1, !dbg !45
+  %140 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %139, i32 1, 4, 1, !dbg !45
+  %141 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %140, i32 2048, 3, 0, !dbg !45
+  %142 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %141, i32 8, 4, 0, !dbg !45
+  %143 = extractvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %113, 0, !dbg !45
+  %144 = bitcast i8 addrspace(203)* %143 to float addrspace(203)*, !dbg !45
+  %145 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } undef, float addrspace(203)* %144, 0, !dbg !45
+  %146 = extractvalue { i8 addrspace(203)*, i8 addrspace(203)*, i32, [1 x i32], [1 x i32] } %113, 1, !dbg !45
+  %147 = getelementptr i8, i8 addrspace(203)* %146, i32 0, !dbg !45
+  %148 = bitcast i8 addrspace(203)* %147 to float addrspace(203)*, !dbg !45
+  %149 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %145, float addrspace(203)* %148, 1, !dbg !45
+  %150 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %149, i32 0, 2, !dbg !45
+  %151 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %150, i32 8, 3, 1, !dbg !45
+  %152 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %151, i32 1, 4, 1, !dbg !45
+  %153 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %152, i32 2048, 3, 0, !dbg !45
+  %154 = insertvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %153, i32 8, 4, 0, !dbg !45
+  %155 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %57, 1, !dbg !45
+  %156 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %64, 1, !dbg !45
+  %157 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %71, 1, !dbg !45
+  %158 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %78, 1, !dbg !45
+  %159 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %83, 1, !dbg !45
+  %160 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %88, 1, !dbg !45
+  %161 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %93, 1, !dbg !45
+  %162 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [1 x i32], [1 x i32] } %98, 1, !dbg !45
+  %163 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %130, 1, !dbg !45
+  %164 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %142, 1, !dbg !45
+  %165 = extractvalue { float addrspace(203)*, float addrspace(203)*, i32, [2 x i32], [2 x i32] } %154, 1, !dbg !45
+  %166 = call i32 addrspace(208)* @llvm.tpu.allocate.dreg(i32 13, i32 0), !dbg !45
+  %167 = getelementptr i32, i32 addrspace(208)* %166, i32 0, !dbg !45
+  store i32 ptrtoint (void ()* @access0_lowered to i32), i32 addrspace(208)* %167, align 4, !dbg !45
+  %168 = getelementptr i32, i32 addrspace(208)* %166, i32 1, !dbg !45
+  store i32 ptrtoint (void ()* @execute1_lowered to i32), i32 addrspace(208)* %168, align 4, !dbg !45
+  %169 = getelementptr i32, i32 addrspace(208)* %166, i32 2, !dbg !45
+  %170 = ptrtoint float addrspace(203)* %155 to i32, !dbg !45
+  store i32 %170, i32 addrspace(208)* %169, align 4, !dbg !45
+  %171 = getelementptr i32, i32 addrspace(208)* %166, i32 3, !dbg !45
+  %172 = ptrtoint float addrspace(203)* %156 to i32, !dbg !45
+  store i32 %172, i32 addrspace(208)* %171, align 4, !dbg !45
+  %173 = getelementptr i32, i32 addrspace(208)* %166, i32 4, !dbg !45
+  %174 = ptrtoint float addrspace(203)* %157 to i32, !dbg !45
+  store i32 %174, i32 addrspace(208)* %173, align 4, !dbg !45
+  %175 = getelementptr i32, i32 addrspace(208)* %166, i32 5, !dbg !45
+  %176 = ptrtoint float addrspace(203)* %158 to i32, !dbg !45
+  store i32 %176, i32 addrspace(208)* %175, align 4, !dbg !45
+  %177 = getelementptr i32, i32 addrspace(208)* %166, i32 6, !dbg !45
+  %178 = ptrtoint float addrspace(203)* %159 to i32, !dbg !45
+  store i32 %178, i32 addrspace(208)* %177, align 4, !dbg !45
+  %179 = getelementptr i32, i32 addrspace(208)* %166, i32 7, !dbg !45
+  %180 = ptrtoint float addrspace(203)* %160 to i32, !dbg !45
+  store i32 %180, i32 addrspace(208)* %179, align 4, !dbg !45
+  %181 = getelementptr i32, i32 addrspace(208)* %166, i32 8, !dbg !45
+  %182 = ptrtoint float addrspace(203)* %161 to i32, !dbg !45
+  store i32 %182, i32 addrspace(208)* %181, align 4, !dbg !45
+  %183 = getelementptr i32, i32 addrspace(208)* %166, i32 9, !dbg !45
+  %184 = ptrtoint float addrspace(203)* %162 to i32, !dbg !45
+  store i32 %184, i32 addrspace(208)* %183, align 4, !dbg !45
+  %185 = getelementptr i32, i32 addrspace(208)* %166, i32 10, !dbg !45
+  %186 = ptrtoint float addrspace(203)* %163 to i32, !dbg !45
+  store i32 %186, i32 addrspace(208)* %185, align 4, !dbg !45
+  %187 = getelementptr i32, i32 addrspace(208)* %166, i32 11, !dbg !45
+  %188 = ptrtoint float addrspace(203)* %164 to i32, !dbg !45
+  store i32 %188, i32 addrspace(208)* %187, align 4, !dbg !45
+  %189 = getelementptr i32, i32 addrspace(208)* %166, i32 12, !dbg !45
+  %190 = ptrtoint float addrspace(203)* %165 to i32, !dbg !45
+  store i32 %190, i32 addrspace(208)* %189, align 4, !dbg !45
+  call void @llvm.tpu.task.dispatch(i32 addrspace(208)* %166, i32 851969), !dbg !45
+  fence seq_cst
+  %191 = call i32 addrspace(208)* @llvm.tpu.allocate.dreg(i32 2, i32 0)
+  %192 = getelementptr i32, i32 addrspace(208)* %191, i32 0
+  store i32 ptrtoint (void ()* @access2_lowered to i32), i32 addrspace(208)* %192, align 4
+  %193 = getelementptr i32, i32 addrspace(208)* %191, i32 1
+  store i32 ptrtoint (void ()* @execute3_lowered to i32), i32 addrspace(208)* %193, align 4
+  call void @llvm.tpu.task.dispatch.clear.ibuf(i32 addrspace(208)* %191, i32 196607)
+  call void @llvm.tpu.sc.strace(i32 -1610612737)
+  call void @llvm.tpu.sc.ssettm(i32 2147483647)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare i32 addrspace(208)* @llvm.tpu.allocate.dreg(i32, i32) #3
+
+; Function Attrs: nounwind readnone
+declare float addrspace(203)* @llvm.tpu.inttoptr.p203f32(i32) #4
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.tpu.tileid() #4
+
+; Function Attrs: nounwind
+declare i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32, i32) #3
+
+; Function Attrs: nounwind
+declare i32 addrspace(204)* @llvm.tpu.allocate.sflag(i32, i32) #3
+
+; Function Attrs: nounwind
+declare void @llvm.tpu.barrier(i32, i32) #3
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* readonly, i32, i32 addrspace(203)* readonly, i32 addrspace(201)*, i32, i32) #5
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* readonly, i32, i32 addrspace(203)* readonly, i32 addrspace(201)*, i32, i32) #5
+
+; Function Attrs: nounwind
+declare void @llvm.tpu.waitge(i32 addrspace(204)*, i32) #3
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.tpu.syncadd(i32 addrspace(204)*, i32) #6
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* readonly, i32, i32 addrspace(201)*, i32 addrspace(203)* writeonly, i32, i32) #5
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float>) #7
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float>) #7
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.minimum.v8f32(<8 x float>, <8 x float>) #8
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.maximum.v8f32(<8 x float>, <8 x float>) #8
+
+; Function Attrs: nounwind
+declare i32* @llvm.tpu.allocate.smem(i32, i32) #3
+
+; Function Attrs: nounwind
+declare i32* @llvm.tpu.alloca.smem(i32) #3
+
+; Function Attrs: nounwind
+declare i32 addrspace(204)* @llvm.tpu.alloca.sflag(i32) #3
+
+; Function Attrs: nounwind readnone
+declare i32 addrspace(203)* @llvm.tpu.inttoptr.p203i32(i32) #4
+
+; Function Attrs: nounwind readnone
+declare i32 addrspace(211)* @llvm.tpu.addrspacecast.scs(i32 addrspace(204)*) #4
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.tpu.dma.hbm.to.smem.sc.simple(i32 addrspace(211)*, i32 addrspace(203)*, i32*, i32, i32) #6
+
+; Function Attrs: nounwind readnone
+declare i8 addrspace(203)* @llvm.tpu.inttoptr.p203i8(i32) #4
+
+; Function Attrs: nounwind
+declare void @llvm.tpu.sc.ssettm(i32) #3
+
+; Function Attrs: nounwind
+declare void @llvm.tpu.sc.strace(i32) #3
+
+; Function Attrs: nounwind
+declare void @llvm.tpu.task.dispatch(i32 addrspace(208)*, i32) #3
+
+; Function Attrs: nounwind
+declare void @llvm.tpu.task.dispatch.clear.ibuf(i32 addrspace(208)*, i32) #3
+
+attributes #0 = { "implicit-section-name"=".text.tile_execute" "target-cpu"="sparsecore-tec-vf" }
+attributes #1 = { "implicit-section-name"=".text.tile_access" "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "implicit-section-name"=".text.scs" "target-cpu"="sparsecore-scs-vf" }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind readnone }
+attributes #5 = { argmemonly nounwind willreturn }
+attributes #6 = { argmemonly nounwind }
+attributes #7 = { nounwind readnone speculatable willreturn }
+attributes #8 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+!smem.funcs.spill = !{!3, !4, !5, !6, !7}
+!smem.ranges.spill.start = !{!8, !8, !8, !8, !9}
+!smem.ranges.spill.limit = !{!10, !10, !10, !10, !11}
+!tilespmem.funcs.spill = !{!3, !4, !5, !6, !7}
+!tilespmem.ranges.spill.start = !{!8, !8, !12, !8, !8}
+!tilespmem.ranges.spill.limit = !{!13, !8, !13, !8, !8}
+!vmem.funcs.spill = !{!3, !4, !5, !6, !7}
+!vmem.ranges.spill.start = !{!8, !8, !8, !8, !8}
+!vmem.ranges.spill.limit = !{!8, !8, !8, !8, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "mlir", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "LLVMDialectModule", directory: "/")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{void ()* @execute3_lowered}
+!4 = !{void ()* @access2_lowered}
+!5 = !{void ()* @execute1_lowered}
+!6 = !{void ()* @access0_lowered}
+!7 = !{void ()* @main_lowered}
+!8 = !{i32 0}
+!9 = !{i32 25}
+!10 = !{i32 2048}
+!11 = !{i32 16350}
+!12 = !{i32 3588}
+!13 = !{i32 131072}
+!14 = distinct !DISubprogram(name: "execute3_lowered", linkageName: "execute3_lowered", scope: null, file: !15, type: !16, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!15 = !DIFile(filename: "<unknown>", directory: "/usr/local/google/_blaze_hgreving/68b472e6f83a7b1f60140f59281cdf52/execroot/google3/blaze-out/k8-fastbuild/bin/platforms/xla/sparse_core/execution_tests/ftrl_test_viperfish_iss_sparsecore.runfiles/google3")
+!16 = !DISubroutineType(types: !17)
+!17 = !{}
+!18 = distinct !DISubprogram(name: "access2_lowered", linkageName: "access2_lowered", scope: null, file: !15, type: !16, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!19 = distinct !DISubprogram(name: "execute1_lowered", linkageName: "execute1_lowered", scope: null, file: !20, line: 1, type: !16, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!20 = !DIFile(filename: "fusion", directory: "/usr/local/google/_blaze_hgreving/68b472e6f83a7b1f60140f59281cdf52/execroot/google3/blaze-out/k8-fastbuild/bin/platforms/xla/sparse_core/execution_tests/ftrl_test_viperfish_iss_sparsecore.runfiles/google3")
+!21 = !DILocation(line: 1, column: 1, scope: !22)
+!22 = !DILexicalBlockFile(scope: !19, file: !20, discriminator: 0)
+!23 = distinct !{!23, !24}
+!24 = !{!"llvm.loop.unroll.disable", i1 true}
+!25 = distinct !{!25, !26}
+!26 = !{!"llvm.loop.parallel_accesses", !27}
+!27 = distinct !{}
+!28 = !{!29}
+!29 = distinct !{!29, !30, !"Unrolled iteration #0"}
+!30 = distinct !{!30}
+!31 = !{!32, !33, !34}
+!32 = distinct !{!32, !30, !"Unrolled iteration #1"}
+!33 = distinct !{!33, !30, !"Unrolled iteration #2"}
+!34 = distinct !{!34, !30, !"Unrolled iteration #3"}
+!35 = !{!32}
+!36 = !{!29, !33, !34}
+!37 = !{!33}
+!38 = !{!29, !32, !34}
+!39 = !{!34}
+!40 = !{!29, !32, !33}
+!41 = distinct !DISubprogram(name: "access0_lowered", linkageName: "access0_lowered", scope: null, file: !20, line: 1, type: !16, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!42 = !DILocation(line: 1, column: 1, scope: !43)
+!43 = !DILexicalBlockFile(scope: !41, file: !20, discriminator: 0)
+!44 = distinct !DISubprogram(name: "main_lowered", linkageName: "main_lowered", scope: null, file: !15, type: !16, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!45 = !DILocation(line: 1, column: 1, scope: !46)
+!46 = !DILexicalBlockFile(scope: !44, file: !20, discriminator: 0)

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/ftrl_pathological_super_pass_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/ftrl_pathological_super_pass_sc.ll
new file mode 100644
index 0000000..7b3795b
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/ftrl_pathological_super_pass_sc.ll

@@ -0,0 +1,2481 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-vf < %s \
+; RUN: | llc -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-pipeliner-strategy=swingslack -tpu-fixed-vregs=32-63 \
+; RUN: -tpu-enable-pipeliner-super-pass -tpu-pipeliner-annotate-for-testing \
+; RUN: -enable-pre-spill -debug-only=tpu-loop-analysis -tpu-enable-loop-analysis \
+; RUN: -improve-prolog-epilog-aa=false 2>&1 | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; FIXME(hgreving): When running with scc, I am getting II=69, the
+; exact reason for this is unclear, but the optimizer's code is
+; different. We should find out what's different exactly, and why
+; we can't mitigate the spills as well as we are in the scc case.
+; CHECK: Post-RA pipelined loop bb.2 (from bb.2): II=72
+
+%"class.embeddings::PointerBase" = type { %"class.embeddings::MemorySpace", %"class.embeddings::BasicType", %"union.embeddings::PointerBase::AnyPtr" }
+%"class.embeddings::MemorySpace" = type { i32 }
+%"class.embeddings::BasicType" = type { i32 }
+%"union.embeddings::PointerBase::AnyPtr" = type { i32* }
+%"class.embeddings::TileSpmemVectorArray" = type { %"class.embeddings::ScratchpadArray" }
+%"class.embeddings::ScratchpadArray" = type { %"class.embeddings::BaseArray" }
+%"class.embeddings::BaseArray" = type { %"class.embeddings::PointerBase", i32 }
+%"class.embeddings::Ftrl" = type { float, float, float, float }
+%"class.embeddings::TileSpmemPointer" = type { %"class.embeddings::PointerBase" }
+%class.anon = type { i8 }
+%class.anon.0 = type { i8 }
+%class.anon.2 = type { i8 }
+%class.anon.4 = type { i8 }
+%class.anon.6 = type { i8 }
+%class.anon.8 = type { i8 }
+%class.anon.10 = type { i8 }
+%class.anon.12 = type { i8 }
+%"class.embeddings::SCTY_V8F32" = type { %"class.embeddings::BasicType" }
+%"class.embeddings::SCM_TileSpmem" = type { %"class.embeddings::MemorySpace" }
+
+$_ZN10embeddings14PointerFactoryIPU5AS201Dv8_fE6CreateES3_ = comdat any
+
+$_ZN10embeddings11PointerBaseC2ERKS0_ = comdat any
+
+$_ZN10embeddings20TileSpmemVectorArrayIfEC2ENS_11PointerBaseEi = comdat any
+
+$_ZN10embeddings4FtrlC2Effff = comdat any
+
+$_ZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ = comdat any
+
+$_ZN10embeddings11MemorySpaceC2ERKS0_ = comdat any
+
+$_ZN10embeddings9BasicTypeC2ERKS0_ = comdat any
+
+$_ZNK10embeddings9BaseArray8ElementsEv = comdat any
+
+$_ZN10embeddings20TileSpmemVectorArrayIfEclEi = comdat any
+
+$_ZNK10embeddings20TileSpmemVectorArrayIfEclEi = comdat any
+
+$_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE_clES6_f = comdat any
+
+$_ZN10embeddings4Ftrl21GetMinusLinearReducedEDv8_f = comdat any
+
+$_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE0_clES6_f = comdat any
+
+$_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE1_clES6_f = comdat any
+
+$_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE2_clES6_f = comdat any
+
+$_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE3_clES6_f = comdat any
+
+$_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE4_clES6_f = comdat any
+
+$_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE5_clES6_f = comdat any
+
+$_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE6_clES6_f = comdat any
+
+$_ZNK10embeddings20TileSpmemVectorArrayIfE4BaseEv = comdat any
+
+$_ZN10embeddings4CastINS_16TileSpmemPointerENS_11PointerBaseEEENS_15cast_retty_implIT_T0_E8ret_typeERKS5_ = comdat any
+
+$_ZNK10embeddings9BaseArray7BasePtrEv = comdat any
+
+$_ZNK10embeddings16TileSpmemPointer6RawPtrEv = comdat any
+
+$_ZN10embeddings4Ftrl14ClampSymmetricEDv8_ff = comdat any
+
+$_ZN10embeddings15ScratchpadArrayC2ENS_11PointerBaseEi = comdat any
+
+$_ZN10embeddings9BaseArrayC2ENS_11PointerBaseEi = comdat any
+
+$_ZN10embeddings11ToBasicTypeIDv8_fE10basic_typeEv = comdat any
+
+$_ZN10embeddings16TileSpmemPointerC2EPU5AS201vNS_9BasicTypeE = comdat any
+
+$_ZN10embeddings11PointerBaseC2EOS0_ = comdat any
+
+$_ZN10embeddings10SCTY_V8F32C2Ev = comdat any
+
+$_ZN10embeddings9BasicTypeC2ENS_19SparsecoreBasicTypeE = comdat any
+
+$_ZN10embeddings11PointerBaseC2ENS_9BasicTypeEPU5AS201v = comdat any
+
+$_ZN10embeddings13SCM_TileSpmemC2Ev = comdat any
+
+$_ZN10embeddings11PointerBase6AnyPtrC2EPU5AS201v = comdat any
+
+$_ZN10embeddings11MemorySpaceC2ENS_21SparsecoreMemorySpaceE = comdat any
+
+@__sc_scs_entry = dso_local alias i32, bitcast (void ()* @scs to i32*)
+@__sc_tile_access_entry = dso_local alias i32, bitcast (void ()* @tile_access to i32*)
+@__sc_tile_execute_entry = dso_local alias i32, bitcast (void ()* @tile_execute to i32*)
+
+; Function Attrs: mustprogress nounwind
+define dso_local void @tile_access() #0 section ".text.tile_access" {
+  ret void
+}
+
+; Function Attrs: mustprogress
+define dso_local void @tile_execute() #1 section ".text.tile_execute" {
+  %1 = alloca i32, align 4
+  %2 = alloca %"class.embeddings::PointerBase", align 4
+  %3 = alloca %"class.embeddings::PointerBase", align 4
+  %4 = alloca %"class.embeddings::PointerBase", align 4
+  %5 = alloca %"class.embeddings::PointerBase", align 4
+  %6 = alloca float, align 4
+  %7 = alloca float, align 4
+  %8 = alloca float, align 4
+  %9 = alloca float, align 4
+  %10 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %11 = alloca %"class.embeddings::PointerBase", align 4
+  %12 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %13 = alloca %"class.embeddings::PointerBase", align 4
+  %14 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %15 = alloca %"class.embeddings::PointerBase", align 4
+  %16 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %17 = alloca %"class.embeddings::PointerBase", align 4
+  %18 = alloca %"class.embeddings::Ftrl", align 4
+  %19 = bitcast i32* %1 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %19) #15
+  %20 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 0) #19
+  store i32 %20, i32* %1, align 4, !tbaa !3
+  %21 = bitcast %"class.embeddings::PointerBase"* %2 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %21) #15
+  %22 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 1) #19
+  %23 = inttoptr i32 %22 to <8 x float> addrspace(201)*
+  call void @_ZN10embeddings14PointerFactoryIPU5AS201Dv8_fE6CreateES3_(%"class.embeddings::PointerBase"* sret(%"class.embeddings::PointerBase") align 4 %2, <8 x float> addrspace(201)* noundef %23) #19
+  %24 = bitcast %"class.embeddings::PointerBase"* %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %24) #15
+  %25 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 2) #19
+  %26 = inttoptr i32 %25 to <8 x float> addrspace(201)*
+  call void @_ZN10embeddings14PointerFactoryIPU5AS201Dv8_fE6CreateES3_(%"class.embeddings::PointerBase"* sret(%"class.embeddings::PointerBase") align 4 %3, <8 x float> addrspace(201)* noundef %26) #19
+  %27 = bitcast %"class.embeddings::PointerBase"* %4 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %27) #15
+  %28 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 3) #19
+  %29 = inttoptr i32 %28 to <8 x float> addrspace(201)*
+  call void @_ZN10embeddings14PointerFactoryIPU5AS201Dv8_fE6CreateES3_(%"class.embeddings::PointerBase"* sret(%"class.embeddings::PointerBase") align 4 %4, <8 x float> addrspace(201)* noundef %29) #19
+  %30 = bitcast %"class.embeddings::PointerBase"* %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %30) #15
+  %31 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 4) #19
+  %32 = inttoptr i32 %31 to <8 x float> addrspace(201)*
+  call void @_ZN10embeddings14PointerFactoryIPU5AS201Dv8_fE6CreateES3_(%"class.embeddings::PointerBase"* sret(%"class.embeddings::PointerBase") align 4 %5, <8 x float> addrspace(201)* noundef %32) #19
+  %33 = bitcast float* %6 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %33) #15
+  store float 2.000000e+00, float* %6, align 4, !tbaa !7
+  %34 = bitcast float* %7 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %34) #15
+  store float 5.000000e-01, float* %7, align 4, !tbaa !7
+  %35 = bitcast float* %8 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %35) #15
+  store float 1.000000e+00, float* %8, align 4, !tbaa !7
+  %36 = bitcast float* %9 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %36) #15
+  store float 1.000000e+00, float* %9, align 4, !tbaa !7
+  %37 = bitcast %"class.embeddings::TileSpmemVectorArray"* %10 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %37) #15
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %11, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %2) #19
+  %38 = load i32, i32* %1, align 4, !tbaa !3
+  %39 = sdiv i32 %38, 8
+  call void @_ZN10embeddings20TileSpmemVectorArrayIfEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %10, %"class.embeddings::PointerBase"* noundef %11, i32 noundef %39) #19
+  %40 = bitcast %"class.embeddings::TileSpmemVectorArray"* %12 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %40) #15
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %13, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %3) #19
+  %41 = load i32, i32* %1, align 4, !tbaa !3
+  %42 = sdiv i32 %41, 8
+  call void @_ZN10embeddings20TileSpmemVectorArrayIfEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %12, %"class.embeddings::PointerBase"* noundef %13, i32 noundef %42) #19
+  %43 = bitcast %"class.embeddings::TileSpmemVectorArray"* %14 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %43) #15
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %15, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %4) #19
+  %44 = load i32, i32* %1, align 4, !tbaa !3
+  %45 = sdiv i32 %44, 8
+  call void @_ZN10embeddings20TileSpmemVectorArrayIfEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %14, %"class.embeddings::PointerBase"* noundef %15, i32 noundef %45) #19
+  %46 = bitcast %"class.embeddings::TileSpmemVectorArray"* %16 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %46) #15
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %17, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %5) #19
+  %47 = load i32, i32* %1, align 4, !tbaa !3
+  %48 = sdiv i32 %47, 8
+  call void @_ZN10embeddings20TileSpmemVectorArrayIfEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %16, %"class.embeddings::PointerBase"* noundef %17, i32 noundef %48) #19
+  %49 = bitcast %"class.embeddings::Ftrl"* %18 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %49) #15
+  call void @_ZN10embeddings4FtrlC2Effff(%"class.embeddings::Ftrl"* noundef nonnull align 4 dereferenceable(16) %18, float noundef 2.000000e+00, float noundef 5.000000e-01, float noundef 1.000000e+00, float noundef 1.000000e+00) #19
+  call void @_ZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_(%"class.embeddings::Ftrl"* noundef nonnull align 4 dereferenceable(16) %18, %"class.embeddings::TileSpmemVectorArray"* noundef %10, %"class.embeddings::TileSpmemVectorArray"* noundef %12, %"class.embeddings::TileSpmemVectorArray"* noundef %14, %"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %16) #19
+  call void @_ZN12_GLOBAL__N_16ReturnEii(i32 noundef 1, i32 noundef 0) #19
+  %50 = bitcast %"class.embeddings::Ftrl"* %18 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %50) #15
+  %51 = bitcast %"class.embeddings::TileSpmemVectorArray"* %16 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %51) #15
+  %52 = bitcast %"class.embeddings::TileSpmemVectorArray"* %14 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %52) #15
+  %53 = bitcast %"class.embeddings::TileSpmemVectorArray"* %12 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %53) #15
+  %54 = bitcast %"class.embeddings::TileSpmemVectorArray"* %10 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %54) #15
+  %55 = bitcast float* %9 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %55) #15
+  %56 = bitcast float* %8 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %56) #15
+  %57 = bitcast float* %7 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %57) #15
+  %58 = bitcast float* %6 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %58) #15
+  %59 = bitcast %"class.embeddings::PointerBase"* %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %59) #15
+  %60 = bitcast %"class.embeddings::PointerBase"* %4 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %60) #15
+  %61 = bitcast %"class.embeddings::PointerBase"* %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %61) #15
+  %62 = bitcast %"class.embeddings::PointerBase"* %2 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %62) #15
+  %63 = bitcast i32* %1 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %63) #15
+  ret void
+}
+
+; Function Attrs: argmemonly nocallback nofree nosync nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: mustprogress nounwind
+define internal noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef %0) #3 {
+  %2 = alloca i32, align 4
+  %3 = alloca i32*, align 4
+  store i32 %0, i32* %2, align 4, !tbaa !3
+  %4 = bitcast i32** %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #15
+  %5 = load i32, i32* %2, align 4, !tbaa !3
+  %6 = add nsw i32 256, %5
+  %7 = inttoptr i32 %6 to i32*
+  store i32* %7, i32** %3, align 4, !tbaa !9
+  %8 = load i32*, i32** %3, align 4, !tbaa !9
+  %9 = load i32, i32* %8, align 4, !tbaa !3
+  %10 = bitcast i32** %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %10) #15
+  ret i32 %9
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings14PointerFactoryIPU5AS201Dv8_fE6CreateES3_(%"class.embeddings::PointerBase"* noalias sret(%"class.embeddings::PointerBase") align 4 %0, <8 x float> addrspace(201)* noundef %1) #4 comdat align 2 {
+  %3 = alloca i8*, align 4
+  %4 = alloca <8 x float> addrspace(201)*, align 4
+  %5 = alloca %"class.embeddings::TileSpmemPointer", align 4
+  %6 = alloca %"class.embeddings::BasicType", align 4
+  %7 = bitcast %"class.embeddings::PointerBase"* %0 to i8*
+  store i8* %7, i8** %3, align 4
+  store <8 x float> addrspace(201)* %1, <8 x float> addrspace(201)** %4, align 4, !tbaa !9
+  %8 = bitcast %"class.embeddings::TileSpmemPointer"* %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %8) #15
+  %9 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** %4, align 4, !tbaa !9
+  %10 = bitcast <8 x float> addrspace(201)* %9 to i8 addrspace(201)*
+  call void @_ZN10embeddings11ToBasicTypeIDv8_fE10basic_typeEv(%"class.embeddings::BasicType"* sret(%"class.embeddings::BasicType") align 4 %6) #20
+  call void @_ZN10embeddings16TileSpmemPointerC2EPU5AS201vNS_9BasicTypeE(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %5, i8 addrspace(201)* noundef %10, %"class.embeddings::BasicType"* noundef %6) #20
+  %11 = bitcast %"class.embeddings::TileSpmemPointer"* %5 to %"class.embeddings::PointerBase"*
+  call void @_ZN10embeddings11PointerBaseC2EOS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %11) #20
+  %12 = bitcast %"class.embeddings::TileSpmemPointer"* %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %12) #15
+  ret void
+}
+
+; Function Attrs: inlinehint
+define linkonce_odr dso_local void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) unnamed_addr #5 comdat align 2 {
+  %3 = alloca %"class.embeddings::PointerBase"*, align 4
+  %4 = alloca %"class.embeddings::PointerBase"*, align 4
+  store %"class.embeddings::PointerBase"* %0, %"class.embeddings::PointerBase"** %3, align 4, !tbaa !9
+  store %"class.embeddings::PointerBase"* %1, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !9
+  %5 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %3, align 4
+  %6 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 0
+  %7 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !9
+  %8 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 0
+  call void @_ZN10embeddings11MemorySpaceC2ERKS0_(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %6, %"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %8) #20
+  %9 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 1
+  %10 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !9
+  %11 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %10, i32 0, i32 1
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %9, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %11) #20
+  %12 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 2
+  %13 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !9
+  %14 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %13, i32 0, i32 2
+  %15 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %12 to i8*
+  %16 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %14 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %15, i8* align 4 %16, i32 4, i1 false), !tbaa.struct !11
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings20TileSpmemVectorArrayIfEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::PointerBase"* noundef %1, i32 noundef %2) unnamed_addr #6 comdat align 2 {
+  %4 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca %"class.embeddings::PointerBase", align 4
+  store %"class.embeddings::TileSpmemVectorArray"* %0, %"class.embeddings::TileSpmemVectorArray"** %4, align 4, !tbaa !9
+  store i32 %2, i32* %5, align 4, !tbaa !3
+  %7 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %4, align 4
+  %8 = bitcast %"class.embeddings::TileSpmemVectorArray"* %7 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %6, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) #20
+  %9 = load i32, i32* %5, align 4, !tbaa !3
+  call void @_ZN10embeddings15ScratchpadArrayC2ENS_11PointerBaseEi(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %8, %"class.embeddings::PointerBase"* noundef %6, i32 noundef %9) #20
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define linkonce_odr dso_local void @_ZN10embeddings4FtrlC2Effff(%"class.embeddings::Ftrl"* noundef nonnull align 4 dereferenceable(16) %0, float noundef %1, float noundef %2, float noundef %3, float noundef %4) unnamed_addr #7 comdat align 2 {
+  %6 = alloca %"class.embeddings::Ftrl"*, align 4
+  %7 = alloca float, align 4
+  %8 = alloca float, align 4
+  %9 = alloca float, align 4
+  %10 = alloca float, align 4
+  store %"class.embeddings::Ftrl"* %0, %"class.embeddings::Ftrl"** %6, align 4, !tbaa !9
+  store float %1, float* %7, align 4, !tbaa !7
+  store float %2, float* %8, align 4, !tbaa !7
+  store float %3, float* %9, align 4, !tbaa !7
+  store float %4, float* %10, align 4, !tbaa !7
+  %11 = load %"class.embeddings::Ftrl"*, %"class.embeddings::Ftrl"** %6, align 4
+  %12 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %11, i32 0, i32 0
+  %13 = load float, float* %7, align 4, !tbaa !7
+  store float %13, float* %12, align 4, !tbaa !12
+  %14 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %11, i32 0, i32 1
+  %15 = load float, float* %8, align 4, !tbaa !7
+  store float %15, float* %14, align 4, !tbaa !14
+  %16 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %11, i32 0, i32 2
+  %17 = load float, float* %9, align 4, !tbaa !7
+  store float %17, float* %16, align 4, !tbaa !15
+  %18 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %11, i32 0, i32 3
+  %19 = load float, float* %10, align 4, !tbaa !7
+  store float %19, float* %18, align 4, !tbaa !16
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_(%"class.embeddings::Ftrl"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::TileSpmemVectorArray"* noundef %1, %"class.embeddings::TileSpmemVectorArray"* noundef %2, %"class.embeddings::TileSpmemVectorArray"* noundef %3, %"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %4) #8 comdat align 2 {
+  %6 = alloca %"class.embeddings::Ftrl"*, align 4
+  %7 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %8 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %9 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %10 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %11 = alloca i32, align 4
+  %12 = alloca i32, align 4
+  %13 = alloca <8 x float>, align 32
+  %14 = alloca <8 x float>, align 32
+  %15 = alloca <8 x float>, align 32
+  %16 = alloca <8 x float>, align 32
+  %17 = alloca <8 x float>, align 32
+  %18 = alloca %class.anon, align 1
+  %19 = alloca <8 x float>, align 32
+  %20 = alloca <8 x float>, align 32
+  %21 = alloca <8 x float>, align 32
+  %22 = alloca <8 x float>, align 32
+  %23 = alloca i32, align 4
+  %24 = alloca <8 x float>, align 32
+  %25 = alloca <8 x float>, align 32
+  %26 = alloca <8 x float>, align 32
+  %27 = alloca <8 x float>, align 32
+  %28 = alloca <8 x float>, align 32
+  %29 = alloca %class.anon.0, align 1
+  %30 = alloca <8 x float>, align 32
+  %31 = alloca <8 x float>, align 32
+  %32 = alloca <8 x float>, align 32
+  %33 = alloca <8 x float>, align 32
+  %34 = alloca i32, align 4
+  %35 = alloca <8 x float>, align 32
+  %36 = alloca <8 x float>, align 32
+  %37 = alloca <8 x float>, align 32
+  %38 = alloca <8 x float>, align 32
+  %39 = alloca <8 x float>, align 32
+  %40 = alloca %class.anon.2, align 1
+  %41 = alloca <8 x float>, align 32
+  %42 = alloca <8 x float>, align 32
+  %43 = alloca <8 x float>, align 32
+  %44 = alloca <8 x float>, align 32
+  %45 = alloca i32, align 4
+  %46 = alloca <8 x float>, align 32
+  %47 = alloca <8 x float>, align 32
+  %48 = alloca <8 x float>, align 32
+  %49 = alloca <8 x float>, align 32
+  %50 = alloca <8 x float>, align 32
+  %51 = alloca %class.anon.4, align 1
+  %52 = alloca <8 x float>, align 32
+  %53 = alloca <8 x float>, align 32
+  %54 = alloca <8 x float>, align 32
+  %55 = alloca <8 x float>, align 32
+  %56 = alloca i32, align 4
+  %57 = alloca <8 x float>, align 32
+  %58 = alloca <8 x float>, align 32
+  %59 = alloca <8 x float>, align 32
+  %60 = alloca <8 x float>, align 32
+  %61 = alloca <8 x float>, align 32
+  %62 = alloca %class.anon.6, align 1
+  %63 = alloca <8 x float>, align 32
+  %64 = alloca <8 x float>, align 32
+  %65 = alloca <8 x float>, align 32
+  %66 = alloca <8 x float>, align 32
+  %67 = alloca i32, align 4
+  %68 = alloca <8 x float>, align 32
+  %69 = alloca <8 x float>, align 32
+  %70 = alloca <8 x float>, align 32
+  %71 = alloca <8 x float>, align 32
+  %72 = alloca <8 x float>, align 32
+  %73 = alloca %class.anon.8, align 1
+  %74 = alloca <8 x float>, align 32
+  %75 = alloca <8 x float>, align 32
+  %76 = alloca <8 x float>, align 32
+  %77 = alloca <8 x float>, align 32
+  %78 = alloca i32, align 4
+  %79 = alloca <8 x float>, align 32
+  %80 = alloca <8 x float>, align 32
+  %81 = alloca <8 x float>, align 32
+  %82 = alloca <8 x float>, align 32
+  %83 = alloca <8 x float>, align 32
+  %84 = alloca %class.anon.10, align 1
+  %85 = alloca <8 x float>, align 32
+  %86 = alloca <8 x float>, align 32
+  %87 = alloca <8 x float>, align 32
+  %88 = alloca <8 x float>, align 32
+  %89 = alloca i32, align 4
+  %90 = alloca <8 x float>, align 32
+  %91 = alloca <8 x float>, align 32
+  %92 = alloca <8 x float>, align 32
+  %93 = alloca <8 x float>, align 32
+  %94 = alloca <8 x float>, align 32
+  %95 = alloca %class.anon.12, align 1
+  %96 = alloca <8 x float>, align 32
+  %97 = alloca <8 x float>, align 32
+  %98 = alloca <8 x float>, align 32
+  %99 = alloca <8 x float>, align 32
+  store %"class.embeddings::Ftrl"* %0, %"class.embeddings::Ftrl"** %6, align 4, !tbaa !9
+  store %"class.embeddings::TileSpmemVectorArray"* %1, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9
+  store %"class.embeddings::TileSpmemVectorArray"* %2, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9
+  store %"class.embeddings::TileSpmemVectorArray"* %3, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9
+  store %"class.embeddings::TileSpmemVectorArray"* %4, %"class.embeddings::TileSpmemVectorArray"** %10, align 4, !tbaa !9
+  %100 = load %"class.embeddings::Ftrl"*, %"class.embeddings::Ftrl"** %6, align 4
+  %101 = bitcast i32* %11 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %101) #15
+  store i32 0, i32* %11, align 4, !tbaa !3
+  br label %102
+
+102:                                              ; preds = %879, %5
+  %103 = load i32, i32* %11, align 4, !tbaa !3, !llvm.access.group !17
+  %104 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %105 = bitcast %"class.embeddings::TileSpmemVectorArray"* %104 to %"class.embeddings::BaseArray"*
+  %106 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %105) #20, !llvm.access.group !17
+  %107 = icmp slt i32 %103, %106
+  br i1 %107, label %110, label %108
+
+108:                                              ; preds = %102
+  %109 = bitcast i32* %11 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %109) #15, !llvm.access.group !17
+  br label %882
+
+110:                                              ; preds = %102
+  %111 = bitcast i32* %12 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %111) #15, !llvm.access.group !17
+  %112 = load i32, i32* %11, align 4, !tbaa !3, !llvm.access.group !17
+  %113 = add nsw i32 %112, 0
+  store i32 %113, i32* %12, align 4, !tbaa !3, !llvm.access.group !17
+  %114 = bitcast <8 x float>* %13 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %114) #15, !llvm.access.group !17
+  %115 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %116 = load i32, i32* %12, align 4, !tbaa !3, !llvm.access.group !17
+  %117 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %115, i32 noundef %116) #20, !llvm.access.group !17
+  %118 = load <8 x float>, <8 x float> addrspace(201)* %117, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %118, <8 x float>* %13, align 32, !tbaa !18, !llvm.access.group !17
+  %119 = bitcast <8 x float>* %14 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %119) #15, !llvm.access.group !17
+  %120 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %10, align 4, !tbaa !9, !llvm.access.group !17
+  %121 = load i32, i32* %12, align 4, !tbaa !3, !llvm.access.group !17
+  %122 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %120, i32 noundef %121) #20, !llvm.access.group !17
+  %123 = load <8 x float>, <8 x float> addrspace(201)* %122, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %123, <8 x float>* %14, align 32, !tbaa !18, !llvm.access.group !17
+  %124 = bitcast <8 x float>* %15 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %124) #15, !llvm.access.group !17
+  %125 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9, !llvm.access.group !17
+  %126 = load i32, i32* %12, align 4, !tbaa !3, !llvm.access.group !17
+  %127 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %125, i32 noundef %126) #20, !llvm.access.group !17
+  %128 = load <8 x float>, <8 x float> addrspace(201)* %127, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %128, <8 x float>* %15, align 32, !tbaa !18, !llvm.access.group !17
+  %129 = bitcast <8 x float>* %16 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %129) #15, !llvm.access.group !17
+  %130 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9, !llvm.access.group !17
+  %131 = load i32, i32* %12, align 4, !tbaa !3, !llvm.access.group !17
+  %132 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %130, i32 noundef %131) #20, !llvm.access.group !17
+  %133 = load <8 x float>, <8 x float> addrspace(201)* %132, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %133, <8 x float>* %16, align 32, !tbaa !18, !llvm.access.group !17
+  %134 = bitcast <8 x float>* %17 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %134) #15, !llvm.access.group !17
+  %135 = load <8 x float>, <8 x float>* %15, align 32, !tbaa !18, !llvm.access.group !17
+  %136 = load <8 x float>, <8 x float>* %14, align 32, !tbaa !18, !llvm.access.group !17
+  %137 = load <8 x float>, <8 x float>* %14, align 32, !tbaa !18, !llvm.access.group !17
+  %138 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %136, <8 x float> %137, <8 x float> %135)
+  store <8 x float> %138, <8 x float>* %17, align 32, !tbaa !18, !llvm.access.group !17
+  %139 = bitcast %class.anon* %18 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %139) #15, !llvm.access.group !17
+  %140 = bitcast <8 x float>* %19 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %140) #15, !llvm.access.group !17
+  %141 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 0
+  %142 = load float, float* %141, align 4, !tbaa !12, !llvm.access.group !17
+  %143 = insertelement <8 x float> poison, float %142, i32 0
+  %144 = shufflevector <8 x float> %143, <8 x float> poison, <8 x i32> zeroinitializer
+  %145 = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %144)
+  store <8 x float> %145, <8 x float>* %19, align 32, !tbaa !18, !llvm.access.group !17
+  %146 = bitcast <8 x float>* %20 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %146) #15, !llvm.access.group !17
+  %147 = load <8 x float>, <8 x float>* %16, align 32, !tbaa !18, !llvm.access.group !17
+  %148 = load <8 x float>, <8 x float>* %14, align 32, !tbaa !18, !llvm.access.group !17
+  %149 = fadd <8 x float> %147, %148
+  %150 = load <8 x float>, <8 x float>* %17, align 32, !tbaa !18, !llvm.access.group !17
+  %151 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %152 = load float, float* %151, align 4, !tbaa !14, !llvm.access.group !17
+  %153 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE_clES6_f(%class.anon* noundef nonnull align 1 dereferenceable(1) %18, <8 x float> noundef %150, float noundef %152) #20, !llvm.access.group !17
+  %154 = load <8 x float>, <8 x float>* %15, align 32, !tbaa !18, !llvm.access.group !17
+  %155 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %156 = load float, float* %155, align 4, !tbaa !14, !llvm.access.group !17
+  %157 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE_clES6_f(%class.anon* noundef nonnull align 1 dereferenceable(1) %18, <8 x float> noundef %154, float noundef %156) #20, !llvm.access.group !17
+  %158 = fsub <8 x float> %153, %157
+  %159 = load <8 x float>, <8 x float>* %13, align 32, !tbaa !18, !llvm.access.group !17
+  %160 = load <8 x float>, <8 x float>* %19, align 32, !tbaa !18, !llvm.access.group !17
+  %161 = fmul <8 x float> %159, %160
+  %162 = fneg <8 x float> %158
+  %163 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %162, <8 x float> %161, <8 x float> %149)
+  store <8 x float> %163, <8 x float>* %20, align 32, !tbaa !18, !llvm.access.group !17
+  %164 = bitcast <8 x float>* %21 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %164) #15, !llvm.access.group !17
+  %165 = load <8 x float>, <8 x float>* %17, align 32, !tbaa !18, !llvm.access.group !17
+  %166 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %167 = load float, float* %166, align 4, !tbaa !14, !llvm.access.group !17
+  %168 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE_clES6_f(%class.anon* noundef nonnull align 1 dereferenceable(1) %18, <8 x float> noundef %165, float noundef %167) #20, !llvm.access.group !17
+  %169 = load <8 x float>, <8 x float>* %19, align 32, !tbaa !18, !llvm.access.group !17
+  %170 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 3
+  %171 = load float, float* %170, align 4, !tbaa !16, !llvm.access.group !17
+  %172 = fpext float %171 to double
+  %173 = fmul double 2.000000e+00, %172
+  %174 = fptrunc double %173 to float
+  %175 = insertelement <8 x float> poison, float %174, i32 0
+  %176 = shufflevector <8 x float> %175, <8 x float> poison, <8 x i32> zeroinitializer
+  %177 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %168, <8 x float> %169, <8 x float> %176)
+  store <8 x float> %177, <8 x float>* %21, align 32, !tbaa !18, !llvm.access.group !17
+  %178 = bitcast <8 x float>* %22 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %178) #15, !llvm.access.group !17
+  %179 = load <8 x float>, <8 x float>* %20, align 32, !tbaa !18, !llvm.access.group !17
+  %180 = call noundef <8 x float> @_ZN10embeddings4Ftrl21GetMinusLinearReducedEDv8_f(%"class.embeddings::Ftrl"* noundef nonnull align 4 dereferenceable(16) %100, <8 x float> noundef %179) #20, !llvm.access.group !17
+  %181 = load <8 x float>, <8 x float>* %21, align 32, !tbaa !18, !llvm.access.group !17
+  %182 = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %181)
+  %183 = fmul <8 x float> %180, %182
+  store <8 x float> %183, <8 x float>* %22, align 32, !tbaa !18, !llvm.access.group !17
+  %184 = load <8 x float>, <8 x float>* %22, align 32, !tbaa !18, !llvm.access.group !17
+  %185 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %186 = load i32, i32* %12, align 4, !tbaa !3, !llvm.access.group !17
+  %187 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %185, i32 noundef %186) #20, !llvm.access.group !17
+  store <8 x float> %184, <8 x float> addrspace(201)* %187, align 32, !tbaa !18, !llvm.access.group !17
+  %188 = load <8 x float>, <8 x float>* %17, align 32, !tbaa !18, !llvm.access.group !17
+  %189 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9, !llvm.access.group !17
+  %190 = load i32, i32* %12, align 4, !tbaa !3, !llvm.access.group !17
+  %191 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %189, i32 noundef %190) #20, !llvm.access.group !17
+  store <8 x float> %188, <8 x float> addrspace(201)* %191, align 32, !tbaa !18, !llvm.access.group !17
+  %192 = load <8 x float>, <8 x float>* %20, align 32, !tbaa !18, !llvm.access.group !17
+  %193 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9, !llvm.access.group !17
+  %194 = load i32, i32* %12, align 4, !tbaa !3, !llvm.access.group !17
+  %195 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %193, i32 noundef %194) #20, !llvm.access.group !17
+  store <8 x float> %192, <8 x float> addrspace(201)* %195, align 32, !tbaa !18, !llvm.access.group !17
+  %196 = bitcast <8 x float>* %22 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %196) #15, !llvm.access.group !17
+  %197 = bitcast <8 x float>* %21 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %197) #15, !llvm.access.group !17
+  %198 = bitcast <8 x float>* %20 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %198) #15, !llvm.access.group !17
+  %199 = bitcast <8 x float>* %19 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %199) #15, !llvm.access.group !17
+  %200 = bitcast %class.anon* %18 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %200) #15, !llvm.access.group !17
+  %201 = bitcast <8 x float>* %17 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %201) #15, !llvm.access.group !17
+  %202 = bitcast <8 x float>* %16 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %202) #15, !llvm.access.group !17
+  %203 = bitcast <8 x float>* %15 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %203) #15, !llvm.access.group !17
+  %204 = bitcast <8 x float>* %14 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %204) #15, !llvm.access.group !17
+  %205 = bitcast <8 x float>* %13 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %205) #15, !llvm.access.group !17
+  %206 = bitcast i32* %12 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %206) #15, !llvm.access.group !17
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !17
+  %207 = bitcast i32* %23 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %207) #15, !llvm.access.group !17
+  %208 = load i32, i32* %11, align 4, !tbaa !3, !llvm.access.group !17
+  %209 = add nsw i32 %208, 1
+  store i32 %209, i32* %23, align 4, !tbaa !3, !llvm.access.group !17
+  %210 = bitcast <8 x float>* %24 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %210) #15, !llvm.access.group !17
+  %211 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %212 = load i32, i32* %23, align 4, !tbaa !3, !llvm.access.group !17
+  %213 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %211, i32 noundef %212) #20, !llvm.access.group !17
+  %214 = load <8 x float>, <8 x float> addrspace(201)* %213, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %214, <8 x float>* %24, align 32, !tbaa !18, !llvm.access.group !17
+  %215 = bitcast <8 x float>* %25 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %215) #15, !llvm.access.group !17
+  %216 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %10, align 4, !tbaa !9, !llvm.access.group !17
+  %217 = load i32, i32* %23, align 4, !tbaa !3, !llvm.access.group !17
+  %218 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %216, i32 noundef %217) #20, !llvm.access.group !17
+  %219 = load <8 x float>, <8 x float> addrspace(201)* %218, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %219, <8 x float>* %25, align 32, !tbaa !18, !llvm.access.group !17
+  %220 = bitcast <8 x float>* %26 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %220) #15, !llvm.access.group !17
+  %221 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9, !llvm.access.group !17
+  %222 = load i32, i32* %23, align 4, !tbaa !3, !llvm.access.group !17
+  %223 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %221, i32 noundef %222) #20, !llvm.access.group !17
+  %224 = load <8 x float>, <8 x float> addrspace(201)* %223, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %224, <8 x float>* %26, align 32, !tbaa !18, !llvm.access.group !17
+  %225 = bitcast <8 x float>* %27 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %225) #15, !llvm.access.group !17
+  %226 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9, !llvm.access.group !17
+  %227 = load i32, i32* %23, align 4, !tbaa !3, !llvm.access.group !17
+  %228 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %226, i32 noundef %227) #20, !llvm.access.group !17
+  %229 = load <8 x float>, <8 x float> addrspace(201)* %228, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %229, <8 x float>* %27, align 32, !tbaa !18, !llvm.access.group !17
+  %230 = bitcast <8 x float>* %28 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %230) #15, !llvm.access.group !17
+  %231 = load <8 x float>, <8 x float>* %26, align 32, !tbaa !18, !llvm.access.group !17
+  %232 = load <8 x float>, <8 x float>* %25, align 32, !tbaa !18, !llvm.access.group !17
+  %233 = load <8 x float>, <8 x float>* %25, align 32, !tbaa !18, !llvm.access.group !17
+  %234 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %232, <8 x float> %233, <8 x float> %231)
+  store <8 x float> %234, <8 x float>* %28, align 32, !tbaa !18, !llvm.access.group !17
+  %235 = bitcast %class.anon.0* %29 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %235) #15, !llvm.access.group !17
+  %236 = bitcast <8 x float>* %30 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %236) #15, !llvm.access.group !17
+  %237 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 0
+  %238 = load float, float* %237, align 4, !tbaa !12, !llvm.access.group !17
+  %239 = insertelement <8 x float> poison, float %238, i32 0
+  %240 = shufflevector <8 x float> %239, <8 x float> poison, <8 x i32> zeroinitializer
+  %241 = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %240)
+  store <8 x float> %241, <8 x float>* %30, align 32, !tbaa !18, !llvm.access.group !17
+  %242 = bitcast <8 x float>* %31 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %242) #15, !llvm.access.group !17
+  %243 = load <8 x float>, <8 x float>* %27, align 32, !tbaa !18, !llvm.access.group !17
+  %244 = load <8 x float>, <8 x float>* %25, align 32, !tbaa !18, !llvm.access.group !17
+  %245 = fadd <8 x float> %243, %244
+  %246 = load <8 x float>, <8 x float>* %28, align 32, !tbaa !18, !llvm.access.group !17
+  %247 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %248 = load float, float* %247, align 4, !tbaa !14, !llvm.access.group !17
+  %249 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE0_clES6_f(%class.anon.0* noundef nonnull align 1 dereferenceable(1) %29, <8 x float> noundef %246, float noundef %248) #20, !llvm.access.group !17
+  %250 = load <8 x float>, <8 x float>* %26, align 32, !tbaa !18, !llvm.access.group !17
+  %251 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %252 = load float, float* %251, align 4, !tbaa !14, !llvm.access.group !17
+  %253 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE0_clES6_f(%class.anon.0* noundef nonnull align 1 dereferenceable(1) %29, <8 x float> noundef %250, float noundef %252) #20, !llvm.access.group !17
+  %254 = fsub <8 x float> %249, %253
+  %255 = load <8 x float>, <8 x float>* %24, align 32, !tbaa !18, !llvm.access.group !17
+  %256 = load <8 x float>, <8 x float>* %30, align 32, !tbaa !18, !llvm.access.group !17
+  %257 = fmul <8 x float> %255, %256
+  %258 = fneg <8 x float> %254
+  %259 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %258, <8 x float> %257, <8 x float> %245)
+  store <8 x float> %259, <8 x float>* %31, align 32, !tbaa !18, !llvm.access.group !17
+  %260 = bitcast <8 x float>* %32 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %260) #15, !llvm.access.group !17
+  %261 = load <8 x float>, <8 x float>* %28, align 32, !tbaa !18, !llvm.access.group !17
+  %262 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %263 = load float, float* %262, align 4, !tbaa !14, !llvm.access.group !17
+  %264 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE0_clES6_f(%class.anon.0* noundef nonnull align 1 dereferenceable(1) %29, <8 x float> noundef %261, float noundef %263) #20, !llvm.access.group !17
+  %265 = load <8 x float>, <8 x float>* %30, align 32, !tbaa !18, !llvm.access.group !17
+  %266 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 3
+  %267 = load float, float* %266, align 4, !tbaa !16, !llvm.access.group !17
+  %268 = fpext float %267 to double
+  %269 = fmul double 2.000000e+00, %268
+  %270 = fptrunc double %269 to float
+  %271 = insertelement <8 x float> poison, float %270, i32 0
+  %272 = shufflevector <8 x float> %271, <8 x float> poison, <8 x i32> zeroinitializer
+  %273 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %264, <8 x float> %265, <8 x float> %272)
+  store <8 x float> %273, <8 x float>* %32, align 32, !tbaa !18, !llvm.access.group !17
+  %274 = bitcast <8 x float>* %33 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %274) #15, !llvm.access.group !17
+  %275 = load <8 x float>, <8 x float>* %31, align 32, !tbaa !18, !llvm.access.group !17
+  %276 = call noundef <8 x float> @_ZN10embeddings4Ftrl21GetMinusLinearReducedEDv8_f(%"class.embeddings::Ftrl"* noundef nonnull align 4 dereferenceable(16) %100, <8 x float> noundef %275) #20, !llvm.access.group !17
+  %277 = load <8 x float>, <8 x float>* %32, align 32, !tbaa !18, !llvm.access.group !17
+  %278 = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %277)
+  %279 = fmul <8 x float> %276, %278
+  store <8 x float> %279, <8 x float>* %33, align 32, !tbaa !18, !llvm.access.group !17
+  %280 = load <8 x float>, <8 x float>* %33, align 32, !tbaa !18, !llvm.access.group !17
+  %281 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %282 = load i32, i32* %23, align 4, !tbaa !3, !llvm.access.group !17
+  %283 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %281, i32 noundef %282) #20, !llvm.access.group !17
+  store <8 x float> %280, <8 x float> addrspace(201)* %283, align 32, !tbaa !18, !llvm.access.group !17
+  %284 = load <8 x float>, <8 x float>* %28, align 32, !tbaa !18, !llvm.access.group !17
+  %285 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9, !llvm.access.group !17
+  %286 = load i32, i32* %23, align 4, !tbaa !3, !llvm.access.group !17
+  %287 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %285, i32 noundef %286) #20, !llvm.access.group !17
+  store <8 x float> %284, <8 x float> addrspace(201)* %287, align 32, !tbaa !18, !llvm.access.group !17
+  %288 = load <8 x float>, <8 x float>* %31, align 32, !tbaa !18, !llvm.access.group !17
+  %289 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9, !llvm.access.group !17
+  %290 = load i32, i32* %23, align 4, !tbaa !3, !llvm.access.group !17
+  %291 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %289, i32 noundef %290) #20, !llvm.access.group !17
+  store <8 x float> %288, <8 x float> addrspace(201)* %291, align 32, !tbaa !18, !llvm.access.group !17
+  %292 = bitcast <8 x float>* %33 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %292) #15, !llvm.access.group !17
+  %293 = bitcast <8 x float>* %32 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %293) #15, !llvm.access.group !17
+  %294 = bitcast <8 x float>* %31 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %294) #15, !llvm.access.group !17
+  %295 = bitcast <8 x float>* %30 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %295) #15, !llvm.access.group !17
+  %296 = bitcast %class.anon.0* %29 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %296) #15, !llvm.access.group !17
+  %297 = bitcast <8 x float>* %28 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %297) #15, !llvm.access.group !17
+  %298 = bitcast <8 x float>* %27 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %298) #15, !llvm.access.group !17
+  %299 = bitcast <8 x float>* %26 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %299) #15, !llvm.access.group !17
+  %300 = bitcast <8 x float>* %25 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %300) #15, !llvm.access.group !17
+  %301 = bitcast <8 x float>* %24 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %301) #15, !llvm.access.group !17
+  %302 = bitcast i32* %23 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %302) #15, !llvm.access.group !17
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !17
+  %303 = bitcast i32* %34 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %303) #15, !llvm.access.group !17
+  %304 = load i32, i32* %11, align 4, !tbaa !3, !llvm.access.group !17
+  %305 = add nsw i32 %304, 2
+  store i32 %305, i32* %34, align 4, !tbaa !3, !llvm.access.group !17
+  %306 = bitcast <8 x float>* %35 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %306) #15, !llvm.access.group !17
+  %307 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %308 = load i32, i32* %34, align 4, !tbaa !3, !llvm.access.group !17
+  %309 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %307, i32 noundef %308) #20, !llvm.access.group !17
+  %310 = load <8 x float>, <8 x float> addrspace(201)* %309, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %310, <8 x float>* %35, align 32, !tbaa !18, !llvm.access.group !17
+  %311 = bitcast <8 x float>* %36 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %311) #15, !llvm.access.group !17
+  %312 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %10, align 4, !tbaa !9, !llvm.access.group !17
+  %313 = load i32, i32* %34, align 4, !tbaa !3, !llvm.access.group !17
+  %314 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %312, i32 noundef %313) #20, !llvm.access.group !17
+  %315 = load <8 x float>, <8 x float> addrspace(201)* %314, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %315, <8 x float>* %36, align 32, !tbaa !18, !llvm.access.group !17
+  %316 = bitcast <8 x float>* %37 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %316) #15, !llvm.access.group !17
+  %317 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9, !llvm.access.group !17
+  %318 = load i32, i32* %34, align 4, !tbaa !3, !llvm.access.group !17
+  %319 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %317, i32 noundef %318) #20, !llvm.access.group !17
+  %320 = load <8 x float>, <8 x float> addrspace(201)* %319, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %320, <8 x float>* %37, align 32, !tbaa !18, !llvm.access.group !17
+  %321 = bitcast <8 x float>* %38 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %321) #15, !llvm.access.group !17
+  %322 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9, !llvm.access.group !17
+  %323 = load i32, i32* %34, align 4, !tbaa !3, !llvm.access.group !17
+  %324 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %322, i32 noundef %323) #20, !llvm.access.group !17
+  %325 = load <8 x float>, <8 x float> addrspace(201)* %324, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %325, <8 x float>* %38, align 32, !tbaa !18, !llvm.access.group !17
+  %326 = bitcast <8 x float>* %39 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %326) #15, !llvm.access.group !17
+  %327 = load <8 x float>, <8 x float>* %37, align 32, !tbaa !18, !llvm.access.group !17
+  %328 = load <8 x float>, <8 x float>* %36, align 32, !tbaa !18, !llvm.access.group !17
+  %329 = load <8 x float>, <8 x float>* %36, align 32, !tbaa !18, !llvm.access.group !17
+  %330 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %328, <8 x float> %329, <8 x float> %327)
+  store <8 x float> %330, <8 x float>* %39, align 32, !tbaa !18, !llvm.access.group !17
+  %331 = bitcast %class.anon.2* %40 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %331) #15, !llvm.access.group !17
+  %332 = bitcast <8 x float>* %41 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %332) #15, !llvm.access.group !17
+  %333 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 0
+  %334 = load float, float* %333, align 4, !tbaa !12, !llvm.access.group !17
+  %335 = insertelement <8 x float> poison, float %334, i32 0
+  %336 = shufflevector <8 x float> %335, <8 x float> poison, <8 x i32> zeroinitializer
+  %337 = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %336)
+  store <8 x float> %337, <8 x float>* %41, align 32, !tbaa !18, !llvm.access.group !17
+  %338 = bitcast <8 x float>* %42 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %338) #15, !llvm.access.group !17
+  %339 = load <8 x float>, <8 x float>* %38, align 32, !tbaa !18, !llvm.access.group !17
+  %340 = load <8 x float>, <8 x float>* %36, align 32, !tbaa !18, !llvm.access.group !17
+  %341 = fadd <8 x float> %339, %340
+  %342 = load <8 x float>, <8 x float>* %39, align 32, !tbaa !18, !llvm.access.group !17
+  %343 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %344 = load float, float* %343, align 4, !tbaa !14, !llvm.access.group !17
+  %345 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE1_clES6_f(%class.anon.2* noundef nonnull align 1 dereferenceable(1) %40, <8 x float> noundef %342, float noundef %344) #20, !llvm.access.group !17
+  %346 = load <8 x float>, <8 x float>* %37, align 32, !tbaa !18, !llvm.access.group !17
+  %347 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %348 = load float, float* %347, align 4, !tbaa !14, !llvm.access.group !17
+  %349 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE1_clES6_f(%class.anon.2* noundef nonnull align 1 dereferenceable(1) %40, <8 x float> noundef %346, float noundef %348) #20, !llvm.access.group !17
+  %350 = fsub <8 x float> %345, %349
+  %351 = load <8 x float>, <8 x float>* %35, align 32, !tbaa !18, !llvm.access.group !17
+  %352 = load <8 x float>, <8 x float>* %41, align 32, !tbaa !18, !llvm.access.group !17
+  %353 = fmul <8 x float> %351, %352
+  %354 = fneg <8 x float> %350
+  %355 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %354, <8 x float> %353, <8 x float> %341)
+  store <8 x float> %355, <8 x float>* %42, align 32, !tbaa !18, !llvm.access.group !17
+  %356 = bitcast <8 x float>* %43 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %356) #15, !llvm.access.group !17
+  %357 = load <8 x float>, <8 x float>* %39, align 32, !tbaa !18, !llvm.access.group !17
+  %358 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %359 = load float, float* %358, align 4, !tbaa !14, !llvm.access.group !17
+  %360 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE1_clES6_f(%class.anon.2* noundef nonnull align 1 dereferenceable(1) %40, <8 x float> noundef %357, float noundef %359) #20, !llvm.access.group !17
+  %361 = load <8 x float>, <8 x float>* %41, align 32, !tbaa !18, !llvm.access.group !17
+  %362 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 3
+  %363 = load float, float* %362, align 4, !tbaa !16, !llvm.access.group !17
+  %364 = fpext float %363 to double
+  %365 = fmul double 2.000000e+00, %364
+  %366 = fptrunc double %365 to float
+  %367 = insertelement <8 x float> poison, float %366, i32 0
+  %368 = shufflevector <8 x float> %367, <8 x float> poison, <8 x i32> zeroinitializer
+  %369 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %360, <8 x float> %361, <8 x float> %368)
+  store <8 x float> %369, <8 x float>* %43, align 32, !tbaa !18, !llvm.access.group !17
+  %370 = bitcast <8 x float>* %44 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %370) #15, !llvm.access.group !17
+  %371 = load <8 x float>, <8 x float>* %42, align 32, !tbaa !18, !llvm.access.group !17
+  %372 = call noundef <8 x float> @_ZN10embeddings4Ftrl21GetMinusLinearReducedEDv8_f(%"class.embeddings::Ftrl"* noundef nonnull align 4 dereferenceable(16) %100, <8 x float> noundef %371) #20, !llvm.access.group !17
+  %373 = load <8 x float>, <8 x float>* %43, align 32, !tbaa !18, !llvm.access.group !17
+  %374 = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %373)
+  %375 = fmul <8 x float> %372, %374
+  store <8 x float> %375, <8 x float>* %44, align 32, !tbaa !18, !llvm.access.group !17
+  %376 = load <8 x float>, <8 x float>* %44, align 32, !tbaa !18, !llvm.access.group !17
+  %377 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %378 = load i32, i32* %34, align 4, !tbaa !3, !llvm.access.group !17
+  %379 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %377, i32 noundef %378) #20, !llvm.access.group !17
+  store <8 x float> %376, <8 x float> addrspace(201)* %379, align 32, !tbaa !18, !llvm.access.group !17
+  %380 = load <8 x float>, <8 x float>* %39, align 32, !tbaa !18, !llvm.access.group !17
+  %381 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9, !llvm.access.group !17
+  %382 = load i32, i32* %34, align 4, !tbaa !3, !llvm.access.group !17
+  %383 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %381, i32 noundef %382) #20, !llvm.access.group !17
+  store <8 x float> %380, <8 x float> addrspace(201)* %383, align 32, !tbaa !18, !llvm.access.group !17
+  %384 = load <8 x float>, <8 x float>* %42, align 32, !tbaa !18, !llvm.access.group !17
+  %385 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9, !llvm.access.group !17
+  %386 = load i32, i32* %34, align 4, !tbaa !3, !llvm.access.group !17
+  %387 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %385, i32 noundef %386) #20, !llvm.access.group !17
+  store <8 x float> %384, <8 x float> addrspace(201)* %387, align 32, !tbaa !18, !llvm.access.group !17
+  %388 = bitcast <8 x float>* %44 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %388) #15, !llvm.access.group !17
+  %389 = bitcast <8 x float>* %43 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %389) #15, !llvm.access.group !17
+  %390 = bitcast <8 x float>* %42 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %390) #15, !llvm.access.group !17
+  %391 = bitcast <8 x float>* %41 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %391) #15, !llvm.access.group !17
+  %392 = bitcast %class.anon.2* %40 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %392) #15, !llvm.access.group !17
+  %393 = bitcast <8 x float>* %39 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %393) #15, !llvm.access.group !17
+  %394 = bitcast <8 x float>* %38 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %394) #15, !llvm.access.group !17
+  %395 = bitcast <8 x float>* %37 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %395) #15, !llvm.access.group !17
+  %396 = bitcast <8 x float>* %36 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %396) #15, !llvm.access.group !17
+  %397 = bitcast <8 x float>* %35 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %397) #15, !llvm.access.group !17
+  %398 = bitcast i32* %34 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %398) #15, !llvm.access.group !17
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !17
+  %399 = bitcast i32* %45 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %399) #15, !llvm.access.group !17
+  %400 = load i32, i32* %11, align 4, !tbaa !3, !llvm.access.group !17
+  %401 = add nsw i32 %400, 3
+  store i32 %401, i32* %45, align 4, !tbaa !3, !llvm.access.group !17
+  %402 = bitcast <8 x float>* %46 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %402) #15, !llvm.access.group !17
+  %403 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %404 = load i32, i32* %45, align 4, !tbaa !3, !llvm.access.group !17
+  %405 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %403, i32 noundef %404) #20, !llvm.access.group !17
+  %406 = load <8 x float>, <8 x float> addrspace(201)* %405, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %406, <8 x float>* %46, align 32, !tbaa !18, !llvm.access.group !17
+  %407 = bitcast <8 x float>* %47 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %407) #15, !llvm.access.group !17
+  %408 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %10, align 4, !tbaa !9, !llvm.access.group !17
+  %409 = load i32, i32* %45, align 4, !tbaa !3, !llvm.access.group !17
+  %410 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %408, i32 noundef %409) #20, !llvm.access.group !17
+  %411 = load <8 x float>, <8 x float> addrspace(201)* %410, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %411, <8 x float>* %47, align 32, !tbaa !18, !llvm.access.group !17
+  %412 = bitcast <8 x float>* %48 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %412) #15, !llvm.access.group !17
+  %413 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9, !llvm.access.group !17
+  %414 = load i32, i32* %45, align 4, !tbaa !3, !llvm.access.group !17
+  %415 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %413, i32 noundef %414) #20, !llvm.access.group !17
+  %416 = load <8 x float>, <8 x float> addrspace(201)* %415, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %416, <8 x float>* %48, align 32, !tbaa !18, !llvm.access.group !17
+  %417 = bitcast <8 x float>* %49 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %417) #15, !llvm.access.group !17
+  %418 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9, !llvm.access.group !17
+  %419 = load i32, i32* %45, align 4, !tbaa !3, !llvm.access.group !17
+  %420 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %418, i32 noundef %419) #20, !llvm.access.group !17
+  %421 = load <8 x float>, <8 x float> addrspace(201)* %420, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %421, <8 x float>* %49, align 32, !tbaa !18, !llvm.access.group !17
+  %422 = bitcast <8 x float>* %50 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %422) #15, !llvm.access.group !17
+  %423 = load <8 x float>, <8 x float>* %48, align 32, !tbaa !18, !llvm.access.group !17
+  %424 = load <8 x float>, <8 x float>* %47, align 32, !tbaa !18, !llvm.access.group !17
+  %425 = load <8 x float>, <8 x float>* %47, align 32, !tbaa !18, !llvm.access.group !17
+  %426 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %424, <8 x float> %425, <8 x float> %423)
+  store <8 x float> %426, <8 x float>* %50, align 32, !tbaa !18, !llvm.access.group !17
+  %427 = bitcast %class.anon.4* %51 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %427) #15, !llvm.access.group !17
+  %428 = bitcast <8 x float>* %52 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %428) #15, !llvm.access.group !17
+  %429 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 0
+  %430 = load float, float* %429, align 4, !tbaa !12, !llvm.access.group !17
+  %431 = insertelement <8 x float> poison, float %430, i32 0
+  %432 = shufflevector <8 x float> %431, <8 x float> poison, <8 x i32> zeroinitializer
+  %433 = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %432)
+  store <8 x float> %433, <8 x float>* %52, align 32, !tbaa !18, !llvm.access.group !17
+  %434 = bitcast <8 x float>* %53 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %434) #15, !llvm.access.group !17
+  %435 = load <8 x float>, <8 x float>* %49, align 32, !tbaa !18, !llvm.access.group !17
+  %436 = load <8 x float>, <8 x float>* %47, align 32, !tbaa !18, !llvm.access.group !17
+  %437 = fadd <8 x float> %435, %436
+  %438 = load <8 x float>, <8 x float>* %50, align 32, !tbaa !18, !llvm.access.group !17
+  %439 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %440 = load float, float* %439, align 4, !tbaa !14, !llvm.access.group !17
+  %441 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE2_clES6_f(%class.anon.4* noundef nonnull align 1 dereferenceable(1) %51, <8 x float> noundef %438, float noundef %440) #20, !llvm.access.group !17
+  %442 = load <8 x float>, <8 x float>* %48, align 32, !tbaa !18, !llvm.access.group !17
+  %443 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %444 = load float, float* %443, align 4, !tbaa !14, !llvm.access.group !17
+  %445 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE2_clES6_f(%class.anon.4* noundef nonnull align 1 dereferenceable(1) %51, <8 x float> noundef %442, float noundef %444) #20, !llvm.access.group !17
+  %446 = fsub <8 x float> %441, %445
+  %447 = load <8 x float>, <8 x float>* %46, align 32, !tbaa !18, !llvm.access.group !17
+  %448 = load <8 x float>, <8 x float>* %52, align 32, !tbaa !18, !llvm.access.group !17
+  %449 = fmul <8 x float> %447, %448
+  %450 = fneg <8 x float> %446
+  %451 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %450, <8 x float> %449, <8 x float> %437)
+  store <8 x float> %451, <8 x float>* %53, align 32, !tbaa !18, !llvm.access.group !17
+  %452 = bitcast <8 x float>* %54 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %452) #15, !llvm.access.group !17
+  %453 = load <8 x float>, <8 x float>* %50, align 32, !tbaa !18, !llvm.access.group !17
+  %454 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %455 = load float, float* %454, align 4, !tbaa !14, !llvm.access.group !17
+  %456 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE2_clES6_f(%class.anon.4* noundef nonnull align 1 dereferenceable(1) %51, <8 x float> noundef %453, float noundef %455) #20, !llvm.access.group !17
+  %457 = load <8 x float>, <8 x float>* %52, align 32, !tbaa !18, !llvm.access.group !17
+  %458 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 3
+  %459 = load float, float* %458, align 4, !tbaa !16, !llvm.access.group !17
+  %460 = fpext float %459 to double
+  %461 = fmul double 2.000000e+00, %460
+  %462 = fptrunc double %461 to float
+  %463 = insertelement <8 x float> poison, float %462, i32 0
+  %464 = shufflevector <8 x float> %463, <8 x float> poison, <8 x i32> zeroinitializer
+  %465 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %456, <8 x float> %457, <8 x float> %464)
+  store <8 x float> %465, <8 x float>* %54, align 32, !tbaa !18, !llvm.access.group !17
+  %466 = bitcast <8 x float>* %55 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %466) #15, !llvm.access.group !17
+  %467 = load <8 x float>, <8 x float>* %53, align 32, !tbaa !18, !llvm.access.group !17
+  %468 = call noundef <8 x float> @_ZN10embeddings4Ftrl21GetMinusLinearReducedEDv8_f(%"class.embeddings::Ftrl"* noundef nonnull align 4 dereferenceable(16) %100, <8 x float> noundef %467) #20, !llvm.access.group !17
+  %469 = load <8 x float>, <8 x float>* %54, align 32, !tbaa !18, !llvm.access.group !17
+  %470 = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %469)
+  %471 = fmul <8 x float> %468, %470
+  store <8 x float> %471, <8 x float>* %55, align 32, !tbaa !18, !llvm.access.group !17
+  %472 = load <8 x float>, <8 x float>* %55, align 32, !tbaa !18, !llvm.access.group !17
+  %473 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %474 = load i32, i32* %45, align 4, !tbaa !3, !llvm.access.group !17
+  %475 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %473, i32 noundef %474) #20, !llvm.access.group !17
+  store <8 x float> %472, <8 x float> addrspace(201)* %475, align 32, !tbaa !18, !llvm.access.group !17
+  %476 = load <8 x float>, <8 x float>* %50, align 32, !tbaa !18, !llvm.access.group !17
+  %477 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9, !llvm.access.group !17
+  %478 = load i32, i32* %45, align 4, !tbaa !3, !llvm.access.group !17
+  %479 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %477, i32 noundef %478) #20, !llvm.access.group !17
+  store <8 x float> %476, <8 x float> addrspace(201)* %479, align 32, !tbaa !18, !llvm.access.group !17
+  %480 = load <8 x float>, <8 x float>* %53, align 32, !tbaa !18, !llvm.access.group !17
+  %481 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9, !llvm.access.group !17
+  %482 = load i32, i32* %45, align 4, !tbaa !3, !llvm.access.group !17
+  %483 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %481, i32 noundef %482) #20, !llvm.access.group !17
+  store <8 x float> %480, <8 x float> addrspace(201)* %483, align 32, !tbaa !18, !llvm.access.group !17
+  %484 = bitcast <8 x float>* %55 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %484) #15, !llvm.access.group !17
+  %485 = bitcast <8 x float>* %54 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %485) #15, !llvm.access.group !17
+  %486 = bitcast <8 x float>* %53 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %486) #15, !llvm.access.group !17
+  %487 = bitcast <8 x float>* %52 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %487) #15, !llvm.access.group !17
+  %488 = bitcast %class.anon.4* %51 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %488) #15, !llvm.access.group !17
+  %489 = bitcast <8 x float>* %50 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %489) #15, !llvm.access.group !17
+  %490 = bitcast <8 x float>* %49 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %490) #15, !llvm.access.group !17
+  %491 = bitcast <8 x float>* %48 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %491) #15, !llvm.access.group !17
+  %492 = bitcast <8 x float>* %47 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %492) #15, !llvm.access.group !17
+  %493 = bitcast <8 x float>* %46 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %493) #15, !llvm.access.group !17
+  %494 = bitcast i32* %45 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %494) #15, !llvm.access.group !17
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !17
+  %495 = bitcast i32* %56 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %495) #15, !llvm.access.group !17
+  %496 = load i32, i32* %11, align 4, !tbaa !3, !llvm.access.group !17
+  %497 = add nsw i32 %496, 4
+  store i32 %497, i32* %56, align 4, !tbaa !3, !llvm.access.group !17
+  %498 = bitcast <8 x float>* %57 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %498) #15, !llvm.access.group !17
+  %499 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %500 = load i32, i32* %56, align 4, !tbaa !3, !llvm.access.group !17
+  %501 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %499, i32 noundef %500) #20, !llvm.access.group !17
+  %502 = load <8 x float>, <8 x float> addrspace(201)* %501, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %502, <8 x float>* %57, align 32, !tbaa !18, !llvm.access.group !17
+  %503 = bitcast <8 x float>* %58 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %503) #15, !llvm.access.group !17
+  %504 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %10, align 4, !tbaa !9, !llvm.access.group !17
+  %505 = load i32, i32* %56, align 4, !tbaa !3, !llvm.access.group !17
+  %506 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %504, i32 noundef %505) #20, !llvm.access.group !17
+  %507 = load <8 x float>, <8 x float> addrspace(201)* %506, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %507, <8 x float>* %58, align 32, !tbaa !18, !llvm.access.group !17
+  %508 = bitcast <8 x float>* %59 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %508) #15, !llvm.access.group !17
+  %509 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9, !llvm.access.group !17
+  %510 = load i32, i32* %56, align 4, !tbaa !3, !llvm.access.group !17
+  %511 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %509, i32 noundef %510) #20, !llvm.access.group !17
+  %512 = load <8 x float>, <8 x float> addrspace(201)* %511, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %512, <8 x float>* %59, align 32, !tbaa !18, !llvm.access.group !17
+  %513 = bitcast <8 x float>* %60 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %513) #15, !llvm.access.group !17
+  %514 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9, !llvm.access.group !17
+  %515 = load i32, i32* %56, align 4, !tbaa !3, !llvm.access.group !17
+  %516 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %514, i32 noundef %515) #20, !llvm.access.group !17
+  %517 = load <8 x float>, <8 x float> addrspace(201)* %516, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %517, <8 x float>* %60, align 32, !tbaa !18, !llvm.access.group !17
+  %518 = bitcast <8 x float>* %61 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %518) #15, !llvm.access.group !17
+  %519 = load <8 x float>, <8 x float>* %59, align 32, !tbaa !18, !llvm.access.group !17
+  %520 = load <8 x float>, <8 x float>* %58, align 32, !tbaa !18, !llvm.access.group !17
+  %521 = load <8 x float>, <8 x float>* %58, align 32, !tbaa !18, !llvm.access.group !17
+  %522 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %520, <8 x float> %521, <8 x float> %519)
+  store <8 x float> %522, <8 x float>* %61, align 32, !tbaa !18, !llvm.access.group !17
+  %523 = bitcast %class.anon.6* %62 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %523) #15, !llvm.access.group !17
+  %524 = bitcast <8 x float>* %63 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %524) #15, !llvm.access.group !17
+  %525 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 0
+  %526 = load float, float* %525, align 4, !tbaa !12, !llvm.access.group !17
+  %527 = insertelement <8 x float> poison, float %526, i32 0
+  %528 = shufflevector <8 x float> %527, <8 x float> poison, <8 x i32> zeroinitializer
+  %529 = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %528)
+  store <8 x float> %529, <8 x float>* %63, align 32, !tbaa !18, !llvm.access.group !17
+  %530 = bitcast <8 x float>* %64 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %530) #15, !llvm.access.group !17
+  %531 = load <8 x float>, <8 x float>* %60, align 32, !tbaa !18, !llvm.access.group !17
+  %532 = load <8 x float>, <8 x float>* %58, align 32, !tbaa !18, !llvm.access.group !17
+  %533 = fadd <8 x float> %531, %532
+  %534 = load <8 x float>, <8 x float>* %61, align 32, !tbaa !18, !llvm.access.group !17
+  %535 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %536 = load float, float* %535, align 4, !tbaa !14, !llvm.access.group !17
+  %537 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE3_clES6_f(%class.anon.6* noundef nonnull align 1 dereferenceable(1) %62, <8 x float> noundef %534, float noundef %536) #20, !llvm.access.group !17
+  %538 = load <8 x float>, <8 x float>* %59, align 32, !tbaa !18, !llvm.access.group !17
+  %539 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %540 = load float, float* %539, align 4, !tbaa !14, !llvm.access.group !17
+  %541 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE3_clES6_f(%class.anon.6* noundef nonnull align 1 dereferenceable(1) %62, <8 x float> noundef %538, float noundef %540) #20, !llvm.access.group !17
+  %542 = fsub <8 x float> %537, %541
+  %543 = load <8 x float>, <8 x float>* %57, align 32, !tbaa !18, !llvm.access.group !17
+  %544 = load <8 x float>, <8 x float>* %63, align 32, !tbaa !18, !llvm.access.group !17
+  %545 = fmul <8 x float> %543, %544
+  %546 = fneg <8 x float> %542
+  %547 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %546, <8 x float> %545, <8 x float> %533)
+  store <8 x float> %547, <8 x float>* %64, align 32, !tbaa !18, !llvm.access.group !17
+  %548 = bitcast <8 x float>* %65 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %548) #15, !llvm.access.group !17
+  %549 = load <8 x float>, <8 x float>* %61, align 32, !tbaa !18, !llvm.access.group !17
+  %550 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %551 = load float, float* %550, align 4, !tbaa !14, !llvm.access.group !17
+  %552 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE3_clES6_f(%class.anon.6* noundef nonnull align 1 dereferenceable(1) %62, <8 x float> noundef %549, float noundef %551) #20, !llvm.access.group !17
+  %553 = load <8 x float>, <8 x float>* %63, align 32, !tbaa !18, !llvm.access.group !17
+  %554 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 3
+  %555 = load float, float* %554, align 4, !tbaa !16, !llvm.access.group !17
+  %556 = fpext float %555 to double
+  %557 = fmul double 2.000000e+00, %556
+  %558 = fptrunc double %557 to float
+  %559 = insertelement <8 x float> poison, float %558, i32 0
+  %560 = shufflevector <8 x float> %559, <8 x float> poison, <8 x i32> zeroinitializer
+  %561 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %552, <8 x float> %553, <8 x float> %560)
+  store <8 x float> %561, <8 x float>* %65, align 32, !tbaa !18, !llvm.access.group !17
+  %562 = bitcast <8 x float>* %66 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %562) #15, !llvm.access.group !17
+  %563 = load <8 x float>, <8 x float>* %64, align 32, !tbaa !18, !llvm.access.group !17
+  %564 = call noundef <8 x float> @_ZN10embeddings4Ftrl21GetMinusLinearReducedEDv8_f(%"class.embeddings::Ftrl"* noundef nonnull align 4 dereferenceable(16) %100, <8 x float> noundef %563) #20, !llvm.access.group !17
+  %565 = load <8 x float>, <8 x float>* %65, align 32, !tbaa !18, !llvm.access.group !17
+  %566 = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %565)
+  %567 = fmul <8 x float> %564, %566
+  store <8 x float> %567, <8 x float>* %66, align 32, !tbaa !18, !llvm.access.group !17
+  %568 = load <8 x float>, <8 x float>* %66, align 32, !tbaa !18, !llvm.access.group !17
+  %569 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %570 = load i32, i32* %56, align 4, !tbaa !3, !llvm.access.group !17
+  %571 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %569, i32 noundef %570) #20, !llvm.access.group !17
+  store <8 x float> %568, <8 x float> addrspace(201)* %571, align 32, !tbaa !18, !llvm.access.group !17
+  %572 = load <8 x float>, <8 x float>* %61, align 32, !tbaa !18, !llvm.access.group !17
+  %573 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9, !llvm.access.group !17
+  %574 = load i32, i32* %56, align 4, !tbaa !3, !llvm.access.group !17
+  %575 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %573, i32 noundef %574) #20, !llvm.access.group !17
+  store <8 x float> %572, <8 x float> addrspace(201)* %575, align 32, !tbaa !18, !llvm.access.group !17
+  %576 = load <8 x float>, <8 x float>* %64, align 32, !tbaa !18, !llvm.access.group !17
+  %577 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9, !llvm.access.group !17
+  %578 = load i32, i32* %56, align 4, !tbaa !3, !llvm.access.group !17
+  %579 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %577, i32 noundef %578) #20, !llvm.access.group !17
+  store <8 x float> %576, <8 x float> addrspace(201)* %579, align 32, !tbaa !18, !llvm.access.group !17
+  %580 = bitcast <8 x float>* %66 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %580) #15, !llvm.access.group !17
+  %581 = bitcast <8 x float>* %65 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %581) #15, !llvm.access.group !17
+  %582 = bitcast <8 x float>* %64 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %582) #15, !llvm.access.group !17
+  %583 = bitcast <8 x float>* %63 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %583) #15, !llvm.access.group !17
+  %584 = bitcast %class.anon.6* %62 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %584) #15, !llvm.access.group !17
+  %585 = bitcast <8 x float>* %61 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %585) #15, !llvm.access.group !17
+  %586 = bitcast <8 x float>* %60 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %586) #15, !llvm.access.group !17
+  %587 = bitcast <8 x float>* %59 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %587) #15, !llvm.access.group !17
+  %588 = bitcast <8 x float>* %58 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %588) #15, !llvm.access.group !17
+  %589 = bitcast <8 x float>* %57 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %589) #15, !llvm.access.group !17
+  %590 = bitcast i32* %56 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %590) #15, !llvm.access.group !17
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !17
+  %591 = bitcast i32* %67 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %591) #15, !llvm.access.group !17
+  %592 = load i32, i32* %11, align 4, !tbaa !3, !llvm.access.group !17
+  %593 = add nsw i32 %592, 5
+  store i32 %593, i32* %67, align 4, !tbaa !3, !llvm.access.group !17
+  %594 = bitcast <8 x float>* %68 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %594) #15, !llvm.access.group !17
+  %595 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %596 = load i32, i32* %67, align 4, !tbaa !3, !llvm.access.group !17
+  %597 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %595, i32 noundef %596) #20, !llvm.access.group !17
+  %598 = load <8 x float>, <8 x float> addrspace(201)* %597, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %598, <8 x float>* %68, align 32, !tbaa !18, !llvm.access.group !17
+  %599 = bitcast <8 x float>* %69 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %599) #15, !llvm.access.group !17
+  %600 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %10, align 4, !tbaa !9, !llvm.access.group !17
+  %601 = load i32, i32* %67, align 4, !tbaa !3, !llvm.access.group !17
+  %602 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %600, i32 noundef %601) #20, !llvm.access.group !17
+  %603 = load <8 x float>, <8 x float> addrspace(201)* %602, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %603, <8 x float>* %69, align 32, !tbaa !18, !llvm.access.group !17
+  %604 = bitcast <8 x float>* %70 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %604) #15, !llvm.access.group !17
+  %605 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9, !llvm.access.group !17
+  %606 = load i32, i32* %67, align 4, !tbaa !3, !llvm.access.group !17
+  %607 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %605, i32 noundef %606) #20, !llvm.access.group !17
+  %608 = load <8 x float>, <8 x float> addrspace(201)* %607, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %608, <8 x float>* %70, align 32, !tbaa !18, !llvm.access.group !17
+  %609 = bitcast <8 x float>* %71 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %609) #15, !llvm.access.group !17
+  %610 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9, !llvm.access.group !17
+  %611 = load i32, i32* %67, align 4, !tbaa !3, !llvm.access.group !17
+  %612 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %610, i32 noundef %611) #20, !llvm.access.group !17
+  %613 = load <8 x float>, <8 x float> addrspace(201)* %612, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %613, <8 x float>* %71, align 32, !tbaa !18, !llvm.access.group !17
+  %614 = bitcast <8 x float>* %72 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %614) #15, !llvm.access.group !17
+  %615 = load <8 x float>, <8 x float>* %70, align 32, !tbaa !18, !llvm.access.group !17
+  %616 = load <8 x float>, <8 x float>* %69, align 32, !tbaa !18, !llvm.access.group !17
+  %617 = load <8 x float>, <8 x float>* %69, align 32, !tbaa !18, !llvm.access.group !17
+  %618 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %616, <8 x float> %617, <8 x float> %615)
+  store <8 x float> %618, <8 x float>* %72, align 32, !tbaa !18, !llvm.access.group !17
+  %619 = bitcast %class.anon.8* %73 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %619) #15, !llvm.access.group !17
+  %620 = bitcast <8 x float>* %74 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %620) #15, !llvm.access.group !17
+  %621 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 0
+  %622 = load float, float* %621, align 4, !tbaa !12, !llvm.access.group !17
+  %623 = insertelement <8 x float> poison, float %622, i32 0
+  %624 = shufflevector <8 x float> %623, <8 x float> poison, <8 x i32> zeroinitializer
+  %625 = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %624)
+  store <8 x float> %625, <8 x float>* %74, align 32, !tbaa !18, !llvm.access.group !17
+  %626 = bitcast <8 x float>* %75 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %626) #15, !llvm.access.group !17
+  %627 = load <8 x float>, <8 x float>* %71, align 32, !tbaa !18, !llvm.access.group !17
+  %628 = load <8 x float>, <8 x float>* %69, align 32, !tbaa !18, !llvm.access.group !17
+  %629 = fadd <8 x float> %627, %628
+  %630 = load <8 x float>, <8 x float>* %72, align 32, !tbaa !18, !llvm.access.group !17
+  %631 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %632 = load float, float* %631, align 4, !tbaa !14, !llvm.access.group !17
+  %633 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE4_clES6_f(%class.anon.8* noundef nonnull align 1 dereferenceable(1) %73, <8 x float> noundef %630, float noundef %632) #20, !llvm.access.group !17
+  %634 = load <8 x float>, <8 x float>* %70, align 32, !tbaa !18, !llvm.access.group !17
+  %635 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %636 = load float, float* %635, align 4, !tbaa !14, !llvm.access.group !17
+  %637 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE4_clES6_f(%class.anon.8* noundef nonnull align 1 dereferenceable(1) %73, <8 x float> noundef %634, float noundef %636) #20, !llvm.access.group !17
+  %638 = fsub <8 x float> %633, %637
+  %639 = load <8 x float>, <8 x float>* %68, align 32, !tbaa !18, !llvm.access.group !17
+  %640 = load <8 x float>, <8 x float>* %74, align 32, !tbaa !18, !llvm.access.group !17
+  %641 = fmul <8 x float> %639, %640
+  %642 = fneg <8 x float> %638
+  %643 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %642, <8 x float> %641, <8 x float> %629)
+  store <8 x float> %643, <8 x float>* %75, align 32, !tbaa !18, !llvm.access.group !17
+  %644 = bitcast <8 x float>* %76 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %644) #15, !llvm.access.group !17
+  %645 = load <8 x float>, <8 x float>* %72, align 32, !tbaa !18, !llvm.access.group !17
+  %646 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %647 = load float, float* %646, align 4, !tbaa !14, !llvm.access.group !17
+  %648 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE4_clES6_f(%class.anon.8* noundef nonnull align 1 dereferenceable(1) %73, <8 x float> noundef %645, float noundef %647) #20, !llvm.access.group !17
+  %649 = load <8 x float>, <8 x float>* %74, align 32, !tbaa !18, !llvm.access.group !17
+  %650 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 3
+  %651 = load float, float* %650, align 4, !tbaa !16, !llvm.access.group !17
+  %652 = fpext float %651 to double
+  %653 = fmul double 2.000000e+00, %652
+  %654 = fptrunc double %653 to float
+  %655 = insertelement <8 x float> poison, float %654, i32 0
+  %656 = shufflevector <8 x float> %655, <8 x float> poison, <8 x i32> zeroinitializer
+  %657 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %648, <8 x float> %649, <8 x float> %656)
+  store <8 x float> %657, <8 x float>* %76, align 32, !tbaa !18, !llvm.access.group !17
+  %658 = bitcast <8 x float>* %77 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %658) #15, !llvm.access.group !17
+  %659 = load <8 x float>, <8 x float>* %75, align 32, !tbaa !18, !llvm.access.group !17
+  %660 = call noundef <8 x float> @_ZN10embeddings4Ftrl21GetMinusLinearReducedEDv8_f(%"class.embeddings::Ftrl"* noundef nonnull align 4 dereferenceable(16) %100, <8 x float> noundef %659) #20, !llvm.access.group !17
+  %661 = load <8 x float>, <8 x float>* %76, align 32, !tbaa !18, !llvm.access.group !17
+  %662 = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %661)
+  %663 = fmul <8 x float> %660, %662
+  store <8 x float> %663, <8 x float>* %77, align 32, !tbaa !18, !llvm.access.group !17
+  %664 = load <8 x float>, <8 x float>* %77, align 32, !tbaa !18, !llvm.access.group !17
+  %665 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %666 = load i32, i32* %67, align 4, !tbaa !3, !llvm.access.group !17
+  %667 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %665, i32 noundef %666) #20, !llvm.access.group !17
+  store <8 x float> %664, <8 x float> addrspace(201)* %667, align 32, !tbaa !18, !llvm.access.group !17
+  %668 = load <8 x float>, <8 x float>* %72, align 32, !tbaa !18, !llvm.access.group !17
+  %669 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9, !llvm.access.group !17
+  %670 = load i32, i32* %67, align 4, !tbaa !3, !llvm.access.group !17
+  %671 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %669, i32 noundef %670) #20, !llvm.access.group !17
+  store <8 x float> %668, <8 x float> addrspace(201)* %671, align 32, !tbaa !18, !llvm.access.group !17
+  %672 = load <8 x float>, <8 x float>* %75, align 32, !tbaa !18, !llvm.access.group !17
+  %673 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9, !llvm.access.group !17
+  %674 = load i32, i32* %67, align 4, !tbaa !3, !llvm.access.group !17
+  %675 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %673, i32 noundef %674) #20, !llvm.access.group !17
+  store <8 x float> %672, <8 x float> addrspace(201)* %675, align 32, !tbaa !18, !llvm.access.group !17
+  %676 = bitcast <8 x float>* %77 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %676) #15, !llvm.access.group !17
+  %677 = bitcast <8 x float>* %76 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %677) #15, !llvm.access.group !17
+  %678 = bitcast <8 x float>* %75 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %678) #15, !llvm.access.group !17
+  %679 = bitcast <8 x float>* %74 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %679) #15, !llvm.access.group !17
+  %680 = bitcast %class.anon.8* %73 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %680) #15, !llvm.access.group !17
+  %681 = bitcast <8 x float>* %72 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %681) #15, !llvm.access.group !17
+  %682 = bitcast <8 x float>* %71 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %682) #15, !llvm.access.group !17
+  %683 = bitcast <8 x float>* %70 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %683) #15, !llvm.access.group !17
+  %684 = bitcast <8 x float>* %69 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %684) #15, !llvm.access.group !17
+  %685 = bitcast <8 x float>* %68 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %685) #15, !llvm.access.group !17
+  %686 = bitcast i32* %67 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %686) #15, !llvm.access.group !17
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !17
+  %687 = bitcast i32* %78 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %687) #15, !llvm.access.group !17
+  %688 = load i32, i32* %11, align 4, !tbaa !3, !llvm.access.group !17
+  %689 = add nsw i32 %688, 6
+  store i32 %689, i32* %78, align 4, !tbaa !3, !llvm.access.group !17
+  %690 = bitcast <8 x float>* %79 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %690) #15, !llvm.access.group !17
+  %691 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %692 = load i32, i32* %78, align 4, !tbaa !3, !llvm.access.group !17
+  %693 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %691, i32 noundef %692) #20, !llvm.access.group !17
+  %694 = load <8 x float>, <8 x float> addrspace(201)* %693, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %694, <8 x float>* %79, align 32, !tbaa !18, !llvm.access.group !17
+  %695 = bitcast <8 x float>* %80 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %695) #15, !llvm.access.group !17
+  %696 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %10, align 4, !tbaa !9, !llvm.access.group !17
+  %697 = load i32, i32* %78, align 4, !tbaa !3, !llvm.access.group !17
+  %698 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %696, i32 noundef %697) #20, !llvm.access.group !17
+  %699 = load <8 x float>, <8 x float> addrspace(201)* %698, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %699, <8 x float>* %80, align 32, !tbaa !18, !llvm.access.group !17
+  %700 = bitcast <8 x float>* %81 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %700) #15, !llvm.access.group !17
+  %701 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9, !llvm.access.group !17
+  %702 = load i32, i32* %78, align 4, !tbaa !3, !llvm.access.group !17
+  %703 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %701, i32 noundef %702) #20, !llvm.access.group !17
+  %704 = load <8 x float>, <8 x float> addrspace(201)* %703, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %704, <8 x float>* %81, align 32, !tbaa !18, !llvm.access.group !17
+  %705 = bitcast <8 x float>* %82 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %705) #15, !llvm.access.group !17
+  %706 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9, !llvm.access.group !17
+  %707 = load i32, i32* %78, align 4, !tbaa !3, !llvm.access.group !17
+  %708 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %706, i32 noundef %707) #20, !llvm.access.group !17
+  %709 = load <8 x float>, <8 x float> addrspace(201)* %708, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %709, <8 x float>* %82, align 32, !tbaa !18, !llvm.access.group !17
+  %710 = bitcast <8 x float>* %83 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %710) #15, !llvm.access.group !17
+  %711 = load <8 x float>, <8 x float>* %81, align 32, !tbaa !18, !llvm.access.group !17
+  %712 = load <8 x float>, <8 x float>* %80, align 32, !tbaa !18, !llvm.access.group !17
+  %713 = load <8 x float>, <8 x float>* %80, align 32, !tbaa !18, !llvm.access.group !17
+  %714 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %712, <8 x float> %713, <8 x float> %711)
+  store <8 x float> %714, <8 x float>* %83, align 32, !tbaa !18, !llvm.access.group !17
+  %715 = bitcast %class.anon.10* %84 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %715) #15, !llvm.access.group !17
+  %716 = bitcast <8 x float>* %85 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %716) #15, !llvm.access.group !17
+  %717 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 0
+  %718 = load float, float* %717, align 4, !tbaa !12, !llvm.access.group !17
+  %719 = insertelement <8 x float> poison, float %718, i32 0
+  %720 = shufflevector <8 x float> %719, <8 x float> poison, <8 x i32> zeroinitializer
+  %721 = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %720)
+  store <8 x float> %721, <8 x float>* %85, align 32, !tbaa !18, !llvm.access.group !17
+  %722 = bitcast <8 x float>* %86 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %722) #15, !llvm.access.group !17
+  %723 = load <8 x float>, <8 x float>* %82, align 32, !tbaa !18, !llvm.access.group !17
+  %724 = load <8 x float>, <8 x float>* %80, align 32, !tbaa !18, !llvm.access.group !17
+  %725 = fadd <8 x float> %723, %724
+  %726 = load <8 x float>, <8 x float>* %83, align 32, !tbaa !18, !llvm.access.group !17
+  %727 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %728 = load float, float* %727, align 4, !tbaa !14, !llvm.access.group !17
+  %729 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE5_clES6_f(%class.anon.10* noundef nonnull align 1 dereferenceable(1) %84, <8 x float> noundef %726, float noundef %728) #20, !llvm.access.group !17
+  %730 = load <8 x float>, <8 x float>* %81, align 32, !tbaa !18, !llvm.access.group !17
+  %731 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %732 = load float, float* %731, align 4, !tbaa !14, !llvm.access.group !17
+  %733 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE5_clES6_f(%class.anon.10* noundef nonnull align 1 dereferenceable(1) %84, <8 x float> noundef %730, float noundef %732) #20, !llvm.access.group !17
+  %734 = fsub <8 x float> %729, %733
+  %735 = load <8 x float>, <8 x float>* %79, align 32, !tbaa !18, !llvm.access.group !17
+  %736 = load <8 x float>, <8 x float>* %85, align 32, !tbaa !18, !llvm.access.group !17
+  %737 = fmul <8 x float> %735, %736
+  %738 = fneg <8 x float> %734
+  %739 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %738, <8 x float> %737, <8 x float> %725)
+  store <8 x float> %739, <8 x float>* %86, align 32, !tbaa !18, !llvm.access.group !17
+  %740 = bitcast <8 x float>* %87 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %740) #15, !llvm.access.group !17
+  %741 = load <8 x float>, <8 x float>* %83, align 32, !tbaa !18, !llvm.access.group !17
+  %742 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %743 = load float, float* %742, align 4, !tbaa !14, !llvm.access.group !17
+  %744 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE5_clES6_f(%class.anon.10* noundef nonnull align 1 dereferenceable(1) %84, <8 x float> noundef %741, float noundef %743) #20, !llvm.access.group !17
+  %745 = load <8 x float>, <8 x float>* %85, align 32, !tbaa !18, !llvm.access.group !17
+  %746 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 3
+  %747 = load float, float* %746, align 4, !tbaa !16, !llvm.access.group !17
+  %748 = fpext float %747 to double
+  %749 = fmul double 2.000000e+00, %748
+  %750 = fptrunc double %749 to float
+  %751 = insertelement <8 x float> poison, float %750, i32 0
+  %752 = shufflevector <8 x float> %751, <8 x float> poison, <8 x i32> zeroinitializer
+  %753 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %744, <8 x float> %745, <8 x float> %752)
+  store <8 x float> %753, <8 x float>* %87, align 32, !tbaa !18, !llvm.access.group !17
+  %754 = bitcast <8 x float>* %88 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %754) #15, !llvm.access.group !17
+  %755 = load <8 x float>, <8 x float>* %86, align 32, !tbaa !18, !llvm.access.group !17
+  %756 = call noundef <8 x float> @_ZN10embeddings4Ftrl21GetMinusLinearReducedEDv8_f(%"class.embeddings::Ftrl"* noundef nonnull align 4 dereferenceable(16) %100, <8 x float> noundef %755) #20, !llvm.access.group !17
+  %757 = load <8 x float>, <8 x float>* %87, align 32, !tbaa !18, !llvm.access.group !17
+  %758 = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %757)
+  %759 = fmul <8 x float> %756, %758
+  store <8 x float> %759, <8 x float>* %88, align 32, !tbaa !18, !llvm.access.group !17
+  %760 = load <8 x float>, <8 x float>* %88, align 32, !tbaa !18, !llvm.access.group !17
+  %761 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %762 = load i32, i32* %78, align 4, !tbaa !3, !llvm.access.group !17
+  %763 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %761, i32 noundef %762) #20, !llvm.access.group !17
+  store <8 x float> %760, <8 x float> addrspace(201)* %763, align 32, !tbaa !18, !llvm.access.group !17
+  %764 = load <8 x float>, <8 x float>* %83, align 32, !tbaa !18, !llvm.access.group !17
+  %765 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9, !llvm.access.group !17
+  %766 = load i32, i32* %78, align 4, !tbaa !3, !llvm.access.group !17
+  %767 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %765, i32 noundef %766) #20, !llvm.access.group !17
+  store <8 x float> %764, <8 x float> addrspace(201)* %767, align 32, !tbaa !18, !llvm.access.group !17
+  %768 = load <8 x float>, <8 x float>* %86, align 32, !tbaa !18, !llvm.access.group !17
+  %769 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9, !llvm.access.group !17
+  %770 = load i32, i32* %78, align 4, !tbaa !3, !llvm.access.group !17
+  %771 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %769, i32 noundef %770) #20, !llvm.access.group !17
+  store <8 x float> %768, <8 x float> addrspace(201)* %771, align 32, !tbaa !18, !llvm.access.group !17
+  %772 = bitcast <8 x float>* %88 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %772) #15, !llvm.access.group !17
+  %773 = bitcast <8 x float>* %87 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %773) #15, !llvm.access.group !17
+  %774 = bitcast <8 x float>* %86 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %774) #15, !llvm.access.group !17
+  %775 = bitcast <8 x float>* %85 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %775) #15, !llvm.access.group !17
+  %776 = bitcast %class.anon.10* %84 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %776) #15, !llvm.access.group !17
+  %777 = bitcast <8 x float>* %83 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %777) #15, !llvm.access.group !17
+  %778 = bitcast <8 x float>* %82 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %778) #15, !llvm.access.group !17
+  %779 = bitcast <8 x float>* %81 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %779) #15, !llvm.access.group !17
+  %780 = bitcast <8 x float>* %80 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %780) #15, !llvm.access.group !17
+  %781 = bitcast <8 x float>* %79 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %781) #15, !llvm.access.group !17
+  %782 = bitcast i32* %78 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %782) #15, !llvm.access.group !17
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !17
+  %783 = bitcast i32* %89 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %783) #15, !llvm.access.group !17
+  %784 = load i32, i32* %11, align 4, !tbaa !3, !llvm.access.group !17
+  %785 = add nsw i32 %784, 7
+  store i32 %785, i32* %89, align 4, !tbaa !3, !llvm.access.group !17
+  %786 = bitcast <8 x float>* %90 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %786) #15, !llvm.access.group !17
+  %787 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %788 = load i32, i32* %89, align 4, !tbaa !3, !llvm.access.group !17
+  %789 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %787, i32 noundef %788) #20, !llvm.access.group !17
+  %790 = load <8 x float>, <8 x float> addrspace(201)* %789, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %790, <8 x float>* %90, align 32, !tbaa !18, !llvm.access.group !17
+  %791 = bitcast <8 x float>* %91 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %791) #15, !llvm.access.group !17
+  %792 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %10, align 4, !tbaa !9, !llvm.access.group !17
+  %793 = load i32, i32* %89, align 4, !tbaa !3, !llvm.access.group !17
+  %794 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %792, i32 noundef %793) #20, !llvm.access.group !17
+  %795 = load <8 x float>, <8 x float> addrspace(201)* %794, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %795, <8 x float>* %91, align 32, !tbaa !18, !llvm.access.group !17
+  %796 = bitcast <8 x float>* %92 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %796) #15, !llvm.access.group !17
+  %797 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9, !llvm.access.group !17
+  %798 = load i32, i32* %89, align 4, !tbaa !3, !llvm.access.group !17
+  %799 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %797, i32 noundef %798) #20, !llvm.access.group !17
+  %800 = load <8 x float>, <8 x float> addrspace(201)* %799, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %800, <8 x float>* %92, align 32, !tbaa !18, !llvm.access.group !17
+  %801 = bitcast <8 x float>* %93 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %801) #15, !llvm.access.group !17
+  %802 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9, !llvm.access.group !17
+  %803 = load i32, i32* %89, align 4, !tbaa !3, !llvm.access.group !17
+  %804 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %802, i32 noundef %803) #20, !llvm.access.group !17
+  %805 = load <8 x float>, <8 x float> addrspace(201)* %804, align 32, !tbaa !18, !llvm.access.group !17
+  store <8 x float> %805, <8 x float>* %93, align 32, !tbaa !18, !llvm.access.group !17
+  %806 = bitcast <8 x float>* %94 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %806) #15, !llvm.access.group !17
+  %807 = load <8 x float>, <8 x float>* %92, align 32, !tbaa !18, !llvm.access.group !17
+  %808 = load <8 x float>, <8 x float>* %91, align 32, !tbaa !18, !llvm.access.group !17
+  %809 = load <8 x float>, <8 x float>* %91, align 32, !tbaa !18, !llvm.access.group !17
+  %810 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %808, <8 x float> %809, <8 x float> %807)
+  store <8 x float> %810, <8 x float>* %94, align 32, !tbaa !18, !llvm.access.group !17
+  %811 = bitcast %class.anon.12* %95 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %811) #15, !llvm.access.group !17
+  %812 = bitcast <8 x float>* %96 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %812) #15, !llvm.access.group !17
+  %813 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 0
+  %814 = load float, float* %813, align 4, !tbaa !12, !llvm.access.group !17
+  %815 = insertelement <8 x float> poison, float %814, i32 0
+  %816 = shufflevector <8 x float> %815, <8 x float> poison, <8 x i32> zeroinitializer
+  %817 = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %816)
+  store <8 x float> %817, <8 x float>* %96, align 32, !tbaa !18, !llvm.access.group !17
+  %818 = bitcast <8 x float>* %97 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %818) #15, !llvm.access.group !17
+  %819 = load <8 x float>, <8 x float>* %93, align 32, !tbaa !18, !llvm.access.group !17
+  %820 = load <8 x float>, <8 x float>* %91, align 32, !tbaa !18, !llvm.access.group !17
+  %821 = fadd <8 x float> %819, %820
+  %822 = load <8 x float>, <8 x float>* %94, align 32, !tbaa !18, !llvm.access.group !17
+  %823 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %824 = load float, float* %823, align 4, !tbaa !14, !llvm.access.group !17
+  %825 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE6_clES6_f(%class.anon.12* noundef nonnull align 1 dereferenceable(1) %95, <8 x float> noundef %822, float noundef %824) #20, !llvm.access.group !17
+  %826 = load <8 x float>, <8 x float>* %92, align 32, !tbaa !18, !llvm.access.group !17
+  %827 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %828 = load float, float* %827, align 4, !tbaa !14, !llvm.access.group !17
+  %829 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE6_clES6_f(%class.anon.12* noundef nonnull align 1 dereferenceable(1) %95, <8 x float> noundef %826, float noundef %828) #20, !llvm.access.group !17
+  %830 = fsub <8 x float> %825, %829
+  %831 = load <8 x float>, <8 x float>* %90, align 32, !tbaa !18, !llvm.access.group !17
+  %832 = load <8 x float>, <8 x float>* %96, align 32, !tbaa !18, !llvm.access.group !17
+  %833 = fmul <8 x float> %831, %832
+  %834 = fneg <8 x float> %830
+  %835 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %834, <8 x float> %833, <8 x float> %821)
+  store <8 x float> %835, <8 x float>* %97, align 32, !tbaa !18, !llvm.access.group !17
+  %836 = bitcast <8 x float>* %98 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %836) #15, !llvm.access.group !17
+  %837 = load <8 x float>, <8 x float>* %94, align 32, !tbaa !18, !llvm.access.group !17
+  %838 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 1
+  %839 = load float, float* %838, align 4, !tbaa !14, !llvm.access.group !17
+  %840 = call noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE6_clES6_f(%class.anon.12* noundef nonnull align 1 dereferenceable(1) %95, <8 x float> noundef %837, float noundef %839) #20, !llvm.access.group !17
+  %841 = load <8 x float>, <8 x float>* %96, align 32, !tbaa !18, !llvm.access.group !17
+  %842 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %100, i32 0, i32 3
+  %843 = load float, float* %842, align 4, !tbaa !16, !llvm.access.group !17
+  %844 = fpext float %843 to double
+  %845 = fmul double 2.000000e+00, %844
+  %846 = fptrunc double %845 to float
+  %847 = insertelement <8 x float> poison, float %846, i32 0
+  %848 = shufflevector <8 x float> %847, <8 x float> poison, <8 x i32> zeroinitializer
+  %849 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %840, <8 x float> %841, <8 x float> %848)
+  store <8 x float> %849, <8 x float>* %98, align 32, !tbaa !18, !llvm.access.group !17
+  %850 = bitcast <8 x float>* %99 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %850) #15, !llvm.access.group !17
+  %851 = load <8 x float>, <8 x float>* %97, align 32, !tbaa !18, !llvm.access.group !17
+  %852 = call noundef <8 x float> @_ZN10embeddings4Ftrl21GetMinusLinearReducedEDv8_f(%"class.embeddings::Ftrl"* noundef nonnull align 4 dereferenceable(16) %100, <8 x float> noundef %851) #20, !llvm.access.group !17
+  %853 = load <8 x float>, <8 x float>* %98, align 32, !tbaa !18, !llvm.access.group !17
+  %854 = call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %853)
+  %855 = fmul <8 x float> %852, %854
+  store <8 x float> %855, <8 x float>* %99, align 32, !tbaa !18, !llvm.access.group !17
+  %856 = load <8 x float>, <8 x float>* %99, align 32, !tbaa !18, !llvm.access.group !17
+  %857 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %7, align 4, !tbaa !9, !llvm.access.group !17
+  %858 = load i32, i32* %89, align 4, !tbaa !3, !llvm.access.group !17
+  %859 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %857, i32 noundef %858) #20, !llvm.access.group !17
+  store <8 x float> %856, <8 x float> addrspace(201)* %859, align 32, !tbaa !18, !llvm.access.group !17
+  %860 = load <8 x float>, <8 x float>* %94, align 32, !tbaa !18, !llvm.access.group !17
+  %861 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %8, align 4, !tbaa !9, !llvm.access.group !17
+  %862 = load i32, i32* %89, align 4, !tbaa !3, !llvm.access.group !17
+  %863 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %861, i32 noundef %862) #20, !llvm.access.group !17
+  store <8 x float> %860, <8 x float> addrspace(201)* %863, align 32, !tbaa !18, !llvm.access.group !17
+  %864 = load <8 x float>, <8 x float>* %97, align 32, !tbaa !18, !llvm.access.group !17
+  %865 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %9, align 4, !tbaa !9, !llvm.access.group !17
+  %866 = load i32, i32* %89, align 4, !tbaa !3, !llvm.access.group !17
+  %867 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %865, i32 noundef %866) #20, !llvm.access.group !17
+  store <8 x float> %864, <8 x float> addrspace(201)* %867, align 32, !tbaa !18, !llvm.access.group !17
+  %868 = bitcast <8 x float>* %99 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %868) #15, !llvm.access.group !17
+  %869 = bitcast <8 x float>* %98 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %869) #15, !llvm.access.group !17
+  %870 = bitcast <8 x float>* %97 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %870) #15, !llvm.access.group !17
+  %871 = bitcast <8 x float>* %96 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %871) #15, !llvm.access.group !17
+  %872 = bitcast %class.anon.12* %95 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %872) #15, !llvm.access.group !17
+  %873 = bitcast <8 x float>* %94 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %873) #15, !llvm.access.group !17
+  %874 = bitcast <8 x float>* %93 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %874) #15, !llvm.access.group !17
+  %875 = bitcast <8 x float>* %92 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %875) #15, !llvm.access.group !17
+  %876 = bitcast <8 x float>* %91 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %876) #15, !llvm.access.group !17
+  %877 = bitcast <8 x float>* %90 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %877) #15, !llvm.access.group !17
+  %878 = bitcast i32* %89 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %878) #15, !llvm.access.group !17
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !17
+  br label %879
+
+879:                                              ; preds = %110
+  %880 = load i32, i32* %11, align 4, !tbaa !3, !llvm.access.group !17
+  %881 = add nsw i32 %880, 8
+  store i32 %881, i32* %11, align 4, !tbaa !3, !llvm.access.group !17
+  br label %102, !llvm.loop !19
+
+882:                                              ; preds = %108
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind
+define internal void @_ZN12_GLOBAL__N_16ReturnEii(i32 noundef %0, i32 noundef %1) #3 {
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32*, align 4
+  store i32 %0, i32* %3, align 4, !tbaa !3
+  store i32 %1, i32* %4, align 4, !tbaa !3
+  %6 = bitcast i32** %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #15
+  %7 = load i32, i32* %4, align 4, !tbaa !3
+  %8 = add nsw i32 256, %7
+  %9 = inttoptr i32 %8 to i32*
+  store i32* %9, i32** %5, align 4, !tbaa !9
+  %10 = load i32, i32* %3, align 4, !tbaa !3
+  %11 = load i32*, i32** %5, align 4, !tbaa !9
+  store i32 %10, i32* %11, align 4, !tbaa !3
+  %12 = bitcast i32** %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %12) #15
+  ret void
+}
+
+; Function Attrs: argmemonly nocallback nofree nosync nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: mustprogress nounwind
+define dso_local void @scs() #9 section ".text.scs" {
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define linkonce_odr dso_local void @_ZN10embeddings11MemorySpaceC2ERKS0_(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %0, %"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %1) unnamed_addr #7 comdat align 2 {
+  %3 = alloca %"class.embeddings::MemorySpace"*, align 4
+  %4 = alloca %"class.embeddings::MemorySpace"*, align 4
+  store %"class.embeddings::MemorySpace"* %0, %"class.embeddings::MemorySpace"** %3, align 4, !tbaa !9
+  store %"class.embeddings::MemorySpace"* %1, %"class.embeddings::MemorySpace"** %4, align 4, !tbaa !9
+  %5 = load %"class.embeddings::MemorySpace"*, %"class.embeddings::MemorySpace"** %3, align 4
+  %6 = load %"class.embeddings::MemorySpace"*, %"class.embeddings::MemorySpace"** %4, align 4, !tbaa !9
+  %7 = getelementptr inbounds %"class.embeddings::MemorySpace", %"class.embeddings::MemorySpace"* %6, i32 0, i32 0
+  %8 = load i32, i32* %7, align 4, !tbaa !26
+  %9 = getelementptr inbounds %"class.embeddings::MemorySpace", %"class.embeddings::MemorySpace"* %5, i32 0, i32 0
+  store i32 %8, i32* %9, align 4, !tbaa !26
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define linkonce_odr dso_local void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %0, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %1) unnamed_addr #7 comdat align 2 {
+  %3 = alloca %"class.embeddings::BasicType"*, align 4
+  %4 = alloca %"class.embeddings::BasicType"*, align 4
+  store %"class.embeddings::BasicType"* %0, %"class.embeddings::BasicType"** %3, align 4, !tbaa !9
+  store %"class.embeddings::BasicType"* %1, %"class.embeddings::BasicType"** %4, align 4, !tbaa !9
+  %5 = load %"class.embeddings::BasicType"*, %"class.embeddings::BasicType"** %3, align 4
+  %6 = load %"class.embeddings::BasicType"*, %"class.embeddings::BasicType"** %4, align 4, !tbaa !9
+  %7 = getelementptr inbounds %"class.embeddings::BasicType", %"class.embeddings::BasicType"* %6, i32 0, i32 0
+  %8 = load i32, i32* %7, align 4, !tbaa !29
+  %9 = getelementptr inbounds %"class.embeddings::BasicType", %"class.embeddings::BasicType"* %5, i32 0, i32 0
+  store i32 %8, i32* %9, align 4, !tbaa !29
+  ret void
+}
+
+; Function Attrs: argmemonly nofree nounwind willreturn
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #10
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %0) #11 comdat align 2 {
+  %2 = alloca %"class.embeddings::BaseArray"*, align 4
+  store %"class.embeddings::BaseArray"* %0, %"class.embeddings::BaseArray"** %2, align 4, !tbaa !9
+  %3 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %2, align 4
+  %4 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %3, i32 0, i32 1
+  %5 = load i32, i32* %4, align 4, !tbaa !32
+  ret i32 %5
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %0, i32 noundef %1) #4 comdat align 2 {
+  %3 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %4 = alloca i32, align 4
+  store %"class.embeddings::TileSpmemVectorArray"* %0, %"class.embeddings::TileSpmemVectorArray"** %3, align 4, !tbaa !9
+  store i32 %1, i32* %4, align 4, !tbaa !3
+  %5 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %3, align 4
+  %6 = call noundef <8 x float> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIfE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %5) #20
+  %7 = load i32, i32* %4, align 4, !tbaa !3
+  %8 = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %6, i32 %7
+  ret <8 x float> addrspace(201)* %8
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %0, i32 noundef %1) #4 comdat align 2 {
+  %3 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %4 = alloca i32, align 4
+  store %"class.embeddings::TileSpmemVectorArray"* %0, %"class.embeddings::TileSpmemVectorArray"** %3, align 4, !tbaa !9
+  store i32 %1, i32* %4, align 4, !tbaa !3
+  %5 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %3, align 4
+  %6 = call noundef <8 x float> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIfE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %5) #20
+  %7 = load i32, i32* %4, align 4, !tbaa !3
+  %8 = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %6, i32 %7
+  ret <8 x float> addrspace(201)* %8
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>) #12
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float>) #13
+
+; Function Attrs: inlinehint mustprogress nounwind
+define linkonce_odr dso_local noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE_clES6_f(%class.anon* noundef nonnull align 1 dereferenceable(1) %0, <8 x float> noundef %1, float noundef %2) #14 comdat align 2 {
+  %4 = alloca <8 x float>, align 32
+  %5 = alloca %class.anon*, align 4
+  %6 = alloca <8 x float>, align 32
+  %7 = alloca float, align 4
+  store %class.anon* %0, %class.anon** %5, align 4, !tbaa !9
+  store <8 x float> %1, <8 x float>* %6, align 32, !tbaa !18
+  store float %2, float* %7, align 4, !tbaa !7
+  %8 = load %class.anon*, %class.anon** %5, align 4
+  %9 = load float, float* %7, align 4, !tbaa !7
+  %10 = fpext float %9 to double
+  %11 = fcmp oeq double %10, 5.000000e-01
+  br i1 %11, label %12, label %15
+
+12:                                               ; preds = %3
+  %13 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %14 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %13)
+  store <8 x float> %14, <8 x float>* %4, align 32
+  br label %34
+
+15:                                               ; preds = %3
+  %16 = load float, float* %7, align 4, !tbaa !7
+  %17 = fpext float %16 to double
+  %18 = fcmp oeq double %17, -5.000000e-01
+  br i1 %18, label %19, label %24
+
+19:                                               ; preds = %15
+  %20 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %21 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %22 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %21)
+  %23 = fmul <8 x float> %20, %22
+  store <8 x float> %23, <8 x float>* %4, align 32
+  br label %34
+
+24:                                               ; preds = %15
+  br label %25
+
+25:                                               ; preds = %24
+  %26 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %27 = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %26)
+  %28 = load float, float* %7, align 4, !tbaa !7
+  %29 = fneg float %28
+  %30 = insertelement <8 x float> poison, float %29, i32 0
+  %31 = shufflevector <8 x float> %30, <8 x float> poison, <8 x i32> zeroinitializer
+  %32 = fmul <8 x float> %27, %31
+  %33 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %32)
+  store <8 x float> %33, <8 x float>* %4, align 32
+  br label %34
+
+34:                                               ; preds = %25, %19, %12
+  %35 = load <8 x float>, <8 x float>* %4, align 32
+  ret <8 x float> %35
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local noundef <8 x float> @_ZN10embeddings4Ftrl21GetMinusLinearReducedEDv8_f(%"class.embeddings::Ftrl"* noundef nonnull align 4 dereferenceable(16) %0, <8 x float> noundef %1) #8 comdat align 2 {
+  %3 = alloca %"class.embeddings::Ftrl"*, align 4
+  %4 = alloca <8 x float>, align 32
+  %5 = alloca <8 x float>, align 32
+  store %"class.embeddings::Ftrl"* %0, %"class.embeddings::Ftrl"** %3, align 4, !tbaa !9
+  store <8 x float> %1, <8 x float>* %4, align 32, !tbaa !18
+  %6 = load %"class.embeddings::Ftrl"*, %"class.embeddings::Ftrl"** %3, align 4
+  %7 = bitcast <8 x float>* %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %7) #15
+  %8 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %6, i32 0, i32 2
+  %9 = load float, float* %8, align 4, !tbaa !15
+  %10 = fcmp une float %9, 0.000000e+00
+  br i1 %10, label %11, label %18
+
+11:                                               ; preds = %2
+  %12 = load <8 x float>, <8 x float>* %4, align 32, !tbaa !18
+  %13 = getelementptr inbounds %"class.embeddings::Ftrl", %"class.embeddings::Ftrl"* %6, i32 0, i32 2
+  %14 = load float, float* %13, align 4, !tbaa !15
+  %15 = call noundef <8 x float> @_ZN10embeddings4Ftrl14ClampSymmetricEDv8_ff(%"class.embeddings::Ftrl"* noundef nonnull align 4 dereferenceable(16) %6, <8 x float> noundef %12, float noundef %14) #20
+  %16 = load <8 x float>, <8 x float>* %4, align 32, !tbaa !18
+  %17 = fsub <8 x float> %15, %16
+  store <8 x float> %17, <8 x float>* %5, align 32, !tbaa !18
+  br label %23
+
+18:                                               ; preds = %2
+  %19 = load <8 x float>, <8 x float>* %4, align 32, !tbaa !18
+  %20 = bitcast <8 x float> %19 to <8 x i32>
+  %21 = xor <8 x i32> %20, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %22 = bitcast <8 x i32> %21 to <8 x float>
+  store <8 x float> %22, <8 x float>* %5, align 32, !tbaa !18
+  br label %23
+
+23:                                               ; preds = %18, %11
+  %24 = load <8 x float>, <8 x float>* %5, align 32, !tbaa !18
+  %25 = bitcast <8 x float>* %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %25) #15
+  ret <8 x float> %24
+}
+
+; Function Attrs: nounwind
+declare void @llvm.tpu.loop.parallel() #15
+
+; Function Attrs: inlinehint mustprogress nounwind
+define linkonce_odr dso_local noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE0_clES6_f(%class.anon.0* noundef nonnull align 1 dereferenceable(1) %0, <8 x float> noundef %1, float noundef %2) #14 comdat align 2 {
+  %4 = alloca <8 x float>, align 32
+  %5 = alloca %class.anon.0*, align 4
+  %6 = alloca <8 x float>, align 32
+  %7 = alloca float, align 4
+  store %class.anon.0* %0, %class.anon.0** %5, align 4, !tbaa !9
+  store <8 x float> %1, <8 x float>* %6, align 32, !tbaa !18
+  store float %2, float* %7, align 4, !tbaa !7
+  %8 = load %class.anon.0*, %class.anon.0** %5, align 4
+  %9 = load float, float* %7, align 4, !tbaa !7
+  %10 = fpext float %9 to double
+  %11 = fcmp oeq double %10, 5.000000e-01
+  br i1 %11, label %12, label %15
+
+12:                                               ; preds = %3
+  %13 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %14 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %13)
+  store <8 x float> %14, <8 x float>* %4, align 32
+  br label %34
+
+15:                                               ; preds = %3
+  %16 = load float, float* %7, align 4, !tbaa !7
+  %17 = fpext float %16 to double
+  %18 = fcmp oeq double %17, -5.000000e-01
+  br i1 %18, label %19, label %24
+
+19:                                               ; preds = %15
+  %20 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %21 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %22 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %21)
+  %23 = fmul <8 x float> %20, %22
+  store <8 x float> %23, <8 x float>* %4, align 32
+  br label %34
+
+24:                                               ; preds = %15
+  br label %25
+
+25:                                               ; preds = %24
+  %26 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %27 = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %26)
+  %28 = load float, float* %7, align 4, !tbaa !7
+  %29 = fneg float %28
+  %30 = insertelement <8 x float> poison, float %29, i32 0
+  %31 = shufflevector <8 x float> %30, <8 x float> poison, <8 x i32> zeroinitializer
+  %32 = fmul <8 x float> %27, %31
+  %33 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %32)
+  store <8 x float> %33, <8 x float>* %4, align 32
+  br label %34
+
+34:                                               ; preds = %25, %19, %12
+  %35 = load <8 x float>, <8 x float>* %4, align 32
+  ret <8 x float> %35
+}
+
+; Function Attrs: inlinehint mustprogress nounwind
+define linkonce_odr dso_local noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE1_clES6_f(%class.anon.2* noundef nonnull align 1 dereferenceable(1) %0, <8 x float> noundef %1, float noundef %2) #14 comdat align 2 {
+  %4 = alloca <8 x float>, align 32
+  %5 = alloca %class.anon.2*, align 4
+  %6 = alloca <8 x float>, align 32
+  %7 = alloca float, align 4
+  store %class.anon.2* %0, %class.anon.2** %5, align 4, !tbaa !9
+  store <8 x float> %1, <8 x float>* %6, align 32, !tbaa !18
+  store float %2, float* %7, align 4, !tbaa !7
+  %8 = load %class.anon.2*, %class.anon.2** %5, align 4
+  %9 = load float, float* %7, align 4, !tbaa !7
+  %10 = fpext float %9 to double
+  %11 = fcmp oeq double %10, 5.000000e-01
+  br i1 %11, label %12, label %15
+
+12:                                               ; preds = %3
+  %13 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %14 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %13)
+  store <8 x float> %14, <8 x float>* %4, align 32
+  br label %34
+
+15:                                               ; preds = %3
+  %16 = load float, float* %7, align 4, !tbaa !7
+  %17 = fpext float %16 to double
+  %18 = fcmp oeq double %17, -5.000000e-01
+  br i1 %18, label %19, label %24
+
+19:                                               ; preds = %15
+  %20 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %21 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %22 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %21)
+  %23 = fmul <8 x float> %20, %22
+  store <8 x float> %23, <8 x float>* %4, align 32
+  br label %34
+
+24:                                               ; preds = %15
+  br label %25
+
+25:                                               ; preds = %24
+  %26 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %27 = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %26)
+  %28 = load float, float* %7, align 4, !tbaa !7
+  %29 = fneg float %28
+  %30 = insertelement <8 x float> poison, float %29, i32 0
+  %31 = shufflevector <8 x float> %30, <8 x float> poison, <8 x i32> zeroinitializer
+  %32 = fmul <8 x float> %27, %31
+  %33 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %32)
+  store <8 x float> %33, <8 x float>* %4, align 32
+  br label %34
+
+34:                                               ; preds = %25, %19, %12
+  %35 = load <8 x float>, <8 x float>* %4, align 32
+  ret <8 x float> %35
+}
+
+; Function Attrs: inlinehint mustprogress nounwind
+define linkonce_odr dso_local noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE2_clES6_f(%class.anon.4* noundef nonnull align 1 dereferenceable(1) %0, <8 x float> noundef %1, float noundef %2) #14 comdat align 2 {
+  %4 = alloca <8 x float>, align 32
+  %5 = alloca %class.anon.4*, align 4
+  %6 = alloca <8 x float>, align 32
+  %7 = alloca float, align 4
+  store %class.anon.4* %0, %class.anon.4** %5, align 4, !tbaa !9
+  store <8 x float> %1, <8 x float>* %6, align 32, !tbaa !18
+  store float %2, float* %7, align 4, !tbaa !7
+  %8 = load %class.anon.4*, %class.anon.4** %5, align 4
+  %9 = load float, float* %7, align 4, !tbaa !7
+  %10 = fpext float %9 to double
+  %11 = fcmp oeq double %10, 5.000000e-01
+  br i1 %11, label %12, label %15
+
+12:                                               ; preds = %3
+  %13 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %14 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %13)
+  store <8 x float> %14, <8 x float>* %4, align 32
+  br label %34
+
+15:                                               ; preds = %3
+  %16 = load float, float* %7, align 4, !tbaa !7
+  %17 = fpext float %16 to double
+  %18 = fcmp oeq double %17, -5.000000e-01
+  br i1 %18, label %19, label %24
+
+19:                                               ; preds = %15
+  %20 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %21 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %22 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %21)
+  %23 = fmul <8 x float> %20, %22
+  store <8 x float> %23, <8 x float>* %4, align 32
+  br label %34
+
+24:                                               ; preds = %15
+  br label %25
+
+25:                                               ; preds = %24
+  %26 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %27 = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %26)
+  %28 = load float, float* %7, align 4, !tbaa !7
+  %29 = fneg float %28
+  %30 = insertelement <8 x float> poison, float %29, i32 0
+  %31 = shufflevector <8 x float> %30, <8 x float> poison, <8 x i32> zeroinitializer
+  %32 = fmul <8 x float> %27, %31
+  %33 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %32)
+  store <8 x float> %33, <8 x float>* %4, align 32
+  br label %34
+
+34:                                               ; preds = %25, %19, %12
+  %35 = load <8 x float>, <8 x float>* %4, align 32
+  ret <8 x float> %35
+}
+
+; Function Attrs: inlinehint mustprogress nounwind
+define linkonce_odr dso_local noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE3_clES6_f(%class.anon.6* noundef nonnull align 1 dereferenceable(1) %0, <8 x float> noundef %1, float noundef %2) #14 comdat align 2 {
+  %4 = alloca <8 x float>, align 32
+  %5 = alloca %class.anon.6*, align 4
+  %6 = alloca <8 x float>, align 32
+  %7 = alloca float, align 4
+  store %class.anon.6* %0, %class.anon.6** %5, align 4, !tbaa !9
+  store <8 x float> %1, <8 x float>* %6, align 32, !tbaa !18
+  store float %2, float* %7, align 4, !tbaa !7
+  %8 = load %class.anon.6*, %class.anon.6** %5, align 4
+  %9 = load float, float* %7, align 4, !tbaa !7
+  %10 = fpext float %9 to double
+  %11 = fcmp oeq double %10, 5.000000e-01
+  br i1 %11, label %12, label %15
+
+12:                                               ; preds = %3
+  %13 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %14 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %13)
+  store <8 x float> %14, <8 x float>* %4, align 32
+  br label %34
+
+15:                                               ; preds = %3
+  %16 = load float, float* %7, align 4, !tbaa !7
+  %17 = fpext float %16 to double
+  %18 = fcmp oeq double %17, -5.000000e-01
+  br i1 %18, label %19, label %24
+
+19:                                               ; preds = %15
+  %20 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %21 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %22 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %21)
+  %23 = fmul <8 x float> %20, %22
+  store <8 x float> %23, <8 x float>* %4, align 32
+  br label %34
+
+24:                                               ; preds = %15
+  br label %25
+
+25:                                               ; preds = %24
+  %26 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %27 = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %26)
+  %28 = load float, float* %7, align 4, !tbaa !7
+  %29 = fneg float %28
+  %30 = insertelement <8 x float> poison, float %29, i32 0
+  %31 = shufflevector <8 x float> %30, <8 x float> poison, <8 x i32> zeroinitializer
+  %32 = fmul <8 x float> %27, %31
+  %33 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %32)
+  store <8 x float> %33, <8 x float>* %4, align 32
+  br label %34
+
+34:                                               ; preds = %25, %19, %12
+  %35 = load <8 x float>, <8 x float>* %4, align 32
+  ret <8 x float> %35
+}
+
+; Function Attrs: inlinehint mustprogress nounwind
+define linkonce_odr dso_local noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE4_clES6_f(%class.anon.8* noundef nonnull align 1 dereferenceable(1) %0, <8 x float> noundef %1, float noundef %2) #14 comdat align 2 {
+  %4 = alloca <8 x float>, align 32
+  %5 = alloca %class.anon.8*, align 4
+  %6 = alloca <8 x float>, align 32
+  %7 = alloca float, align 4
+  store %class.anon.8* %0, %class.anon.8** %5, align 4, !tbaa !9
+  store <8 x float> %1, <8 x float>* %6, align 32, !tbaa !18
+  store float %2, float* %7, align 4, !tbaa !7
+  %8 = load %class.anon.8*, %class.anon.8** %5, align 4
+  %9 = load float, float* %7, align 4, !tbaa !7
+  %10 = fpext float %9 to double
+  %11 = fcmp oeq double %10, 5.000000e-01
+  br i1 %11, label %12, label %15
+
+12:                                               ; preds = %3
+  %13 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %14 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %13)
+  store <8 x float> %14, <8 x float>* %4, align 32
+  br label %34
+
+15:                                               ; preds = %3
+  %16 = load float, float* %7, align 4, !tbaa !7
+  %17 = fpext float %16 to double
+  %18 = fcmp oeq double %17, -5.000000e-01
+  br i1 %18, label %19, label %24
+
+19:                                               ; preds = %15
+  %20 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %21 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %22 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %21)
+  %23 = fmul <8 x float> %20, %22
+  store <8 x float> %23, <8 x float>* %4, align 32
+  br label %34
+
+24:                                               ; preds = %15
+  br label %25
+
+25:                                               ; preds = %24
+  %26 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %27 = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %26)
+  %28 = load float, float* %7, align 4, !tbaa !7
+  %29 = fneg float %28
+  %30 = insertelement <8 x float> poison, float %29, i32 0
+  %31 = shufflevector <8 x float> %30, <8 x float> poison, <8 x i32> zeroinitializer
+  %32 = fmul <8 x float> %27, %31
+  %33 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %32)
+  store <8 x float> %33, <8 x float>* %4, align 32
+  br label %34
+
+34:                                               ; preds = %25, %19, %12
+  %35 = load <8 x float>, <8 x float>* %4, align 32
+  ret <8 x float> %35
+}
+
+; Function Attrs: inlinehint mustprogress nounwind
+define linkonce_odr dso_local noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE5_clES6_f(%class.anon.10* noundef nonnull align 1 dereferenceable(1) %0, <8 x float> noundef %1, float noundef %2) #14 comdat align 2 {
+  %4 = alloca <8 x float>, align 32
+  %5 = alloca %class.anon.10*, align 4
+  %6 = alloca <8 x float>, align 32
+  %7 = alloca float, align 4
+  store %class.anon.10* %0, %class.anon.10** %5, align 4, !tbaa !9
+  store <8 x float> %1, <8 x float>* %6, align 32, !tbaa !18
+  store float %2, float* %7, align 4, !tbaa !7
+  %8 = load %class.anon.10*, %class.anon.10** %5, align 4
+  %9 = load float, float* %7, align 4, !tbaa !7
+  %10 = fpext float %9 to double
+  %11 = fcmp oeq double %10, 5.000000e-01
+  br i1 %11, label %12, label %15
+
+12:                                               ; preds = %3
+  %13 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %14 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %13)
+  store <8 x float> %14, <8 x float>* %4, align 32
+  br label %34
+
+15:                                               ; preds = %3
+  %16 = load float, float* %7, align 4, !tbaa !7
+  %17 = fpext float %16 to double
+  %18 = fcmp oeq double %17, -5.000000e-01
+  br i1 %18, label %19, label %24
+
+19:                                               ; preds = %15
+  %20 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %21 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %22 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %21)
+  %23 = fmul <8 x float> %20, %22
+  store <8 x float> %23, <8 x float>* %4, align 32
+  br label %34
+
+24:                                               ; preds = %15
+  br label %25
+
+25:                                               ; preds = %24
+  %26 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %27 = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %26)
+  %28 = load float, float* %7, align 4, !tbaa !7
+  %29 = fneg float %28
+  %30 = insertelement <8 x float> poison, float %29, i32 0
+  %31 = shufflevector <8 x float> %30, <8 x float> poison, <8 x i32> zeroinitializer
+  %32 = fmul <8 x float> %27, %31
+  %33 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %32)
+  store <8 x float> %33, <8 x float>* %4, align 32
+  br label %34
+
+34:                                               ; preds = %25, %19, %12
+  %35 = load <8 x float>, <8 x float>* %4, align 32
+  ret <8 x float> %35
+}
+
+; Function Attrs: inlinehint mustprogress nounwind
+define linkonce_odr dso_local noundef <8 x float> @_ZZN10embeddings4Ftrl7ComputeEPNS_20TileSpmemVectorArrayIfEES3_S3_RKS2_ENKUlDv8_ffE6_clES6_f(%class.anon.12* noundef nonnull align 1 dereferenceable(1) %0, <8 x float> noundef %1, float noundef %2) #14 comdat align 2 {
+  %4 = alloca <8 x float>, align 32
+  %5 = alloca %class.anon.12*, align 4
+  %6 = alloca <8 x float>, align 32
+  %7 = alloca float, align 4
+  store %class.anon.12* %0, %class.anon.12** %5, align 4, !tbaa !9
+  store <8 x float> %1, <8 x float>* %6, align 32, !tbaa !18
+  store float %2, float* %7, align 4, !tbaa !7
+  %8 = load %class.anon.12*, %class.anon.12** %5, align 4
+  %9 = load float, float* %7, align 4, !tbaa !7
+  %10 = fpext float %9 to double
+  %11 = fcmp oeq double %10, 5.000000e-01
+  br i1 %11, label %12, label %15
+
+12:                                               ; preds = %3
+  %13 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %14 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %13)
+  store <8 x float> %14, <8 x float>* %4, align 32
+  br label %34
+
+15:                                               ; preds = %3
+  %16 = load float, float* %7, align 4, !tbaa !7
+  %17 = fpext float %16 to double
+  %18 = fcmp oeq double %17, -5.000000e-01
+  br i1 %18, label %19, label %24
+
+19:                                               ; preds = %15
+  %20 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %21 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %22 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %21)
+  %23 = fmul <8 x float> %20, %22
+  store <8 x float> %23, <8 x float>* %4, align 32
+  br label %34
+
+24:                                               ; preds = %15
+  br label %25
+
+25:                                               ; preds = %24
+  %26 = load <8 x float>, <8 x float>* %6, align 32, !tbaa !18
+  %27 = call <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float> %26)
+  %28 = load float, float* %7, align 4, !tbaa !7
+  %29 = fneg float %28
+  %30 = insertelement <8 x float> poison, float %29, i32 0
+  %31 = shufflevector <8 x float> %30, <8 x float> poison, <8 x i32> zeroinitializer
+  %32 = fmul <8 x float> %27, %31
+  %33 = call <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float> %32)
+  store <8 x float> %33, <8 x float>* %4, align 32
+  br label %34
+
+34:                                               ; preds = %25, %19, %12
+  %35 = load <8 x float>, <8 x float>* %4, align 32
+  ret <8 x float> %35
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local noundef <8 x float> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIfE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %0) #4 comdat align 2 {
+  %2 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %3 = alloca %"class.embeddings::TileSpmemPointer"*, align 4
+  %4 = alloca %"class.embeddings::PointerBase", align 4
+  store %"class.embeddings::TileSpmemVectorArray"* %0, %"class.embeddings::TileSpmemVectorArray"** %2, align 4, !tbaa !9
+  %5 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %2, align 4
+  %6 = bitcast %"class.embeddings::TileSpmemPointer"** %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #15
+  %7 = bitcast %"class.embeddings::PointerBase"* %4 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %7) #15
+  %8 = bitcast %"class.embeddings::TileSpmemVectorArray"* %5 to %"class.embeddings::BaseArray"*
+  call void @_ZNK10embeddings9BaseArray7BasePtrEv(%"class.embeddings::PointerBase"* sret(%"class.embeddings::PointerBase") align 4 %4, %"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %8) #20
+  %9 = call noundef %"class.embeddings::TileSpmemPointer"* @_ZN10embeddings4CastINS_16TileSpmemPointerENS_11PointerBaseEEENS_15cast_retty_implIT_T0_E8ret_typeERKS5_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %4) #20
+  %10 = bitcast %"class.embeddings::PointerBase"* %4 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %10) #15
+  store %"class.embeddings::TileSpmemPointer"* %9, %"class.embeddings::TileSpmemPointer"** %3, align 4, !tbaa !9
+  %11 = load %"class.embeddings::TileSpmemPointer"*, %"class.embeddings::TileSpmemPointer"** %3, align 4, !tbaa !9
+  %12 = call noundef i8 addrspace(201)* @_ZNK10embeddings16TileSpmemPointer6RawPtrEv(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %11) #20
+  %13 = bitcast i8 addrspace(201)* %12 to <8 x float> addrspace(201)*
+  %14 = bitcast %"class.embeddings::TileSpmemPointer"** %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %14) #15
+  ret <8 x float> addrspace(201)* %13
+}
+
+; Function Attrs: mustprogress nounwind
+define linkonce_odr dso_local noundef %"class.embeddings::TileSpmemPointer"* @_ZN10embeddings4CastINS_16TileSpmemPointerENS_11PointerBaseEEENS_15cast_retty_implIT_T0_E8ret_typeERKS5_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0) #3 comdat {
+  %2 = alloca %"class.embeddings::PointerBase"*, align 4
+  store %"class.embeddings::PointerBase"* %0, %"class.embeddings::PointerBase"** %2, align 4, !tbaa !9
+  %3 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %2, align 4, !tbaa !9
+  %4 = bitcast %"class.embeddings::PointerBase"* %3 to %"class.embeddings::TileSpmemPointer"*
+  ret %"class.embeddings::TileSpmemPointer"* %4
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZNK10embeddings9BaseArray7BasePtrEv(%"class.embeddings::PointerBase"* noalias sret(%"class.embeddings::PointerBase") align 4 %0, %"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %1) #4 comdat align 2 {
+  %3 = alloca i8*, align 4
+  %4 = alloca %"class.embeddings::BaseArray"*, align 4
+  %5 = bitcast %"class.embeddings::PointerBase"* %0 to i8*
+  store i8* %5, i8** %3, align 4
+  store %"class.embeddings::BaseArray"* %1, %"class.embeddings::BaseArray"** %4, align 4, !tbaa !9
+  %6 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %4, align 4
+  %7 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %6, i32 0, i32 0
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %7) #20
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef i8 addrspace(201)* @_ZNK10embeddings16TileSpmemPointer6RawPtrEv(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %0) #11 comdat align 2 {
+  %2 = alloca %"class.embeddings::TileSpmemPointer"*, align 4
+  store %"class.embeddings::TileSpmemPointer"* %0, %"class.embeddings::TileSpmemPointer"** %2, align 4, !tbaa !9
+  %3 = load %"class.embeddings::TileSpmemPointer"*, %"class.embeddings::TileSpmemPointer"** %2, align 4
+  %4 = bitcast %"class.embeddings::TileSpmemPointer"* %3 to %"class.embeddings::PointerBase"*
+  %5 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %4, i32 0, i32 2
+  %6 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %5 to i32 addrspace(201)**
+  %7 = load i32 addrspace(201)*, i32 addrspace(201)** %6, align 4, !tbaa !18
+  %8 = bitcast i32 addrspace(201)* %7 to i8 addrspace(201)*
+  ret i8 addrspace(201)* %8
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float>) #13
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.tpu.pow2.macro.v8f32(<8 x float>) #13
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.tpu.log2.macro.v8f32(<8 x float>) #13
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef <8 x float> @_ZN10embeddings4Ftrl14ClampSymmetricEDv8_ff(%"class.embeddings::Ftrl"* noundef nonnull align 4 dereferenceable(16) %0, <8 x float> noundef %1, float noundef %2) #16 comdat align 2 {
+  %4 = alloca %"class.embeddings::Ftrl"*, align 4
+  %5 = alloca <8 x float>, align 32
+  %6 = alloca float, align 4
+  %7 = alloca <8 x float>, align 32
+  %8 = alloca <8 x float>, align 32
+  %9 = alloca <8 x float>, align 32
+  store %"class.embeddings::Ftrl"* %0, %"class.embeddings::Ftrl"** %4, align 4, !tbaa !9
+  store <8 x float> %1, <8 x float>* %5, align 32, !tbaa !18
+  store float %2, float* %6, align 4, !tbaa !7
+  %10 = load %"class.embeddings::Ftrl"*, %"class.embeddings::Ftrl"** %4, align 4
+  %11 = bitcast <8 x float>* %7 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %11) #15
+  %12 = load float, float* %6, align 4, !tbaa !7
+  %13 = fcmp oge float %12, 0.000000e+00
+  br i1 %13, label %14, label %16
+
+14:                                               ; preds = %3
+  %15 = load float, float* %6, align 4, !tbaa !7
+  br label %19
+
+16:                                               ; preds = %3
+  %17 = load float, float* %6, align 4, !tbaa !7
+  %18 = fneg float %17
+  br label %19
+
+19:                                               ; preds = %16, %14
+  %20 = phi float [ %15, %14 ], [ %18, %16 ]
+  store float %20, float* %6, align 4, !tbaa !7
+  %21 = bitcast <8 x float>* %8 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %21) #15
+  %22 = load float, float* %6, align 4, !tbaa !7
+  %23 = insertelement <8 x float> poison, float %22, i32 0
+  %24 = shufflevector <8 x float> %23, <8 x float> poison, <8 x i32> zeroinitializer
+  %25 = fneg <8 x float> %24
+  store <8 x float> %25, <8 x float>* %8, align 32, !tbaa !18
+  %26 = bitcast <8 x float>* %9 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %26) #15
+  %27 = load <8 x float>, <8 x float>* %5, align 32, !tbaa !18
+  %28 = load float, float* %6, align 4, !tbaa !7
+  %29 = insertelement <8 x float> poison, float %28, i32 0
+  %30 = shufflevector <8 x float> %29, <8 x float> poison, <8 x i32> zeroinitializer
+  %31 = call <8 x float> @llvm.minimum.v8f32(<8 x float> %27, <8 x float> %30)
+  store <8 x float> %31, <8 x float>* %9, align 32, !tbaa !18
+  %32 = load <8 x float>, <8 x float>* %9, align 32, !tbaa !18
+  %33 = load <8 x float>, <8 x float>* %8, align 32, !tbaa !18
+  %34 = call <8 x float> @llvm.maximum.v8f32(<8 x float> %32, <8 x float> %33)
+  store <8 x float> %34, <8 x float>* %7, align 32, !tbaa !18
+  %35 = load <8 x float>, <8 x float>* %7, align 32, !tbaa !18
+  %36 = bitcast <8 x float>* %9 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %36) #15
+  %37 = bitcast <8 x float>* %8 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %37) #15
+  %38 = bitcast <8 x float>* %7 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %38) #15
+  ret <8 x float> %35
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.minimum.v8f32(<8 x float>, <8 x float>) #12
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.maximum.v8f32(<8 x float>, <8 x float>) #12
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings15ScratchpadArrayC2ENS_11PointerBaseEi(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::PointerBase"* noundef %1, i32 noundef %2) unnamed_addr #6 comdat align 2 {
+  %4 = alloca %"class.embeddings::ScratchpadArray"*, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca %"class.embeddings::PointerBase", align 4
+  store %"class.embeddings::ScratchpadArray"* %0, %"class.embeddings::ScratchpadArray"** %4, align 4, !tbaa !9
+  store i32 %2, i32* %5, align 4, !tbaa !3
+  %7 = load %"class.embeddings::ScratchpadArray"*, %"class.embeddings::ScratchpadArray"** %4, align 4
+  %8 = bitcast %"class.embeddings::ScratchpadArray"* %7 to %"class.embeddings::BaseArray"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %6, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) #20
+  %9 = load i32, i32* %5, align 4, !tbaa !3
+  call void @_ZN10embeddings9BaseArrayC2ENS_11PointerBaseEi(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %8, %"class.embeddings::PointerBase"* noundef %6, i32 noundef %9) #20
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings9BaseArrayC2ENS_11PointerBaseEi(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::PointerBase"* noundef %1, i32 noundef %2) unnamed_addr #6 comdat align 2 {
+  %4 = alloca %"class.embeddings::BaseArray"*, align 4
+  %5 = alloca i32, align 4
+  store %"class.embeddings::BaseArray"* %0, %"class.embeddings::BaseArray"** %4, align 4, !tbaa !9
+  store i32 %2, i32* %5, align 4, !tbaa !3
+  %6 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %4, align 4
+  %7 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %6, i32 0, i32 0
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %7, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) #20
+  %8 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %6, i32 0, i32 1
+  %9 = load i32, i32* %5, align 4, !tbaa !3
+  store i32 %9, i32* %8, align 4, !tbaa !32
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings11ToBasicTypeIDv8_fE10basic_typeEv(%"class.embeddings::BasicType"* noalias sret(%"class.embeddings::BasicType") align 4 %0) #4 comdat align 2 {
+  %2 = alloca i8*, align 4
+  %3 = alloca %"class.embeddings::SCTY_V8F32", align 4
+  %4 = bitcast %"class.embeddings::BasicType"* %0 to i8*
+  store i8* %4, i8** %2, align 4
+  %5 = bitcast %"class.embeddings::SCTY_V8F32"* %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #15
+  call void @_ZN10embeddings10SCTY_V8F32C2Ev(%"class.embeddings::SCTY_V8F32"* noundef nonnull align 4 dereferenceable(4) %3) #20
+  %6 = bitcast %"class.embeddings::SCTY_V8F32"* %3 to %"class.embeddings::BasicType"*
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %0, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %6) #20
+  %7 = bitcast %"class.embeddings::SCTY_V8F32"* %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %7) #15
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings16TileSpmemPointerC2EPU5AS201vNS_9BasicTypeE(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %0, i8 addrspace(201)* noundef %1, %"class.embeddings::BasicType"* noundef %2) unnamed_addr #6 comdat align 2 {
+  %4 = alloca %"class.embeddings::TileSpmemPointer"*, align 4
+  %5 = alloca i8 addrspace(201)*, align 4
+  %6 = alloca %"class.embeddings::BasicType", align 4
+  store %"class.embeddings::TileSpmemPointer"* %0, %"class.embeddings::TileSpmemPointer"** %4, align 4, !tbaa !9
+  store i8 addrspace(201)* %1, i8 addrspace(201)** %5, align 4, !tbaa !9
+  %7 = load %"class.embeddings::TileSpmemPointer"*, %"class.embeddings::TileSpmemPointer"** %4, align 4
+  %8 = bitcast %"class.embeddings::TileSpmemPointer"* %7 to %"class.embeddings::PointerBase"*
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %6, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %2) #20
+  %9 = load i8 addrspace(201)*, i8 addrspace(201)** %5, align 4, !tbaa !9
+  call void @_ZN10embeddings11PointerBaseC2ENS_9BasicTypeEPU5AS201v(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %8, %"class.embeddings::BasicType"* noundef %6, i8 addrspace(201)* noundef %9) #20
+  ret void
+}
+
+; Function Attrs: inlinehint
+define linkonce_odr dso_local void @_ZN10embeddings11PointerBaseC2EOS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) unnamed_addr #5 comdat align 2 {
+  %3 = alloca %"class.embeddings::PointerBase"*, align 4
+  %4 = alloca %"class.embeddings::PointerBase"*, align 4
+  store %"class.embeddings::PointerBase"* %0, %"class.embeddings::PointerBase"** %3, align 4, !tbaa !9
+  store %"class.embeddings::PointerBase"* %1, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !9
+  %5 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %3, align 4
+  %6 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 0
+  %7 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !9
+  %8 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 0
+  call void @_ZN10embeddings11MemorySpaceC2ERKS0_(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %6, %"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %8) #20
+  %9 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 1
+  %10 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !9
+  %11 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %10, i32 0, i32 1
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %9, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %11) #20
+  %12 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 2
+  %13 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !9
+  %14 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %13, i32 0, i32 2
+  %15 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %12 to i8*
+  %16 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %14 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %15, i8* align 4 %16, i32 4, i1 false), !tbaa.struct !11
+  ret void
+}
+
+define linkonce_odr dso_local void @_ZN10embeddings10SCTY_V8F32C2Ev(%"class.embeddings::SCTY_V8F32"* noundef nonnull align 4 dereferenceable(4) %0) unnamed_addr #17 comdat align 2 {
+  %2 = alloca %"class.embeddings::SCTY_V8F32"*, align 4
+  store %"class.embeddings::SCTY_V8F32"* %0, %"class.embeddings::SCTY_V8F32"** %2, align 4, !tbaa !9
+  %3 = load %"class.embeddings::SCTY_V8F32"*, %"class.embeddings::SCTY_V8F32"** %2, align 4
+  %4 = bitcast %"class.embeddings::SCTY_V8F32"* %3 to %"class.embeddings::BasicType"*
+  call void @_ZN10embeddings9BasicTypeC2ENS_19SparsecoreBasicTypeE(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %4, i32 noundef 5) #20
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define linkonce_odr dso_local void @_ZN10embeddings9BasicTypeC2ENS_19SparsecoreBasicTypeE(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %0, i32 noundef %1) unnamed_addr #7 comdat align 2 {
+  %3 = alloca %"class.embeddings::BasicType"*, align 4
+  %4 = alloca i32, align 4
+  store %"class.embeddings::BasicType"* %0, %"class.embeddings::BasicType"** %3, align 4, !tbaa !9
+  store i32 %1, i32* %4, align 4, !tbaa !35
+  %5 = load %"class.embeddings::BasicType"*, %"class.embeddings::BasicType"** %3, align 4
+  %6 = getelementptr inbounds %"class.embeddings::BasicType", %"class.embeddings::BasicType"* %5, i32 0, i32 0
+  %7 = load i32, i32* %4, align 4, !tbaa !35
+  store i32 %7, i32* %6, align 4, !tbaa !29
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings11PointerBaseC2ENS_9BasicTypeEPU5AS201v(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::BasicType"* noundef %1, i8 addrspace(201)* noundef %2) unnamed_addr #6 comdat align 2 {
+  %4 = alloca %"class.embeddings::PointerBase"*, align 4
+  %5 = alloca i8 addrspace(201)*, align 4
+  %6 = alloca %"class.embeddings::SCM_TileSpmem", align 4
+  store %"class.embeddings::PointerBase"* %0, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !9
+  store i8 addrspace(201)* %2, i8 addrspace(201)** %5, align 4, !tbaa !9
+  %7 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4
+  %8 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 0
+  %9 = bitcast %"class.embeddings::SCM_TileSpmem"* %6 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %9) #15
+  call void @_ZN10embeddings13SCM_TileSpmemC2Ev(%"class.embeddings::SCM_TileSpmem"* noundef nonnull align 4 dereferenceable(4) %6) #20
+  %10 = bitcast %"class.embeddings::SCM_TileSpmem"* %6 to %"class.embeddings::MemorySpace"*
+  call void @_ZN10embeddings11MemorySpaceC2ERKS0_(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %8, %"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %10) #20
+  %11 = bitcast %"class.embeddings::SCM_TileSpmem"* %6 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %11) #15
+  %12 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 1
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %12, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %1) #20
+  %13 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 2
+  %14 = load i8 addrspace(201)*, i8 addrspace(201)** %5, align 4, !tbaa !9
+  call void @_ZN10embeddings11PointerBase6AnyPtrC2EPU5AS201v(%"union.embeddings::PointerBase::AnyPtr"* noundef nonnull align 4 dereferenceable(4) %13, i8 addrspace(201)* noundef %14) #20
+  ret void
+}
+
+define linkonce_odr dso_local void @_ZN10embeddings13SCM_TileSpmemC2Ev(%"class.embeddings::SCM_TileSpmem"* noundef nonnull align 4 dereferenceable(4) %0) unnamed_addr #17 comdat align 2 {
+  %2 = alloca %"class.embeddings::SCM_TileSpmem"*, align 4
+  store %"class.embeddings::SCM_TileSpmem"* %0, %"class.embeddings::SCM_TileSpmem"** %2, align 4, !tbaa !9
+  %3 = load %"class.embeddings::SCM_TileSpmem"*, %"class.embeddings::SCM_TileSpmem"** %2, align 4
+  %4 = bitcast %"class.embeddings::SCM_TileSpmem"* %3 to %"class.embeddings::MemorySpace"*
+  call void @_ZN10embeddings11MemorySpaceC2ENS_21SparsecoreMemorySpaceE(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %4, i32 noundef 1) #20
+  ret void
+}
+
+; Function Attrs: nounwind
+define linkonce_odr dso_local void @_ZN10embeddings11PointerBase6AnyPtrC2EPU5AS201v(%"union.embeddings::PointerBase::AnyPtr"* noundef nonnull align 4 dereferenceable(4) %0, i8 addrspace(201)* noundef %1) unnamed_addr #18 comdat align 2 {
+  %3 = alloca %"union.embeddings::PointerBase::AnyPtr"*, align 4
+  %4 = alloca i8 addrspace(201)*, align 4
+  store %"union.embeddings::PointerBase::AnyPtr"* %0, %"union.embeddings::PointerBase::AnyPtr"** %3, align 4, !tbaa !9
+  store i8 addrspace(201)* %1, i8 addrspace(201)** %4, align 4, !tbaa !9
+  %5 = load %"union.embeddings::PointerBase::AnyPtr"*, %"union.embeddings::PointerBase::AnyPtr"** %3, align 4
+  %6 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %5 to i32 addrspace(201)**
+  %7 = load i8 addrspace(201)*, i8 addrspace(201)** %4, align 4, !tbaa !9
+  %8 = bitcast i8 addrspace(201)* %7 to i32 addrspace(201)*
+  store i32 addrspace(201)* %8, i32 addrspace(201)** %6, align 4, !tbaa !18
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define linkonce_odr dso_local void @_ZN10embeddings11MemorySpaceC2ENS_21SparsecoreMemorySpaceE(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %0, i32 noundef %1) unnamed_addr #7 comdat align 2 {
+  %3 = alloca %"class.embeddings::MemorySpace"*, align 4
+  %4 = alloca i32, align 4
+  store %"class.embeddings::MemorySpace"* %0, %"class.embeddings::MemorySpace"** %3, align 4, !tbaa !9
+  store i32 %1, i32* %4, align 4, !tbaa !36
+  %5 = load %"class.embeddings::MemorySpace"*, %"class.embeddings::MemorySpace"** %3, align 4
+  %6 = getelementptr inbounds %"class.embeddings::MemorySpace", %"class.embeddings::MemorySpace"* %5, i32 0, i32 0
+  %7 = load i32, i32* %4, align 4, !tbaa !36
+  store i32 %7, i32* %6, align 4, !tbaa !26
+  ret void
+}
+
+attributes #0 = { mustprogress nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tac-vf" }
+attributes #1 = { mustprogress "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #2 = { argmemonly nocallback nofree nosync nounwind willreturn }
+attributes #3 = { mustprogress nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #4 = { alwaysinline mustprogress "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #5 = { inlinehint "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #6 = { alwaysinline "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #7 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #8 = { alwaysinline mustprogress "frame-pointer"="all" "min-legal-vector-width"="256" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #9 = { mustprogress nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-scs-vf" }
+attributes #10 = { argmemonly nofree nounwind willreturn }
+attributes #11 = { alwaysinline mustprogress nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #12 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
+attributes #13 = { nounwind readnone speculatable willreturn }
+attributes #14 = { inlinehint mustprogress nounwind "frame-pointer"="all" "min-legal-vector-width"="256" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #15 = { nounwind }
+attributes #16 = { alwaysinline mustprogress nounwind "frame-pointer"="all" "min-legal-vector-width"="256" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #17 = { "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #18 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #19 = { alwaysinline nobuiltin "no-builtins" }
+attributes #20 = { nobuiltin "no-builtins" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"clang version google3-trunk (18b9c4637099f6ed5414d8778de8c773291a9cf9)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"float", !5, i64 0}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"any pointer", !5, i64 0}
+!11 = !{i64 0, i64 4, !9, i64 0, i64 4, !9, i64 0, i64 4, !9, i64 0, i64 4, !9, i64 0, i64 4, !9}
+!12 = !{!13, !8, i64 0}
+!13 = !{!"_ZTSN10embeddings4FtrlE", !8, i64 0, !8, i64 4, !8, i64 8, !8, i64 12}
+!14 = !{!13, !8, i64 4}
+!15 = !{!13, !8, i64 8}
+!16 = !{!13, !8, i64 12}
+!17 = distinct !{}
+!18 = !{!5, !5, i64 0}
+!19 = distinct !{!19, !20, !21, !22, !23, !24, !25}
+!20 = !{!"llvm.loop.mustprogress"}
+!21 = !{!"llvm.loop.parallel_accesses", !17}
+!22 = !{!"llvm.loop.unroll.disable"}
+!23 = !{!"llvm.loop.vectorize.width", i32 1}
+!24 = !{!"llvm.loop.interleave.count", i32 1}
+!25 = !{!"llvm.loop.vectorize.enable", i1 true}
+!26 = !{!27, !28, i64 0}
+!27 = !{!"_ZTSN10embeddings11MemorySpaceE", !28, i64 0}
+!28 = !{!"_ZTSN10embeddings21SparsecoreMemorySpaceE", !5, i64 0}
+!29 = !{!30, !31, i64 0}
+!30 = !{!"_ZTSN10embeddings9BasicTypeE", !31, i64 0}
+!31 = !{!"_ZTSN10embeddings19SparsecoreBasicTypeE", !5, i64 0}
+!32 = !{!33, !4, i64 12}
+!33 = !{!"_ZTSN10embeddings9BaseArrayE", !34, i64 0, !4, i64 12}
+!34 = !{!"_ZTSN10embeddings11PointerBaseE", !27, i64 0, !30, i64 4, !5, i64 8}
+!35 = !{!31, !31, i64 0}
+!36 = !{!28, !28, i64 0}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/function_call_attr_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/function_call_attr_sc.ll
new file mode 100644
index 0000000..16773ef
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/function_call_attr_sc.ll

@@ -0,0 +1,53 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-TEC-VF
+; RUN: llc < %s -mcpu=sparsecore-scs-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-SCS-VF
+; REQUIRES: tpu
+
+; Test preliminary function call code generation.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.init.stack(i32, i32)
+declare i32* @llvm.tpu.inttoptr.i32(i32) nounwind
+
+define void @g2(i32 %x) #0 {
+  %a = call i32* @llvm.tpu.inttoptr.i32(i32 128)
+  store i32 %x, i32* %a, align 4
+  ret void
+}
+
+define void @g1(i32 %x) #0 {
+  call void @g2(i32 %x)
+  ret void
+}
+
+; CHECK-LABEL: f_imm
+; CHECK-TEC: spv = simm.s32 $0x7ff
+; CHECK-TEC: fpv = smov.u32 spv
+; CHECK-SCS-NOT: spv = simm.s32 $0x7ff
+; CHECK-SCS-NOT: fpv = smov.u32 spv
+; CHECK-DAG: sps = simm.s32 $0x1ffff
+; CHECK-DAG: lr = scall.abs g1
+define void @f_imm(i32 %a) #0 {
+  call void @llvm.tpu.init.stack(i32 2047, i32 131071)
+  call void @g1(i32 %a)
+  ret void
+}
+
+attributes #0 = { noinline "enable-tpu-abi" }
+
+!smem.funcs.spill = !{!0, !1, !2}
+!smem.ranges.spill.start = !{!100, !100, !100}
+!smem.ranges.spill.limit = !{!101, !101, !101}
+!tilespmem.funcs.spill = !{!0, !1, !2}
+!tilespmem.ranges.spill.start = !{!100, !100, !100}
+!tilespmem.ranges.spill.limit = !{!101, !101, !101}
+
+!0 = !{void (i32)* @g1}
+!1 = !{void (i32)* @g2}
+!2 = !{void (i32)* @f_imm}
+
+!100 = !{i32 0}
+!101 = !{i32 2048}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/function_call_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/function_call_sc.ll
new file mode 100644
index 0000000..0c8a992
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/function_call_sc.ll

@@ -0,0 +1,53 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-enable-tpu-abi-tec -tpu-enable-tpu-abi-scs | FileCheck %s --check-prefixes=CHECK,CHECK-TEC-VF
+; RUN: llc < %s -mcpu=sparsecore-scs-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-enable-tpu-abi-tec -tpu-enable-tpu-abi-scs | FileCheck %s --check-prefixes=CHECK,CHECK-SCS-VF
+; REQUIRES: tpu
+
+; Test preliminary function call code generation.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.init.stack(i32, i32)
+declare i32* @llvm.tpu.inttoptr.i32(i32) nounwind
+
+define void @g2(i32 %x) #0 {
+  %a = call i32* @llvm.tpu.inttoptr.i32(i32 128)
+  store i32 %x, i32* %a, align 4
+  ret void
+}
+
+define void @g1(i32 %x) #0 {
+  call void @g2(i32 %x)
+  ret void
+}
+
+; CHECK-LABEL: f_imm
+; CHECK-TEC: spv = simm.s32 $0x7ff
+; CHECK-TEC: fpv = smov.u32 spv
+; CHECK-SCS-NOT: spv = simm.s32 $0x7ff
+; CHECK-SCS-NOT: fpv = smov.u32 spv
+; CHECK-DAG: sps = simm.s32 $0x1ffff
+; CHECK-DAG: lr = scall.abs g1
+define void @f_imm(i32 %a) #0 {
+  call void @llvm.tpu.init.stack(i32 2047, i32 131071)
+  call void @g1(i32 %a)
+  ret void
+}
+
+attributes #0 = { noinline }
+
+!smem.funcs.spill = !{!0, !1, !2}
+!smem.ranges.spill.start = !{!100, !100, !100}
+!smem.ranges.spill.limit = !{!101, !101, !101}
+!tilespmem.funcs.spill = !{!0, !1, !2}
+!tilespmem.ranges.spill.start = !{!100, !100, !100}
+!tilespmem.ranges.spill.limit = !{!101, !101, !101}
+
+!0 = !{void (i32)* @g1}
+!1 = !{void (i32)* @g2}
+!2 = !{void (i32)* @f_imm}
+
+!100 = !{i32 0}
+!101 = !{i32 2048}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/function_call_stack_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/function_call_stack_sc.ll
new file mode 100644
index 0000000..2d9a38a
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/function_call_stack_sc.ll

@@ -0,0 +1,105 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-enable-tpu-abi-tec -tpu-enable-tpu-abi-scs | FileCheck %s
+; REQUIRES: tpu
+
+; Test preliminary function call code generation, stack pointer adjustment,
+; parameter passing.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @g2(i32, i32, <8 x i32>, <8 x i32>);
+
+; CHECK-LABEL: g1
+; CHECK: [smem:sps] = sst lr
+; CHECK: lr = sadd.s32 $-0x1, sps
+; CHECK: [smem:lr] = sst fpv
+; CHECK: lr = sadd.s32 $-0x2, sps
+; CHECK: fpv = smov.u32 spv
+; CHECK: v1 = vld [tilespmem:fpv+$0x8]
+; CHECK: [smem:lr] = sst fps
+; CHECK: fps = smov.u32 sps
+; CHECK: v0 = vld [tilespmem:fpv+$0x0]
+; CHECK: s1 = sld [smem:fps+$0x9]
+; CHECK: s0 = sld [smem:fps+$0x8]
+; CHECK: lr = scall.abs g2
+; CHECK: sps = sadd.s32 $-0x3, sps
+; FIXME(b/237788792): Can be optimized.
+; CHECK: sps = sadd.s32 $-0x1, sps
+; CHECK: lr = sld [smem:fps+$0x0]
+; CHECK: sps = sadd.s32 $0x1, sps
+; CHECK: (pc) = sbr.ind lr
+; CHECK: spv = smov.u32 fpv
+; CHECK: fpv = sld [smem:fps+$-0x1]
+; FIXME(b/237788792): Can be optimized.
+; CHECK: sps = sadd.s32 $0x3, sps
+; CHECK: sps = smov.u32 fps
+; CHECK: fps = sld [smem:fps+$-0x2]
+define void @g1(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6, i32 %p7, i32 %p8,
+                i32 %p9, i32 %p10, i32 %p11, i32 %p12, i32 %p13, i32 %p14, i32 %p15, i32 %p16,
+                i32 %p17, i32 %p18, i32 %p19, i32 %p20, i32 %p21, i32 %p22, i32 %p23, i32 %p24,
+                i32 %p25, i32 %p26, i32 %p27, i32 %p28, i32 %p29,
+                <8 x i32> %p30, <8 x i32> %p31, <8 x i32> %p32, <8 x i32> %p33, <8 x i32> %p34, <8 x i32> %p35, <8 x i32> %p36, <8 x i32> %p37,
+                <8 x i32> %p38, <8 x i32> %p39, <8 x i32> %p40, <8 x i32> %p41, <8 x i32> %p42, <8 x i32> %p43, <8 x i32> %p44, <8 x i32> %p45,
+                <8 x i32> %p46, <8 x i32> %p47, <8 x i32> %p48, <8 x i32> %p49, <8 x i32> %p50, <8 x i32> %p51, <8 x i32> %p52, <8 x i32> %p53,
+                <8 x i32> %p54, <8 x i32> %p55, <8 x i32> %p56, <8 x i32> %p57, <8 x i32> %p58, <8 x i32> %p59, <8 x i32> %p60,
+                <8 x i32> %p61, <8 x i32> %p62, <8 x i32> %p63, <8 x i32> %p64, <8 x i32> %p65, <8 x i32> %p66, <8 x i32> %p67, <8 x i32> %p68,
+                <8 x i32> %p69, <8 x i32> %p70, <8 x i32> %p71, <8 x i32> %p72, <8 x i32> %p73, <8 x i32> %p74, <8 x i32> %p75, <8 x i32> %p76,
+                <8 x i32> %p77, <8 x i32> %p78, <8 x i32> %p79) {
+  call void @g2(i32 %p28, i32 %p29, <8 x i32> %p78, <8 x i32> %p79)
+  ret void
+}
+
+; CHECK-LABEL: f_main
+; FIXME(b/237788792): Any future spill locations need to be frame pointer relative.
+; CHECK-NOT: [smem:sps+$0x0] = sst lr
+; CHECK-DAG: s[[param0_loc:[0-9]+]] = sadd.s32 $0x9, sps
+; CHECK-DAG: [smem:s[[param0_loc]]] = sst s1
+; CHECK-DAG: s[[param1_loc:[0-9]+]] = sadd.s32 $0x8, sps
+; CHECK-DAG: [smem:s[[param1_loc]]] = sst s0
+; CHECK-DAG: [tilespmem:spv+$0x8] = vst v{{[0-9+]}}
+; CHECK-DAG: sps = sadd.s32 $-0xa, sps
+; CHECK-DAG: spv = sadd.s32 $-0x10, spv
+; CHECK: lr = scall.abs g1
+; CHECK-DAG: [tilespmem:spv+$0x0] = vst v{{[0-9+]}}
+; CHECK-NOT: lr = sld [smem:$0x0]
+; CHECK-DAG: sps = sadd.s32 $0xa, sps
+; CHECK-DAG: spv = sadd.s32 $0x10, spv
+; CHECK: shalt
+define void @f_main(i32 %ms0, i32 %ms1, i32 %xs, i32 %ys, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %mv0, <8 x i32> %mv1) {
+  call void @g1(i32 %xs, i32 %ys, i32 %xs, i32 %ys, i32 %xs, i32 %ys, i32 %xs, i32 %ys, i32 %xs, i32 %ys,
+                i32 %xs, i32 %ys, i32 %xs, i32 %ys, i32 %xs, i32 %ys, i32 %xs, i32 %ys, i32 %xs, i32 %ys,
+                i32 %xs, i32 %ys, i32 %xs, i32 %ys, i32 %xs, i32 %ys, i32 %xs, i32 %ms0, i32 %ms1,
+		<8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv,
+		<8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv,
+		<8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv,
+		<8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv,
+		<8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv,
+		<8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv,
+                <8 x i32> %mv0, <8 x i32> %mv1)
+  ret void
+}
+
+!smem.funcs.spill = !{!0, !1, !2}
+!smem.ranges.spill.start = !{!100, !100, !100}
+!smem.ranges.spill.limit = !{!101, !101, !101}
+!tilespmem.funcs.spill = !{!0, !1, !2}
+!tilespmem.ranges.spill.start = !{!100, !100, !100}
+!tilespmem.ranges.spill.limit = !{!101, !101, !101}
+
+!0 = !{void (i32, i32, i32, i32, i32, i32, i32, i32,
+             i32, i32, i32, i32, i32, i32, i32, i32,
+             i32, i32, i32, i32, i32, i32, i32, i32,
+             i32, i32, i32, i32, i32,
+             <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>,
+             <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>,
+             <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>,
+             <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>,
+             <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>,
+             <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>,
+             <8 x i32>, <8 x i32>)* @g1}
+!1 = !{void (i32, i32, <8 x i32>, <8 x i32>)* @g2}
+!2 = !{void (i32, i32, i32, i32, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>)* @f_main}
+
+!100 = !{i32 0}
+!101 = !{i32 2048}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/function_call_stack_spill_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/function_call_stack_spill_sc.ll
new file mode 100644
index 0000000..15adac1
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/function_call_stack_spill_sc.ll

@@ -0,0 +1,125 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-enable-tpu-abi-tec -tpu-enable-tpu-abi-scs -tpu-enable-spill-debug \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Test preliminary function call code generation, stack pointer adjustment,
+; parameter passing, in the presence of spill slots.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @g2(i32, i32, <8 x i32>, <8 x i32>);
+declare i32* @llvm.tpu.inttoptr.p0i32(i32)
+declare void @llvm.tpu.spill.debug.i32(i32)
+declare void @llvm.tpu.spill.debug.v8i32(<8 x i32>)
+
+; CHECK-LABEL: g1
+; CHECK: [smem:sps] = sst lr
+; CHECK: lr = sadd.s32 $-0x1, sps
+; CHECK: [smem:lr] = sst fpv
+; CHECK: lr = sadd.s32 $-0x2, sps
+; CHECK: fpv = smov.u32 spv
+; CHECK: v0 = vld [tilespmem:fpv+$0x0]
+; CHECK: [smem:lr] = sst fps
+; CHECK: fps = smov.u32 sps
+; CHECK: v1 = vld [tilespmem:fpv+$0x8]
+; CHECK: s0 = sld [smem:fps+$0x8]
+; CHECK: s1 = sld [smem:fps+$0x9]
+; CHECK: lr = scall.abs g2
+; CHECK: s[[sspill1:[0-9]+]] = simm.s32 $0xe9
+; CHECK: v[[vspill1:[0-9]+]] = vlaneseq.u32
+; CHECK: sps = sadd.s32 $-0x5, sps
+; CHECK: lr = sadd.s32 $-0x3, fps
+; CHECK: spv = sadd.s32 $-0x10, spv
+; FIXME(b/237788792): Can be optimized.
+; CHECK: [tilespmem:fpv+$0x0] = vst v[[vspill1]]
+; CHECK: sps = sadd.s32 $-0x1, sps
+; CHECK: [smem:lr] = sst s[[sspill1]] }
+; CHECK: s[[sspill2:[0-9]+]]  = simm.s32 $0xea
+; CHECK: lr = sadd.s32 $-0x4, fps
+; CHECK: [smem:lr] = sst s[[sspill2]]
+; CHECK: lr = sld [smem:fps+$0x0]
+; CHECK: sps = sadd.s32 $0x1, sps
+; CHECK: v[[vspill2:[0-9]+]] = vlaneseq.u32
+; FIXME(b/237788792): Can be optimized.
+; CHECK: spv = sadd.s32 $0x10, spv
+; CHECK: (pc) = sbr.ind lr
+; CHECK: [tilespmem:fpv+$-0x8] = vst v[[vspill2]]
+; FIXME(b/237788792): Can be optimized.
+; CHECK: sps = sadd.s32 $0x5, sps
+; CHECK: spv = smov.u32 fpv
+; CHECK: fpv = sld [smem:fps+$-0x1]
+; CHECK: sps = smov.u32 fps
+; CHECK: fps = sld [smem:fps+$-0x2]
+define void @g1(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6, i32 %p7, i32 %p8,
+                i32 %p9, i32 %p10, i32 %p11, i32 %p12, i32 %p13, i32 %p14, i32 %p15, i32 %p16,
+                i32 %p17, i32 %p18, i32 %p19, i32 %p20, i32 %p21, i32 %p22, i32 %p23, i32 %p24,
+                i32 %p25, i32 %p26, i32 %p27, i32 %p28, i32 %p29,
+                <8 x i32> %p30, <8 x i32> %p31, <8 x i32> %p32, <8 x i32> %p33, <8 x i32> %p34, <8 x i32> %p35, <8 x i32> %p36, <8 x i32> %p37,
+                <8 x i32> %p38, <8 x i32> %p39, <8 x i32> %p40, <8 x i32> %p41, <8 x i32> %p42, <8 x i32> %p43, <8 x i32> %p44, <8 x i32> %p45,
+                <8 x i32> %p46, <8 x i32> %p47, <8 x i32> %p48, <8 x i32> %p49, <8 x i32> %p50, <8 x i32> %p51, <8 x i32> %p52, <8 x i32> %p53,
+                <8 x i32> %p54, <8 x i32> %p55, <8 x i32> %p56, <8 x i32> %p57, <8 x i32> %p58, <8 x i32> %p59, <8 x i32> %p60,
+                <8 x i32> %p61, <8 x i32> %p62, <8 x i32> %p63, <8 x i32> %p64, <8 x i32> %p65, <8 x i32> %p66, <8 x i32> %p67, <8 x i32> %p68,
+                <8 x i32> %p69, <8 x i32> %p70, <8 x i32> %p71, <8 x i32> %p72, <8 x i32> %p73, <8 x i32> %p74, <8 x i32> %p75, <8 x i32> %p76,
+                <8 x i32> %p77, <8 x i32> %p78, <8 x i32> %p79) {
+  tail call void @llvm.tpu.spill.debug.i32(i32 233)
+  tail call void @llvm.tpu.spill.debug.v8i32(<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  call void @g2(i32 %p28, i32 %p29, <8 x i32> %p78, <8 x i32> %p79)
+  tail call void @llvm.tpu.spill.debug.i32(i32 234)
+  tail call void @llvm.tpu.spill.debug.v8i32(<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  ret void
+}
+
+; CHECK-LABEL: f_main
+; CHECK-NOT: [smem:sps+$0x0] = sst lr
+; CHECK: sps = sadd.s32 $-0xa, sps
+; CHECK: spv = sadd.s32 $-0x10, spv
+; CHECK-DAG: s[[param0_loc:[0-9]+]] = sadd.s32 $0x8, sps
+; CHECK-DAG: s[[param1_loc:[0-9]+]] = sadd.s32 $0x9, sps
+; CHECK-DAG: [smem:s[[param1_loc]]] = sst s1
+; CHECK-DAG: [smem:s[[param0_loc]]] = sst s0
+; CHECK-DAG: [tilespmem:spv+$0x8] = vst v{{[0-9]+}}
+; CHECK: lr = scall.abs g1
+; CHECK: [tilespmem:spv+$0x0] = vst v{{[0-9]+}}
+; CHECK-NOT: lr = sld [smem:$0x0]
+; CHECK-DAG: sps = sadd.s32 $0xa, sps
+; CHECK-DAG: spv = sadd.s32 $0x10, spv
+; CHECK: shalt
+define void @f_main(i32 %ms0, i32 %ms1, i32 %xs, i32 %ys, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %mv0, <8 x i32> %mv1) {
+  call void @g1(i32 %xs, i32 %ys, i32 %xs, i32 %ys, i32 %xs, i32 %ys, i32 %xs, i32 %ys, i32 %xs, i32 %ys,
+                i32 %xs, i32 %ys, i32 %xs, i32 %ys, i32 %xs, i32 %ys, i32 %xs, i32 %ys, i32 %xs, i32 %ys,
+                i32 %xs, i32 %ys, i32 %xs, i32 %ys, i32 %xs, i32 %ys, i32 %xs, i32 %ms0, i32 %ms1,
+		<8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv,
+		<8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv,
+		<8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv,
+		<8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv,
+		<8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv,
+		<8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv, <8 x i32> %xv, <8 x i32> %yv,
+                <8 x i32> %mv0, <8 x i32> %mv1)
+  ret void
+}
+
+!smem.funcs.spill = !{!0, !1, !2}
+!smem.ranges.spill.start = !{!100, !100, !100}
+!smem.ranges.spill.limit = !{!101, !101, !101}
+!tilespmem.funcs.spill = !{!0, !1, !2}
+!tilespmem.ranges.spill.start = !{!100, !100, !100}
+!tilespmem.ranges.spill.limit = !{!101, !101, !101}
+
+!0 = !{void (i32, i32, i32, i32, i32, i32, i32, i32,
+             i32, i32, i32, i32, i32, i32, i32, i32,
+             i32, i32, i32, i32, i32, i32, i32, i32,
+             i32, i32, i32, i32, i32,
+             <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>,
+             <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>,
+             <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>,
+             <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>,
+             <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>,
+             <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>,
+             <8 x i32>, <8 x i32>)* @g1}
+!1 = !{void (i32, i32, <8 x i32>, <8 x i32>)* @g2}
+!2 = !{void (i32, i32, i32, i32, <8 x i32>, <8 x i32>, <8 x i32>, <8 x i32>)* @f_main}
+
+!100 = !{i32 0}
+!101 = !{i32 2048}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/high_pressure2_bc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/high_pressure2_bc.ll
new file mode 100644
index 0000000..2be337b
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/high_pressure2_bc.ll

@@ -0,0 +1,117 @@
+; RUN: llc < %s -march=googletpu -mcpu=barnacore-cc-pf -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+; Tests if the following high register pressure test compiles. Without register
+; pressure detection, it fails to allocate on Barnacore and tries to spill.
+
+; CHECK-LABEL: high_pressure2
+
+define void @high_pressure2(<8 x float> addrspace(207)* %0, <8 x float> addrspace(207)* %1, float %2, i32 %3) {
+entry:
+  call void @llvm.tpu.bc.loop.start(i32 0)
+  br label %loop-start
+
+loop-start:                                       ; preds = %loop-start, %entry
+  %4 = call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 3)
+  %5 = bitcast <8 x float> %4 to <8 x i32>
+  %6 = icmp ne <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> <float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000>, <8 x float> addrspace(207)* %0, i32 1, i32 3)
+  %7 = call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 1)
+  %8 = fmul <8 x float> %4, %4
+  %9 = bitcast <8 x float> %8 to <8 x i32>
+  %10 = bitcast <8 x i32> %9 to <8 x float>
+  %11 = fsub <8 x float> %10, %7
+  %12 = bitcast <8 x float> %11 to <8 x i32>
+  %13 = bitcast <8 x i32> %12 to <8 x float>
+  %14 = fmul <8 x float> %13, <float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01>
+  %15 = bitcast <8 x float> %14 to <8 x i32>
+  %16 = bitcast <8 x i32> %15 to <8 x float>
+  %17 = fadd <8 x float> %7, %16
+  %18 = bitcast <8 x float> %17 to <8 x i32>
+  %19 = bitcast <8 x float> %7 to <8 x i32>
+  %20 = select <8 x i1> %6, <8 x i32> %18, <8 x i32> %19
+  %21 = bitcast <8 x i32> %20 to <8 x float>
+  call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %21, <8 x float> addrspace(207)* %0, i32 1, i32 1)
+  %22 = bitcast <8 x i32> %18 to <8 x float>
+  %23 = fadd <8 x float> %22, <float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000>
+  %24 = bitcast <8 x float> %23 to <8 x i32>
+  %25 = call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 2)
+  %26 = fadd <8 x float> %25, <float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000>
+  %27 = bitcast <8 x float> %26 to <8 x i32>
+  %28 = bitcast <8 x i32> %24 to <8 x float>
+  %29 = bitcast <8 x i32> %27 to <8 x float>
+  %30 = fmul <8 x float> %28, %29
+  %31 = bitcast <8 x float> %30 to <8 x i32>
+  %32 = bitcast <8 x i32> %31 to <8 x float>
+  %33 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %32)
+  %34 = bitcast <8 x i32> %27 to <8 x float>
+  %35 = fmul <8 x float> %4, %34
+  %36 = bitcast <8 x float> %35 to <8 x i32>
+  %37 = bitcast <8 x i32> %36 to <8 x float>
+  %38 = fmul <8 x float> %37, %33
+  %39 = bitcast <8 x float> %38 to <8 x i32>
+  %40 = bitcast <8 x i32> %39 to <8 x float>
+  %41 = bitcast <8 x i32> %39 to <8 x float>
+  %42 = fmul <8 x float> %40, %41
+  %43 = bitcast <8 x float> %42 to <8 x i32>
+  %44 = bitcast <8 x i32> %39 to <8 x float>
+  %45 = fmul <8 x float> %44, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  %46 = bitcast <8 x float> %45 to <8 x i32>
+  %47 = bitcast <8 x i32> %43 to <8 x float>
+  %48 = fsub <8 x float> %47, %25
+  %49 = bitcast <8 x float> %48 to <8 x i32>
+  %50 = bitcast <8 x i32> %49 to <8 x float>
+  %51 = fmul <8 x float> %50, <float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01>
+  %52 = bitcast <8 x float> %51 to <8 x i32>
+  %53 = bitcast <8 x i32> %52 to <8 x float>
+  %54 = fadd <8 x float> %25, %53
+  %55 = bitcast <8 x float> %54 to <8 x i32>
+  %56 = bitcast <8 x float> %25 to <8 x i32>
+  %57 = select <8 x i1> %6, <8 x i32> %55, <8 x i32> %56
+  %58 = bitcast <8 x i32> %57 to <8 x float>
+  call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %58, <8 x float> addrspace(207)* %0, i32 1, i32 2)
+  %59 = call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 0)
+  %60 = fmul <8 x float> %59, <float 0x3FECCCCCC0000000, float 0x3FECCCCCC0000000, float 0x3FECCCCCC0000000, float 0x3FECCCCCC0000000, float 0x3FECCCCCC0000000, float 0x3FECCCCCC0000000, float 0x3FECCCCCC0000000, float 0x3FECCCCCC0000000>
+  %61 = bitcast <8 x float> %60 to <8 x i32>
+  %62 = bitcast <8 x i32> %61 to <8 x float>
+  %63 = bitcast <8 x i32> %46 to <8 x float>
+  %64 = fsub <8 x float> %62, %63
+  %65 = bitcast <8 x float> %64 to <8 x i32>
+  %66 = bitcast <8 x float> %59 to <8 x i32>
+  %67 = select <8 x i1> %6, <8 x i32> %65, <8 x i32> %66
+  %68 = bitcast <8 x i32> %67 to <8 x float>
+  call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %68, <8 x float> addrspace(207)* %0, i32 1, i32 0)
+  %69 = call i1 @llvm.tpu.bc.loop.end()
+  br i1 %69, label %loop-start, label %loop-out
+
+loop-out:                                         ; preds = %loop-start
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.tpu.bc.loop.start(i32 immarg) #1
+
+; Function Attrs: argmemonly nounwind readonly
+declare <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)*, i32 immarg, i32 immarg) #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.tpu.bc.store.aliaddr.flm(<8 x float>, <8 x float> addrspace(207)*, i32 immarg, i32 immarg) #3
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float>) #4
+
+; Function Attrs: inaccessiblememonly nounwind
+declare i1 @llvm.tpu.bc.loop.end() #5
+
+attributes #0 = { "is-tlp-function" }
+attributes #1 = { nounwind }
+attributes #2 = { argmemonly nounwind readonly }
+attributes #3 = { argmemonly nounwind }
+attributes #4 = { nounwind readnone speculatable willreturn }
+attributes #5 = { inaccessiblememonly nounwind }
+
+!llvm.dbg.cu = !{!0}
+
+!0 = distinct !DICompileUnit(language: 34952, file: !1, producer: "llo2llvm", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "LloTlpModule", directory: "/")
+!2 = !{}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/high_pressure_bc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/high_pressure_bc.ll
new file mode 100644
index 0000000..d5c4938
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/high_pressure_bc.ll

@@ -0,0 +1,93 @@
+; RUN: llc < %s -march=googletpu -mcpu=barnacore-cc-pf -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+; Tests if the following high register pressure test compiles. Without register
+; pressure detection, it fails to allocate on Barnacore and tries to spill.
+
+; CHECK-LABEL: high_pressure
+define void @high_pressure(<8 x float> addrspace(207)* %0, <8 x float> addrspace(207)* %1, float %2, i32 %3) {
+entry:
+  call void @llvm.tpu.bc.loop.start(i32 0)
+  br label %loop-start
+
+loop-start:                                       ; preds = %loop-start, %entry
+  %4 = call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 3)
+  %5 = bitcast <8 x float> %4 to <8 x i32>
+  %6 = icmp ne <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> <float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000>, <8 x float> addrspace(207)* %0, i32 1, i32 3)
+  %7 = call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 0)
+  %8 = fmul <8 x float> %7, <float 0x3FECCCCCC0000000, float 0x3FECCCCCC0000000, float 0x3FECCCCCC0000000, float 0x3FECCCCCC0000000, float 0x3FECCCCCC0000000, float 0x3FECCCCCC0000000, float 0x3FECCCCCC0000000, float 0x3FECCCCCC0000000>
+  %9 = bitcast <8 x float> %8 to <8 x i32>
+  %10 = call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 1)
+  %11 = call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 2)
+  %12 = fmul <8 x float> %4, %4
+  %13 = bitcast <8 x float> %12 to <8 x i32>
+  %14 = bitcast <8 x i32> %13 to <8 x float>
+  %15 = fsub <8 x float> %14, %10
+  %16 = bitcast <8 x float> %15 to <8 x i32>
+  %17 = bitcast <8 x i32> %16 to <8 x float>
+  %18 = fmul <8 x float> %17, <float 0x3FC9999980000000, float 0x3FC9999980000000, float 0x3FC9999980000000, float 0x3FC9999980000000, float 0x3FC9999980000000, float 0x3FC9999980000000, float 0x3FC9999980000000, float 0x3FC9999980000000>
+  %19 = bitcast <8 x float> %18 to <8 x i32>
+  %20 = bitcast <8 x i32> %19 to <8 x float>
+  %21 = fadd <8 x float> %10, %20
+  %22 = bitcast <8 x float> %21 to <8 x i32>
+  %23 = bitcast <8 x float> %10 to <8 x i32>
+  %24 = select <8 x i1> %6, <8 x i32> %22, <8 x i32> %23
+  %25 = bitcast <8 x i32> %24 to <8 x float>
+  call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %25, <8 x float> addrspace(207)* %0, i32 1, i32 1)
+  %26 = bitcast <8 x i32> %22 to <8 x float>
+  %27 = fadd <8 x float> %26, <float 0x3F847AE140000000, float 0x3F847AE140000000, float 0x3F847AE140000000, float 0x3F847AE140000000, float 0x3F847AE140000000, float 0x3F847AE140000000, float 0x3F847AE140000000, float 0x3F847AE140000000>
+  %28 = bitcast <8 x float> %27 to <8 x i32>
+  %29 = bitcast <8 x i32> %28 to <8 x float>
+  %30 = call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %29)
+  %31 = fmul <8 x float> %11, <float 0x3FE6666660000000, float 0x3FE6666660000000, float 0x3FE6666660000000, float 0x3FE6666660000000, float 0x3FE6666660000000, float 0x3FE6666660000000, float 0x3FE6666660000000, float 0x3FE6666660000000>
+  %32 = bitcast <8 x float> %31 to <8 x i32>
+  %33 = fmul <8 x float> %4, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  %34 = bitcast <8 x float> %33 to <8 x i32>
+  %35 = bitcast <8 x i32> %34 to <8 x float>
+  %36 = fmul <8 x float> %35, %30
+  %37 = bitcast <8 x float> %36 to <8 x i32>
+  %38 = bitcast <8 x i32> %32 to <8 x float>
+  %39 = bitcast <8 x i32> %37 to <8 x float>
+  %40 = fadd <8 x float> %38, %39
+  %41 = bitcast <8 x float> %40 to <8 x i32>
+  %42 = bitcast <8 x float> %11 to <8 x i32>
+  %43 = select <8 x i1> %6, <8 x i32> %41, <8 x i32> %42
+  %44 = bitcast <8 x i32> %43 to <8 x float>
+  call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %44, <8 x float> addrspace(207)* %0, i32 1, i32 2)
+  %45 = bitcast <8 x i32> %9 to <8 x float>
+  %46 = bitcast <8 x i32> %41 to <8 x float>
+  %47 = fsub <8 x float> %45, %46
+  %48 = bitcast <8 x float> %47 to <8 x i32>
+  %49 = bitcast <8 x float> %7 to <8 x i32>
+  %50 = select <8 x i1> %6, <8 x i32> %48, <8 x i32> %49
+  %51 = bitcast <8 x i32> %50 to <8 x float>
+  call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %51, <8 x float> addrspace(207)* %0, i32 1, i32 0)
+  %52 = call i1 @llvm.tpu.bc.loop.end()
+  br i1 %52, label %loop-start, label %loop-out
+
+loop-out:                                         ; preds = %loop-start
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.tpu.bc.loop.start(i32 immarg) #1
+
+; Function Attrs: argmemonly nounwind readonly
+declare <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)*, i32 immarg, i32 immarg) #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.tpu.bc.store.aliaddr.flm(<8 x float>, <8 x float> addrspace(207)*, i32 immarg, i32 immarg) #3
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float>) #4
+
+; Function Attrs: inaccessiblememonly nounwind
+declare i1 @llvm.tpu.bc.loop.end() #5
+
+attributes #0 = { "is-tlp-function" }
+attributes #1 = { nounwind }
+attributes #2 = { argmemonly nounwind readonly }
+attributes #3 = { argmemonly nounwind }
+attributes #4 = { nounwind readnone speculatable willreturn }
+attributes #5 = { inaccessiblememonly nounwind }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/iar_ldst_aliasing.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/iar_ldst_aliasing.ll
new file mode 100644
index 0000000..07dab44
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/iar_ldst_aliasing.ll

@@ -0,0 +1,21 @@
+; RUN: opt -S -O2 -mcpu=tensorcore-vf < %s | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.set.lane.indexed(<1024 x i32>, i32)
+declare void @llvm.tpu.vst.indexed.i32(<1024 x i32>, <1024 x i32> addrspace(205)*, i32, i32, <1024 x i1>, i32, i32)
+
+; Test that vst.indexed correctly alias other memory accesses. The load 
+; shouldn't get optimized away. st.indexed need to alias both intrinsics writing
+; iar as well as normal memory accesses.
+; CHECK-LABEL: st_indexed_alias_memaccess
+; CHECK: load <1024 x i32>, <1024 x i32> addrspace(205)*
+define <1024 x i32> @st_indexed_alias_memaccess(<1024 x i32> addrspace(205)* %ptr1, <1024 x i32> addrspace(205)* %ptr2, <1024 x i32> %data, <1024 x i32> %x, <1024 x i1> %vmask, i32 %iar) {
+entry:
+  store <1024 x i32> %x, <1024 x i32> addrspace(205)* %ptr2
+  call void @llvm.tpu.vst.indexed.i32(<1024 x i32> %data, <1024 x i32> addrspace(205)* %ptr1, i32 255, i32 8, <1024 x i1> %vmask, i32 %iar, i32 1)
+  %res = load <1024 x i32>, <1024 x i32> addrspace(205)* %ptr2
+  ret <1024 x i32> %res
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/intrinsics_gf_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/intrinsics_gf_sc.ll
new file mode 100644
index 0000000..724609b
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/intrinsics_gf_sc.ll

@@ -0,0 +1,139 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gf -asm-verbose=false \
+; RUN: | FileCheck %s
+; RUN: llc < %s -mcpu=sparsecore-tec-gf -asm-verbose=false \
+; RUN: -opaque-pointers | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <16 x i32> @llvm.tpu.mprefix.v16i32(<16 x i1>)
+declare i32 addrspace(201)* @llvm.tpu.inttoptr.p201i32(i32) nounwind
+declare x86_mmx @llvm.tpu.wrcbreg.tilespmem.base(x86_mmx, i32 addrspace(201)*)
+declare x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx, i32)
+declare x86_mmx @llvm.tpu.wrcbreg.offset(x86_mmx, i32)
+declare i32* @llvm.tpu.rdcbreg.smem.base(x86_mmx)
+declare i32 addrspace(201)* @llvm.tpu.rdcbreg.tilespmem.base(x86_mmx)
+declare i32 @llvm.tpu.rdcbreg.size(x86_mmx)
+declare i32 @llvm.tpu.rdcbreg.offset(x86_mmx)
+declare x86_mmx @llvm.tpu.cbreg.add.offset(x86_mmx, i32)
+declare <16 x i32> @llvm.tpu.sc.permute.v16i32(<16 x i32>, <16 x i32>)
+declare <16 x float> @llvm.tpu.sc.permute.v16f32(<16 x float>, <16 x i32>)
+declare <16 x i32> @llvm.tpu.sc.permute.c2i.v16i32(<16 x i32>)
+declare <16 x i32> @llvm.tpu.sc.permute.i2c.v16i32(<16 x i32>)
+declare <16 x float> @llvm.tpu.sc.permute.c2i.v16f32(<16 x float>)
+declare <16 x float> @llvm.tpu.sc.permute.i2c.v16f32(<16 x float>)
+declare <16 x i1> @llvm.tpu.sc.mask.permute.v8i1(<16 x i1>, <16 x i32>)
+declare <16 x i1> @llvm.tpu.sc.mask.permute.i2c.v8i1(<16 x i1>)
+declare <16 x i1> @llvm.tpu.sc.mask.permute.c2i.v8i1(<16 x i1>)
+
+; CHECK-LABEL: mprefix:
+; CHECK: {       v0 = vmprefix.xlane vm0;
+define <16 x i32> @mprefix(<16 x i1> %in) {
+entry:
+  %0 = tail call <16 x i32> @llvm.tpu.mprefix.v16i32(<16 x i1> %in)
+  ret <16 x i32> %0
+}
+
+; CHECK-LABEL: func_rdcbreg_tilespmem_base:
+; CHECK: { s0 = rdcbreg [cbreg:$0x0 metadata:$0x0]
+define i32 @func_rdcbreg_tilespmem_base() {
+  %p = tail call i32 addrspace(201)* @llvm.tpu.rdcbreg.tilespmem.base(x86_mmx undef)
+  %r = ptrtoint i32 addrspace(201)* %p to i32
+  ret i32 %r
+}
+
+; CHECK-LABEL: func_wrcbreg_tilespmem_base_r:
+; CHECK: { [cbreg:$0x0 metadata:$0x0] = wrcbreg s0
+; CHECK-NEXT: { [cbreg:$0x0 metadata:$0x1] = wrcbreg s1
+; CHECK-NEXT: { [cbreg:$0x0 metadata:$0x2] = wrcbreg s2
+define x86_mmx @func_wrcbreg_tilespmem_base_r(i32 addrspace(201)* %b, i32 %s, i32 %o) {
+  %1 = tail call x86_mmx @llvm.tpu.wrcbreg.tilespmem.base(x86_mmx undef, i32 addrspace(201)* %b)
+  %2 = tail call x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx %1, i32 %s)
+  %3 = tail call x86_mmx @llvm.tpu.wrcbreg.offset(x86_mmx %2, i32 %o)
+  ret x86_mmx %3
+}
+
+; CHECK-LABEL: func_wrcbreg_tilespmem_base_i:
+; CHECK: { [cbreg:$0x0 metadata:$0x0] = wrcbreg $0x1
+; CHECK-NEXT: { [cbreg:$0x0 metadata:$0x1] = wrcbreg $0x2
+; CHECK-NEXT: { [cbreg:$0x0 metadata:$0x2] = wrcbreg $0x3
+define x86_mmx @func_wrcbreg_tilespmem_base_i() {
+  %s = call i32 addrspace(201)* @llvm.tpu.inttoptr.p201i32(i32 1)
+  %1 = tail call x86_mmx @llvm.tpu.wrcbreg.tilespmem.base(x86_mmx undef, i32 addrspace(201)* %s)
+  %2 = tail call x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx %1, i32 2)
+  %3 = tail call x86_mmx @llvm.tpu.wrcbreg.offset(x86_mmx %2, i32 3)
+  ret x86_mmx %3
+}
+; CHECK-LABEL: sc_permutei:
+; CHECK: { v0 = vperm.xlane v0, v1
+define <16 x i32> @sc_permutei(<16 x i32> %v0, <16 x i32> %v1) {
+entry:
+  %0 = tail call <16 x i32> @llvm.tpu.sc.permute.v16i32(<16 x i32> %v0, <16 x i32> %v1)
+  ret <16 x i32> %0
+}
+
+; CHECK-LABEL: sc_permutef:
+; CHECK: { v0 = vperm.xlane v0, v1
+define <16 x float> @sc_permutef(<16 x float> %v0, <16 x i32> %v1) {
+entry:
+  %0 = tail call <16 x float> @llvm.tpu.sc.permute.v16f32(<16 x float> %v0, <16 x i32> %v1)
+  ret <16 x float> %0
+}
+
+
+; CHECK-LABEL: sc_c2i_permutei:
+; CHECK: { v0 = vperm.xlane.c2i v0
+define <16 x i32> @sc_c2i_permutei(<16 x i32> %v0, <16 x i32> %v1) {
+entry:
+  %0 = tail call <16 x i32> @llvm.tpu.sc.permute.c2i.v16i32(<16 x i32> %v0)
+  ret <16 x i32> %0
+}
+
+; CHECK-LABEL: sc_c2i_permutef:
+; CHECK: { v0 = vperm.xlane.c2i v0
+define <16 x float> @sc_c2i_permutef(<16 x float> %v0) {
+entry:
+  %0 = tail call <16 x float> @llvm.tpu.sc.permute.c2i.v16f32(<16 x float> %v0)
+  ret <16 x float> %0
+}
+ 
+; CHECK-LABEL: sc_i2c_permutei:
+; CHECK: { v0 = vperm.xlane.i2c v0
+define <16 x i32> @sc_i2c_permutei(<16 x i32> %v0, <16 x i32> %v1) {
+entry:
+  %0 = tail call <16 x i32> @llvm.tpu.sc.permute.i2c.v16i32(<16 x i32> %v0)
+  ret <16 x i32> %0
+}
+
+; CHECK-LABEL: sc_i2c_permutef:
+; CHECK: { v0 = vperm.xlane.i2c v0
+define <16 x float> @sc_i2c_permutef(<16 x float> %v0) {
+entry:
+  %0 = tail call <16 x float> @llvm.tpu.sc.permute.i2c.v16f32(<16 x float> %v0)
+  ret <16 x float> %0
+}
+
+; CHECK-LABEL: sc_mask_permute:
+; CHECK: { vm0 = vmperm.xlane vm0, v0
+define <16 x i1> @sc_mask_permute(<16 x i1> %m0, <16 x i32> %v0) {
+entry:
+  %0 = tail call <16 x i1> @llvm.tpu.sc.mask.permute.v8i1(<16 x i1> %m0, <16 x i32> %v0)
+  ret <16 x i1> %0
+}
+
+; CHECK-LABEL: sc_c2i_mask_permute:
+; CHECK: { vm0 = vmperm.xlane.c2i vm0
+define <16 x i1> @sc_c2i_mask_permute(<16 x i1> %m0) {
+entry:
+  %0 = tail call <16 x i1> @llvm.tpu.sc.mask.permute.c2i.v8i1(<16 x i1> %m0)
+  ret <16 x i1> %0
+}
+
+; CHECK-LABEL: sc_i2c_mask_permute:
+; CHECK: { vm0 = vmperm.xlane.i2c vm0
+define <16 x i1> @sc_i2c_mask_permute(<16 x i1> %m0) {
+entry:
+  %0 = tail call <16 x i1> @llvm.tpu.sc.mask.permute.i2c.v8i1(<16 x i1> %m0)
+  ret <16 x i1> %0
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/intrinsics_gl_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/intrinsics_gl_sc.ll
new file mode 100644
index 0000000..58525e4
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/intrinsics_gl_sc.ll

@@ -0,0 +1,72 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false \
+; RUN: -tpu-enable-vld-vst-idx-add | FileCheck %s
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false \
+; RUN: -tpu-enable-vld-vst-idx-add -opaque-pointers | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <8 x i32> @llvm.tpu.sc.permute.c2i.v8i32(<8 x i32>)
+declare <8 x i32> @llvm.tpu.sc.permute.i2c.v8i32(<8 x i32>)
+declare <8 x float> @llvm.tpu.sc.permute.c2i.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.sc.permute.i2c.v8f32(<8 x float>)
+declare <8 x i1> @llvm.tpu.sc.mask.permute.v8i1(<8 x i1>, <8 x i32>)
+declare <8 x i1> @llvm.tpu.sc.mask.permute.i2c.v8i1(<8 x i1>)
+declare <8 x i1> @llvm.tpu.sc.mask.permute.c2i.v8i1(<8 x i1>)
+
+; CHECK-LABEL: sc_c2i_permutei:
+; CHECK: { v0 = vperm.xlane.c2i v0
+define <8 x i32> @sc_c2i_permutei(<8 x i32> %v0, <8 x i32> %v1) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.sc.permute.c2i.v8i32(<8 x i32> %v0)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: sc_c2i_permutef:
+; CHECK: { v0 = vperm.xlane.c2i v0
+define <8 x float> @sc_c2i_permutef(<8 x float> %v0) {
+entry:
+  %0 = tail call <8 x float> @llvm.tpu.sc.permute.c2i.v8f32(<8 x float> %v0)
+  ret <8 x float> %0
+}
+ 
+; CHECK-LABEL: sc_i2c_permutei:
+; CHECK: { v0 = vperm.xlane.i2c v0
+define <8 x i32> @sc_i2c_permutei(<8 x i32> %v0, <8 x i32> %v1) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.sc.permute.i2c.v8i32(<8 x i32> %v0)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: sc_i2c_permutef:
+; CHECK: { v0 = vperm.xlane.i2c v0
+define <8 x float> @sc_i2c_permutef(<8 x float> %v0) {
+entry:
+  %0 = tail call <8 x float> @llvm.tpu.sc.permute.i2c.v8f32(<8 x float> %v0)
+  ret <8 x float> %0
+}
+
+; CHECK-LABEL: sc_mask_permute:
+; CHECK: { vm0 = vmperm.xlane vm0, v0
+define <8 x i1> @sc_mask_permute(<8 x i1> %m0, <8 x i32> %v0) {
+entry:
+  %0 = tail call <8 x i1> @llvm.tpu.sc.mask.permute.v8i1(<8 x i1> %m0, <8 x i32> %v0)
+  ret <8 x i1> %0
+}
+
+; CHECK-LABEL: sc_c2i_mask_permute:
+; CHECK: { vm0 = vmperm.xlane.c2i vm0
+define <8 x i1> @sc_c2i_mask_permute(<8 x i1> %m0) {
+entry:
+  %0 = tail call <8 x i1> @llvm.tpu.sc.mask.permute.c2i.v8i1(<8 x i1> %m0)
+  ret <8 x i1> %0
+}
+
+; CHECK-LABEL: sc_i2c_mask_permute:
+; CHECK: { vm0 = vmperm.xlane.i2c vm0
+define <8 x i1> @sc_i2c_mask_permute(<8 x i1> %m0) {
+entry:
+  %0 = tail call <8 x i1> @llvm.tpu.sc.mask.permute.i2c.v8i1(<8 x i1> %m0)
+  ret <8 x i1> %0
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/intrinsics_opt_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/intrinsics_opt_sc.ll
new file mode 100644
index 0000000..e692961
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/intrinsics_opt_sc.ll

@@ -0,0 +1,40 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-vf < %s | FileCheck %s
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-gl < %s | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.tileid()
+
+; CHECK-LABEL: sc_tileid_opt_lt_true
+; CHECK: ret i1 true
+define i1 @sc_tileid_opt_lt_true() {
+  %t = call i32 @llvm.tpu.tileid()
+  %r = icmp slt i32 %t, 16
+  ret i1 %r
+}
+
+; CHECK-LABEL: sc_tileid_opt_ge_true
+; CHECK: ret i1 true
+define i1 @sc_tileid_opt_ge_true() {
+  %t = call i32 @llvm.tpu.tileid()
+  %r = icmp sge i32 %t, 0
+  ret i1 %r
+}
+
+; CHECK-LABEL: sc_tileid_opt_lt_false
+; CHECK: ret i1 false
+define i1 @sc_tileid_opt_lt_false() {
+  %t = call i32 @llvm.tpu.tileid()
+  %r = icmp slt i32 %t, 0
+  ret i1 %r
+}
+
+; CHECK-LABEL: sc_tileid_opt_gt_false
+; CHECK: ret i1 false
+define i1 @sc_tileid_opt_gt_false() {
+  %t = call i32 @llvm.tpu.tileid()
+  %r = icmp sgt i32 %t, 16
+  ret i1 %r
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/intrinsics_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/intrinsics_sc.ll
new file mode 100644
index 0000000..de8d6ea
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/intrinsics_sc.ll

@@ -0,0 +1,1144 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false \
+; RUN: -tpu-enable-vld-vst-idx-add | FileCheck %s
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false \
+; RUN: -tpu-enable-vld-vst-idx-add | FileCheck %s
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false \
+; RUN: -tpu-enable-vld-vst-idx-add -opaque-pointers | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <8 x i32> @llvm.tpu.mprefix.v8i32(<8 x i1>)
+
+declare <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*)
+declare <8 x i32> @llvm.tpu.vld.msk.idx.v8i32.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>)
+declare <8 x float> @llvm.tpu.vld.msk.idx.v8f32.p201v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>)
+declare <8 x i32> @llvm.tpu.vld.msk.idx.np.v8i32.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>)
+declare <8 x float> @llvm.tpu.vld.msk.idx.np.v8f32.p201v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>)
+declare i32 @llvm.tpu.sld.cb(x86_mmx, i32)
+declare i32 @llvm.tpu.sld.cb.upd(x86_mmx, i32)
+declare void @llvm.tpu.sst.cb(i32, x86_mmx, i32)
+declare void @llvm.tpu.sst.cb.upd(i32, x86_mmx, i32)
+declare void @llvm.tpu.vst.msk.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>)
+declare void @llvm.tpu.vst.msk.add.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>)
+declare void @llvm.tpu.vst.msk.add.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x float>)
+declare void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+declare void @llvm.tpu.vst.msk.idx.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, <8 x float>)
+declare void @llvm.tpu.vst.msk.idx.add.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+declare void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, <8 x float>)
+declare void @llvm.tpu.vst.msk.idx.add.np.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, <8 x float>)
+declare void @llvm.tpu.vst.msk.idx.add.np.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+declare i32 @llvm.tpu.inttoptr.p209i32(i32) nounwind
+declare i32 addrspace(201)* @llvm.tpu.inttoptr.p201i32(i32) nounwind
+declare i32* @llvm.tpu.inttoptr.p0i32(i32) nounwind
+declare i32* @llvm.tpu.rdcbreg.smem.base(x86_mmx)
+declare i32 addrspace(201)* @llvm.tpu.rdcbreg.tilespmem.base(x86_mmx)
+declare i32 @llvm.tpu.rdcbreg.size(x86_mmx)
+declare i32 @llvm.tpu.rdcbreg.offset(x86_mmx)
+declare x86_mmx @llvm.tpu.wrcbreg.smem.base(x86_mmx, i32*)
+declare x86_mmx @llvm.tpu.wrcbreg.tilespmem.base(x86_mmx, i32 addrspace(201)*)
+declare x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx, i32)
+declare x86_mmx @llvm.tpu.wrcbreg.offset(x86_mmx, i32)
+declare x86_mmx @llvm.tpu.cbreg.add.offset(x86_mmx, i32)
+declare <8 x i32> @llvm.tpu.vld.cb.msk(<8 x i1>, x86_mmx, i32)
+declare <8 x i32> @llvm.tpu.vld.cb.msk.idx.v8i32(<8 x i1>, x86_mmx, i32, <8 x i32>)
+declare <8 x float> @llvm.tpu.vld.cb.msk.idx.v8f32(<8 x i1>, x86_mmx, i32, <8 x i32>)
+declare <8 x i32> @llvm.tpu.vld.cb.msk.idx.np.v8i32(<8 x i1>, x86_mmx, i32, <8 x i32>)
+declare <8 x float> @llvm.tpu.vld.cb.msk.idx.np.v8f32(<8 x i1>, x86_mmx, i32, <8 x i32>)
+declare void @llvm.tpu.vst.cb.msk.v8i32(<8 x i1>, x86_mmx, i32, <8 x i32>)
+declare void @llvm.tpu.vst.cb.msk.idx.v8i32(<8 x i1>, x86_mmx, i32, <8 x i32>, <8 x i32>)
+declare void @llvm.tpu.vst.cb.msk.idx.v8f32(<8 x i1>, x86_mmx, i32, <8 x i32>, <8 x float>)
+declare void @llvm.tpu.vst.cb.msk.idx.add.v8i32(<8 x i1>, x86_mmx, i32, <8 x i32>, <8 x i32>)
+declare void @llvm.tpu.vst.cb.msk.idx.add.v8f32(<8 x i1>, x86_mmx, i32, <8 x i32>, <8 x float>)
+declare void @llvm.tpu.vst.cb.msk.idx.add.np.v8i32(<8 x i1>, x86_mmx, i32, <8 x i32>, <8 x i32>)
+declare void @llvm.tpu.vst.cb.msk.idx.add.np.v8f32(<8 x i1>, x86_mmx, i32, <8 x i32>, <8 x float>)
+declare <8 x i32> @llvm.tpu.vld.cb.upd.msk(<8 x i1>, x86_mmx, i32)
+declare <8 x i32> @llvm.tpu.vld.msk.strided(<8 x i1>, <8 x i32> addrspace(201)*, i32)
+declare <8 x i32> @llvm.tpu.vld.msk.idx.strided.v8i32.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, i32)
+declare <8 x float> @llvm.tpu.vld.msk.idx.strided.v8f32.p201v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, i32)
+declare void @llvm.tpu.vst.msk.strided(<8 x i1>, <8 x i32> addrspace(201)*, i32, <8 x i32>)
+declare void @llvm.tpu.vst.msk.add.strided.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, i32, <8 x i32>)
+declare void @llvm.tpu.vst.msk.add.strided.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, i32, <8 x float>)
+declare void @llvm.tpu.vst.msk.idx.strided.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, i32, <8 x i32>)
+declare void @llvm.tpu.vst.msk.idx.strided.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, i32, <8 x float>)
+declare void @llvm.tpu.vst.cb.upd.msk.v8i32(<8 x i1>, x86_mmx, i32, <8 x i32>)
+declare void @llvm.tpu.vst.cb.msk.add.v8i32(<8 x i1>, x86_mmx, i32, <8 x i32>)
+declare void @llvm.tpu.vst.cb.msk.add.v8f32(<8 x i1>, x86_mmx, i32, <8 x float>)
+declare void @llvm.tpu.vst.cb.upd.msk.add.v8i32(<8 x i1>, x86_mmx, i32, <8 x i32>)
+declare void @llvm.tpu.vst.cb.upd.msk.add.v8f32(<8 x i1>, x86_mmx, i32, <8 x float>)
+declare <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+declare <8 x float> @llvm.tpu.vst.msk.idx.ret.add.np.v8f32.p201v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, <8 x float>)
+declare <8 x i32> @llvm.tpu.vld.cb.msk.strided(<8 x i1>, x86_mmx, i32, i32)
+declare <8 x i32> @llvm.tpu.vld.cb.upd.msk.strided(<8 x i1>, x86_mmx, i32, i32)
+declare void @llvm.tpu.vst.cb.msk.strided.v8i32(<8 x i1>, x86_mmx, i32, i32, <8 x i32>)
+declare void @llvm.tpu.vst.cb.upd.msk.strided.v8i32(<8 x i1>, x86_mmx, i32, i32, <8 x i32>)
+declare void @llvm.tpu.vst.cb.msk.add.strided.v8i32(<8 x i1>, x86_mmx, i32, i32, <8 x i32>)
+declare void @llvm.tpu.vst.cb.upd.msk.add.strided.v8i32(<8 x i1>, x86_mmx, i32, i32, <8 x i32>)
+declare void @llvm.tpu.vst.cb.msk.add.strided.v8f32(<8 x i1>, x86_mmx, i32, i32, <8 x float>)
+declare void @llvm.tpu.vst.cb.upd.msk.add.strided.v8f32(<8 x i1>, x86_mmx, i32, i32, <8 x float>)
+declare <8 x i32> @llvm.tpu.sc.permute.v8i32(<8 x i32>, <8 x i32>)
+declare <8 x float> @llvm.tpu.sc.permute.v8f32(<8 x float>, <8 x i32>)
+
+; CHECK-LABEL: mprefix:
+; CHECK: {       v0 = vmprefix.xlane vm0;
+define <8 x i32> @mprefix(<8 x i1> %in) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.mprefix.v8i32(<8 x i1> %in)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vldmsk:
+; CHECK: v0 = vld.msk [tilespmem:s0+$0x0], vm0;
+define <8 x i32> @vldmsk(<8 x i32> addrspace(201)* %base, <8 x i1> %m) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base)
+  ret <8 x i32> %0
+}
+
+; 256 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x800.
+; CHECK-LABEL: vldmsk_disp:
+; CHECK: v0 = vld.msk [tilespmem:s0+$0x800], vm0;
+define <8 x i32> @vldmsk_disp(<8 x i32> addrspace(201)* %base, <8 x i1> %m) {
+entry:
+  %b = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %base, i32 256
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %b)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vldmsk_strided_0:
+; CHECK: v0 = vld.msk [tilespmem:s0+$0x0 ss:s1], vm0;
+define <8 x i32> @vldmsk_strided_0(<8 x i32> addrspace(201)* %base, <8 x i1> %m, i32 %s) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.strided(<8 x i1> %m, <8 x i32> addrspace(201)* %base, i32 %s)
+  ret <8 x i32> %0
+}
+
+; 256 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x800.
+; CHECK-LABEL: vldmsk_disp_strided_0:
+; CHECK: v0 = vld.msk [tilespmem:s0+$0x800 ss:s1], vm0;
+define <8 x i32> @vldmsk_disp_strided_0(<8 x i32> addrspace(201)* %base, <8 x i1> %m, i32 %s) {
+entry:
+  %b = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %base, i32 256
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.strided(<8 x i1> %m, <8 x i32> addrspace(201)* %b, i32 %s)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vldmsk_strided_1:
+; CHECK: v0 = vld.msk [tilespmem:s0+$0x0 ss:$0x8], vm0;
+define <8 x i32> @vldmsk_strided_1(<8 x i32> addrspace(201)* %base, <8 x i1> %m) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.strided(<8 x i1> %m, <8 x i32> addrspace(201)* %base, i32 8)
+  ret <8 x i32> %0
+}
+
+; 256 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x800.
+; CHECK-LABEL: vldmsk_disp_strided_1:
+; CHECK: v0 = vld.msk [tilespmem:s0+$0x800 ss:$0x8], vm0;
+define <8 x i32> @vldmsk_disp_strided_1(<8 x i32> addrspace(201)* %base, <8 x i1> %m) {
+entry:
+  %b = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %base, i32 256
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.strided(<8 x i1> %m, <8 x i32> addrspace(201)* %b, i32 8)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vldmskidxi:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], vm0;
+define <8 x i32> @vldmskidxi(<8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.v8i32.p201v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %off)
+  ret <8 x i32> %0
+}
+
+
+; CHECK-LABEL: vldmskidx_stridedi_0:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0 ss:s1], vm0;
+define <8 x i32> @vldmskidx_stridedi_0(<8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i1> %m, i32 %s) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.strided.v8i32.p201v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %off, i32 %s)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vldmskidx_stridedi_1:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0 ss:$0x12c], vm0;
+define <8 x i32> @vldmskidx_stridedi_1(<8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.strided.v8i32.p201v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %off, i32 300)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vldmskidxf:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], vm0;
+define <8 x float> @vldmskidxf(<8 x float> addrspace(201)* %base, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %0 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32.p201v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %base, <8 x i32> %off)
+  ret <8 x float> %0
+}
+
+; CHECK-LABEL: vldmskidx_stridedf_0:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0 ss:s1], vm0;
+define <8 x float> @vldmskidx_stridedf_0(<8 x float> addrspace(201)* %base, <8 x i32> %off, <8 x i1> %m, i32 %s) {
+entry:
+  %0 = tail call <8 x float> @llvm.tpu.vld.msk.idx.strided.v8f32.p201v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %base, <8 x i32> %off, i32 %s)
+  ret <8 x float> %0
+}
+
+; CHECK-LABEL: vldmskidx_stridedf_1:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0 ss:$0x400], vm0;
+define <8 x float> @vldmskidx_stridedf_1(<8 x float> addrspace(201)* %base, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %0 = tail call <8 x float> @llvm.tpu.vld.msk.idx.strided.v8f32.p201v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %base, <8 x i32> %off, i32 1024)
+  ret <8 x float> %0
+}
+
+; CHECK-LABEL: vstmsk:
+; CHECK: [tilespmem:s0+$0x0] = vst.msk vm0, v0;
+define void @vstmsk(<8 x i32> addrspace(201)* %base, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstmsk_disp:
+; CHECK: [tilespmem:s0+$0x2000] = vst.msk vm0, v0;
+define void @vstmsk_disp(<8 x i32> addrspace(201)* %base, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  %b = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %base, i32 1024
+  tail call void @llvm.tpu.vst.msk.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %b, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskaddi:
+; CHECK: [tilespmem:s0+$0x0] = vst.add.s32.msk vm0, v0;
+define void @vstmskaddi(<8 x i32> addrspace(201)* %base, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.add.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstmskaddi_disp:
+; CHECK: [tilespmem:s0+$0x2000] = vst.add.s32.msk vm0, v0;
+define void @vstmskaddi_disp(<8 x i32> addrspace(201)* %base, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  %b = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %base, i32 1024
+  tail call void @llvm.tpu.vst.msk.add.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %b, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskaddf:
+; CHECK: [tilespmem:s0+$0x0] = vst.add.f32.msk vm0, v0;
+define void @vstmskaddf(<8 x float> addrspace(201)* %base, <8 x float> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.add.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %base, <8 x float> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstmskaddf_disp:
+; CHECK: [tilespmem:s0+$0x2000] = vst.add.f32.msk vm0, v0;
+define void @vstmskaddf_disp(<8 x float> addrspace(201)* %base, <8 x float> %val, <8 x i1> %m) {
+entry:
+  %b = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %base, i32 1024
+  tail call void @llvm.tpu.vst.msk.add.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %b, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmsk_strided_0:
+; CHECK: [tilespmem:s0+$0x0 ss:s1] = vst.msk vm0, v0;
+define void @vstmsk_strided_0(<8 x i32> addrspace(201)* %base, <8 x i32> %val, <8 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.strided(<8 x i1> %m, <8 x i32> addrspace(201)* %base, i32 %s, <8 x i32> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstmsk_disp_strided_0:
+; CHECK: [tilespmem:s0+$0x2000 ss:s1] = vst.msk vm0, v0;
+define void @vstmsk_disp_strided_0(<8 x i32> addrspace(201)* %base, <8 x i32> %val, <8 x i1> %m, i32 %s) {
+entry:
+  %b = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %base, i32 1024
+  tail call void @llvm.tpu.vst.msk.strided(<8 x i1> %m, <8 x i32> addrspace(201)* %b, i32 %s, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmsk_strided_1:
+; CHECK: [tilespmem:s0+$0x0 ss:$0x8] = vst.msk vm0, v0;
+define void @vstmsk_strided_1(<8 x i32> addrspace(201)* %base, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.strided(<8 x i1> %m, <8 x i32> addrspace(201)* %base, i32 8, <8 x i32> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstmsk_disp_strided_1:
+; CHECK: [tilespmem:s0+$0x2000 ss:$0x400] = vst.msk vm0, v0;
+define void @vstmsk_disp_strided_1(<8 x i32> addrspace(201)* %base, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  %b = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %base, i32 1024
+  tail call void @llvm.tpu.vst.msk.strided(<8 x i1> %m, <8 x i32> addrspace(201)* %b, i32 1024, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskadd_stridedi_0:
+; CHECK: [tilespmem:s0+$0x0 ss:s1] = vst.add.s32.msk vm0, v0;
+define void @vstmskadd_stridedi_0(<8 x i32> addrspace(201)* %base, <8 x i32> %val, <8 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.add.strided.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, i32 %s, <8 x i32> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstmskadd_disp_stridedi_0:
+; CHECK: [tilespmem:s0+$0x2000 ss:s1] = vst.add.s32.msk vm0, v0;
+define void @vstmskadd_disp_stridedi_0(<8 x i32> addrspace(201)* %base, <8 x i32> %val, <8 x i1> %m, i32 %s) {
+entry:
+  %b = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %base, i32 1024
+  tail call void @llvm.tpu.vst.msk.add.strided.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %b, i32 %s, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskadd_stridedi_1:
+; CHECK: [tilespmem:s0+$0x0 ss:$0x8] = vst.add.s32.msk vm0, v0;
+define void @vstmskadd_stridedi_1(<8 x i32> addrspace(201)* %base, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.add.strided.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, i32 8, <8 x i32> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstmskadd_disp_stridedi_1:
+; CHECK: [tilespmem:s0+$0x2000 ss:$0x400] = vst.add.s32.msk vm0, v0;
+define void @vstmskadd_disp_stridedi_1(<8 x i32> addrspace(201)* %base, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  %b = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %base, i32 1024
+  tail call void @llvm.tpu.vst.msk.add.strided.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %b, i32 1024, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskadd_stridedf_0:
+; CHECK: [tilespmem:s0+$0x0 ss:s1] = vst.add.f32.msk vm0, v0;
+define void @vstmskadd_stridedf_0(<8 x float> addrspace(201)* %base, <8 x float> %val, <8 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.add.strided.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %base, i32 %s, <8 x float> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstmskadd_disp_stridedf_0:
+; CHECK: [tilespmem:s0+$0x2000 ss:s1] = vst.add.f32.msk vm0, v0;
+define void @vstmskadd_disp_stridedf_0(<8 x float> addrspace(201)* %base, <8 x float> %val, <8 x i1> %m, i32 %s) {
+entry:
+  %b = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %base, i32 1024
+  tail call void @llvm.tpu.vst.msk.add.strided.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %b, i32 %s, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskadd_stridedf_1:
+; CHECK: [tilespmem:s0+$0x0 ss:$0x8] = vst.add.f32.msk vm0, v0;
+define void @vstmskadd_stridedf_1(<8 x float> addrspace(201)* %base, <8 x float> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.add.strided.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %base, i32 8, <8 x float> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstmskadd_disp_stridedf_1:
+; CHECK: [tilespmem:s0+$0x2000 ss:$0x400] = vst.add.f32.msk vm0, v0;
+define void @vstmskadd_disp_stridedf_1(<8 x float> addrspace(201)* %base, <8 x float> %val, <8 x i1> %m) {
+entry:
+  %b = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %base, i32 1024
+  tail call void @llvm.tpu.vst.msk.add.strided.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %b, i32 1024, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskidxi:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.msk vm0, v1;
+define void @vstmskidxi(<8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskidxf:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.msk vm0, v1;
+define void @vstmskidxf(<8 x float> addrspace(201)* %base, <8 x i32> %off, <8 x float> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %base, <8 x i32> %off, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskidx_stridedi_0:
+; CHECK: [tilespmem:v0+s0+$0x0 ss:s1] = vst.idx.msk vm0, v1;
+define void @vstmskidx_stridedi_0(<8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i32> %val, <8 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.strided.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %off, i32 %s, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskidx_stridedi_1:
+; CHECK: [tilespmem:v0+s0+$0x0 ss:$0x800] = vst.idx.msk vm0, v1;
+define void @vstmskidx_stridedi_1(<8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.strided.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %off, i32 2048, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskidx_stridedf_0:
+; CHECK: [tilespmem:v0+s0+$0x0 ss:s1] = vst.idx.msk vm0, v1;
+define void @vstmskidx_stridedf_0(<8 x float> addrspace(201)* %base, <8 x i32> %off, <8 x float> %val, <8 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.strided.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %base, <8 x i32> %off, i32 %s, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskidx_stridedf_1:
+; CHECK: [tilespmem:v0+s0+$0x0 ss:$0x185] = vst.idx.msk vm0, v1;
+define void @vstmskidx_stridedf_1(<8 x float> addrspace(201)* %base, <8 x i32> %off, <8 x float> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.strided.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %base, <8 x i32> %off, i32 389, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstidxaddi:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.add.s32.msk vm0, v1;
+define void @vstidxaddi(<8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.add.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstidxaddf:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.add.f32.msk vm0, v1;
+define void @vstidxaddf(<8 x float> addrspace(201)* %base, <8 x i32> %off, <8 x float> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %base, <8 x i32> %off, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstidxaddf_np:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.add.f32.msk vm0, v1;
+define void @vstidxaddf_np(<8 x float> addrspace(201)* %base, <8 x i32> %off, <8 x float> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.add.np.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %base, <8 x i32> %off, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstidxaddi_np:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.add.s32.msk vm0, v1;
+define void @vstidxaddi_np(<8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.add.np.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vldmskidxi_np:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], vm0;
+define <8 x i32> @vldmskidxi_np(<8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.np.v8i32.p201v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %off)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vldmskidxf_np:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], vm0;
+define <8 x float> @vldmskidxf_np(<8 x float> addrspace(201)* %base, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %0 = tail call <8 x float> @llvm.tpu.vld.msk.idx.np.v8f32.p201v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %base, <8 x i32> %off)
+  ret <8 x float> %0
+}
+
+; CHECK-LABEL: vldcbmsk:
+; CHECK: v0 = vld.cb.msk [tilespmem:s0+$0x0 cbreg:$0x0], vm0;
+define <8 x i32> @vldcbmsk(i32 %base, <8 x i1> %m) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.cb.msk(<8 x i1> %m,
+                                                x86_mmx undef,
+                                                i32 %base)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vldcbidxmski:
+; CHECK: v0 = vld.idx.cb.msk [tilespmem:v0+s0+$0x0 cbreg:$0x0], vm0;
+define <8 x i32> @vldcbidxmski(i32 %base, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.cb.msk.idx.v8i32(<8 x i1> %m,
+                                                     x86_mmx undef,
+                                                     i32 %base,
+                                                     <8 x i32> %off)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vldcbidxmskf:
+; CHECK: v0 = vld.idx.cb.msk [tilespmem:v0+s0+$0x0 cbreg:$0x0], vm0;
+define <8 x float> @vldcbidxmskf(i32 %base, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %0 = tail call <8 x float> @llvm.tpu.vld.cb.msk.idx.v8f32(<8 x i1> %m,
+                                                       x86_mmx undef,
+                                                       i32 %base,
+                                                       <8 x i32> %off)
+  ret <8 x float> %0
+}
+
+; CHECK-LABEL: vldcbidxmski_np:
+; CHECK: v0 = vld.idx.cb.msk [tilespmem:v0+s0+$0x0 cbreg:$0x0], vm0;
+define <8 x i32> @vldcbidxmski_np(i32 %base, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.cb.msk.idx.np.v8i32(<8 x i1> %m,
+                                                        x86_mmx undef,
+                                                        i32 %base,
+                                                        <8 x i32> %off)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vldcbidxmskf_np:
+; CHECK: v0 = vld.idx.cb.msk [tilespmem:v0+s0+$0x0 cbreg:$0x0], vm0;
+define <8 x float> @vldcbidxmskf_np(i32 %base, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %0 = tail call <8 x float> @llvm.tpu.vld.cb.msk.idx.np.v8f32(<8 x i1> %m,
+                                                          x86_mmx undef,
+                                                          i32 %base,
+                                                          <8 x i32> %off)
+  ret <8 x float> %0
+}
+
+; CHECK-LABEL: vstcbmsk:
+; CHECK: [tilespmem:s0+$0x0 cbreg:$0x0] = vst.cb.msk vm0, v0;
+define void @vstcbmsk(i32 %base, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.v8i32(<8 x i1> %m,
+                                      x86_mmx undef,
+                                      i32 %base,
+                                      <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxmski:
+; CHECK: [tilespmem:v0+s0+$0x0 cbreg:$0x0] = vst.idx.cb.msk vm0, v1;
+define void @vstcbidxmski(i32 %base, <8 x i32> %off, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.idx.v8i32(<8 x i1> %m,
+                                           x86_mmx undef,
+                                           i32 %base,
+                                           <8 x i32> %off, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxmskf:
+; CHECK: [tilespmem:v0+s0+$0x0 cbreg:$0x0] = vst.idx.cb.msk vm0, v1;
+define void @vstcbidxmskf(i32 %base, <8 x i32> %off, <8 x float> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.idx.v8f32(<8 x i1> %m,
+                                           x86_mmx undef,
+                                           i32 %base,
+                                           <8 x i32> %off, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxaddi:
+; CHECK: [tilespmem:v0+s0+$0x0 cbreg:$0x0] = vst.idx.cb.add.s32.msk vm0, v1;
+define void @vstcbidxaddi(<8 x i1> %m, i32 %base, <8 x i32> %off, <8 x i32> %val) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.idx.add.v8i32(<8 x i1> %m,
+                                               x86_mmx undef,
+                                               i32 %base,
+                                               <8 x i32> %off, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxaddf:
+; CHECK: [tilespmem:v0+s0+$0x0 cbreg:$0x0] = vst.idx.cb.add.f32.msk vm0, v1;
+define void @vstcbidxaddf(<8 x i1> %m, i32 %base, <8 x i32> %off, <8 x float> %val) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.idx.add.v8f32(<8 x i1> %m,
+                                               x86_mmx undef,
+                                               i32 %base,
+                                               <8 x i32> %off, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxaddi_np:
+; CHECK: [tilespmem:v0+s0+$0x0 cbreg:$0x0] = vst.idx.cb.add.s32.msk vm0, v1;
+define void @vstcbidxaddi_np(<8 x i1> %m, i32 %base, <8 x i32> %off, <8 x i32> %val) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.idx.add.np.v8i32(<8 x i1> %m,
+                                                  x86_mmx undef,
+                                                  i32 %base,
+                                                  <8 x i32> %off, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxaddf_np:
+; CHECK: [tilespmem:v0+s0+$0x0 cbreg:$0x0] = vst.idx.cb.add.f32.msk vm0, v1;
+define void @vstcbidxaddf_np(<8 x i1> %m, i32 %base, <8 x i32> %off, <8 x float> %val) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.idx.add.np.v8f32(<8 x i1> %m,
+                                                  x86_mmx undef,
+                                                  i32 %base,
+                                                  <8 x i32> %off, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vldcbupdmsk:
+; CHECK: v0 = vld.cb.upd.msk [tilespmem:s0+$0x0 cbreg:$0x0], vm0;
+define <8 x i32> @vldcbupdmsk(i32 %base, <8 x i1> %m) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.cb.upd.msk(<8 x i1> %m,
+                                                    x86_mmx undef,
+                                                    i32 %base)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vstcbupdmsk:
+; CHECK: [tilespmem:s0+$0x0 cbreg:$0x0] = vst.cb.upd.msk vm0, v0;
+define void @vstcbupdmsk(i32 %base, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.cb.upd.msk.v8i32(<8 x i1> %m,
+                                          x86_mmx undef,
+                                          i32 %base,
+                                          <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: func_sst_cb_rr:
+; CHECK: { [smem:s1 cbreg:$0x0] = sst.cb s0;
+define void @func_sst_cb_rr(i32 %v, i32 %off) {
+  tail call void @llvm.tpu.sst.cb(i32 %v, x86_mmx undef, i32 %off)
+  ret void
+}
+
+; CHECK-LABEL: func_sst_cb_ri:
+; CHECK: { [smem:$0x1 cbreg:$0x0] = sst.cb s0;
+define void @func_sst_cb_ri(i32 %v) {
+  tail call void @llvm.tpu.sst.cb(i32 %v, x86_mmx undef, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: func_sst_cb_upd_rr:
+; CHECK: { [smem:s1 cbreg:$0x0] = sst.cb.upd s0;
+define void @func_sst_cb_upd_rr(i32 %v, i32 %off) {
+  tail call void @llvm.tpu.sst.cb.upd(i32 %v, x86_mmx undef, i32 %off)
+  ret void
+}
+
+; CHECK-LABEL: func_sst_cb_upd_ri:
+; CHECK: { [smem:$0x1 cbreg:$0x0] = sst.cb.upd s0;
+define void @func_sst_cb_upd_ri(i32 %v) {
+  tail call void @llvm.tpu.sst.cb.upd(i32 %v, x86_mmx undef, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: func_sld_cb_r:
+; CHECK: { s0 = sld.cb [smem:s0 cbreg:$0x0]
+define i32 @func_sld_cb_r(i32 %off) {
+  %r = tail call i32 @llvm.tpu.sld.cb(x86_mmx undef, i32 %off)
+  ret i32 %r
+}
+
+; CHECK-LABEL: func_sld_cb_i:
+; CHECK: { s0 = sld.cb [smem:$0x1 cbreg:$0x0]
+define i32 @func_sld_cb_i() {
+  %r = tail call i32 @llvm.tpu.sld.cb(x86_mmx undef, i32 1)
+  ret i32 %r
+}
+
+; CHECK-LABEL: func_sld_cb_upd_r:
+; CHECK: { s0 = sld.cb.upd [smem:s0 cbreg:$0x0]
+define i32 @func_sld_cb_upd_r(i32 %off) {
+  %r = tail call i32 @llvm.tpu.sld.cb.upd(x86_mmx undef, i32 %off)
+  ret i32 %r
+}
+
+; CHECK-LABEL: func_sld_cb_upd_i:
+; CHECK: { s0 = sld.cb.upd [smem:$0x1 cbreg:$0x0]
+define i32 @func_sld_cb_upd_i() {
+  %r = tail call i32 @llvm.tpu.sld.cb.upd(x86_mmx undef, i32 1)
+  ret i32 %r
+}
+
+define i32 @func_rdcbreg_smem_base() {
+  %p = tail call i32* @llvm.tpu.rdcbreg.smem.base(x86_mmx undef)
+  %r = ptrtoint i32* %p to i32
+  ret i32 %r
+}
+
+; CHECK-LABEL: func_rdcbreg_tilespmem_base:
+; CHECK: { s0 = rdcbreg [cbreg:$0x0 metadata:$0x0]
+define i32 @func_rdcbreg_tilespmem_base() {
+  %p = tail call i32 addrspace(201)* @llvm.tpu.rdcbreg.tilespmem.base(x86_mmx undef)
+  %r = ptrtoint i32 addrspace(201)* %p to i32
+  ret i32 %r
+}
+
+; CHECK-LABEL: func_rdcbreg_size:
+; CHECK: { s0 = rdcbreg [cbreg:$0x0 metadata:$0x1]
+define i32 @func_rdcbreg_size() {
+  %r = tail call i32 @llvm.tpu.rdcbreg.size(x86_mmx undef)
+  ret i32 %r
+}
+
+; CHECK-LABEL: func_rdcbreg_offset:
+; CHECK: { s0 = rdcbreg [cbreg:$0x0 metadata:$0x2]
+define i32 @func_rdcbreg_offset() {
+  %r = tail call i32 @llvm.tpu.rdcbreg.offset(x86_mmx undef)
+  ret i32 %r
+}
+
+; CHECK-LABEL: func_wrcbreg_smem_base_r:
+; CHECK: { [cbreg:$0x0 metadata:$0x0] = wrcbreg s0
+; CHECK-NEXT: { [cbreg:$0x0 metadata:$0x1] = wrcbreg s1
+; CHECK-NEXT: { [cbreg:$0x0 metadata:$0x2] = wrcbreg s2
+define x86_mmx @func_wrcbreg_smem_base_r(i32* %b, i32 %s, i32 %o) {
+  %1 = tail call x86_mmx @llvm.tpu.wrcbreg.smem.base(x86_mmx undef, i32* %b)
+  %2 = tail call x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx %1, i32 %s)
+  %3 = tail call x86_mmx @llvm.tpu.wrcbreg.offset(x86_mmx %2, i32 %o)
+  ret x86_mmx %3
+}
+ 
+; CHECK-LABEL: func_wrcbreg_smem_base_i:
+; CHECK: { [cbreg:$0x0 metadata:$0x0] = wrcbreg $0x1
+; CHECK-NEXT: { [cbreg:$0x0 metadata:$0x1] = wrcbreg $0x2
+; CHECK-NEXT: { [cbreg:$0x0 metadata:$0x2] = wrcbreg $0x3
+define x86_mmx @func_wrcbreg_smem_base_i() {
+  %b = call i32* @llvm.tpu.inttoptr.p0i32(i32 1)
+  %1 = tail call x86_mmx @llvm.tpu.wrcbreg.smem.base(x86_mmx undef, i32* %b)
+  %2 = tail call x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx %1, i32 2)
+  %3 = tail call x86_mmx @llvm.tpu.wrcbreg.offset(x86_mmx %2, i32 3)
+  ret x86_mmx %3
+}
+
+; CHECK-LABEL: func_wrcbreg_tilespmem_base_r:
+; CHECK: { [cbreg:$0x0 metadata:$0x0] = wrcbreg s0
+; CHECK-NEXT: { [cbreg:$0x0 metadata:$0x1] = wrcbreg s1
+; CHECK-NEXT: { [cbreg:$0x0 metadata:$0x2] = wrcbreg s2
+define x86_mmx @func_wrcbreg_tilespmem_base_r(i32 addrspace(201)* %b, i32 %s, i32 %o) {
+  %1 = tail call x86_mmx @llvm.tpu.wrcbreg.tilespmem.base(x86_mmx undef, i32 addrspace(201)* %b)
+  %2 = tail call x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx %1, i32 %s)
+  %3 = tail call x86_mmx @llvm.tpu.wrcbreg.offset(x86_mmx %2, i32 %o)
+  ret x86_mmx %3
+}
+
+; CHECK-LABEL: func_wrcbreg_tilespmem_base_i:
+; CHECK: { [cbreg:$0x0 metadata:$0x0] = wrcbreg $0x1
+; CHECK-NEXT: { [cbreg:$0x0 metadata:$0x1] = wrcbreg $0x2
+; CHECK-NEXT: { [cbreg:$0x0 metadata:$0x2] = wrcbreg $0x3
+define x86_mmx @func_wrcbreg_tilespmem_base_i() {
+  %s = call i32 addrspace(201)* @llvm.tpu.inttoptr.p201i32(i32 1)
+  %1 = tail call x86_mmx @llvm.tpu.wrcbreg.tilespmem.base(x86_mmx undef, i32 addrspace(201)* %s)
+  %2 = tail call x86_mmx @llvm.tpu.wrcbreg.size(x86_mmx %1, i32 2)
+  %3 = tail call x86_mmx @llvm.tpu.wrcbreg.offset(x86_mmx %2, i32 3)
+  ret x86_mmx %3
+}
+
+; CHECK-LABEL: func_cbreg_add_offset_r:
+; CHECK: { [cbreg:$0x0] = cbreg.add s0;
+define x86_mmx @func_cbreg_add_offset_r(i32 %v) {
+  %1 = tail call x86_mmx @llvm.tpu.cbreg.add.offset(x86_mmx undef, i32 %v)
+  ret x86_mmx %1
+}
+
+; CHECK-LABEL: func_cbreg_add_offset_i:
+; CHECK: { [cbreg:$0x0] = cbreg.add $0x2;
+define x86_mmx @func_cbreg_add_offset_i() {
+  %1 = tail call x86_mmx @llvm.tpu.cbreg.add.offset(x86_mmx undef, i32 2)
+  ret x86_mmx %1
+}
+
+; CHECK-LABEL: vstcbmskaddi:
+; CHECK: [tilespmem:s0+$0x0 cbreg:$0x0] = vst.cb.add.s32.msk vm0, v0;
+define void @vstcbmskaddi(i32 %base, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.add.v8i32(<8 x i1> %m, x86_mmx undef,
+                                           i32 %base, <8 x i32> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstcbmskaddi_disp:
+; CHECK: [tilespmem:s0+$0x400 cbreg:$0x0] = vst.cb.add.s32.msk vm0, v0;
+define void @vstcbmskaddi_disp(i32 %base, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  %b = add i32 %base, 1024
+  tail call void @llvm.tpu.vst.cb.msk.add.v8i32(<8 x i1> %m, x86_mmx undef, i32 %b, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbmskaddf:
+; CHECK: [tilespmem:s0+$0x0 cbreg:$0x0] = vst.cb.add.f32.msk vm0, v0;
+define void @vstcbmskaddf(i32 %base, <8 x float> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.add.v8f32(<8 x i1> %m, x86_mmx undef,
+                                           i32 %base, <8 x float> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstcbmskaddf_disp:
+; CHECK: [tilespmem:s0+$0x400 cbreg:$0x0] = vst.cb.add.f32.msk vm0, v0;
+define void @vstcbmskaddf_disp(i32 %base, <8 x float> %val, <8 x i1> %m) {
+entry:
+  %b = add i32 %base, 1024
+  tail call void @llvm.tpu.vst.cb.msk.add.v8f32(<8 x i1> %m, x86_mmx undef, i32 %b, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbupdmskaddi:
+; CHECK: [tilespmem:s0+$0x0 cbreg:$0x0] = vst.cb.upd.add.s32.msk vm0, v0;
+define void @vstcbupdmskaddi(i32 %base, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.v8i32(<8 x i1> %m, x86_mmx undef,
+                                               i32 %base, <8 x i32> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstcbupdmskaddi_disp:
+; CHECK: [tilespmem:s0+$0x400 cbreg:$0x0] = vst.cb.upd.add.s32.msk vm0, v0;
+define void @vstcbupdmskaddi_disp(i32 %base, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  %b = add i32 %base, 1024
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.v8i32(<8 x i1> %m, x86_mmx undef, i32 %b, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbupdmskaddf:
+; CHECK: [tilespmem:s0+$0x0 cbreg:$0x0] = vst.cb.upd.add.f32.msk vm0, v0;
+define void @vstcbupdmskaddf(i32 %base, <8 x float> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.v8f32(<8 x i1> %m, x86_mmx undef,
+                                           i32 %base, <8 x float> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstcbupdmskaddf_disp:
+; CHECK: [tilespmem:s0+$0x400 cbreg:$0x0] = vst.cb.upd.add.f32.msk vm0, v0;
+define void @vstcbupdmskaddf_disp(i32 %base, <8 x float> %val, <8 x i1> %m) {
+entry:
+  %b = add i32 %base, 1024
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.v8f32(<8 x i1> %m, x86_mmx undef, i32 %b, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: func_vld_vst_msk_idx_addi_np:
+; CHECK: [tilespmem:v0+s0+$0x0], v0 = vst.idx.ret.add.s32.msk vm0, v1;
+define <8 x i32> @func_vld_vst_msk_idx_addi_np(<8 x i32> addrspace(201)* %base, <8 x i32> %idx, <8 x i32> %val, <8 x i1> %m) {
+  %r = tail call <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.p201v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %idx, <8 x i32> %val)
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: func_vld_vst_msk_idx_addf_np:
+; CHECK: [tilespmem:v0+s0+$0x0], v0 = vst.idx.ret.add.f32.msk vm0, v1;
+define <8 x float> @func_vld_vst_msk_idx_addf_np(<8 x float> addrspace(201)* %base, <8 x i32> %idx, <8 x float> %val, <8 x i1> %m) {
+  %r = tail call <8 x float> @llvm.tpu.vst.msk.idx.ret.add.np.v8f32.p201v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %base, <8 x i32> %idx, <8 x float> %val)
+  ret <8 x float> %r
+}
+
+; CHECK-LABEL: vldcbmskstrided:
+; CHECK: v0 = vld.cb.msk [tilespmem:s0+$0x0 ss:$0x7d cbreg:$0x0], vm0
+; CHECK: v1 = vld.cb.msk [tilespmem:s0+$0x0 ss:s1 cbreg:$0x0], vm0
+define <8 x i32> @vldcbmskstrided(i32 %base, <8 x i1> %m, i32 %ss) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.cb.msk.strided(<8 x i1> %m,
+                                                        x86_mmx undef,
+                                                        i32 %base,
+                                                        i32 125)
+  %1 = tail call <8 x i32> @llvm.tpu.vld.cb.msk.strided(<8 x i1> %m,
+                                                        x86_mmx undef,
+                                                        i32 %base,
+                                                        i32 %ss)
+  %r = add <8 x i32> %0, %1
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: vldcbupdmskstrided:
+; CHECK: v0 = vld.cb.upd.msk [tilespmem:s0+$0x0 ss:$0x7d cbreg:$0x0], vm0
+; CHECK: v1 = vld.cb.upd.msk [tilespmem:s0+$0x0 ss:s1 cbreg:$0x0], vm0
+define <8 x i32> @vldcbupdmskstrided(i32 %base, <8 x i1> %m, i32 %ss) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.cb.upd.msk.strided(<8 x i1> %m,
+                                                            x86_mmx undef,
+                                                            i32 %base,
+                                                            i32 125)
+  %1 = tail call <8 x i32> @llvm.tpu.vld.cb.upd.msk.strided(<8 x i1> %m,
+                                                            x86_mmx undef,
+                                                            i32 %base,
+                                                            i32 %ss)
+  %r = add <8 x i32> %0, %1
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: vldcbmskstrided_off:
+; CHECK: v0 = vld.cb.msk [tilespmem:s0+$0x100 ss:$0x7d cbreg:$0x0], vm0
+; CHECK: v1 = vld.cb.msk [tilespmem:s0+$0x100 ss:s1 cbreg:$0x0], vm0
+define <8 x i32> @vldcbmskstrided_off(i32 %base, <8 x i1> %m, i32 %ss) {
+entry:
+  %base_off = add i32 %base, 256
+  %0 = tail call <8 x i32> @llvm.tpu.vld.cb.msk.strided(<8 x i1> %m,
+                                                        x86_mmx undef,
+                                                        i32 %base_off,
+                                                        i32 125)
+  %1 = tail call <8 x i32> @llvm.tpu.vld.cb.msk.strided(<8 x i1> %m,
+                                                        x86_mmx undef,
+                                                        i32 %base_off,
+                                                        i32 %ss)
+  %r = add <8 x i32> %0, %1
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: vldcbupdmskstrided_off:
+; CHECK: v0 = vld.cb.upd.msk [tilespmem:s0+$0xac ss:$0x7d cbreg:$0x0], vm0
+; CHECK: v1 = vld.cb.upd.msk [tilespmem:s0+$0xac ss:s1 cbreg:$0x0], vm0
+define <8 x i32> @vldcbupdmskstrided_off(i32 %base, <8 x i1> %m, i32 %ss) {
+entry:
+  %base_off = add i32 %base, 172
+  %0 = tail call <8 x i32> @llvm.tpu.vld.cb.upd.msk.strided(<8 x i1> %m,
+                                                            x86_mmx undef,
+                                                            i32 %base_off,
+                                                            i32 125)
+  %1 = tail call <8 x i32> @llvm.tpu.vld.cb.upd.msk.strided(<8 x i1> %m,
+                                                            x86_mmx undef,
+                                                            i32 %base_off,
+                                                            i32 %ss)
+  %r = add <8 x i32> %0, %1
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: vstcbmskstrided:
+; CHECK: [tilespmem:s0+$0x0 ss:$0x7d cbreg:$0x0] = vst.cb.msk vm0, v0
+; CHECK: [tilespmem:s0+$0x0 ss:s1 cbreg:$0x0] = vst.cb.msk vm0, v0
+define void @vstcbmskstrided(i32 %base, <8 x i32> %val, <8 x i1> %m, i32 %ss) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.strided.v8i32(<8 x i1> %m,
+                                              x86_mmx undef,
+                                              i32 %base,
+                                              i32 125,
+                                              <8 x i32> %val)
+  tail call void @llvm.tpu.vst.cb.msk.strided.v8i32(<8 x i1> %m,
+                                              x86_mmx undef,
+                                              i32 %base,
+                                              i32 %ss,
+                                              <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbupdmskstrided:
+; CHECK: [tilespmem:s0+$0x0 ss:$0x7d cbreg:$0x0] = vst.cb.upd.msk vm0, v0
+; CHECK: [tilespmem:s0+$0x0 ss:s1 cbreg:$0x0] = vst.cb.upd.msk vm0, v0
+define void @vstcbupdmskstrided(i32 %base, <8 x i32> %val, <8 x i1> %m, i32 %ss) {
+entry:
+  tail call void @llvm.tpu.vst.cb.upd.msk.strided.v8i32(<8 x i1> %m,
+                                                  x86_mmx undef,
+                                                  i32 %base,
+                                                  i32 125,
+                                                  <8 x i32> %val)
+  tail call void @llvm.tpu.vst.cb.upd.msk.strided.v8i32(<8 x i1> %m,
+                                                  x86_mmx undef,
+                                                  i32 %base,
+                                                  i32 %ss,
+                                                  <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbmskstrided_off:
+; CHECK: [tilespmem:s0+$0xc0 ss:$0x7d cbreg:$0x0] = vst.cb.msk vm0, v0
+; CHECK: [tilespmem:s0+$0xc0 ss:s1 cbreg:$0x0] = vst.cb.msk vm0, v0
+define void @vstcbmskstrided_off(i32 %base, <8 x i32> %val, <8 x i1> %m, i32 %ss) {
+entry:
+  %base_off = add i32 %base, 192
+  tail call void @llvm.tpu.vst.cb.msk.strided.v8i32(<8 x i1> %m,
+                                              x86_mmx undef,
+                                              i32 %base_off,
+                                              i32 125,
+                                              <8 x i32> %val)
+  tail call void @llvm.tpu.vst.cb.msk.strided.v8i32(<8 x i1> %m,
+                                              x86_mmx undef,
+                                              i32 %base_off,
+                                              i32 %ss,
+                                              <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbupdmskstrided_off:
+; CHECK: [tilespmem:s0+$0xc0 ss:$0x7d cbreg:$0x0] = vst.cb.upd.msk vm0, v0
+; CHECK: [tilespmem:s0+$0xc0 ss:s1 cbreg:$0x0] = vst.cb.upd.msk vm0, v0
+define void @vstcbupdmskstrided_off(i32 %base, <8 x i32> %val, <8 x i1> %m, i32 %ss) {
+entry:
+  %base_off = add i32 %base, 192
+  tail call void @llvm.tpu.vst.cb.upd.msk.strided.v8i32(<8 x i1> %m,
+                                                  x86_mmx undef,
+                                                  i32 %base_off,
+                                                  i32 125,
+                                                  <8 x i32> %val)
+  tail call void @llvm.tpu.vst.cb.upd.msk.strided.v8i32(<8 x i1> %m,
+                                                  x86_mmx undef,
+                                                  i32 %base_off,
+                                                  i32 %ss,
+                                                  <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbmskaddstridedi:
+; CHECK: [tilespmem:s0+$0x0 ss:$0x7d cbreg:$0x0] = vst.cb.add.s32.msk vm0, v0
+; CHECK: [tilespmem:s0+$0x0 ss:s1 cbreg:$0x0] = vst.cb.add.s32.msk vm0, v0
+define void @vstcbmskaddstridedi(i32 %base, <8 x i32> %val, <8 x i1> %m, i32 %ss) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.add.strided.v8i32(<8 x i1> %m,
+                                                   x86_mmx undef,
+                                                   i32 %base,
+                                                   i32 125,
+                                                   <8 x i32> %val)
+  tail call void @llvm.tpu.vst.cb.msk.add.strided.v8i32(<8 x i1> %m,
+                                                   x86_mmx undef,
+                                                   i32 %base,
+                                                   i32 %ss,
+                                                   <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbupdmskaddstridedi:
+; CHECK: [tilespmem:s0+$0x0 ss:$0x7d cbreg:$0x0] = vst.cb.upd.add.s32.msk vm0, v0
+; CHECK: [tilespmem:s0+$0x0 ss:s1 cbreg:$0x0] = vst.cb.upd.add.s32.msk vm0, v0
+define void @vstcbupdmskaddstridedi(i32 %base, <8 x i32> %val, <8 x i1> %m, i32 %ss) {
+entry:
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.strided.v8i32(<8 x i1> %m,
+                                                       x86_mmx undef,
+                                                       i32 %base,
+                                                       i32 125,
+                                                       <8 x i32> %val)
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.strided.v8i32(<8 x i1> %m,
+                                                       x86_mmx undef,
+                                                       i32 %base,
+                                                       i32 %ss,
+                                                       <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbmskaddstridedi_off:
+; CHECK: [tilespmem:s0+$0xc0 ss:$0x7d cbreg:$0x0] = vst.cb.add.s32.msk vm0, v0
+; CHECK: [tilespmem:s0+$0xc0 ss:s1 cbreg:$0x0] = vst.cb.add.s32.msk vm0, v0
+define void @vstcbmskaddstridedi_off(i32 %base, <8 x i32> %val, <8 x i1> %m, i32 %ss) {
+entry:
+  %base_off = add i32 %base, 192
+  tail call void @llvm.tpu.vst.cb.msk.add.strided.v8i32(<8 x i1> %m,
+                                                   x86_mmx undef,
+                                                   i32 %base_off,
+                                                   i32 125,
+                                                   <8 x i32> %val)
+  tail call void @llvm.tpu.vst.cb.msk.add.strided.v8i32(<8 x i1> %m,
+                                                   x86_mmx undef,
+                                                   i32 %base_off,
+                                                   i32 %ss,
+                                                   <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbupdmskaddstridedi_off:
+; CHECK: [tilespmem:s0+$0xc0 ss:$0x7d cbreg:$0x0] = vst.cb.upd.add.s32.msk vm0, v0
+; CHECK: [tilespmem:s0+$0xc0 ss:s1 cbreg:$0x0] = vst.cb.upd.add.s32.msk vm0, v0
+define void @vstcbupdmskaddstridedi_off(i32 %base, <8 x i32> %val, <8 x i1> %m, i32 %ss) {
+entry:
+  %base_off = add i32 %base, 192
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.strided.v8i32(<8 x i1> %m,
+                                                       x86_mmx undef,
+                                                       i32 %base_off,
+                                                       i32 125,
+                                                       <8 x i32> %val)
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.strided.v8i32(<8 x i1> %m,
+                                                       x86_mmx undef,
+                                                       i32 %base_off,
+                                                       i32 %ss,
+                                                       <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbmskaddstridedf:
+; CHECK: [tilespmem:s0+$0x0 ss:$0x7d cbreg:$0x0] = vst.cb.add.f32.msk vm0, v0
+; CHECK: [tilespmem:s0+$0x0 ss:s1 cbreg:$0x0] = vst.cb.add.f32.msk vm0, v0
+define void @vstcbmskaddstridedf(i32 %base, <8 x float> %val, <8 x i1> %m, i32 %ss) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.add.strided.v8f32(<8 x i1> %m,
+                                                   x86_mmx undef,
+                                                   i32 %base,
+                                                   i32 125,
+                                                   <8 x float> %val)
+  tail call void @llvm.tpu.vst.cb.msk.add.strided.v8f32(<8 x i1> %m,
+                                                   x86_mmx undef,
+                                                   i32 %base,
+                                                   i32 %ss,
+                                                   <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbupdmskaddstridedf:
+; CHECK: [tilespmem:s0+$0x0 ss:$0x7d cbreg:$0x0] = vst.cb.upd.add.f32.msk vm0, v0
+; CHECK: [tilespmem:s0+$0x0 ss:s1 cbreg:$0x0] = vst.cb.upd.add.f32.msk vm0, v0
+define void @vstcbupdmskaddstridedf(i32 %base, <8 x float> %val, <8 x i1> %m, i32 %ss) {
+entry:
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.strided.v8f32(<8 x i1> %m,
+                                                       x86_mmx undef,
+                                                       i32 %base,
+                                                       i32 125,
+                                                       <8 x float> %val)
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.strided.v8f32(<8 x i1> %m,
+                                                       x86_mmx undef,
+                                                       i32 %base,
+                                                       i32 %ss,
+                                                       <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbmskaddstridedf_off:
+; CHECK: [tilespmem:s0+$0xc0 ss:$0x7d cbreg:$0x0] = vst.cb.add.f32.msk vm0, v0
+; CHECK: [tilespmem:s0+$0xc0 ss:s1 cbreg:$0x0] = vst.cb.add.f32.msk vm0, v0
+define void @vstcbmskaddstridedf_off(i32 %base, <8 x float> %val, <8 x i1> %m, i32 %ss) {
+entry:
+  %base_off = add i32 %base, 192
+  tail call void @llvm.tpu.vst.cb.msk.add.strided.v8f32(<8 x i1> %m,
+                                                   x86_mmx undef,
+                                                   i32 %base_off,
+                                                   i32 125,
+                                                   <8 x float> %val)
+  tail call void @llvm.tpu.vst.cb.msk.add.strided.v8f32(<8 x i1> %m,
+                                                   x86_mmx undef,
+                                                   i32 %base_off,
+                                                   i32 %ss,
+                                                   <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbupdmskaddstridedf_off:
+; CHECK: [tilespmem:s0+$0xc0 ss:$0x7d cbreg:$0x0] = vst.cb.upd.add.f32.msk vm0, v0
+; CHECK: [tilespmem:s0+$0xc0 ss:s1 cbreg:$0x0] = vst.cb.upd.add.f32.msk vm0, v0
+define void @vstcbupdmskaddstridedf_off(i32 %base, <8 x float> %val, <8 x i1> %m, i32 %ss) {
+entry:
+  %base_off = add i32 %base, 192
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.strided.v8f32(<8 x i1> %m,
+                                                       x86_mmx undef,
+                                                       i32 %base_off,
+                                                       i32 125,
+                                                       <8 x float> %val)
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.strided.v8f32(<8 x i1> %m,
+                                                       x86_mmx undef,
+                                                       i32 %base_off,
+                                                       i32 %ss,
+                                                       <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: sc_permutei:
+; CHECK: { v0 = vperm.xlane v0, v1
+define <8 x i32> @sc_permutei(<8 x i32> %v0, <8 x i32> %v1) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.sc.permute.v8i32(<8 x i32> %v0, <8 x i32> %v1)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: sc_permutef:
+; CHECK: { v0 = vperm.xlane v0, v1
+define <8 x float> @sc_permutef(<8 x float> %v0, <8 x i32> %v1) {
+entry:
+  %0 = tail call <8 x float> @llvm.tpu.sc.permute.v8f32(<8 x float> %v0, <8 x i32> %v1)
+  ret <8 x float> %0
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/inttoptr.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/inttoptr.ll
new file mode 100644
index 0000000..189b63e
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/inttoptr.ll

@@ -0,0 +1,26 @@
+; RUN: llc < %s -mcpu=tensorcore-jf -asm-verbose=false | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32) nounwind
+declare i32 @llvm.tpu.ptrtoint.p205v1024f32(<1024 x float> addrspace(205)*) nounwind
+
+; Test that address calculation gets folded.
+; CHECK-LABEL: address_folding:
+; CHECK: [vmem:$0x1c] =	vst v0
+define void @address_folding(i1 %p, <1024 x float> %x) {
+  %a = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 4)
+  %b = getelementptr inbounds <1024 x float>, <1024 x float> addrspace(205)* %a, i32 2
+  br i1 %p, label %next, label %one
+
+one:
+  %c = call i32 @llvm.tpu.ptrtoint.p205v1024f32(<1024 x float> addrspace(205)* %b)
+  %d = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %c)
+  %e = getelementptr inbounds <1024 x float>, <1024 x float> addrspace(205)* %d, i32 1
+  store <1024 x float> %x, <1024 x float> addrspace(205)* %e
+  br label %one
+next:
+  ret void
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/latency_override.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/latency_override.ll
new file mode 100644
index 0000000..dfb63f4
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/latency_override.ll

@@ -0,0 +1,34 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN:     -tpu-latencies %S/Inputs/latency_override.yml | \
+; RUN:     FileCheck %s --check-prefix CHECK-OVERRIDE
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s --check-prefix CHECK-NORMAL
+; REQUIRES: tpu
+
+; The overridden latencies file sets the latency of sld to 4, so check there are
+; exactly 4 bundles between def and use.
+; CHECK-OVERRIDE: [[x:s[0-9]+]] = sld
+; CHECK-OVERRIDE: sdelay $0x3
+; CHECK-OVERRIDE: {
+; CHECK-OVERRIDE: {
+; CHECK-OVERRIDE-NOT: {
+; CHECK-OVERRIDE: [[x]]
+
+; The default latency is 1m, so check there is exactly one bundle between def
+; and use.
+; CHECK-NORMAL: [[x:s[0-9]+]] = sld
+; CHECK-NORMAL: sdelay $0x2
+; CHECK-NORMAL: {
+; CHECK-NORMAL-NOT: {
+; CHECK-NORMAL: [[x]]
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu-unknown-unknown"
+
+@a = dso_local local_unnamed_addr global i32 4, align 4
+
+define dso_local void @_Z6globali(i32) {
+  %2 = load i32, i32* @a, align 4
+  %3 = add nsw i32 %2, %2
+  store i32 %3, i32* inttoptr (i32 256 to i32*), align 256
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/legacy_spill_limits.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/legacy_spill_limits.ll
new file mode 100644
index 0000000..ce85c31
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/legacy_spill_limits.ll

@@ -0,0 +1,41 @@
+; RUN: opt -S -O2 < %s -mcpu=sparsecore-tec-vf -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu-unknown-unknown"
+
+; Tests that the legacy spill limits works even if there are multiple
+; functions in the module.
+
+; CHECK-LABEL: function1
+define void @function1() #0 {
+entry:
+  ret void
+}
+
+; CHECK-LABEL: function2
+define void @function2() #0 {
+entry:
+  ret void
+}
+
+; CHECK-LABEL: function3
+define void @function3() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="sparsecore-tec-vf" }
+attributes #1 = { nounwind readnone "target-cpu"="sparsecore-tec-vf" }
+attributes #2 = { argmemonly nounwind readonly "target-cpu"="sparsecore-tec-vf" }
+attributes #3 = { inaccessiblememonly nounwind "target-cpu"="sparsecore-tec-vf" }
+
+!smem.spill.start = !{!0}
+!smem.spill.limit = !{!1}
+!vmem.spill.start = !{!2}
+!vmem.spill.limit = !{!3}
+
+!0 = !{i32 42}
+!1 = !{i32 2126}
+!2 = !{i32 22840}
+!3 = !{i32 32768}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/loop_disable_unroll_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/loop_disable_unroll_sc.ll
new file mode 100644
index 0000000..d6b2d8b
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/loop_disable_unroll_sc.ll

@@ -0,0 +1,41 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-vf -tpu-propagate-disable-unroll < %s \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32* @llvm.tpu.inttoptr.pi32(i32)
+
+; Tests that we can optionally propagate llvm.loop.unroll.disable metadata.
+
+; CHECK-LABEL: @random_loop
+; CHECK: !{{[0-9]+}} = !{!"llvm.loop.unroll.disable", i1 true}
+
+define void @random_loop(i32 %a) {
+entry:
+  %0 = call i32* @llvm.tpu.inttoptr.pi32(i32 0)
+  %1 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  %2 = call i32* @llvm.tpu.inttoptr.pi32(i32 64)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.i ]
+  %idx0 = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %idx0, align 4
+  store i32 %3, i32* %1, align 4
+  %idx1 = getelementptr inbounds i32, i32* %0, i32 32
+  %4 = load i32, i32* %idx1, align 4
+  store i32 %4, i32* %2, align 4
+
+  %ic = add i32 %i, 1
+  %cmp.i = icmp slt i32 %ic, %a
+  br i1 %cmp.i, label %for.body.i, label %exit, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+!0 = distinct !{}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.parallel_accesses", !0}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/loop_parallel_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/loop_parallel_sc.ll
new file mode 100644
index 0000000..fdb0b32
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/loop_parallel_sc.ll

@@ -0,0 +1,374 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-vf < %s \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.loop.parallel()
+declare i32* @llvm.tpu.inttoptr.pi32(i32)
+declare void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, <8 x float>)
+declare void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.tpu.vld.msk.idx.np.v8i32.v8i1.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>)
+declare <8 x float> @llvm.tpu.vld.msk.idx.np.v8f32.v8i1.p201v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>)
+declare <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.v8i1.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+declare <8 x float> @llvm.tpu.vst.msk.idx.ret.add.np.v8f32.v8i1.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, <8 x float>)
+
+; Tests if alias.scope and noalias metadata is propagated as expected
+; when enabling tpu-loop-parallel pass.
+
+; CHECK-LABEL: @parallel_loop
+; CHECK: %[[l1:[0-9]+]] = load i32, i32* %0, align 4, !alias.scope ![[m1:[0-9]+]], !noalias ![[m2:[0-9]+]]
+; CHECK:  store i32 %[[l1]], i32* %{{[0-9]+}}, align 4, !alias.scope ![[m1]], !noalias ![[m2]]
+; CHECK:  %[[l2:[0-9]+]] = load i32, i32* %idx1, align 4, !alias.scope ![[m3:[0-9]+]], !noalias ![[m4:[0-9]+]]
+; CHECK:  store i32 %[[l2]], i32* %{{[0-9]+}}, align 4, !alias.scope ![[m3]], !noalias ![[m4]]
+
+define void @parallel_loop(i32 %a) {
+entry:
+  %0 = call i32* @llvm.tpu.inttoptr.pi32(i32 0)
+  %1 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  %2 = call i32* @llvm.tpu.inttoptr.pi32(i32 64)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.i ]
+  %idx0 = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %idx0, align 4
+  store i32 %3, i32* %1, align 4
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !0
+  %idx1 = getelementptr inbounds i32, i32* %0, i32 32
+  %4 = load i32, i32* %idx1, align 4
+  store i32 %4, i32* %2, align 4
+
+  %ic = add i32 %i, 1
+  %cmp.i = icmp slt i32 %ic, %a
+  br i1 %cmp.i, label %for.body.i, label %exit, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: @parallel_loop_multiblock
+; CHECK: %[[l3:[0-9]+]] = load i32, i32* %0, align 4, !alias.scope ![[m5:[0-9]+]], !noalias ![[m6:[0-9]+]]
+; CHECK:  store i32 %[[l3]], i32* %{{[0-9]+}}, align 4, !alias.scope ![[m5]], !noalias ![[m6]]
+; CHECK:  %[[l4:[0-9]+]] = load i32, i32* %idx1, align 4, !alias.scope ![[m7:[0-9]+]], !noalias ![[m8:[0-9]+]]
+; CHECK:  store i32 %[[l4]], i32* %{{[0-9]+}}, align 4, !alias.scope ![[m7]], !noalias ![[m8]]
+
+define void @parallel_loop_multiblock(i32 %a, i32 %b) {
+entry:
+  %0 = call i32* @llvm.tpu.inttoptr.pi32(i32 0)
+  %1 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  %2 = call i32* @llvm.tpu.inttoptr.pi32(i32 64)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.loop.i ]
+  %idx0 = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %idx0, align 4
+  store i32 %3, i32* %1, align 4
+  %ic1 = add i32 %i, 1
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !0
+  %cmp.side.i = icmp slt i32 %ic1, %b
+  br i1 %cmp.side.i, label %for.body.side.i, label %for.body.loop.i, !llvm.loop !1
+
+for.body.side.i:
+  %idx1 = getelementptr inbounds i32, i32* %0, i32 32
+  %4 = load i32, i32* %idx1, align 4
+  store i32 %4, i32* %2, align 4
+  br label %for.body.loop.i
+
+for.body.loop.i:
+  %ic = add i32 %ic1, 1
+  %cmp.i = icmp slt i32 %ic, %a
+  br i1 %cmp.i, label %for.body.i, label %exit, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: @parallel_loop_prop_meta
+; CHECK: %cmp.side.i =
+; CHECK-NEXT: br i1 {{.*}} !llvm.loop ![[m9:[0-9]+]]
+; CHECK: br label {{.*}} !llvm.loop ![[m9]]
+; CHECK: %cmp.i =
+; CHECK-NEXT: br i1 {{.*}} !llvm.loop ![[m9]]
+
+define void @parallel_loop_prop_meta(i32 %a, i32 %b) {
+entry:
+  %0 = call i32* @llvm.tpu.inttoptr.pi32(i32 0)
+  %1 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  %2 = call i32* @llvm.tpu.inttoptr.pi32(i32 64)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.loop.i ]
+  %idx0 = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %idx0, align 4
+  store i32 %3, i32* %1, align 4
+  %ic1 = add i32 %i, 1
+  %cmp.side.i = icmp slt i32 %ic1, %b
+  br i1 %cmp.side.i, label %for.body.side.i, label %for.body.loop.i, !llvm.loop !1
+
+for.body.side.i:
+  %idx1 = getelementptr inbounds i32, i32* %0, i32 32
+  %4 = load i32, i32* %idx1, align 4
+  store i32 %4, i32* %2, align 4
+  br label %for.body.loop.i
+
+for.body.loop.i:
+  %ic = add i32 %ic1, 1
+  %cmp.i = icmp slt i32 %ic, %a
+  br i1 %cmp.i, label %for.body.i, label %exit, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+; Tests that the llvm.loop metadata propagates to the inner loop.
+
+; CHECK-LABEL: @parallel_loop_prop_meta_nested_outer
+; CHECK: %cmp.side.i =
+; CHECK-NEXT: br i1 {{.*}} !llvm.loop ![[m9:[0-9]+]]
+; CHECK: %cmp.inner.i =
+; CHECK: br i1 {{.*}} !llvm.loop ![[m9]]
+; CHECK: %cmp.i =
+; CHECK-NEXT: br i1 {{.*}} !llvm.loop ![[m9]]
+
+define void @parallel_loop_prop_meta_nested_outer(i32 %a, i32 %b) {
+entry:
+  %0 = call i32* @llvm.tpu.inttoptr.pi32(i32 0)
+  %1 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  %2 = call i32* @llvm.tpu.inttoptr.pi32(i32 64)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.loop.i ]
+  %idx0 = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %idx0, align 4
+  store i32 %3, i32* %1, align 4
+  %ic1 = add i32 %i, 1
+  %cmp.side.i = icmp slt i32 %ic1, %b
+  br i1 %cmp.side.i, label %for.body.inner.i, label %for.body.loop.i
+
+for.body.inner.i:
+  %ii = phi i32 [ 0, %for.body.i ], [ %ic2, %for.body.inner.i ]
+  %idx1 = getelementptr inbounds i32, i32* %0, i32 32
+  %4 = load i32, i32* %idx1, align 4
+  store i32 %4, i32* %2, align 4
+  %ic2 = add i32 %ii, 1
+  %cmp.inner.i = icmp slt i32 %ic2, %4
+  br i1 %cmp.inner.i, label %for.body.inner.i, label %for.body.loop.i
+
+for.body.loop.i:
+  %ic = add i32 %ic1, 1
+  %cmp.i = icmp slt i32 %ic, %a
+  br i1 %cmp.i, label %for.body.i, label %exit, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+; Tests that the llvm.loop metadata does not propagate to the inner loop,
+; because there already is llvm.loop metadata attached.
+
+; CHECK-LABEL: @parallel_loop_prop_meta_nested_outer_not
+; CHECK: %cmp.side.i =
+; CHECK-NEXT: br i1 {{.*}} !llvm.loop ![[m9:[0-9]+]]
+; CHECK: %cmp.inner.i =
+; TODO(b/185953953): The inner loop still ends up getting the
+; llvm.loop metadata from the outer loop, due to SimplifyCFGPass.
+; CHECK-NOT-DISABLED: br i1 {{.*}} !llvm.loop ![[m9]]
+; CHECK: %cmp.i =
+; CHECK-NEXT: br i1 {{.*}} !llvm.loop ![[m9]]
+
+define void @parallel_loop_prop_meta_nested_outer_not(i32 %a, i32 %b) {
+entry:
+  %0 = call i32* @llvm.tpu.inttoptr.pi32(i32 0)
+  %1 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  %2 = call i32* @llvm.tpu.inttoptr.pi32(i32 64)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.loop.i ]
+  %idx0 = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %idx0, align 4
+  store i32 %3, i32* %1, align 4
+  %ic1 = add i32 %i, 1
+  %cmp.side.i = icmp slt i32 %ic1, %b
+  br i1 %cmp.side.i, label %for.body.inner.i, label %for.body.loop.i
+
+for.body.inner.i:
+  %ii = phi i32 [ 0, %for.body.i ], [ %ic2, %for.body.inner.i ]
+  %idx1 = getelementptr inbounds i32, i32* %0, i32 32
+  %4 = load i32, i32* %idx1, align 4
+  store i32 %4, i32* %2, align 4
+  %ic2 = add i32 %ii, 1
+  %cmp.inner.i = icmp slt i32 %ic2, %4
+  br i1 %cmp.inner.i, label %for.body.inner.i, label %for.body.loop.i, !llvm.loop !7
+
+for.body.loop.i:
+  %ic = add i32 %ic1, 1
+  %cmp.i = icmp slt i32 %ic, %a
+  br i1 %cmp.i, label %for.body.i, label %exit, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+; Tests that the llvm.loop metadata does not propagate to the outer loop.
+
+; CHECK-LABEL: @parallel_loop_prop_meta_nested_inner
+; CHECK: %cmp.side.i =
+; CHECK-NOT: br i1 {{.*}} !llvm.loop ![[m9:[0-9]+]]
+; CHECK: %cmp.inner.i =
+; CHECK: br i1 {{.*}} !llvm.loop ![[m9]]
+; CHECK: %cmp.i =
+; CHECK-NOT: br i1 {{.*}} !llvm.loop ![[m9]]
+
+define void @parallel_loop_prop_meta_nested_inner(i32 %a, i32 %b) {
+entry:
+  %0 = call i32* @llvm.tpu.inttoptr.pi32(i32 0)
+  %1 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  %2 = call i32* @llvm.tpu.inttoptr.pi32(i32 64)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.loop.i ]
+  %idx0 = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %idx0, align 4
+  store i32 %3, i32* %1, align 4
+  %ic1 = add i32 %i, 1
+  %cmp.side.i = icmp slt i32 %ic1, %b
+  br i1 %cmp.side.i, label %for.body.inner.i, label %for.body.loop.i
+
+for.body.inner.i:
+  %ii = phi i32 [ 0, %for.body.i ], [ %ic2, %for.body.inner.i ]
+  %idx1 = getelementptr inbounds i32, i32* %0, i32 32
+  %4 = load i32, i32* %idx1, align 4
+  store i32 %4, i32* %2, align 4
+  %ic2 = add i32 %ii, 1
+  %cmp.inner.i = icmp slt i32 %ic2, %4
+  br i1 %cmp.inner.i, label %for.body.inner.i, label %for.body.loop.i, !llvm.loop !1
+
+for.body.loop.i:
+  %ic = add i32 %ic1, 1
+  %cmp.i = icmp slt i32 %ic, %a
+  br i1 %cmp.i, label %for.body.i, label %exit
+
+exit:
+  ret void
+}
+
+; Tests if alias.scope and noalias metadata is propagated as expected
+; when enabling tpu-loop-parallel pass and .np intrinsics are exempt.
+
+; CHECK-LABEL: @parallel_loop_np
+; CHECK-LABEL: for.body.i:
+; CHECK: {{.*}} phi i32
+; CHECK-NOT: {{.*}} @llvm.tpu.vld.msk.idx.np.v8i1.v8f32{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+; CHECK-NOT: {{.*}} @llvm.tpu.vld.msk.idx.np.v8i1.v8i32{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+; CHECK-NOT: {{.*}} @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8f32.v8f32{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+; CHECK-NOT: {{.*}} @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8i32.v8i32{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+; CHECK-NOT: {{.*}} @llvm.tpu.vst.msk.idx.ret.add.np.v8f32.v8i1.p201v8f32{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+; CHECK-NOT: {{.*}} @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.v8i1.p201v8i32{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+
+define void @parallel_loop_np(i32 %a, <8 x i1> %m, <8 x i32> addrspace(201)* %basei, <8 x float> addrspace(201)* %basef, <8 x i32> %off) {
+entry:
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.i ]
+  %0 = tail call <8 x float> @llvm.tpu.vld.msk.idx.np.v8f32.v8i1.p201v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %basef, <8 x i32> %off)
+  %1 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.np.v8i32.v8i1.p201v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %basei, <8 x i32> %off)
+  tail call void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %basef, <8 x i32> %off, <8 x float> %0)
+  tail call void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %basei, <8 x i32> %off, <8 x i32> %1)
+  %2 = tail call <8 x float> @llvm.tpu.vst.msk.idx.ret.add.np.v8f32.v8i1.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %basef, <8 x i32> %off, <8 x float> %0)
+  %3 = tail call <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.v8i1.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %basei, <8 x i32> %off, <8 x i32> %1)
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !0
+  %4 = tail call <8 x float> @llvm.tpu.vld.msk.idx.np.v8f32.v8i1.p201v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %basef, <8 x i32> %off)
+  %5 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.np.v8i32.v8i1.p201v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %basei, <8 x i32> %off)
+  tail call void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %basef, <8 x i32> %off, <8 x float> %4)
+  tail call void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %basei, <8 x i32> %off, <8 x i32> %5)
+  %6 = tail call <8 x float> @llvm.tpu.vst.msk.idx.ret.add.np.v8f32.v8i1.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %basef, <8 x i32> %off, <8 x float> %4)
+  %7 = tail call <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.v8i1.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %basei, <8 x i32> %off, <8 x i32> %5)
+
+  %ic = add i32 %i, 1
+  %cmp.i = icmp slt i32 %ic, %a
+  br i1 %cmp.i, label %for.body.i, label %exit, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+
+; Tests if alias.scope and noalias metadata is propagated as expected
+; when enabling tpu-loop-parallel pass and .np intrinsics are exempt,
+; but there is noalias metadata w.r.t. to regular load/stores. The
+; test does not inspect the metadata itself, and only checks that it's
+; there.
+
+; CHECK-LABEL: @parallel_loop_np_mix
+; CHECK-LABEL: for.body.i:
+; CHECK: {{.*}} phi i32
+; CHECK: %[[l0:[l0-9]+]] = load i32, i32* %0, align 4, !alias.scope ![[m0:[0-9]+]], !noalias ![[m1:[0-9]+]]
+; CHECK:  store i32 %[[l0]], i32* %{{[0-9]+}}, align 4, !alias.scope ![[m0]], !noalias ![[m1]]
+; CHECK: {{.*}} @llvm.tpu.vld.msk.idx.np.v8f32.v8i1{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+; CHECK: {{.*}} @llvm.tpu.vld.msk.idx.np.v8i32.v8i1{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+; CHECK: {{.*}} @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8f32.v8f32{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+; CHECK: {{.*}} @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8i32.v8i32{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+; CHECK: {{.*}} @llvm.tpu.vst.msk.idx.ret.add.np.v8f32.v8i1.p201v8f32{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+; CHECK: {{.*}} @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.v8i1.p201v8i32{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+; CHECK: %[[l1:[l0-9]+]] = load i32, i32* %idx1, align 4, !alias.scope ![[m2:[0-9]+]], !noalias ![[m3:[0-9]+]]
+; CHECK:  store i32 %[[l1]], i32* %{{[0-9]+}}, align 4, !alias.scope ![[m2]], !noalias ![[m3]]
+; CHECK: {{.*}} @llvm.tpu.vld.msk.idx.np.v8f32.v8i1{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+; CHECK: {{.*}} @llvm.tpu.vld.msk.idx.np.v8i32.v8i1{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+; CHECK: {{.*}} @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8f32.v8f32{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+; CHECK: {{.*}} @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8i32.v8i32{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+; CHECK: {{.*}} @llvm.tpu.vst.msk.idx.ret.add.np.v8f32.v8i1.p201v8f32{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+; CHECK: {{.*}} @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.v8i1.p201v8i32{{.*}}, !alias.scope !{{[0-9]+}}, !noalias !{{[0-9]+}}
+
+define void @parallel_loop_np_mix(i32 %a, <8 x i1> %m, <8 x i32> addrspace(201)* %basei, <8 x float> addrspace(201)* %basef, <8 x i32> %off) {
+entry:
+  %0 = call i32* @llvm.tpu.inttoptr.pi32(i32 0)
+  %1 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  %2 = call i32* @llvm.tpu.inttoptr.pi32(i32 64)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.i ]
+  %idx0 = getelementptr inbounds i32, i32* %0, i32 0
+  %l0 = load i32, i32* %idx0, align 4
+  store i32 %l0, i32* %1, align 4
+  %3 = tail call <8 x float> @llvm.tpu.vld.msk.idx.np.v8f32.v8i1.p201v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %basef, <8 x i32> %off)
+  %4 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.np.v8i32.v8i1.p201v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %basei, <8 x i32> %off)
+  tail call void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %basef, <8 x i32> %off, <8 x float> %3)
+  tail call void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %basei, <8 x i32> %off, <8 x i32> %4)
+  %5 = tail call <8 x float> @llvm.tpu.vst.msk.idx.ret.add.np.v8f32.v8i1.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %basef, <8 x i32> %off, <8 x float> %3)
+  %6 = tail call <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.v8i1.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %basei, <8 x i32> %off, <8 x i32> %4)
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !0
+  %idx1 = getelementptr inbounds i32, i32* %0, i32 8
+  %l1 = load i32, i32* %idx1, align 4
+  store i32 %l1, i32* %2, align 4
+  %7 = tail call <8 x float> @llvm.tpu.vld.msk.idx.np.v8f32.v8i1.p201v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %basef, <8 x i32> %off)
+  %8 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.np.v8i32.v8i1.p201v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %basei, <8 x i32> %off)
+  tail call void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %basef, <8 x i32> %off, <8 x float> %7)
+  tail call void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %basei, <8 x i32> %off, <8 x i32> %8)
+  %9 = tail call <8 x float> @llvm.tpu.vst.msk.idx.ret.add.np.v8f32.v8i1.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %basef, <8 x i32> %off, <8 x float> %7)
+  %10 = tail call <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.v8i1.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %basei, <8 x i32> %off, <8 x i32> %8)
+
+  %ic = add i32 %i, 1
+  %cmp.i = icmp slt i32 %ic, %a
+  br i1 %cmp.i, label %for.body.i, label %exit, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+!0 = distinct !{}
+!1 = distinct !{!1, !2, !3, !4, !5, !6}
+!2 = !{!"llvm.loop.parallel_accesses", !0}
+!3 = !{!"llvm.loop.unroll.disable"}
+!4 = !{!"llvm.loop.vectorize.width", i32 1}
+!5 = !{!"llvm.loop.interleave.count", i32 1}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}
+!7 = distinct !{!7}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/loops_bc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/loops_bc.ll
new file mode 100644
index 0000000..2f6b479
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/loops_bc.ll

@@ -0,0 +1,93 @@
+; RUN: llc < %s -march=googletpu -mcpu=barnacore-cc-pf -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+declare void @llvm.tpu.bc.loop.start(i32)
+declare i1 @llvm.tpu.bc.loop.end()
+declare <8 x float> addrspace(207)* @llvm.tpu.inttoptr.p207v8f32(i32) nounwind
+
+; CHECK: loop_empty:
+; CHECK: _ = loop_start $0x1, $0x1
+; CHECK: // Block address taken
+; CHECK: _ = #HALT
+define void @loop_empty() {
+  call void @llvm.tpu.bc.loop.start(i32 1)
+  br label %ls
+
+ls:
+  %cont = call i1 @llvm.tpu.bc.loop.end()
+  br i1 %cont, label %ls, label %cnt
+
+cnt:
+  ret void
+}
+
+; CHECK: loop_nonempty:
+; CHECK: _ = loop_start $0x1, $0x0
+; CHECK: {  	v0 = vld.f32 [bmem:$0x2a] }
+; CHECK: {      [bmem:s0] = vst.f32 v0  }
+; CHECK: _ = #HALT
+define void @loop_nonempty(<8 x float> addrspace(207)* %addr) {
+  call void @llvm.tpu.bc.loop.start(i32 0)
+  br label %ls
+
+ls:
+  %a = call <8 x float> addrspace(207)* @llvm.tpu.inttoptr.p207v8f32(i32 42)
+  %b = load <8 x float>, <8 x float> addrspace(207)* %a
+  store <8 x float> %b, <8 x float> addrspace(207)* %addr
+  %cont = call i1 @llvm.tpu.bc.loop.end()
+  br i1 %cont, label %ls, label %cnt
+
+cnt:
+  ret void
+}
+
+; CHECK: loop_aggphi:
+; CHECK: _ = loop_start $0x1, $0x0
+; CHECK: {    _ = vnop  }
+; CHECK: {    v0.ali = vld.f32 [bmem:$0x2a]  }
+; CHECK: _ = #HALT
+declare [6 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex([6 x <8 x float>], <8 x float>)
+define [6 x <8 x float>] @loop_aggphi([6 x <8 x float>] %agg, <8 x float> addrspace(207)* %addr) {
+entry:
+  call void @llvm.tpu.bc.loop.start(i32 0)
+  br label %ls
+
+ls:
+  %aggphi = phi [6 x <8 x float>] [ %agg, %entry ], [ %aggupdate, %ls ]
+  %a = call <8 x float> addrspace(207)* @llvm.tpu.inttoptr.p207v8f32(i32 42)
+  %b = load <8 x float>, <8 x float> addrspace(207)* %a
+  %aggupdate = call [6 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex([6 x <8 x float>] %aggphi, <8 x float> %b)
+  %cont = call i1 @llvm.tpu.bc.loop.end()
+  br i1 %cont, label %ls, label %cnt
+
+cnt:
+  ret [6 x <8 x float>] %aggupdate
+}
+
+declare <8 x float> @llvm.tpu.bc.load.aliaddr(<8 x float> addrspace(207)*, i32) argmemonly
+declare void @llvm.tpu.bc.store.concat.aliaddr(<8 x float>, i32) inaccessiblememonly
+declare void @llvm.tpu.bc.shift.aliaddr(i32) inaccessiblememonly
+
+; CHECK: loop_vm:
+; CHECK: v{{[0-9]+}} = vld.f32 [bmem:s0] ali_addr:$0x1;
+; CHECK: vm0 = veq.f32 ps:$1 v{{[0-9]+}}, $0.0;
+; CHECK: v{{[0-9]+}} = vsel ps:$2 vm0, $0x0, v{{[0-9]+}}
+; CHECK: (concat_reg) = vst.f32 ps:$3 v{{[0-9]+}} ali_addr:$0x1;
+; CHECK: (cdfifo_reg) = shift ps:$3 (concat_reg) aliaddr:$0x1  }
+define void @loop_vm([6 x <8 x float>] %agg, <8 x float> addrspace(207)* %addr, <8 x float> %b) {
+entry:
+  call void @llvm.tpu.bc.loop.start(i32 0)
+  br label %ls
+
+ls:
+  %weight = call <8 x float> @llvm.tpu.bc.load.aliaddr(<8 x float> addrspace(207)* %addr, i32 1)
+  %c = fcmp oeq <8 x float> %weight, zeroinitializer
+  %r = select <8 x i1> %c, <8 x float> zeroinitializer, <8 x float> %b
+  call void @llvm.tpu.bc.store.concat.aliaddr(<8 x float> %r, i32 1)
+  call void @llvm.tpu.bc.shift.aliaddr(i32 1)
+  %cont = call i1 @llvm.tpu.bc.loop.end()
+  br i1 %cont, label %ls, label %cnt
+cnt:
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/mask_pred_spill_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/mask_pred_spill_sc.ll
new file mode 100644
index 0000000..5c97979
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/mask_pred_spill_sc.ll

@@ -0,0 +1,61 @@
+; RUN: llc -O0 -o - %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+define void @mask_spill(<8 x i1> %arg) {
+; CHECK-LABEL: mask_spill:
+; CHECK:       { v0 = vimm.s32 $0x0 }
+; CHECK-NEXT:  { v0 = vsel vm0, $0xffffffff, v0 }
+; CHECK-NEXT:  { [tilespmem:$0xc0] = vst v0 }
+; CHECK-NEXT:  { lr = call fun }
+; CHECK-NEXT:  { [[x:v[0-9]+]] = vld [tilespmem:$0xc0] }
+; CHECK:  { vm0 = vne.s32 [[x]], $0x0 }
+; CHECK-NEXT:  { lr = call fun2 }
+; CHECK-NEXT:  { _ = shalt }
+entry:
+  call void @fun()
+  call void @fun2(<8 x i1> %arg)
+  ret void
+}
+
+define void @pred_spill(i1 %arg) {
+; CHECK-LABEL: pred_spill:
+; CHECK:       { s0 = simm.s32 @!p0 $0x0 }
+; CHECK-NEXT:  { s0 = simm.s32 @p0 $0x1 }
+; CHECK-NEXT:  { [smem:$0xc7] = sst s0 }
+; CHECK-NEXT:  { lr = call fun }
+; CHECK-NEXT:  { [[x:s[0-9]+]] = sld [smem:$0xc7] }
+; CHECK-NEXT:    _ = snop }
+; CHECK-NEXT:    _ = snop }
+; CHECK-NEXT:  { p0 = seq.s32 [[x]], $0x1 }
+; CHECK-NEXT:  { lr = call fun3 }
+; CHECK-NEXT:  { _ = shalt }
+entry:
+  call void @fun()
+  call void @fun3(i1 %arg)
+  ret void
+}
+
+declare i32* @llvm.tpu.inttoptr.p0i32(i32)
+declare void @fun()
+declare void @fun2(<8 x i1> %m)
+declare void @fun3(i1 %p)
+
+!smem.funcs.spill = !{!0, !1}
+!smem.ranges.spill.start = !{!2, !2}
+!smem.ranges.spill.limit = !{!3, !3}
+
+!tilespmem.funcs.spill = !{!0, !1}
+!tilespmem.ranges.spill.start = !{!2, !2}
+!tilespmem.ranges.spill.limit = !{!3, !3}
+
+!vmem.funcs.spill = !{!0, !1}
+!vmem.ranges.spill.start = !{!2, !2}
+!vmem.ranges.spill.limit = !{!3, !3}
+
+!0 = !{void (<8 x i1>)* @mask_spill}
+!1 = !{void (i1)* @pred_spill}
+!2 = !{i32 100}
+!3 = !{i32 200}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/mask_pred_spill_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/mask_pred_spill_tc.ll
new file mode 100644
index 0000000..a0d841f
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/mask_pred_spill_tc.ll

@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -o - %s -mcpu=tensorcore-jf -asm-verbose=false -disable-cgp -tpu-skip-fast-opt | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+define void @mask_spill(<1024 x i1> %arg) {
+; CHECK-LABEL: mask_spill:
+; CHECK:       { v0 = vimm.s32 $0x0 }
+; CHECK-NEXT:  { v0 = vsel vm0, $0xffffffff, v0 }
+; CHECK-NEXT:  { [vmem:$0xc0] = vst v0 }
+; CHECK-NEXT:  { _ = vnop;
+; CHECK-NEXT:    lr = call fun }
+; CHECK-NEXT:  { _ = vnop }
+; CHECK-NEXT:  { _ = vnop }
+; CHECK-NEXT:  { _ = vnop }
+; CHECK-NEXT:  { [[x:v[0-9]+]] = vld [vmem:$0xc0] }
+; CHECK-NEXT:  { vm0 = vne.s32 [[x]], $0x0 }
+; CHECK-NEXT:  { lr = call fun2 }
+; CHECK-NEXT:  { _ = shalt }
+entry:
+  call void @fun()
+  call void @fun2(<1024 x i1> %arg)
+  ret void
+}
+
+define void @pred_spill(i1 %arg) {
+; CHECK-LABEL: pred_spill:
+; CHECK:       { s0 = simm.s32 @!p0 $0x0 }
+; CHECK-NEXT:  { s0 = simm.s32 @p0 $0x1 }
+; CHECK-NEXT:  { [smem:$0xc7] = sst s0 }
+; CHECK-NEXT:  { lr = call fun }
+; CHECK-NEXT:  { [[x:s[0-9]+]] = sld [smem:$0xc7] }
+; CHECK-NEXT:  { _ = vnop }
+; CHECK-NEXT:  { p0 = seq.s32 [[x]], $0x1 }
+; CHECK-NEXT:  { lr = call fun3 }
+; CHECK-NEXT:  { _ = shalt }
+entry:
+  call void @fun()
+  call void @fun3(i1 %arg)
+  ret void
+}
+
+declare i32* @llvm.tpu.inttoptr.p0i32(i32)
+declare void @fun()
+declare void @fun2(<1024 x i1> %m)
+declare void @fun3(i1 %p)
+
+!smem.funcs.spill = !{!0, !1}
+!smem.ranges.spill.start = !{!2, !2}
+!smem.ranges.spill.limit = !{!3, !3}
+
+!tilespmem.funcs.spill = !{!0, !1}
+!tilespmem.ranges.spill.start = !{!2, !2}
+!tilespmem.ranges.spill.limit = !{!3, !3}
+
+!vmem.funcs.spill = !{!0, !1}
+!vmem.ranges.spill.start = !{!2, !2}
+!vmem.ranges.spill.limit = !{!3, !3}
+
+!0 = !{void (<1024 x i1>)* @mask_spill}
+!1 = !{void (i1)* @pred_spill}
+!2 = !{i32 100}
+!3 = !{i32 200}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/memops_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/memops_sc.ll
new file mode 100644
index 0000000..89fc30e
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/memops_sc.ll

@@ -0,0 +1,68 @@
+; RUN: llc -O2 < %s -mcpu=sparsecore-tec-vf -asm-verbose=false \
+; RUN: -opaque-pointers | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests that we can break down llvm.memset, llvm.memcpy, and llvm.memmove.
+
+declare void @llvm.memset.p201i8.i64(i8 addrspace(201)*, i8, i64, i1)
+declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32)
+
+; CHECK-LABEL: memset_0:
+; CHECK: v[[v0:[0-9]+]] = vimm.s32 $0x0 }
+; CHECK: [tilespmem:$0x40] = vst v[[v0]]
+; CHECK: [tilespmem:$0x48] = vst v[[v0]]
+define void @memset_0() {
+entry:
+  %0 = tail call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 64)
+  %1 = bitcast <8 x i32> addrspace(201)* %0 to i8 addrspace(201)*
+  call void @llvm.memset.p201i8.i64(i8 addrspace(201)* align 32 %1, i8 0, i64 64, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: memset_15:
+; CHECK: v[[v0:[0-9]+]] = vimm.s32 $-0x1 }
+; CHECK: [tilespmem:$0x40] = vst v[[v0]]
+; CHECK: [tilespmem:$0x48] = vst v[[v0]]
+define void @memset_15() {
+entry:
+  %0 = tail call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 64)
+  %1 = bitcast <8 x i32> addrspace(201)* %0 to i8 addrspace(201)*
+  call void @llvm.memset.p201i8.i64(i8 addrspace(201)* align 32 %1, i8 255, i64 64, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: memset_0_long:
+; CHECK: v[[v0:[0-9]+]] = vimm.s32 $0x0 }
+; CHECK: [tilespmem:$0x40] = vst v[[v0]]
+; CHECK: [tilespmem:$0x48] = vst v[[v0]]
+; CHECK: [tilespmem:$0x50] = vst v[[v0]]
+; CHECK: [tilespmem:$0x58] = vst v[[v0]]
+define void @memset_0_long() {
+entry:
+  %0 = tail call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 64)
+  %1 = bitcast <8 x i32> addrspace(201)* %0 to i8 addrspace(201)*
+  call void @llvm.memset.p201i8.i64(i8 addrspace(201)* align 32 %1, i8 0, i64 128, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: memcpy_test:
+; CHECK: s[[s0:[0-9]+]] = sld [smem:s[[sin:[0-9]+]]+$0x0]
+; CHECK: [smem:s[[s4:[0-9]+]]] = sst s[[s0]]
+; CHECK: s[[s1:[0-9]+]] = sld [smem:s[[sin]]+$0x1]
+; CHECK: s[[s5:[0-9]+]] = sadd.s32 $0x1, s[[s4]]
+; CHECK: [smem:s[[s5]]] = sst s[[s1]]
+; CHECK: s[[s2:[0-9]+]] = sld [smem:s[[sin]]+$0x2]
+; CHECK: s[[s5:[0-9]+]] = sadd.s32 $0x2, s[[s4]]
+; CHECK: [smem:s[[s5]]] = sst s[[s2]]
+; CHECK: s[[s3:[0-9]+]] = sld [smem:s[[sin]]+$0x3]
+; CHECK: s[[s6:[0-9]+]] = sadd.s32 $0x3, s[[s4]]
+; CHECK: [smem:s[[s6]]] = sst s[[s3]]
+define void @memcpy_test(ptr %s, ptr %d) {
+entry:
+  call void @llvm.memcpy.p0.p0.i32(ptr align 4 %d, ptr align 4 %s, i32 16, i1 false)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/misc_bf16_gl_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/misc_bf16_gl_sc.ll
new file mode 100644
index 0000000..7533ac7
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/misc_bf16_gl_sc.ll

@@ -0,0 +1,35 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "googletpu"
+
+declare <8 x i32> @llvm.tpu.vlaneseq.v8i32() readnone
+declare <16 x bfloat> @llvm.tpu.vlaneseq.c.bf16() readnone
+declare <16 x bfloat> @llvm.tpu.vlaneseq.i.bf16() readnone
+
+; CHECK-LABEL: vlaneseq_u32:
+; CHECK: v{{[0-9]+}} = vlaneseq.u32
+define <8 x i32> @vlaneseq_u32() {
+  %a = call <8 x i32> @llvm.tpu.vlaneseq.v8i32() readnone
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vlaneseq_c_bf16:
+; CHECK: v{{[0-9]+}} = vlaneseq.c.bf16
+define <16 x bfloat> @vlaneseq_c_bf16() {
+  %a = call <16 x bfloat> @llvm.tpu.vlaneseq.c.bf16() readnone
+  ret <16 x bfloat> %a
+}
+
+; CHECK-LABEL: vlaneseq_i_bf16:
+; CHECK: v{{[0-9]+}} = vlaneseq.i.bf16
+define <16 x bfloat> @vlaneseq_i_bf16() {
+  %a = call <16 x bfloat> @llvm.tpu.vlaneseq.i.bf16() readnone
+  ret <16 x bfloat> %a
+}
+
+attributes #0 = { "implicit-section-name"=".text.tile_execute" "target-cpu"="sparsecore-tec-gl" }
+attributes #1 = { "implicit-section-name"=".text.tile_access" "target-cpu"="sparsecore-tac-gl" }
+attributes #2 = { "implicit-section-name"=".text.scs" "target-cpu"="sparsecore-scs-gl" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/misc_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/misc_tc.ll
new file mode 100644
index 0000000..eabe0a1
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/misc_tc.ll

@@ -0,0 +1,87 @@
+; RUN: llc < %s -mcpu=tensorcore-jf -asm-verbose=false -disable-cgp -print-encoding-annotations | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.vtrace(i32) nounwind
+
+; CHECK-LABEL: vtrace_r:
+; CHECK: _ = vtrace s{{[0-9]+}}
+define void @vtrace_r(i32 %op) {
+  call void @llvm.tpu.vtrace(i32 %op)
+  ret void
+}
+
+; CHECK-LABEL: vtrace_i:
+; CHECK: _ = vtrace $0x1
+define void @vtrace_i() {
+  call void @llvm.tpu.vtrace(i32 1)
+  ret void
+}
+
+declare void @llvm.tpu.vsettm(i32) nounwind
+
+; CHECK-LABEL: vsettm_r:
+; CHECK: (tm) = vsettm s{{[0-9]+}}
+define void @vsettm_r(i32 %op) {
+  call void @llvm.tpu.vsettm(i32 %op)
+  ret void
+}
+
+; CHECK-LABEL: vsettm_i:
+; CHECK: (tm) = vsettm $0x1
+define void @vsettm_i() {
+  call void @llvm.tpu.vsettm(i32 1)
+  ret void
+}
+
+declare void @llvm.tpu.halt.trap(i1)
+
+; CHECK-LABEL: predicated_shalt:
+; CHECK: _ = shalt @p{{[0-9]+}}
+; CHECK: _ = shalt
+define void @predicated_shalt(i32 %x) {
+  %flag = icmp ne i32 %x, 42
+  call void @llvm.tpu.halt.trap(i1 %flag)
+  ret void
+}
+
+; CHECK-LABEL: false_shalt:
+; CHECK-NOT: _ = shalt @p{{[0-9]+}}
+; CHECK: _ = shalt
+define void @false_shalt(i32 %x) {
+  call void @llvm.tpu.halt.trap(i1 false)
+  ret void
+}
+
+declare void @llvm.tpu.tc.setrngseed(<1024 x i32>) nounwind
+declare <1024 x i32> @llvm.tpu.tc.getrngseed() nounwind
+declare <1024 x i32> @llvm.tpu.tc.vrng() nounwind
+
+; CHECK-LABEL: prng:
+; CHECK: _ = setrngseed v{{[0-9]+}}
+; CHECK-NEXT: vdelay $0x8
+; CHECK-NEXT: v{{[0-9]+}} = getrngseed
+; CHECK-NEXT: v{{[0-9]+}} = vrng.8x128.u32
+; CHECK-NEXT: vdelay $0x6
+; CHECK-NEXT: v{{[0-9]+}} = vrng.8x128.u32
+define void @prng(<1024 x i32> %x) {
+  call void @llvm.tpu.tc.setrngseed(<1024 x i32> %x)
+  %y = call <1024 x i32> @llvm.tpu.tc.getrngseed()
+  %z = call <1024 x i32> @llvm.tpu.tc.vrng()
+  %a = call <1024 x i32> @llvm.tpu.tc.vrng()
+  ret void
+}
+
+declare void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)*, <1024 x i32> addrspace(205)*, i8192 addrspace(203)*, i32) argmemonly nounwind
+declare i8192 addrspace(203)* @llvm.tpu.inttoptr.p203i8192(i32) nounwind readnone
+
+; CHECK-LABEL: dma_vmem_to_hbm_with_gep:
+; CHECK:    { (slot_s0) s2 = simm.s32 $0x3d8000 }
+define void @dma_vmem_to_hbm_with_gep(<1024 x i32> addrspace(205)* %src, i32 addrspace(204)* %flag) {
+  %base = tail call i8192 addrspace(203)* @llvm.tpu.inttoptr.p203i8192(i32 0)
+  %dest = getelementptr i8192, i8192 addrspace(203)* %base, i32 4030464
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %flag, <1024 x i32> addrspace(205)* %src, i8192 addrspace(203)* %dest, i32 4)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/mxu.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/mxu.ll
new file mode 100644
index 0000000..652d657
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/mxu.ll

@@ -0,0 +1,237 @@
+; RUN: llc < %s -mcpu=tensorcore-jf -asm-verbose=false -disable-cgp | FileCheck %s --check-prefixes CHECK,CHECK-JF
+; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp | FileCheck %s --check-prefixes CHECK,CHECK-PF
+; REQUIRES: tpu
+
+; Test mxu intrinsics code generation
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.vmatpush.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.low.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.hi.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.packed.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.xpose.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.low.xpose.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.hi.xpose.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.packed.xpose.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vdwg(i32, i32) nounwind
+declare i32 @llvm.tpu.vdwg.xpose(i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.f32.dwg.gsfn(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.f32.dwg.gsft(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.low.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.low.f32.dwg.gsfn(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.low.f32.dwg.gsft(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.hi.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.hi.f32.dwg.gsfn(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.hi.f32.dwg.gsft(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.packed.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.packed.f32.dwg.gsfn(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.packed.f32.dwg.gsft(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare <1024 x float> @llvm.tpu.vmatres.f32(i32, i32) nounwind
+
+; CHECK-LABEL: matpush:
+; CHECK: (gsfn0) =	vmatpush.f32 v{{[0-9]+}}
+ define void @matpush(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %push = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: matpushmxu1:
+; CHECK: (gsfn1) =	vmatpush.f32 v{{[0-9]+}}
+ define void @matpushmxu1(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %push = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: matpushlow:
+; CHECK: (gsfn0) =	vmatpush.low.f32 v{{[0-9]+}}
+ define void @matpushlow(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %push = call i32 @llvm.tpu.vmatpush.low.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: matpushhi:
+; CHECK: (gsfn0) =	vmatpush.hi.f32 v{{[0-9]+}}
+ define void @matpushhi(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %push = call i32 @llvm.tpu.vmatpush.hi.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: matpushpacked:
+; CHECK: (gsfn0) =	vmatpush.packed.f32 v{{[0-9]+}}
+ define void @matpushpacked(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %push = call i32 @llvm.tpu.vmatpush.packed.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: matpushxpos:
+; CHECK-JF: (gsfn0) =	vmatpush.xpose.f32 v{{[0-9]+}}
+; CHECK-PF: (gsft0) =	vmatpush.xpose.f32 v{{[0-9]+}}
+ define void @matpushxpos(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %push = call i32 @llvm.tpu.vmatpush.xpose.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: matpushxposlow:
+; CHECK-JF: (gsfn0) =	vmatpush.low.xpose.f32 v{{[0-9]+}}
+; CHECK-PF: (gsft0) =	vmatpush.low.xpose.f32 v{{[0-9]+}}
+ define void @matpushxposlow(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %push = call i32 @llvm.tpu.vmatpush.low.xpose.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: matpushxposhi:
+; CHECK-JF: (gsfn0) =	vmatpush.hi.xpose.f32 v{{[0-9]+}}
+; CHECK-PF: (gsft0) =	vmatpush.hi.xpose.f32 v{{[0-9]+}}
+ define void @matpushxposhi(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %push = call i32 @llvm.tpu.vmatpush.hi.xpose.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: matpushxpospacked:
+; CHECK-JF: (gsfn0) =	vmatpush.packed.xpose.f32 v{{[0-9]+}}
+; CHECK-PF: (gsft0) =	vmatpush.packed.xpose.f32 v{{[0-9]+}}
+ define void @matpushxpospacked(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %push = call i32 @llvm.tpu.vmatpush.packed.xpose.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: dwg:
+; CHECK: (gmr0) =	vdwg.f16 (gsfn0)
+ define void @dwg(<1024 x float> %v) {
+  %d = call i32 @llvm.tpu.vdwg(i32 0, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: dwg_xpose:
+; CHECK: (gmr0) =	vdwg.f16 (gsft0);
+ define void @dwg_xpose(<1024 x float> %v) {
+  %d = call i32 @llvm.tpu.vdwg.xpose(i32 0, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: matmul:
+; CHECK: (mrf0) =	vmatmul.f32 v{{[0-9]+}}
+; CHECK: v{{[0-9]+}} = vmatres.8x128.f32 (mrf0)
+ define <1024 x float> @matmul(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %matmul = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul)
+  ret <1024 x float> %matres
+}
+
+; CHECK-LABEL: matmullow:
+; CHECK: (mrf0) =	vmatmul.low.f32 v{{[0-9]+}}
+; CHECK: v{{[0-9]+}} = vmatres.8x128.f32 (mrf0)
+ define <1024 x float> @matmullow(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %matmul = call i32 @llvm.tpu.vmatmul.low.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul)
+  ret <1024 x float> %matres
+}
+
+; CHECK-LABEL: matmulhi:
+; CHECK: (mrf0) =	vmatmul.hi.f32 v{{[0-9]+}}
+; CHECK: v{{[0-9]+}} = vmatres.8x128.f32 (mrf0)
+ define <1024 x float> @matmulhi(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %matmul = call i32 @llvm.tpu.vmatmul.hi.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul)
+  ret <1024 x float> %matres
+}
+
+; CHECK-LABEL: matmulpacked:
+; CHECK: (mrf0) =	vmatmul.packed.f32 v{{[0-9]+}}
+; CHECK: v{{[0-9]+}} = vmatres.8x128.f32 (mrf0)
+ define <1024 x float> @matmulpacked(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %matmul = call i32 @llvm.tpu.vmatmul.packed.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres1 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul)
+  %matres2 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul)
+  ret <1024 x float> %matres2
+}
+
+; CHECK-LABEL: matmulpacked_mask:
+; CHECK: (mrf0) =	vmatmul.packed.msk.f32 vm0, v0
+; CHECK: v{{[0-9]+}} = vmatres.8x128.f32 (mrf0)
+ define <1024 x float> @matmulpacked_mask(<1024 x float> %v, <1024 x i1> %mask) {
+  %matmul = call i32 @llvm.tpu.vmatmul.packed.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres1 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul)
+  %matres2 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul)
+  ret <1024 x float> %matres2
+}
+
+; CHECK-LABEL: matmul_dwg:
+; CHECK: ((gmr0), (gsfn0), (mrf0)) = vmatmul.dwg v{{[0-9]+}}
+; CHECK: ((gmr0), (gsft0), (mrf0)) = vmatmul.dwg v{{[0-9]+}}
+; CHECK: ((gmr0), (gsfn0), (mrf0)) = vmatmul.low.dwg v{{[0-9]+}}
+; CHECK: ((gmr0), (gsft0), (mrf0)) = vmatmul.low.dwg v{{[0-9]+}}
+; CHECK: ((gmr0), (gsfn0), (mrf0)) = vmatmul.hi.dwg v{{[0-9]+}}
+; CHECK: ((gmr0), (gsft0), (mrf0)) = vmatmul.hi.dwg v{{[0-9]+}}
+; CHECK: ((gmr0), (gsfn0), (mrf0)) = vmatmul.packed.dwg v{{[0-9]+}}
+; CHECK: ((gmr0), (gsft0), (mrf0)) = vmatmul.packed.dwg v{{[0-9]+}}
+ define <1024 x float> @matmul_dwg(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+
+  %matmulres1 = call {i32, i32, i32} @llvm.tpu.vmatmul.f32.dwg.gsfn(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf1 = extractvalue { i32, i32, i32 } %matmulres1, 0
+  %matres1 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf1)
+
+  %matmulres2 = call {i32, i32, i32} @llvm.tpu.vmatmul.f32.dwg.gsft(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf2 = extractvalue { i32, i32, i32 } %matmulres2, 0
+  %matres2 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf2)
+
+  %matmulres3 = call {i32, i32, i32} @llvm.tpu.vmatmul.low.f32.dwg.gsfn(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf3 = extractvalue { i32, i32, i32 } %matmulres3, 0
+  %matres3 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf3)
+
+  %matmulres4 = call {i32, i32, i32} @llvm.tpu.vmatmul.low.f32.dwg.gsft(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf4 = extractvalue { i32, i32, i32 } %matmulres4, 0
+  %matres4 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf4)
+
+  %matmulres5 = call {i32, i32, i32} @llvm.tpu.vmatmul.hi.f32.dwg.gsfn(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf5 = extractvalue { i32, i32, i32 } %matmulres5, 0
+  %matres5 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf5)
+
+  %matmulres6 = call {i32, i32, i32} @llvm.tpu.vmatmul.hi.f32.dwg.gsft(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf6 = extractvalue { i32, i32, i32 } %matmulres6, 0
+  %matres6 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf6)
+
+  %matmulres7 = call {i32, i32, i32} @llvm.tpu.vmatmul.packed.f32.dwg.gsfn(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf7 = extractvalue { i32, i32, i32 } %matmulres7, 0
+  %matres7 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf7)
+  %matres8 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf7)
+
+  %matmulres9 = call {i32, i32, i32} @llvm.tpu.vmatmul.packed.f32.dwg.gsft(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf9 = extractvalue { i32, i32, i32 } %matmulres9, 0
+  %matres9 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf9)
+  %matres10 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf9)
+
+  ret <1024 x float> %matres10
+}
+

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/mxu_scoreboard.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/mxu_scoreboard.ll
new file mode 100644
index 0000000..96e786b
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/mxu_scoreboard.ll

@@ -0,0 +1,71 @@
+; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+; Test mxu scheduling
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.vmatpush.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vdwg(i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.packed.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare <1024 x float> @llvm.tpu.vmatres.f32(i32, i32) nounwind
+
+; CHECK-LABEL: matpush:
+; CHECK: {  (gsfn0) =	vmatpush.f32 v0;
+; CHECK-NEXT:   	_ =	vdelay $0x7  }
+; CHECK-NEXT: { (gsfn0) = vmatpush.f32 v1;
+define void @matpush(<1024 x float> %v0, <1024 x float> %v1) {
+  %v = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %push0 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v0, <1024 x i1> %mask, i32 0, i32 undef)
+  %push1 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 %push0)
+  ret void
+}
+
+; matmul ands matres needs to be 8 cycles away.
+; between first matmul and first matres there should be 83 cycles
+; CHECK-LABEL: matmul:
+; CHECK: (mrf0) =	vmatmul.f32 v0;
+; CHECK-NEXT:   	_ =	vdelay $0x7  }
+; CHECK-NEXT: {  (mrf0) =	vmatmul.f32 v1;
+; CHECK-NEXT:   	_ =	vdelay $0x4a  }
+; CHECK-NEXT: { v{{[0-9]+}} = vmatres.8x128.f32 (mrf0);
+; CHECK-NEXT:   	_ =	vdelay $0x7  }
+; CHECK-NEXT: { v{{[0-9]+}} = vmatres.8x128.f32 (mrf0)
+ define void @matmul(<1024 x float> %v0, <1024 x float> %v1) {
+  %v = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %matmul = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v0, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul)
+  %matmul2 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres2 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul2)
+  ret void
+}
+
+; matmul.packed need to be 16 cycles away.
+; between first matmul and first matres there should be 83 cycles
+; CHECK-LABEL: matmulpacked:
+; CHECK: (mrf0) =	vmatmul.packed.f32 v0;
+; CHECK-NEXT:   	_ =	vdelay $0xf  }
+; CHECK-NEXT: {  (mrf0) =	vmatmul.packed.f32 v1;
+; CHECK-NEXT:   	_ =	vdelay $0x42  }
+; CHECK-NEXT: { v{{[0-9]+}} = vmatres.8x128.f32 (mrf0);
+; CHECK-NEXT:   	_ =	vdelay $0x7  }
+; CHECK-NEXT: { v{{[0-9]+}} = vmatres.8x128.f32 (mrf0);
+; CHECK-NEXT:   	_ =	vdelay $0x7  }
+; CHECK-NEXT: { v{{[0-9]+}} = vmatres.8x128.f32 (mrf0);
+; CHECK-NEXT:   	_ =	vdelay $0x7  }
+; CHECK-NEXT: { v{{[0-9]+}} = vmatres.8x128.f32 (mrf0)
+ define <1024 x float> @matmulpacked(<1024 x float> %v0, <1024 x float> %v1) {
+  %v = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %matmul = call i32 @llvm.tpu.vmatmul.packed.f32(<1024 x float> %v0, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul)
+  %matres1 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul)
+  %matmul2 = call i32 @llvm.tpu.vmatmul.packed.f32(<1024 x float> %v1, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres2 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul2)
+  %matres3 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul2)
+  ret <1024 x float> %matres3
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/mxu_vf.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/mxu_vf.ll
new file mode 100644
index 0000000..11d17e2
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/mxu_vf.ll

@@ -0,0 +1,383 @@
+; RUN: llc < %s -mcpu=tensorcore-vf -asm-verbose=false -disable-cgp | FileCheck %s --check-prefix CHECK
+; REQUIRES: tpu
+
+; Test mxu intrinsics code generation
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.vmatpush.msra(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.if8.bf16.msra(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.bf16.msra(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.bf8.bf16.msra(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.u8.msra(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.s8.msra(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.u4.msra(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.s4.msra(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.msra.xpose(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.if8.bf16.msra.xpose(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.bf16.msra.xpose(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.bf8.bf16.msra.xpose(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.u8.msra.xpose(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.s8.msra.xpose(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.u4.msra.xpose(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.s4.msra.xpose(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.msrb(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.if8.bf16.msrb(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.bf16.msrb(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.bf8.bf16.msrb(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.u8.msrb(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.s8.msrb(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.u4.msrb(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.s4.msrb(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.msrb.xpose(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.if8.bf16.msrb.xpose(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.bf16.msrb.xpose(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.bf8.bf16.msrb.xpose(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.u8.msrb.xpose(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.s8.msrb.xpose(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.u4.msrb.xpose(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatpush.s4.msrb.xpose(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vlgmr.msra(i32, i32) nounwind
+declare i32 @llvm.tpu.vllmr.msra(i32, i32) nounwind
+declare i32 @llvm.tpu.vlgmr.msrb(i32, i32) nounwind
+declare i32 @llvm.tpu.vllmr.msrb(i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.f32(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.if8.bf16(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.bf16(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.bf8.bf16(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.u8(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.s8(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.u4(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare i32 @llvm.tpu.vmatmul.s4(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare <1024 x float> @llvm.tpu.vmatres.f32(i32, i32) nounwind
+
+; CHECK-LABEL: matpush:
+; CHECK: (msra0) =	vmatpush v{{[0-9]+}}
+; CHECK: (msrb0) =	vmatpush v{{[0-9]+}}
+ define void @matpush(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %push.msra = call i32 @llvm.tpu.vmatpush.msra(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push.msrb = call i32 @llvm.tpu.vmatpush.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: matpushmxun:
+; CHECK: (msra1) =	vmatpush v{{[0-9]+}}
+; CHECK: (msra2) =	vmatpush v{{[0-9]+}}
+; CHECK: (msra3) =	vmatpush v{{[0-9]+}}
+; CHECK: (msrb1) =	vmatpush v{{[0-9]+}}
+; CHECK: (msrb2) =	vmatpush v{{[0-9]+}}
+; CHECK: (msrb3) =	vmatpush v{{[0-9]+}}
+ define void @matpushmxun(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %push.msra1 = call i32 @llvm.tpu.vmatpush.msra(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  %push.msrb1 = call i32 @llvm.tpu.vmatpush.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  %push.msra2 = call i32 @llvm.tpu.vmatpush.msra(<1024 x float> %v, <1024 x i1> %mask, i32 2, i32 undef)
+  %push.msrb2 = call i32 @llvm.tpu.vmatpush.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 2, i32 undef)
+  %push.msra3 = call i32 @llvm.tpu.vmatpush.msra(<1024 x float> %v, <1024 x i1> %mask, i32 3, i32 undef)
+  %push.msrb3 = call i32 @llvm.tpu.vmatpush.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 3, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: matpush_vf_vreg_interpretations:
+; CHECK: (msra0) =	vmatpush.if8.bf16 v0
+; CHECK: (msra1) =	vmatpush.if8.bf16 v0
+; CHECK: (msrb0) =	vmatpush.if8.bf16 v0
+; CHECK: (msrb1) =	vmatpush.if8.bf16 v0
+; CHECK: (msra0) =	vmatpush.bf16 v0
+; CHECK: (msra1) =	vmatpush.bf16 v0
+; CHECK: (msrb0) =	vmatpush.bf16 v0
+; CHECK: (msrb1) =	vmatpush.bf16 v0
+; CHECK: (msra0) =	vmatpush.bf8.bf16 v0
+; CHECK: (msra1) =	vmatpush.bf8.bf16 v0
+; CHECK: (msrb0) =	vmatpush.bf8.bf16 v0
+; CHECK: (msrb1) =	vmatpush.bf8.bf16 v0
+; CHECK: (msra0) =	vmatpush.u8 v0
+; CHECK: (msra1) =	vmatpush.u8 v0
+; CHECK: (msrb0) =	vmatpush.u8 v0
+; CHECK: (msrb1) =	vmatpush.u8 v0
+; CHECK: (msra0) =	vmatpush.u4 v0
+; CHECK: (msra1) =	vmatpush.u4 v0
+; CHECK: (msrb0) =	vmatpush.u4 v0
+; CHECK: (msrb1) =	vmatpush.u4 v0
+; CHECK: (msra0) =	vmatpush.s8 v0
+; CHECK: (msra1) =	vmatpush.s8 v0
+; CHECK: (msrb0) =	vmatpush.s8 v0
+; CHECK: (msrb1) =	vmatpush.s8 v0
+; CHECK: (msra0) =	vmatpush.s4 v0
+; CHECK: (msra1) =	vmatpush.s4 v0
+; CHECK: (msrb0) =	vmatpush.s4 v0
+; CHECK: (msrb1) =	vmatpush.s4 v0
+ define void @matpush_vf_vreg_interpretations(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %push.msra0.if8.bf16 = call i32 @llvm.tpu.vmatpush.if8.bf16.msra(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push.msrb0.if8.bf16 = call i32 @llvm.tpu.vmatpush.if8.bf16.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push.msra0.bf16 = call i32 @llvm.tpu.vmatpush.bf16.msra(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push.msrb0.bf16 = call i32 @llvm.tpu.vmatpush.bf16.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push.msra0.bf8.bf16 = call i32 @llvm.tpu.vmatpush.bf8.bf16.msra(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push.msrb0.bf8.bf16 = call i32 @llvm.tpu.vmatpush.bf8.bf16.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push.msra0.u8 = call i32 @llvm.tpu.vmatpush.u8.msra(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push.msrb0.u8 = call i32 @llvm.tpu.vmatpush.u8.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push.msra0.u4 = call i32 @llvm.tpu.vmatpush.u4.msra(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push.msrb0.u4 = call i32 @llvm.tpu.vmatpush.u4.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push.msra0.s8 = call i32 @llvm.tpu.vmatpush.s8.msra(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push.msrb0.s8 = call i32 @llvm.tpu.vmatpush.s8.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push.msra0.s4 = call i32 @llvm.tpu.vmatpush.s4.msra(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push.msrb0.s4 = call i32 @llvm.tpu.vmatpush.s4.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %push.msra1.if8.bf16 = call i32 @llvm.tpu.vmatpush.if8.bf16.msra(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  %push.msrb1.if8.bf16 = call i32 @llvm.tpu.vmatpush.if8.bf16.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  %push.msra1.bf16 = call i32 @llvm.tpu.vmatpush.bf16.msra(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  %push.msrb1.bf16 = call i32 @llvm.tpu.vmatpush.bf16.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  %push.msra1.bf8.bf16 = call i32 @llvm.tpu.vmatpush.bf8.bf16.msra(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  %push.msrb1.bf8.bf16 = call i32 @llvm.tpu.vmatpush.bf8.bf16.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  %push.msra1.u8 = call i32 @llvm.tpu.vmatpush.u8.msra(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  %push.msrb1.u8 = call i32 @llvm.tpu.vmatpush.u8.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  %push.msra1.u4 = call i32 @llvm.tpu.vmatpush.u4.msra(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  %push.msrb1.u4 = call i32 @llvm.tpu.vmatpush.u4.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  %push.msra1.s8 = call i32 @llvm.tpu.vmatpush.s8.msra(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  %push.msrb1.s8 = call i32 @llvm.tpu.vmatpush.s8.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  %push.msra1.s4 = call i32 @llvm.tpu.vmatpush.s4.msra(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  %push.msrb1.s4 = call i32 @llvm.tpu.vmatpush.s4.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 1, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: dwg:
+; CHECK:  (gmr0) =  vlgmr (msra0)
+; CHECK:  (gmr1) =  vlgmr (msra1)
+; CHECK:  (gmr2) =  vlgmr (msra2)
+; CHECK:  (gmr3) =  vlgmr (msra3)
+; CHECK:  (gmr0) =  vlgmr (msrb0)
+; CHECK:  (gmr1) =  vlgmr (msrb1)
+; CHECK:  (gmr2) =  vlgmr (msrb2)
+; CHECK:  (gmr3) =  vlgmr (msrb3)
+; CHECK:  (gmr0) =  vlgmr (msra0)
+; CHECK:  (lmr1) =  vllmr (msra1)
+; CHECK:  (lmr2) =  vllmr (msra2)
+; CHECK:  (lmr3) =  vllmr (msra3)
+; CHECK:  (lmr0) =  vllmr (msrb0)
+; CHECK:  (lmr1) =  vllmr (msrb1)
+; CHECK:  (lmr2) =  vllmr (msrb2)
+; CHECK:  (lmr3) =  vllmr (msrb3)
+ define void @dwg(<1024 x float> %v) {
+  %d1 = call i32 @llvm.tpu.vlgmr.msra(i32 0, i32 undef)
+  %d2 = call i32 @llvm.tpu.vlgmr.msra(i32 1, i32 undef)
+  %d3 = call i32 @llvm.tpu.vlgmr.msra(i32 2, i32 undef)
+  %d4 = call i32 @llvm.tpu.vlgmr.msra(i32 3, i32 undef)
+  %d5 = call i32 @llvm.tpu.vlgmr.msrb(i32 0, i32 undef)
+  %d6 = call i32 @llvm.tpu.vlgmr.msrb(i32 1, i32 undef)
+  %d7 = call i32 @llvm.tpu.vlgmr.msrb(i32 2, i32 undef)
+  %d8 = call i32 @llvm.tpu.vlgmr.msrb(i32 3, i32 undef)
+  %d9 = call i32 @llvm.tpu.vlgmr.msra(i32 0, i32 undef)
+  %d10 = call i32 @llvm.tpu.vllmr.msra(i32 1, i32 undef)
+  %d11 = call i32 @llvm.tpu.vllmr.msra(i32 2, i32 undef)
+  %d12 = call i32 @llvm.tpu.vllmr.msra(i32 3, i32 undef)
+  %d13 = call i32 @llvm.tpu.vllmr.msrb(i32 0, i32 undef)
+  %d14 = call i32 @llvm.tpu.vllmr.msrb(i32 1, i32 undef)
+  %d15 = call i32 @llvm.tpu.vllmr.msrb(i32 2, i32 undef)
+  %d16 = call i32 @llvm.tpu.vllmr.msrb(i32 3, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: matmulvf:
+; CHECK: (mrf0) = vmatmul v0
+; CHECK: (mrf0) = vmatmul.if8.bf16 v0
+; CHECK: (mrf0) = vmatmul.bf16 v0
+; CHECK: (mrf0) = vmatmul.bf8.bf16 v0
+; CHECK: (mrf0) = vmatmul.u8 v0
+; CHECK: (mrf0) = vmatmul.s8 v0
+; CHECK: (mrf0) = vmatmul.u4 v0
+; CHECK: (mrf0) = vmatmul.s4 v0
+; CHECK: v25 =  vmatres.8x128 (mrf0)
+; CHECK: v26 =  vmatres.8x128 (mrf0)
+; CHECK: v27 =  vmatres.8x128 (mrf0)
+; CHECK: v28 =  vmatres.8x128 (mrf0)
+; CHECK: v29 =  vmatres.8x128 (mrf0)
+; CHECK: v30 =  vmatres.8x128 (mrf0)
+; CHECK: v31 =  vmatres.8x128 (mrf0)
+; CHECK: v0 =   vmatres.8x128 (mrf0)
+ define <1024 x float> @matmulvf(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+
+  %matmul1 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres1 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul1)
+  %matmul2 = call i32 @llvm.tpu.vmatmul.if8.bf16(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres2 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul2)
+  %matmul3 = call i32 @llvm.tpu.vmatmul.bf16(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres3 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul3)
+  %matmul4 = call i32 @llvm.tpu.vmatmul.bf8.bf16(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres4 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul4)
+  %matmul5 = call i32 @llvm.tpu.vmatmul.u8(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres5 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul5)
+  %matmul6 = call i32 @llvm.tpu.vmatmul.s8(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres6 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul6)
+  %matmul7 = call i32 @llvm.tpu.vmatmul.u4(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres7 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul7)
+  %matmul8 = call i32 @llvm.tpu.vmatmul.s4(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %matres8 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %matmul8)
+
+  ret <1024 x float> %matres8
+}
+
+
+; fused vmatmul
+declare {i32, i32, i32} @llvm.tpu.vmatmul.f32.lgmr.msra(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.f32.lgmr.msrb(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.if8.bf16.lgmr.msra(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.if8.bf16.lgmr.msrb(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.bf16.lgmr.msra(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.bf16.lgmr.msrb(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.bf8.bf16.lgmr.msra(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.bf8.bf16.lgmr.msrb(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.u8.lgmr.msra(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.u8.lgmr.msrb(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.s8.lgmr.msra(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.s8.lgmr.msrb(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.u4.lgmr.msra(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.u4.lgmr.msrb(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.s4.lgmr.msra(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+declare {i32, i32, i32} @llvm.tpu.vmatmul.s4.lgmr.msrb(<1024 x float>, <1024 x i1>, i32, i32) nounwind
+
+
+; CHECK: ((mrf0), (gmr0), (msra0)) =     vmatmul.lgmr v{{[0-9]+}}
+; CHECK: ((mrf0), (gmr0), (msrb0)) =     vmatmul.lgmr v{{[0-9]+}}
+; CHECK: ((mrf0), (gmr0), (msra0)) =     vmatmul.if8.bf16.lgmr v{{[0-9]+}}
+; CHECK: ((mrf0), (gmr0), (msrb0)) =     vmatmul.if8.bf16.lgmr v{{[0-9]+}}
+; CHECK: ((mrf0), (gmr0), (msra0)) =     vmatmul.bf16.lgmr v{{[0-9]+}}
+; CHECK: ((mrf0), (gmr0), (msrb0)) =     vmatmul.bf16.lgmr v{{[0-9]+}}
+; CHECK: ((mrf0), (gmr0), (msra0)) =     vmatmul.bf8.bf16.lgmr v{{[0-9]+}}
+; CHECK: ((mrf0), (gmr0), (msrb0)) =     vmatmul.bf8.bf16.lgmr v{{[0-9]+}}
+; CHECK: ((mrf0), (gmr0), (msra0)) =     vmatmul.u8.lgmr v{{[0-9]+}}
+; CHECK: ((mrf0), (gmr0), (msrb0)) =     vmatmul.u8.lgmr v{{[0-9]+}}
+; CHECK: ((mrf0), (gmr0), (msra0)) =     vmatmul.s8.lgmr v{{[0-9]+}}
+; CHECK: ((mrf0), (gmr0), (msrb0)) =     vmatmul.s8.lgmr v{{[0-9]+}}
+; CHECK: ((mrf0), (gmr0), (msra0)) =     vmatmul.u4.lgmr v{{[0-9]+}}
+; CHECK: ((mrf0), (gmr0), (msrb0)) =     vmatmul.u4.lgmr v{{[0-9]+}}
+; CHECK: ((mrf0), (gmr0), (msra0)) =     vmatmul.s4.lgmr v{{[0-9]+}}
+; CHECK: ((mrf0), (gmr0), (msrb0)) =     vmatmul.s4.lgmr v{{[0-9]+}}
+define <1024 x float> @matmullgmr(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+
+  %matmulres1 = call {i32, i32, i32} @llvm.tpu.vmatmul.f32.lgmr.msra(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf1 = extractvalue { i32, i32, i32 } %matmulres1, 0
+  %matres1 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf1)
+
+  %matmulres2 = call {i32, i32, i32} @llvm.tpu.vmatmul.f32.lgmr.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf2 = extractvalue { i32, i32, i32 } %matmulres2, 0
+  %matres2 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf2)
+
+  %matmulres3 = call {i32, i32, i32} @llvm.tpu.vmatmul.if8.bf16.lgmr.msra(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf3 = extractvalue { i32, i32, i32 } %matmulres3, 0
+  %matres3 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf3)
+
+  %matmulres4 = call {i32, i32, i32} @llvm.tpu.vmatmul.if8.bf16.lgmr.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf4 = extractvalue { i32, i32, i32 } %matmulres4, 0
+  %matres4 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf4)
+
+  %matmulres5 = call {i32, i32, i32} @llvm.tpu.vmatmul.bf16.lgmr.msra(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf5 = extractvalue { i32, i32, i32 } %matmulres5, 0
+  %matres5 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf5)
+
+  %matmulres6 = call {i32, i32, i32} @llvm.tpu.vmatmul.bf16.lgmr.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf6 = extractvalue { i32, i32, i32 } %matmulres6, 0
+  %matres6 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf6)
+
+  %matmulres7 = call {i32, i32, i32} @llvm.tpu.vmatmul.bf8.bf16.lgmr.msra(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf7 = extractvalue { i32, i32, i32 } %matmulres7, 0
+  %matres7 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf7)
+
+  %matmulres8 = call {i32, i32, i32} @llvm.tpu.vmatmul.bf8.bf16.lgmr.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf8 = extractvalue { i32, i32, i32 } %matmulres8, 0
+  %matres8 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf8)
+
+  %matmulres9 = call {i32, i32, i32} @llvm.tpu.vmatmul.u8.lgmr.msra(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf9 = extractvalue { i32, i32, i32 } %matmulres9, 0
+  %matres9 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf9)
+
+  %matmulres10 = call {i32, i32, i32} @llvm.tpu.vmatmul.u8.lgmr.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf10 = extractvalue { i32, i32, i32 } %matmulres10, 0
+  %matres10 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf10)
+
+  %matmulres11 = call {i32, i32, i32} @llvm.tpu.vmatmul.s8.lgmr.msra(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf11 = extractvalue { i32, i32, i32 } %matmulres11, 0
+  %matres11 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf11)
+
+  %matmulres12 = call {i32, i32, i32} @llvm.tpu.vmatmul.s8.lgmr.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf12 = extractvalue { i32, i32, i32 } %matmulres12, 0
+  %matres12 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf12)
+
+  %matmulres13 = call {i32, i32, i32} @llvm.tpu.vmatmul.u4.lgmr.msra(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf13 = extractvalue { i32, i32, i32 } %matmulres13, 0
+  %matres13 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf13)
+
+  %matmulres14 = call {i32, i32, i32} @llvm.tpu.vmatmul.u4.lgmr.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf14 = extractvalue { i32, i32, i32 } %matmulres14, 0
+  %matres14 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf14)
+
+  %matmulres15 = call {i32, i32, i32} @llvm.tpu.vmatmul.s4.lgmr.msra(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf15 = extractvalue { i32, i32, i32 } %matmulres15, 0
+  %matres15 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf15)
+
+  %matmulres16 = call {i32, i32, i32} @llvm.tpu.vmatmul.s4.lgmr.msrb(<1024 x float> %v, <1024 x i1> %mask, i32 0, i32 undef)
+  %mrf16 = extractvalue { i32, i32, i32 } %matmulres16, 0
+  %matres16 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf16)
+
+  ret <1024 x float> %matres16
+}
+
+; fused vmatmul with lmr
+declare {i32, i32} @llvm.tpu.vmatmul.lmr(i32, i32) nounwind
+declare {i32, i32, i32, i32} @llvm.tpu.vmatmul.lmr.lgmr.msra(i32, i32, i32) nounwind
+declare {i32, i32, i32, i32} @llvm.tpu.vmatmul.lmr.lgmr.msrb(i32, i32, i32) nounwind
+declare {i32, i32} @llvm.tpu.vmatmul.bf16.lmr(i32, i32) nounwind
+declare {i32, i32, i32, i32} @llvm.tpu.vmatmul.bf16.lmr.lgmr.msra(i32, i32, i32) nounwind
+declare {i32, i32, i32, i32} @llvm.tpu.vmatmul.bf16.lmr.lgmr.msrb(i32, i32, i32) nounwind
+
+; CHECK: ((mrf0), (lmr0)) = vmatmul.lmr.16
+; CHECK: ((mrf0), (lmr0), (gmr0), (msra0)) = vmatmul.lmr.16.lgmr
+; CHECK: ((mrf0), (lmr0), (gmr0), (msrb0)) = vmatmul.lmr.16.lgmr
+define <1024 x float> @matmullmr(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+
+  %matmulres1 = call {i32, i32} @llvm.tpu.vmatmul.lmr(i32 0, i32 16)
+  %mrf1 = extractvalue { i32, i32} %matmulres1, 0
+  %matres1 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf1)
+
+  %matmulres2 = call {i32, i32, i32, i32} @llvm.tpu.vmatmul.lmr.lgmr.msra(i32 0, i32 16, i32 undef)
+  %mrf2 = extractvalue { i32, i32, i32, i32} %matmulres2, 0
+  %matres2 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf2)
+
+  %matmulres3 = call {i32, i32, i32, i32} @llvm.tpu.vmatmul.lmr.lgmr.msrb(i32 0, i32 16, i32 undef)
+  %mrf3 = extractvalue { i32, i32, i32, i32} %matmulres3, 0
+  %matres3 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf3)
+
+  ret <1024 x float> %matres3
+}
+
+; CHECK: ((mrf0), (lmr0)) = vmatmul.lmr.16
+; CHECK: ((mrf0), (lmr0), (gmr0), (msra0)) = vmatmul.lmr.16.lgmr
+; CHECK: ((mrf0), (lmr0), (gmr0), (msrb0)) = vmatmul.lmr.16.lgmr
+define <1024 x float> @matmulbf16lmr(<1024 x float> %v) {
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+
+  %matmulres1 = call {i32, i32} @llvm.tpu.vmatmul.bf16.lmr(i32 0, i32 16)
+  %mrf1 = extractvalue { i32, i32} %matmulres1, 0
+  %matres1 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf1)
+
+  %matmulres2 = call {i32, i32, i32, i32} @llvm.tpu.vmatmul.bf16.lmr.lgmr.msra(i32 0, i32 16, i32 undef)
+  %mrf2 = extractvalue { i32, i32, i32, i32} %matmulres2, 0
+  %matres2 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf2)
+
+  %matmulres3 = call {i32, i32, i32, i32} @llvm.tpu.vmatmul.bf16.lmr.lgmr.msrb(i32 0, i32 16, i32 undef)
+  %mrf3 = extractvalue { i32, i32, i32, i32} %matmulres3, 0
+  %matres3 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %mrf3)
+
+  ret <1024 x float> %matres3
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/pack_unpack_bf16_gl_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/pack_unpack_bf16_gl_sc.ll
new file mode 100644
index 0000000..a44e30e
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/pack_unpack_bf16_gl_sc.ll

@@ -0,0 +1,685 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "googletpu"
+
+declare <16 x bfloat> @llvm.tpu.pack.i.f32.bf16(<8 x float>, <8 x float>)
+declare <16 x i16> @llvm.tpu.pack.i.b32.b16(<8 x i32>, <8 x i32>)
+declare <32 x i8> @llvm.tpu.pack.i.b16.b8(<16 x i16>, <16 x i16>)
+declare <64 x i4> @llvm.tpu.pack.i.b8.b4(<32 x i8>, <32 x i8>)
+declare <16 x bfloat> @llvm.tpu.pack.c.f32.bf16(<8 x float>, <8 x float>)
+declare <16 x i16> @llvm.tpu.pack.c.b32.b16(<8 x i32>, <8 x i32>)
+declare <32 x i8> @llvm.tpu.pack.c.b16.b8(<16 x i16>, <16 x i16>)
+declare <64 x i4> @llvm.tpu.pack.c.b8.b4(<32 x i8>, <32 x i8>)
+declare <128 x i2> @llvm.tpu.pack.c.b4.b2(<64 x i4>, <64 x i4>)
+declare <256 x i1> @llvm.tpu.pack.c.b2.b1(<128 x i2>, <128 x i2>)
+declare <32 x i8> @llvm.tpu.pack.c.bf16.s8(<16 x bfloat>, <16 x bfloat>)
+declare <32 x i8> @llvm.tpu.pack.c.bf16.u8(<16 x bfloat>, <16 x bfloat>)
+declare <32 x i8> @llvm.tpu.pack.i.bf16.s8(<16 x bfloat>, <16 x bfloat>)
+declare <32 x i8> @llvm.tpu.pack.i.bf16.u8(<16 x bfloat>, <16 x bfloat>)
+
+declare <8 x float> @llvm.tpu.unpack.i.l.bf16.f32(<16 x bfloat>)
+declare <8 x float> @llvm.tpu.unpack.i.u.bf16.f32(<16 x bfloat>)
+declare <8 x i32> @llvm.tpu.unpack.i.l.s16.s32(<16 x i16>)
+declare <8 x i32> @llvm.tpu.unpack.i.u.s16.s32(<16 x i16>)
+declare <8 x float> @llvm.tpu.unpack.c.l.bf16.f32(<16 x bfloat>)
+declare <8 x float> @llvm.tpu.unpack.c.u.bf16.f32(<16 x bfloat>)
+declare <8 x float> @llvm.tpu.unpack.c.l.hf16.f32(<16 x half>)
+declare <8 x float> @llvm.tpu.unpack.c.u.hf16.f32(<16 x half>)
+declare <8 x i32> @llvm.tpu.unpack.c.l.s16.s32(<16 x i16>)
+declare <8 x i32> @llvm.tpu.unpack.c.u.s16.s32(<16 x i16>)
+declare <8 x float> @llvm.tpu.unpack.c.0.bf8.f32(<32 x i8>)
+declare <8 x float> @llvm.tpu.unpack.c.1.bf8.f32(<32 x i8>)
+declare <8 x float> @llvm.tpu.unpack.c.2.bf8.f32(<32 x i8>)
+declare <8 x float> @llvm.tpu.unpack.c.3.bf8.f32(<32 x i8>)
+declare <8 x i32> @llvm.tpu.unpack.c.0.s8.s32(<32 x i8>)
+declare <8 x i32> @llvm.tpu.unpack.c.1.s8.s32(<32 x i8>)
+declare <8 x i32> @llvm.tpu.unpack.c.2.s8.s32(<32 x i8>)
+declare <8 x i32> @llvm.tpu.unpack.c.3.s8.s32(<32 x i8>)
+declare <32 x i8> @llvm.tpu.unpack.c.l.s4.s8(<64 x i4>)
+declare <32 x i8> @llvm.tpu.unpack.c.u.s4.s8(<64 x i4>)
+declare <64 x i4> @llvm.tpu.unpack.c.l.s2.s4(<128 x i2>)
+declare <64 x i4> @llvm.tpu.unpack.c.u.s2.s4(<128 x i2>)
+declare <128 x i2> @llvm.tpu.unpack.c.l.s1.s2(<256 x i1>)
+declare <128 x i2> @llvm.tpu.unpack.c.u.s1.s2(<256 x i1>)
+declare <8 x float> @llvm.tpu.unpack.c.0.if8.f32(<32 x i8>)
+declare <8 x float> @llvm.tpu.unpack.c.1.if8.f32(<32 x i8>)
+declare <8 x float> @llvm.tpu.unpack.c.2.if8.f32(<32 x i8>)
+declare <8 x float> @llvm.tpu.unpack.c.3.if8.f32(<32 x i8>)
+declare <16 x bfloat> @llvm.tpu.unpack.c.l.s8.bf16(<32 x i8>)
+declare <16 x bfloat> @llvm.tpu.unpack.c.u.s8.bf16(<32 x i8>)
+declare <16 x bfloat> @llvm.tpu.unpack.c.l.u8.bf16(<32 x i8>)
+declare <16 x bfloat> @llvm.tpu.unpack.c.u.u8.bf16(<32 x i8>)
+declare <16 x i16> @llvm.tpu.unpack.c.l.s8.s16(<32 x i8>)
+declare <16 x i16> @llvm.tpu.unpack.c.u.s8.s16(<32 x i8>)
+declare <16 x i16> @llvm.tpu.unpack.c.l.u8.u16(<32 x i8>)
+declare <16 x i16> @llvm.tpu.unpack.c.u.u8.u16(<32 x i8>)
+declare <16 x i16> @llvm.tpu.unpack.i.l.s8.s16(<32 x i8>)
+declare <16 x i16> @llvm.tpu.unpack.i.u.s8.s16(<32 x i8>)
+declare <16 x i16> @llvm.tpu.unpack.i.l.u8.u16(<32 x i8>)
+declare <16 x i16> @llvm.tpu.unpack.i.u.u8.u16(<32 x i8>)
+declare <16 x bfloat> @llvm.tpu.unpack.i.l.s8.bf16(<32 x i8>)
+declare <16 x bfloat> @llvm.tpu.unpack.i.u.s8.bf16(<32 x i8>)
+declare <16 x bfloat> @llvm.tpu.unpack.i.l.u8.bf16(<32 x i8>)
+declare <16 x bfloat> @llvm.tpu.unpack.i.u.u8.bf16(<32 x i8>)
+declare <16 x i16> @llvm.tpu.unpack.ic.l.s8.s16(<32 x i8>)
+declare <16 x i16> @llvm.tpu.unpack.ic.u.s8.s16(<32 x i8>)
+declare <16 x i16> @llvm.tpu.unpack.ic.l.u8.u16(<32 x i8>)
+declare <16 x i16> @llvm.tpu.unpack.ic.u.u8.u16(<32 x i8>)
+declare <16 x bfloat> @llvm.tpu.unpack.ic.l.s8.bf16(<32 x i8>)
+declare <16 x bfloat> @llvm.tpu.unpack.ic.u.s8.bf16(<32 x i8>)
+declare <16 x bfloat> @llvm.tpu.unpack.ic.l.u8.bf16(<32 x i8>)
+declare <16 x bfloat> @llvm.tpu.unpack.ic.u.u8.bf16(<32 x i8>)
+
+declare <8 x float> @llvm.tpu.vcvt.s32.f32(<8 x i32>)
+declare <8 x i32> @llvm.tpu.vcvt.f32.s32(<8 x float>)
+declare <32 x i8> @llvm.tpu.vcvt.f32.bf8(<8 x float>)
+declare <32 x i8> @llvm.tpu.vcvt.f32.if8(<8 x float>)
+declare <16 x bfloat> @llvm.tpu.vcvt.f32.bf16(<8 x float>)
+declare <16 x half> @llvm.tpu.vcvt.f32.hf16(<8 x float>)
+declare <32 x i8> @llvm.tpu.vcvt.sr.f32.bf8(<8 x i32>, <8 x float>)
+declare <32 x i8> @llvm.tpu.vcvt.sr.f32.if8(<8 x i32>, <8 x float>)
+declare <16 x bfloat> @llvm.tpu.vcvt.sr.f32.bf16(<8 x i32>, <8 x float>)
+declare <16 x half> @llvm.tpu.vcvt.sr.f32.hf16(<8 x i32>, <8 x float>)
+declare <32 x i8> @llvm.tpu.vcvt.bf16.s8(<16 x bfloat>)
+declare <32 x i8> @llvm.tpu.vcvt.bf16.u8(<16 x bfloat>)
+declare <64 x i4> @llvm.tpu.vcvt.bf16.s4(<16 x bfloat>)
+declare <64 x i4> @llvm.tpu.vcvt.bf16.u4(<16 x bfloat>)
+declare <16 x bfloat> @llvm.tpu.vcvt.s8.bf16(<32 x i8>)
+declare <16 x bfloat> @llvm.tpu.vcvt.u8.bf16(<32 x i8>)
+declare <16 x bfloat> @llvm.tpu.vcvt.s4.bf16(<64 x i4>)
+declare <16 x bfloat> @llvm.tpu.vcvt.u4.bf16(<64 x i4>)
+
+; CHECK-LABEL: pack_i_f32_bf16
+; CHECK: v{{[0-9]+}} = vpack.i.f32.bf16 v{{[0-9]+}}, v{{[0=9]+}}
+define <16 x bfloat> @pack_i_f32_bf16(<8 x float> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.pack.i.f32.bf16(<8 x float> %x, <8 x float> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: pack_i_b32_b16
+; CHECK: v{{[0-9]+}} = vpack.i.b32.b16 v{{[0-9]+}}, v{{[0=9]+}}
+define <16 x i16> @pack_i_b32_b16(<8 x i32> %x) {
+  %res = call <16 x i16> @llvm.tpu.pack.i.b32.b16(<8 x i32> %x, <8 x i32> %x)
+  ret <16 x i16> %res
+}
+
+; CHECK-LABEL: pack_i_b16_b8
+; CHECK: v{{[0-9]+}} = vpack.i.b16.b8 v{{[0-9]+}}, v{{[0=9]+}}
+define <32 x i8> @pack_i_b16_b8(<16 x i16> %x) {
+  %res = call <32 x i8> @llvm.tpu.pack.i.b16.b8(<16 x i16> %x, <16 x i16> %x)
+  ret <32 x i8> %res
+}
+
+; CHECK-LABEL: pack_i_b8_b4
+; CHECK: v{{[0-9]+}} = vpack.i.b8.b4 v{{[0-9]+}}, v{{[0=9]+}}
+define <64 x i4> @pack_i_b8_b4(<32 x i8> %x) {
+  %res = call <64 x i4> @llvm.tpu.pack.i.b8.b4(<32 x i8> %x, <32 x i8> %x)
+  ret <64 x i4> %res
+}
+
+; CHECK-LABEL: pack_c_f32_bf16
+; CHECK: v{{[0-9]+}} = vpack.c.f32.bf16 v{{[0-9]+}}, v{{[0=9]+}}
+define <16 x bfloat> @pack_c_f32_bf16(<8 x float> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.pack.c.f32.bf16(<8 x float> %x, <8 x float> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: pack_c_b32_b16
+; CHECK: v{{[0-9]+}} = vpack.c.b32.b16 v{{[0-9]+}}, v{{[0=9]+}}
+define <16 x i16> @pack_c_b32_b16(<8 x i32> %x) {
+  %res = call <16 x i16> @llvm.tpu.pack.c.b32.b16(<8 x i32> %x, <8 x i32> %x)
+  ret <16 x i16> %res
+}
+
+; CHECK-LABEL: pack_c_b16_b8
+; CHECK: v{{[0-9]+}} = vpack.c.b16.b8 v{{[0-9]+}}, v{{[0=9]+}}
+define <32 x i8> @pack_c_b16_b8(<16 x i16> %x) {
+  %res = call <32 x i8> @llvm.tpu.pack.c.b16.b8(<16 x i16> %x, <16 x i16> %x)
+  ret <32 x i8> %res
+}
+
+; CHECK-LABEL: pack_c_b8_b4
+; CHECK: v{{[0-9]+}} = vpack.c.b8.b4 v{{[0-9]+}}, v{{[0=9]+}}
+define <64 x i4> @pack_c_b8_b4(<32 x i8> %x) {
+  %res = call <64 x i4> @llvm.tpu.pack.c.b8.b4(<32 x i8> %x, <32 x i8> %x)
+  ret <64 x i4> %res
+}
+
+; CHECK-LABEL: pack_c_b4_b2
+; CHECK: v{{[0-9]+}} = vpack.c.b4.b2 v{{[0-9]+}}, v{{[0=9]+}}
+define <128 x i2> @pack_c_b4_b2(<64 x i4> %x) {
+  %res = call <128 x i2> @llvm.tpu.pack.c.b4.b2(<64 x i4> %x, <64 x i4> %x)
+  ret <128 x i2> %res
+}
+
+; CHECK-LABEL: pack_c_b2_b1
+; CHECK: v{{[0-9]+}} = vpack.c.b2.b1 v{{[0-9]+}}, v{{[0=9]+}}
+define <256 x i1> @pack_c_b2_b1(<128 x i2> %x) {
+  %res = call <256 x i1> @llvm.tpu.pack.c.b2.b1(<128 x i2> %x, <128 x i2> %x)
+  ret <256 x i1> %res
+}
+
+; CHECK-LABEL: pack_c_bf16_s8
+; CHECK: v{{[0-9]+}} = vpack.c.bf16.s8 v{{[0-9]+}}, v{{[0=9]+}}
+define <32 x i8> @pack_c_bf16_s8(<16 x bfloat> %x) {
+  %res = tail call <32 x i8> @llvm.tpu.pack.c.bf16.s8(<16 x bfloat> %x, <16 x bfloat> %x)
+  ret <32 x i8> %res
+}
+
+; CHECK-LABEL: pack_c_bf16_u8
+; CHECK: v{{[0-9]+}} = vpack.c.bf16.u8 v{{[0-9]+}}, v{{[0=9]+}}
+define <32 x i8> @pack_c_bf16_u8(<16 x bfloat> %x) {
+  %res = tail call <32 x i8> @llvm.tpu.pack.c.bf16.u8(<16 x bfloat> %x, <16 x bfloat> %x)
+  ret <32 x i8> %res
+}
+
+; CHECK-LABEL: pack_i_bf16_s8
+; CHECK: v{{[0-9]+}} = vpack.i.bf16.s8 v{{[0-9]+}}, v{{[0=9]+}}
+define <32 x i8> @pack_i_bf16_s8(<16 x bfloat> %x) {
+  %res = tail call <32 x i8> @llvm.tpu.pack.i.bf16.s8(<16 x bfloat> %x, <16 x bfloat> %x)
+  ret <32 x i8> %res
+}
+
+; CHECK-LABEL: pack_i_bf16_u8
+; CHECK: v{{[0-9]+}} = vpack.i.bf16.u8 v{{[0-9]+}}, v{{[0=9]+}}
+define <32 x i8> @pack_i_bf16_u8(<16 x bfloat> %x) {
+  %res = tail call <32 x i8> @llvm.tpu.pack.i.bf16.u8(<16 x bfloat> %x, <16 x bfloat> %x)
+  ret <32 x i8> %res
+}
+
+; CHECK-LABEL: unpack_i_l_bf16_f32
+; CHECK: v{{[0-9]+}} = vunpack.i.l.bf16.f32 v{{[0-9]+}}
+define <8 x float> @unpack_i_l_bf16_f32(<16 x bfloat> %x) {
+  %res = call <8 x float> @llvm.tpu.unpack.i.l.bf16.f32(<16 x bfloat> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_i_u_bf16_f32
+; CHECK: v{{[0-9]+}} = vunpack.i.u.bf16.f32 v{{[0-9]+}}
+define <8 x float> @unpack_i_u_bf16_f32(<16 x bfloat> %x) {
+  %res = call <8 x float> @llvm.tpu.unpack.i.u.bf16.f32(<16 x bfloat> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_i_l_s16_s32
+; CHECK: v{{[0-9]+}} = vunpack.i.l.s16.s32
+define <8 x i32> @unpack_i_l_s16_s32(<16 x i16> %x) {
+  %res = call <8 x i32> @llvm.tpu.unpack.i.l.s16.s32(<16 x i16> %x)
+  ret <8 x i32> %res
+}
+
+; CHECK-LABEL: unpack_i_u_s16_s32
+; CHECK: v{{[0-9]+}} = vunpack.i.u.s16.s32
+define <8 x i32> @unpack_i_u_s16_s32(<16 x i16> %x) {
+  %res = call <8 x i32> @llvm.tpu.unpack.i.u.s16.s32(<16 x i16> %x)
+  ret <8 x i32> %res
+}
+
+; CHECK-LABEL: unpack_c_l_bf16_f32
+; CHECK: v{{[0-9]+}} = vunpack.c.l.bf16.f32
+define <8 x float> @unpack_c_l_bf16_f32(<16 x bfloat> %x) {
+  %res = call <8 x float> @llvm.tpu.unpack.c.l.bf16.f32(<16 x bfloat> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_c_u_bf16_f32
+; CHECK: v{{[0-9]+}} = vunpack.c.u.bf16.f32
+define <8 x float> @unpack_c_u_bf16_f32(<16 x bfloat> %x) {
+  %res = call <8 x float> @llvm.tpu.unpack.c.u.bf16.f32(<16 x bfloat> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_c_l_hf16_f32
+; CHECK: v{{[0-9]+}} = vunpack.c.l.hf16.f32
+define <8 x float> @unpack_c_l_hf16_f32(<16 x half> %x) {
+  %res = call <8 x float> @llvm.tpu.unpack.c.l.hf16.f32(<16 x half> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_c_u_hf16_f32
+; CHECK: v{{[0-9]+}} = vunpack.c.u.hf16.f32
+define <8 x float> @unpack_c_u_hf16_f32(<16 x half> %x) {
+  %res = call <8 x float> @llvm.tpu.unpack.c.u.hf16.f32(<16 x half> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_c_l_s16_s32
+; CHECK: v{{[0-9]+}} = vunpack.c.l.s16.s32
+define <8 x i32> @unpack_c_l_s16_s32(<16 x i16> %x) {
+  %res = call <8 x i32> @llvm.tpu.unpack.c.l.s16.s32(<16 x i16> %x)
+  ret <8 x i32> %res
+}
+
+; CHECK-LABEL: unpack_c_u_s16_s32
+; CHECK: v{{[0-9]+}} = vunpack.c.u.s16.s32
+define <8 x i32> @unpack_c_u_s16_s32(<16 x i16> %x) {
+  %res = call <8 x i32> @llvm.tpu.unpack.c.u.s16.s32(<16 x i16> %x)
+  ret <8 x i32> %res
+}
+
+; CHECK-LABEL: unpack_c_0_bf8_f32
+; CHECK: v{{[0-9]+}} = vunpack.c.0.bf8.f32
+define <8 x float> @unpack_c_0_bf8_f32(<32 x i8> %x) {
+  %res = call <8 x float> @llvm.tpu.unpack.c.0.bf8.f32(<32 x i8> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_c_1_bf8_f32
+; CHECK: v{{[0-9]+}} = vunpack.c.1.bf8.f32
+define <8 x float> @unpack_c_1_bf8_f32(<32 x i8> %x) {
+  %res = call <8 x float> @llvm.tpu.unpack.c.1.bf8.f32(<32 x i8> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_c_2_bf8_f32
+; CHECK: v{{[0-9]+}} = vunpack.c.2.bf8.f32
+define <8 x float> @unpack_c_2_bf8_f32(<32 x i8> %x) {
+  %res = call <8 x float> @llvm.tpu.unpack.c.2.bf8.f32(<32 x i8> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_c_3_bf8_f32
+; CHECK: v{{[0-9]+}} = vunpack.c.3.bf8.f32
+define <8 x float> @unpack_c_3_bf8_f32(<32 x i8> %x) {
+  %res = call <8 x float> @llvm.tpu.unpack.c.3.bf8.f32(<32 x i8> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_c_0_s8_s32
+; CHECK: v{{[0-9]+}} = vunpack.c.0.s8.s32
+define <8 x i32> @unpack_c_0_s8_s32(<32 x i8> %x) {
+  %res = call <8 x i32> @llvm.tpu.unpack.c.0.s8.s32(<32 x i8> %x)
+  ret <8 x i32> %res
+}
+
+; CHECK-LABEL: unpack_c_1_s8_s32
+; CHECK: v{{[0-9]+}} = vunpack.c.1.s8.s32
+define <8 x i32> @unpack_c_1_s8_s32(<32 x i8> %x) {
+  %res = call <8 x i32> @llvm.tpu.unpack.c.1.s8.s32(<32 x i8> %x)
+  ret <8 x i32> %res
+}
+
+; CHECK-LABEL: unpack_c_2_s8_s32
+; CHECK: v{{[0-9]+}} = vunpack.c.2.s8.s32
+define <8 x i32> @unpack_c_2_s8_s32(<32 x i8> %x) {
+  %res = call <8 x i32> @llvm.tpu.unpack.c.2.s8.s32(<32 x i8> %x)
+  ret <8 x i32> %res
+}
+
+; CHECK-LABEL: unpack_c_3_s8_s32
+; CHECK: v{{[0-9]+}} = vunpack.c.3.s8.s32
+define <8 x i32> @unpack_c_3_s8_s32(<32 x i8> %x) {
+  %res = call <8 x i32> @llvm.tpu.unpack.c.3.s8.s32(<32 x i8> %x)
+  ret <8 x i32> %res
+}
+
+; CHECK-LABEL: unpack_c_l_s4_s8
+; CHECK: v{{[0-9]+}} = vunpack.c.l.s4.s8
+define <32 x i8> @unpack_c_l_s4_s8(<64 x i4> %x) {
+  %res = call <32 x i8> @llvm.tpu.unpack.c.l.s4.s8(<64 x i4> %x)
+  ret <32 x i8> %res
+}
+
+; CHECK-LABEL: unpack_c_u_s4_s8
+; CHECK: v{{[0-9]+}} = vunpack.c.u.s4.s8
+define <32 x i8> @unpack_c_u_s4_s8(<64 x i4> %x) {
+  %res = call <32 x i8> @llvm.tpu.unpack.c.u.s4.s8(<64 x i4> %x)
+  ret <32 x i8> %res
+}
+
+; CHECK-LABEL: unpack_c_l_s2_s4
+; CHECK: v{{[0-9]+}} = vunpack.c.l.s2.s4
+define <64 x i4> @unpack_c_l_s2_s4(<128 x i2> %x) {
+  %res = call <64 x i4> @llvm.tpu.unpack.c.l.s2.s4(<128 x i2> %x)
+  ret <64 x i4> %res
+}
+
+; CHECK-LABEL: unpack_c_u_s2_s4
+; CHECK: v{{[0-9]+}} = vunpack.c.u.s2.s4
+define <64 x i4> @unpack_c_u_s2_s4(<128 x i2> %x) {
+  %res = call <64 x i4> @llvm.tpu.unpack.c.u.s2.s4(<128 x i2> %x)
+  ret <64 x i4> %res
+}
+
+; CHECK-LABEL: unpack_c_l_s1_s2
+; CHECK: v{{[0-9]+}} = vunpack.c.l.s1.s2
+define <128 x i2> @unpack_c_l_s1_s2(<256 x i1> %x) {
+  %res = call <128 x i2> @llvm.tpu.unpack.c.l.s1.s2(<256 x i1> %x)
+  ret <128 x i2> %res
+}
+
+; CHECK-LABEL: unpack_c_u_s1_s2
+; CHECK: v{{[0-9]+}} = vunpack.c.u.s1.s2
+define <128 x i2> @unpack_c_u_s1_s2(<256 x i1> %x) {
+  %res = call <128 x i2> @llvm.tpu.unpack.c.u.s1.s2(<256 x i1> %x)
+  ret <128 x i2> %res
+}
+
+; CHECK-LABEL: unpack_c_0_if8_f32
+; CHECK: v{{[0-9]+}} = vunpack.c.0.if8.f32
+define <8 x float> @unpack_c_0_if8_f32(<32 x i8> %x) {
+  %res = call <8 x float> @llvm.tpu.unpack.c.0.if8.f32(<32 x i8> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_c_1_if8_f32
+; CHECK: v{{[0-9]+}} = vunpack.c.1.if8.f32
+define <8 x float> @unpack_c_1_if8_f32(<32 x i8> %x) {
+  %res = call <8 x float> @llvm.tpu.unpack.c.1.if8.f32(<32 x i8> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_c_2_if8_f32
+; CHECK: v{{[0-9]+}} = vunpack.c.2.if8.f32
+define <8 x float> @unpack_c_2_if8_f32(<32 x i8> %x) {
+  %res = call <8 x float> @llvm.tpu.unpack.c.2.if8.f32(<32 x i8> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_c_3_if8_f32
+; CHECK: v{{[0-9]+}} = vunpack.c.3.if8.f32
+define <8 x float> @unpack_c_3_if8_f32(<32 x i8> %x) {
+  %res = call <8 x float> @llvm.tpu.unpack.c.3.if8.f32(<32 x i8> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_c_l_s8_bf16
+; CHECK: v{{[0-9]+}} = vunpack.c.l.s8.bf16
+define <16 x bfloat> @unpack_c_l_s8_bf16(<32 x i8> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.unpack.c.l.s8.bf16(<32 x i8> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: unpack_c_u_s8_bf16
+; CHECK: v{{[0-9]+}} = vunpack.c.u.s8.bf16
+define <16 x bfloat> @unpack_c_u_s8_bf16(<32 x i8> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.unpack.c.u.s8.bf16(<32 x i8> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: unpack_c_l_u8_bf16
+; CHECK: v{{[0-9]+}} = vunpack.c.l.u8.bf16
+define <16 x bfloat> @unpack_c_l_u8_bf16(<32 x i8> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.unpack.c.l.u8.bf16(<32 x i8> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: unpack_c_u_u8_bf16
+; CHECK: v{{[0-9]+}} = vunpack.c.u.u8.bf16
+define <16 x bfloat> @unpack_c_u_u8_bf16(<32 x i8> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.unpack.c.u.u8.bf16(<32 x i8> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: unpack_c_l_s8_s16
+; CHECK: v{{[0-9]+}} = vunpack.c.l.s8.s16
+define <16 x i16> @unpack_c_l_s8_s16(<32 x i8> %x) {
+  %res = call <16 x i16> @llvm.tpu.unpack.c.l.s8.s16(<32 x i8> %x)
+  ret <16 x i16> %res
+}
+
+; CHECK-LABEL: unpack_c_u_s8_s16
+; CHECK: v{{[0-9]+}} = vunpack.c.u.s8.s16
+define <16 x i16> @unpack_c_u_s8_s16(<32 x i8> %x) {
+  %res = call <16 x i16> @llvm.tpu.unpack.c.u.s8.s16(<32 x i8> %x)
+  ret <16 x i16> %res
+}
+
+; CHECK-LABEL: unpack_c_l_u8_u16
+; CHECK: v{{[0-9]+}} = vunpack.c.l.u8.u16
+define <16 x i16> @unpack_c_l_u8_u16(<32 x i8> %x) {
+  %res = call <16 x i16> @llvm.tpu.unpack.c.l.u8.u16(<32 x i8> %x)
+  ret <16 x i16> %res
+}
+
+; CHECK-LABEL: unpack_c_u_u8_u16
+; CHECK: v{{[0-9]+}} = vunpack.c.u.u8.u16
+define <16 x i16> @unpack_c_u_u8_u16(<32 x i8> %x) {
+  %res = call <16 x i16> @llvm.tpu.unpack.c.u.u8.u16(<32 x i8> %x)
+  ret <16 x i16> %res
+}
+
+; CHECK-LABEL: unpack_i_l_s8_s16
+; CHECK: v{{[0-9]+}} = vunpack.i.l.s8.s16
+define <16 x i16> @unpack_i_l_s8_s16(<32 x i8> %x) {
+  %res = call <16 x i16> @llvm.tpu.unpack.i.l.s8.s16(<32 x i8> %x)
+  ret <16 x i16> %res
+}
+
+; CHECK-LABEL: unpack_i_u_s8_s16
+; CHECK: v{{[0-9]+}} = vunpack.i.u.s8.s16
+define <16 x i16> @unpack_i_u_s8_s16(<32 x i8> %x) {
+  %res = call <16 x i16> @llvm.tpu.unpack.i.u.s8.s16(<32 x i8> %x)
+  ret <16 x i16> %res
+}
+
+; CHECK-LABEL: unpack_i_l_u8_u16
+; CHECK: v{{[0-9]+}} = vunpack.i.l.u8.u16
+define <16 x i16> @unpack_i_l_u8_u16(<32 x i8> %x) {
+  %res = call <16 x i16> @llvm.tpu.unpack.i.l.u8.u16(<32 x i8> %x)
+  ret <16 x i16> %res
+}
+
+; CHECK-LABEL: unpack_i_u_u8_u16
+; CHECK: v{{[0-9]+}} = vunpack.i.u.u8.u16
+define <16 x i16> @unpack_i_u_u8_u16(<32 x i8> %x) {
+  %res = call <16 x i16> @llvm.tpu.unpack.i.u.u8.u16(<32 x i8> %x)
+  ret <16 x i16> %res
+}
+
+; CHECK-LABEL: unpack_i_l_s8_bf16
+; CHECK: v{{[0-9]+}} = vunpack.i.l.s8.bf16
+define <16 x bfloat> @unpack_i_l_s8_bf16(<32 x i8> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.unpack.i.l.s8.bf16(<32 x i8> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: unpack_i_u_s8_bf16
+; CHECK: v{{[0-9]+}} = vunpack.i.u.s8.bf16
+define <16 x bfloat> @unpack_i_u_s8_bf16(<32 x i8> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.unpack.i.u.s8.bf16(<32 x i8> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: unpack_i_l_u8_bf16
+; CHECK: v{{[0-9]+}} = vunpack.i.l.u8.bf16
+define <16 x bfloat> @unpack_i_l_u8_bf16(<32 x i8> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.unpack.i.l.u8.bf16(<32 x i8> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: unpack_i_u_u8_bf16
+; CHECK: v{{[0-9]+}} = vunpack.i.u.u8.bf16
+define <16 x bfloat> @unpack_i_u_u8_bf16(<32 x i8> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.unpack.i.u.u8.bf16(<32 x i8> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: unpack_ic_l_s8_s16
+; CHECK: v{{[0-9]+}} = vunpack.ic.l.s8.s16
+define <16 x i16> @unpack_ic_l_s8_s16(<32 x i8> %x) {
+  %res = call <16 x i16> @llvm.tpu.unpack.ic.l.s8.s16(<32 x i8> %x)
+  ret <16 x i16> %res
+}
+
+; CHECK-LABEL: unpack_ic_u_s8_s16
+; CHECK: v{{[0-9]+}} = vunpack.ic.u.s8.s16
+define <16 x i16> @unpack_ic_u_s8_s16(<32 x i8> %x) {
+  %res = call <16 x i16> @llvm.tpu.unpack.ic.u.s8.s16(<32 x i8> %x)
+  ret <16 x i16> %res
+}
+
+; CHECK-LABEL: unpack_ic_l_u8_u16
+; CHECK: v{{[0-9]+}} = vunpack.ic.l.u8.u16
+define <16 x i16> @unpack_ic_l_u8_u16(<32 x i8> %x) {
+  %res = call <16 x i16> @llvm.tpu.unpack.ic.l.u8.u16(<32 x i8> %x)
+  ret <16 x i16> %res
+}
+
+; CHECK-LABEL: unpack_ic_u_u8_u16
+; CHECK: v{{[0-9]+}} = vunpack.ic.u.u8.u16
+define <16 x i16> @unpack_ic_u_u8_u16(<32 x i8> %x) {
+  %res = call <16 x i16> @llvm.tpu.unpack.ic.u.u8.u16(<32 x i8> %x)
+  ret <16 x i16> %res
+}
+
+; CHECK-LABEL: unpack_ic_l_s8_bf16
+; CHECK: v{{[0-9]+}} = vunpack.ic.l.s8.bf16
+define <16 x bfloat> @unpack_ic_l_s8_bf16(<32 x i8> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.unpack.ic.l.s8.bf16(<32 x i8> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: unpack_ic_u_s8_bf16
+; CHECK: v{{[0-9]+}} = vunpack.ic.u.s8.bf16
+define <16 x bfloat> @unpack_ic_u_s8_bf16(<32 x i8> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.unpack.ic.u.s8.bf16(<32 x i8> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: unpack_ic_l_u8_bf16
+; CHECK: v{{[0-9]+}} = vunpack.ic.l.u8.bf16
+define <16 x bfloat> @unpack_ic_l_u8_bf16(<32 x i8> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.unpack.ic.l.u8.bf16(<32 x i8> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: unpack_ic_u_u8_bf16
+; CHECK: v{{[0-9]+}} = vunpack.ic.u.u8.bf16
+define <16 x bfloat> @unpack_ic_u_u8_bf16(<32 x i8> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.unpack.ic.u.u8.bf16(<32 x i8> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: cvt_s32_f32
+; CHECK: v{{[0-9]+}} = vcvt.s32.f32 v{{[0-9]+}}
+define <8 x float> @cvt_s32_f32(<8 x i32> %x) {
+  %res = call <8 x float> @llvm.tpu.vcvt.s32.f32(<8 x i32> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: cvt_f32_s32
+; CHECK: v{{[0-9]+}} = vcvt.f32.s32 v{{[0-9]+}}
+define <8 x i32> @cvt_f32_s32(<8 x float> %x) {
+  %res = call <8 x i32> @llvm.tpu.vcvt.f32.s32(<8 x float> %x)
+  ret <8 x i32> %res
+}
+
+; CHECK-LABEL: cvt_f32_bf8
+; CHECK: v{{[0-9]+}} = vcvt.f32.bf8 v{{[0-9]+}}
+define <32 x i8> @cvt_f32_bf8(<8 x float> %x) {
+  %res = call <32 x i8> @llvm.tpu.vcvt.f32.bf8(<8 x float> %x)
+  ret <32 x i8> %res
+}
+
+; CHECK-LABEL: cvt_f32_if8
+; CHECK: v{{[0-9]+}} = vcvt.f32.if8 v{{[0-9]+}}
+define <32 x i8> @cvt_f32_if8(<8 x float> %x) {
+  %res = call <32 x i8> @llvm.tpu.vcvt.f32.if8(<8 x float> %x)
+  ret <32 x i8> %res
+}
+
+; CHECK-LABEL: cvt_f32_bf16
+; CHECK: v{{[0-9]+}} = vcvt.f32.bf16 v{{[0-9]+}}
+define <16 x bfloat> @cvt_f32_bf16(<8 x float> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.vcvt.f32.bf16(<8 x float> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: cvt_f32_hf16
+; CHECK: v{{[0-9]+}} = vcvt.f32.hf16 v{{[0-9]+}}
+define <16 x half> @cvt_f32_hf16(<8 x float> %x) {
+  %res = call <16 x half> @llvm.tpu.vcvt.f32.hf16(<8 x float> %x)
+  ret <16 x half> %res
+}
+
+; CHECK-LABEL: cvt_sr_f32_bf8
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.bf8 v{{[0-9]+}}, v{{[0-9]+}}
+define <32 x i8> @cvt_sr_f32_bf8(<8 x i32> %y, <8 x float> %x) {
+  %res = call <32 x i8> @llvm.tpu.vcvt.sr.f32.bf8(<8 x i32> %y, <8 x float> %x)
+  ret <32 x i8> %res
+}
+
+; CHECK-LABEL: cvt_sr_f32_if8
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.if8 v{{[0-9]+}}, v{{[0-9]+}}
+define <32 x i8> @cvt_sr_f32_if8(<8 x i32> %y, <8 x float> %x) {
+  %res = call <32 x i8> @llvm.tpu.vcvt.sr.f32.if8(<8 x i32> %y, <8 x float> %x)
+  ret <32 x i8> %res
+}
+
+; CHECK-LABEL: cvt_sr_f32_bf16
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x bfloat> @cvt_sr_f32_bf16(<8 x i32> %y, <8 x float> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.vcvt.sr.f32.bf16(<8 x i32> %y, <8 x float> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: cvt_sr_f32_hf16
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.hf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x half> @cvt_sr_f32_hf16(<8 x i32> %y, <8 x float> %x) {
+  %res = call <16 x half> @llvm.tpu.vcvt.sr.f32.hf16(<8 x i32> %y, <8 x float> %x)
+  ret <16 x half> %res
+}
+
+; CHECK-LABEL: cvt_bf16_s8
+; CHECK: v{{[0-9]+}} = vcvt.bf16.s8 v{{[0-9]+}}
+define <32 x i8> @cvt_bf16_s8(<16 x bfloat> %x) {
+  %res = call <32 x i8> @llvm.tpu.vcvt.bf16.s8(<16 x bfloat> %x)
+  ret <32 x i8> %res
+}
+
+; CHECK-LABEL: cvt_bf16_u8
+; CHECK: v{{[0-9]+}} = vcvt.bf16.u8 v{{[0-9]+}}
+define <32 x i8> @cvt_bf16_u8(<16 x bfloat> %x) {
+  %res = call <32 x i8> @llvm.tpu.vcvt.bf16.u8(<16 x bfloat> %x)
+  ret <32 x i8> %res
+}
+
+; CHECK-LABEL: cvt_bf16_s4
+; CHECK: v{{[0-9]+}} = vcvt.bf16.s4 v{{[0-9]+}}
+define <64 x i4> @cvt_bf16_s4(<16 x bfloat> %x) {
+  %res = call <64 x i4> @llvm.tpu.vcvt.bf16.s4(<16 x bfloat> %x)
+  ret <64 x i4> %res
+}
+
+; CHECK-LABEL: cvt_bf16_u4
+; CHECK: v{{[0-9]+}} = vcvt.bf16.u4 v{{[0-9]+}}
+define <64 x i4> @cvt_bf16_u4(<16 x bfloat> %x) {
+  %res = call <64 x i4> @llvm.tpu.vcvt.bf16.u4(<16 x bfloat> %x)
+  ret <64 x i4> %res
+}
+
+; CHECK-LABEL: cvt_s8_bf16
+; CHECK: v{{[0-9]+}} = vcvt.s8.bf16 v{{[0-9]+}}
+define <16 x bfloat> @cvt_s8_bf16(<32 x i8> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.vcvt.s8.bf16(<32 x i8> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: cvt_u8_bf16
+; CHECK: v{{[0-9]+}} = vcvt.u8.bf16 v{{[0-9]+}}
+define <16 x bfloat> @cvt_u8_bf16(<32 x i8> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.vcvt.u8.bf16(<32 x i8> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: cvt_s4_bf16
+; CHECK: v{{[0-9]+}} = vcvt.s4.bf16 v{{[0-9]+}}
+define <16 x bfloat> @cvt_s4_bf16(<64 x i4> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.vcvt.s4.bf16(<64 x i4> %x)
+  ret <16 x bfloat> %res
+}
+
+; CHECK-LABEL: cvt_u4_bf16
+; CHECK: v{{[0-9]+}} = vcvt.u4.bf16 v{{[0-9]+}}
+define <16 x bfloat> @cvt_u4_bf16(<64 x i4> %x) {
+  %res = call <16 x bfloat> @llvm.tpu.vcvt.u4.bf16(<64 x i4> %x)
+  ret <16 x bfloat> %res
+}
+
+attributes #0 = { "implicit-section-name"=".text.tile_execute" "target-cpu"="sparsecore-tec-gl" }
+attributes #1 = { "implicit-section-name"=".text.tile_access" "target-cpu"="sparsecore-tac-gl" }
+attributes #2 = { "implicit-section-name"=".text.scs" "target-cpu"="sparsecore-scs-gl" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/post_iv_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/post_iv_sc.ll
new file mode 100644
index 0000000..19f10cf
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/post_iv_sc.ll

@@ -0,0 +1,80 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-vf < %s \
+; RUN: | llc -mcpu=sparsecore-tec-vf -stop-after=tpu-pipeliner \
+; RUN: -tpu-pipeliner-annotate-for-testing \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.loop.parallel()
+declare i32* @llvm.tpu.inttoptr.pi32(i32)
+
+; Tests that we're transforming to post loop comparison iv updates in order
+; to reduce the loop's height, even though LSR does the opposite. 
+
+; CHECK-LABEL: pre_iv_update_loop
+; CHECK-LABEL: bb.1.for.body.i:
+; CHECK: %[[p:[0-9]+]]:gpr = PHI %{{[0-9]+}}, %bb.{{[0-9]+}}, %[[a:[0-9]+]], %bb.1
+; CHECK: %{{[0-9]+}}:ppr = CMPNEri %[[p]]
+; CHECK: %[[a]]:gpr = ADDri %[[p]]
+
+define void @pre_iv_update_loop(i32 %b) {
+entry:
+  %0 = call i32* @llvm.tpu.inttoptr.pi32(i32 0)
+  %1 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  %2 = call i32* @llvm.tpu.inttoptr.pi32(i32 64)
+  %3 = call i32* @llvm.tpu.inttoptr.pi32(i32 96)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.i ]
+  %idx0 = getelementptr inbounds i32, i32* %0, i32 %i
+  %4 = load i32, i32* %idx0, align 4
+  store i32 %4, i32* %1, align 4
+  %idx1 = getelementptr inbounds i32, i32* %0, i32 32
+  %5 = load i32, i32* %idx1, align 4
+  store i32 %5, i32* %2, align 4
+  %idx2 = getelementptr inbounds i32, i32* %0, i32 16
+  %6 = load i32, i32* %idx2, align 4
+  store i32 %6, i32* %3, align 4
+  %ic = add nuw nsw i32 %i, 1
+  %cmp.i = icmp ne i32 %ic, %b
+  br i1 %cmp.i, label %for.body.i, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: post_iv_update_loop
+; CHECK-LABEL: bb.1.for.body.i:
+; CHECK: %[[p:[0-9]+]]:gpr = PHI %{{[0-9]+}}, %bb.{{[0-9]+}}, %[[a:[0-9]+]], %bb.1
+; CHECK: %{{[0-9]+}}:ppr = CMPNEri %[[p]]
+; CHECK: %[[a]]:gpr = ADDri %[[p]]
+
+define void @post_iv_update_loop(i32 %b) {
+entry:
+  %0 = call i32* @llvm.tpu.inttoptr.pi32(i32 0)
+  %1 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  %2 = call i32* @llvm.tpu.inttoptr.pi32(i32 64)
+  %3 = call i32* @llvm.tpu.inttoptr.pi32(i32 96)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.i ]
+  %idx0 = getelementptr inbounds i32, i32* %0, i32 %i
+  %4 = load i32, i32* %idx0, align 4
+  store i32 %4, i32* %1, align 4
+  %idx1 = getelementptr inbounds i32, i32* %0, i32 32
+  %5 = load i32, i32* %idx1, align 4
+  store i32 %5, i32* %2, align 4
+  %idx2 = getelementptr inbounds i32, i32* %0, i32 16
+  %6 = load i32, i32* %idx2, align 4
+  store i32 %6, i32* %3, align 4
+  %cmp.i = icmp ne i32 %i, %b
+  %ic = add nuw nsw i32 %i, 1
+  br i1 %cmp.i, label %for.body.i, label %exit
+
+exit:
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/pred_spill_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/pred_spill_sc.ll
new file mode 100644
index 0000000..0a19579
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/pred_spill_sc.ll

@@ -0,0 +1,133 @@
+; RUN: llc -O2 < %s -mcpu=sparsecore-tec-vf -asm-verbose=false | FileCheck %s
+; REQUIRES: tpu
+
+; Test that we are getting a predicate register spill. Please note that we disable
+; optimizations via function attribute in order to disable pre-RA scheduling that
+; otherwise makes it hard to model a predicate register spill.
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+!smem.funcs.spill = !{!0}
+!smem.ranges.spill.start = !{!1}
+!smem.ranges.spill.limit = !{!2}
+!tilespmem.ranges.spill.start = !{!1}
+!tilespmem.ranges.spill.limit = !{!2}
+
+!0 = !{void (<8 x i32> addrspace(201)*)* @spill_pred_to_smem}
+!1 = !{i32 100}
+!2 = !{i32 200}
+
+; Function Attrs: nounwind readnone
+declare <8 x i32> @llvm.tpu.vlaneseq() #0
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32) nounwind
+attributes #0 = { nounwind readnone }
+attributes #1 = { optnone noinline }
+
+@unknownglob1 = global i32 zeroinitializer
+@unknownglob2 = global i32 zeroinitializer
+@unknownglob3 = global i32 zeroinitializer
+@unknownglob4 = global i32 zeroinitializer
+@unknownglob5 = global i32 zeroinitializer
+@unknownglob6 = global i32 zeroinitializer
+@unknownglob7 = global i32 zeroinitializer
+@unknownglob8 = global i32 zeroinitializer
+@unknownglob9 = global i32 zeroinitializer
+@unknownglob10 = global i32 zeroinitializer
+@unknownglob11 = global i32 zeroinitializer
+@unknownglob12 = global i32 zeroinitializer
+@unknownglob13 = global i32 zeroinitializer
+@unknownglob14 = global i32 zeroinitializer
+@unknownglob15 = global i32 zeroinitializer
+@unknownglob16 = global i32 zeroinitializer
+
+; CHECK-LABEL: spill_pred_to_smem:
+; CHECK: s[[#sp:]] = simm.s32 @!p[[#pr:]] $0x0
+; CHECK: s[[#sp]] = simm.s32 @p[[#pr]] $0x1
+; CHECK: [smem:$0xc7] = sst s[[#sp]]
+; CHECK: s[[#sf:]] = sld [smem:$0xc7]
+; CHECK: p{{[0-9]+}} = seq.s32 s[[#sf]], $0x1
+define void @spill_pred_to_smem(<8 x i32> addrspace(201)* %unknownptr) #1 {
+entry:
+  %laneseq = call <8 x i32> @llvm.tpu.vlaneseq()
+  %splatinsert = insertelement <8 x i32> undef, i32 127, i32 0
+  %splat = shufflevector <8 x i32> %splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
+  %base_rec = and <8 x i32> %laneseq, %splat
+  %zero = xor i32 1, 1
+
+  %unknownval1 = load i32, i32* @unknownglob1
+  %unknownval2 = load i32, i32* @unknownglob2
+  %unknownval3 = load i32, i32* @unknownglob3
+  %unknownval4 = load i32, i32* @unknownglob4
+  %unknownval5 = load i32, i32* @unknownglob5
+  %unknownval6 = load i32, i32* @unknownglob6
+  %unknownval7 = load i32, i32* @unknownglob7
+  %unknownval8 = load i32, i32* @unknownglob8
+  %unknownval9 = load i32, i32* @unknownglob9
+  %unknownval10 = load i32, i32* @unknownglob10
+  %unknownval11 = load i32, i32* @unknownglob11
+  %unknownval12 = load i32, i32* @unknownglob12
+  %unknownval13 = load i32, i32* @unknownglob13
+  %unknownval14 = load i32, i32* @unknownglob14
+  %unknownval15 = load i32, i32* @unknownglob15
+  %unknownval16 = load i32, i32* @unknownglob16
+
+  %cmp1 = icmp eq i32 %zero, %unknownval1
+  %cmp2 = icmp eq i32 %zero, %unknownval2  
+  %cmp3 = icmp eq i32 %zero, %unknownval3
+  %cmp4 = icmp eq i32 %zero, %unknownval4
+  %cmp5 = icmp eq i32 %zero, %unknownval5
+  %cmp6 = icmp eq i32 %zero, %unknownval6
+  %cmp7 = icmp eq i32 %zero, %unknownval7
+  %cmp8 = icmp eq i32 %zero, %unknownval8
+  %cmp9 = icmp eq i32 %zero, %unknownval9
+  %cmp10 = icmp eq i32 %zero, %unknownval10
+  %cmp11 = icmp eq i32 %zero, %unknownval11
+  %cmp12 = icmp eq i32 %zero, %unknownval12
+  %cmp13 = icmp eq i32 %zero, %unknownval13
+  %cmp14 = icmp eq i32 %zero, %unknownval14
+  %cmp15 = icmp eq i32 %zero, %unknownval15
+  %cmp16 = icmp eq i32 %zero, %unknownval16
+
+  %unknownval = load <8 x i32>, <8 x i32> addrspace(201)* %unknownptr
+  %sel1 = select i1 %cmp1, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel2 = select i1 %cmp2, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel3 = select i1 %cmp3, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel4 = select i1 %cmp4, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel5 = select i1 %cmp5, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel6 = select i1 %cmp6, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel7 = select i1 %cmp7, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel8 = select i1 %cmp8, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel9 = select i1 %cmp9, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel10 = select i1 %cmp10, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel11 = select i1 %cmp11, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel12 = select i1 %cmp12, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel13 = select i1 %cmp13, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel14 = select i1 %cmp14, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel15 = select i1 %cmp15, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel16 = select i1 %cmp16, <8 x i32> %unknownval, <8 x i32> %base_rec
+
+  %add1 = add <8 x i32> %sel1, %sel1
+  %add2 = add <8 x i32> %sel2, %add1
+  %add3 = add <8 x i32> %sel3, %add2
+  %add4 = add <8 x i32> %sel4, %add3
+  %add5 = add <8 x i32> %sel5, %add4
+  %add6 = add <8 x i32> %sel6, %add5
+  %add7 = add <8 x i32> %sel7, %add6
+  %add8 = add <8 x i32> %sel8, %add7
+  %add9 = add <8 x i32> %sel9, %add8
+  %add10 = add <8 x i32> %sel10, %add9
+  %add11 = add <8 x i32> %sel11, %add10
+  %add12 = add <8 x i32> %sel12, %add11
+  %add13 = add <8 x i32> %sel13, %add12
+  %add14 = add <8 x i32> %sel14, %add13
+  %add15 = add <8 x i32> %sel15, %add14
+  %add16 = add <8 x i32> %sel16, %add15
+
+  %result = sub <8 x i32> %add16, %base_rec
+
+  %result_addr = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 0)
+  store <8 x i32> %result, <8 x i32> addrspace(201)* %result_addr
+
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/proximal_yogi_bc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/proximal_yogi_bc.ll
new file mode 100644
index 0000000..9192cc5
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/proximal_yogi_bc.ll

@@ -0,0 +1,221 @@
+; RUN: opt -march=googletpu -mcpu=barnacore-cc-pf < %s -passes=always-inline,instcombine | llc -march=googletpu -mcpu=barnacore-cc-pf -O3 | FileCheck %s
+; REQUIRES: tpu
+
+declare void @llvm.tpu.bc.loop.start(i32)
+declare i1 @llvm.tpu.bc.loop.end()
+declare <8 x float> @llvm.tpu.bc.load.aliaddr(<8 x float> addrspace(207)*, i32) argmemonly
+declare <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)*, i32, i32) argmemonly
+declare void @llvm.tpu.bc.store.aliaddr(<8 x float>, <8 x float> addrspace(207)*, i32) argmemonly
+declare void @llvm.tpu.bc.store.aliaddr.flm(<8 x float>, <8 x float> addrspace(207)*, i32, i32) argmemonly
+declare void @llvm.tpu.bc.store.concat.aliaddr(<8 x float>, i32) inaccessiblememonly
+declare void @llvm.tpu.bc.shift.aliaddr(i32) inaccessiblememonly
+declare <8 x float> @llvm.tpu.bc.select.predicate(i32, <8 x float>, <8 x float>) readnone
+declare [16 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex.a16([16 x <8 x float>], <8 x float>) readnone
+declare <8 x float> @llvm.tpu.bc.extractvalue.loopindex.a16([16 x <8 x float>]) readnone
+declare [8 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex.a8([8 x <8 x float>], <8 x float>) readnone
+declare <8 x float> @llvm.tpu.bc.extractvalue.loopindex.a8([8 x <8 x float>]) readnone
+declare i32 @llvm.tpu.rsqrt(<8 x float>) inaccessiblememonly
+declare i32 @llvm.tpu.rcp(<8 x float>) inaccessiblememonly
+declare <8 x float> @llvm.tpu.eup.pop(i32) inaccessiblememonly
+declare <8 x float> @llvm.tpu.clamp.symmetric(<8 x float>, <8 x float>) readnone
+
+; Proximal Yogi (bear).
+; https://critique.corp.google.com/#review/271680647
+
+; This is the expected golden output.
+; CHECK: {  	_ = loop_start $0x19, $0x1  }   // num: 0
+; CHECK: {  	_ = 	vnop  }                 // num: 1
+; CHECK-NEXT: {  	_ = 	vnop  }                 // num: 2
+; CHECK-NEXT: {  	_ = 	vnop  }                 // num: 3
+; CHECK-NEXT: {  	_ = 	vnop  }                 // num: 4
+; CHECK-NEXT: {  	v0 = vld.f32 [bmem:s1] ali_addr:$0x1 flm:$2  } // num: 5
+; CHECK-NEXT: {  	v0 =	vmul.f32 $0.5, v0;
+; CHECK-NEXT: 	v4 =	vpop ps:$1 (erf)  }     // pop %2, num: 6
+; CHECK-NEXT: {  	v2 = vld.f32 [bmem:s0] ali_addr:$0x1;
+; CHECK-NEXT: 	v4 =	vmul.f32 ps:$1 $0.5, v4  } // num: 7
+; CHECK-NEXT: {  	v4 =	vmul.f32 v2, v2  }      // num: 8
+; CHECK-NEXT: {  	v6 = vld.f32 [bmem:s1] ali_addr:$0x1 flm:$1;
+; CHECK-NEXT: 	v6 =	vmul.f32 ps:$1 $0.0, v4  } // num: 9
+; CHECK-NEXT: {  	v2 =	vmul.f32 $0.5, v2;
+; CHECK-NEXT: 	v6 =	vsub.f32 v4, v6  }      // num: 10
+; CHECK-NEXT: {  	v6 =	vadd.f32 ps:$1 $1.0, v6  } // num: 11
+; CHECK-NEXT: {  	v4 =	vmul.f32 $0.5, v4;
+; CHECK-NEXT: 	v2 =	vadd.f32 v2, v0  }      // num: 12
+; CHECK-NEXT: {  	v0 =	vand.u32 $-0x80000000, v6;
+; CHECK-NEXT: 	(erf) = vrcp.f32 ps:$1 v6  }    // push %0, num: 13
+; CHECK-NEXT: {  	v0 =	vmul.f32 ps:$1 $0.5, v0;
+; CHECK-NEXT: 	v0 =	vxor.u32 v4, v0  }      // num: 14
+; CHECK-NEXT: {  	v2 =	vmul.f32 ps:$1 v2, v4;
+; CHECK-NEXT: 	(erf) = vrsqrt.f32 v0  }        // push %1, num: 15
+; CHECK-NEXT: {  	_ = 	vnop  }                 // num: 16
+; CHECK-NEXT: {  	v0 =	vsub.f32 ps:$1 v0, v2;
+; CHECK-NEXT: 	v2 =	vmul.f32 ps:$1 $0.5, v4  } // num: 17
+; CHECK-NEXT: {  	_ = 	vnop  }                 // num: 18
+; CHECK-NEXT: {  	v2 =	vclamps.f32 ps:$1 v0, v2  } // num: 19
+; CHECK-NEXT: {  	v0 =	vsub.f32 ps:$1 v0, v2;
+; CHECK-NEXT: 	v2 =	vpop ps:$1 (erf)  }     // pop %0, num: 20
+; CHECK-NEXT: {  	v4 =	vpop (erf)  }           // pop %1, num: 21
+; CHECK-NEXT: {  	[bmem:s1] = vst.f32 v0 ali_addr:$0x1 flm:$1;
+; CHECK-NEXT: 	v0 =	vmul.f32 v0, v4  }      // num: 22
+; CHECK-NEXT: {  	_ = 	vnop  }                 // num: 23
+; CHECK-NEXT: {  	v0 =	vadd.f32 $0.5, v0  }    // num: 24
+; CHECK-NEXT: {  	[bmem:s1] = vst.f32 v2 ali_addr:$0x1 flm:$2  } // num: 25
+; CHECK-NEXT: {  	[bmem:s1] = vst.f32 ps:$1 v0 ali_addr:$0x1;
+; CHECK-NEXT: 	(erf) = vrcp.f32 v0;
+; CHECK-NEXT: 	v0 = vld.f32 [bmem:s1] ali_addr:$0x1  } // push %2, num: 26
+
+; sign_and_mul function computes: sign(x) * y
+; The sign function computes sign(x) with one exception that it returns
+; 1 for +0 and -1 for -0.
+define internal <8 x float> @sign_and_mul(<8 x float> %x, <8 x float> %y) alwaysinline {
+  %xi = bitcast <8 x float> %x to <8 x i32>
+  %yi = bitcast <8 x float> %y to <8 x i32>
+  %a = and <8 x i32> %xi, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
+  %b = xor <8 x i32> %a, %yi
+  %bf  = bitcast <8 x i32> %b to <8 x float>
+  ret <8 x float> %bf
+}
+
+; w = (b - c * math_ops.sign(b)) * a
+; w = math_ops.cast(math_ops.abs(b) > c, dtype=b.dtype) * w
+;
+; The following expression computes:
+;  cast(math_ops.abs(b) > c) * (b - c * sign(b))
+; The transformation is non-trivial but equivalence can be verified by
+; examining the following four cases:
+; (1) b > 0, abs(b) > c
+; (2) b > 0, abs(b) < c
+; (3) b < 0, abs(b) > c
+; (4) b < 0, abs(b) < c
+define internal <8 x float> @solve(<8 x float> %a, <8 x float> %b, <8 x float> %c) alwaysinline {
+  %clamp = call <8 x float> @llvm.tpu.clamp.symmetric(<8 x float> %b, <8 x float> %c)
+  %numerator = fsub <8 x float> %b, %clamp
+  %mul = fmul <8 x float> %numerator, %a
+  ret <8 x float> %mul
+}
+
+define internal <8 x float> @splat(float %a) alwaysinline {
+  %b = insertelement <8 x float> undef, float %a, i32 0
+  %c = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x float> %c
+}
+
+define internal void @yogi(<8 x float> addrspace(207)* noalias %grad_off,
+                  <8 x float> addrspace(207)* noalias %sram_off,
+                  float %weight_decay_factor_s,
+                  float %beta1_s,
+                  float %beta2_s,
+                  float %transformed_learning_rate_s,
+                  float %epsilon_s,
+                  float %l1_s,
+                  float %l2_s) alwaysinline {
+entry:
+  %weight_decay_factor = call <8 x float> @splat(float %weight_decay_factor_s)
+  %beta1 = call <8 x float> @splat(float %beta1_s)
+  %beta2 = call <8 x float> @splat(float %beta2_s)
+  %transformed_learning_rate = call <8 x float> @splat(float %transformed_learning_rate_s)
+  %epsilon = call <8 x float> @splat(float %epsilon_s)
+  %l1 = call <8 x float> @splat(float %l1_s)
+  %l2 = call <8 x float> @splat(float %l2_s)
+
+  call void @llvm.tpu.bc.loop.start(i32 0)
+  br label %loop
+
+loop:
+  ; Load gradient
+  %grad = call <8 x float> @llvm.tpu.bc.load.aliaddr(<8 x float> addrspace(207)* %grad_off, i32 1)
+
+  ; Clip gradient (TODO: assume no clipping for now).
+
+  ; Load weight
+  %raw_weight = call <8 x float> @llvm.tpu.bc.load.aliaddr(<8 x float> addrspace(207)* %sram_off, i32 1)
+
+  ; Apply weight decay
+  %one_minus_decay_factor = fsub <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %weight_decay_factor
+  %weight = fmul <8 x float> %raw_weight, %one_minus_decay_factor
+
+  ; Load v with flm=1
+  %v = call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %sram_off, i32 1, i32 1)
+
+  ; G^2
+  %gradient_squared = fmul <8 x float> %grad, %grad
+
+  ; G^2 - v
+  %gradient_squared_minus_v = fsub <8 x float> %gradient_squared, %v
+
+  ; sign = math_ops.sign(grad2 - v_slice)
+  ; new_v = v_slice + (1-beta2_t) * sign * grad2
+  ; v_t = scatter_update_fn(v, indices, v_scaled_g_values)
+  %one_minus_beta2 = fsub <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %beta2
+  %g2_by_1_minus_beta2 = fmul <8 x float> %gradient_squared, %one_minus_beta2
+  %new_v = call <8 x float> @sign_and_mul(<8 x float> %gradient_squared_minus_v, <8 x float> %g2_by_1_minus_beta2)
+
+  ; Yogi LR scaling co-efficient.
+  ; lr_scaling_coeff = 1 / (sqrt(v_scaled_g_values) + epsilon)
+  %eup = call i32 @llvm.tpu.rsqrt(<8 x float> %new_v)
+  %rsqrt_v = call <8 x float> @llvm.tpu.eup.pop(i32 %eup)
+  %v_by_rsqrt_v = fmul <8 x float> %new_v, %rsqrt_v
+  %v_by_rsqrt_v_plus_e = fadd <8 x float> %v_by_rsqrt_v, %epsilon
+  %eup2 = call i32 @llvm.tpu.rcp(<8 x float> %v_by_rsqrt_v_plus_e)
+  %lr_scaling_coeff = call <8 x float> @llvm.tpu.eup.pop(i32 %eup2)
+
+  ; Yogi effective LR
+  %per_coord_lr = fmul <8 x float> %transformed_learning_rate, %lr_scaling_coeff
+
+  ; 1/(1 + l2_t * per_coord_lr)
+  ; Computing the inverse and special casing l2=0 optimizes the case when
+  ; l1>0 and l2=0, we can avoid a reciprocal operation in the critical path.
+  %inv_scaled_1 = fmul <8 x float> %l2, %per_coord_lr
+  %inv_scaled_2 = fadd <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %inv_scaled_1
+  %eup3 = call i32 @llvm.tpu.rcp(<8 x float> %inv_scaled_2)
+  %inv_scaled_3 = call <8 x float> @llvm.tpu.eup.pop(i32 %eup3)
+  %l2_is_zero = fcmp oeq float %l2_s, 0.0
+  %inv_scaled_lr = select i1 %l2_is_zero, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, <8 x float> %inv_scaled_3
+
+  ; TODO(jmolloy): implementing the harder case, assume beta1 != 0.0f.
+  ; Load m with flm=2
+  %m = call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %sram_off, i32 1, i32 2)
+
+  ; m_scaled_g_values = grad * (1 - beta1_t)
+  %one_minus_beta1 = fsub <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %beta1
+  %scaled_gradient = fmul <8 x float> %grad, %one_minus_beta1
+
+  ; m_slice = array_ops.gather(m, indices) * beta1_t + m_scaled_g_values
+  ; m_t = scatter_update_fn(m, indices, m_slice)
+  %m_by_beta1 = fmul <8 x float> %m, %beta1
+  %new_m = fadd <8 x float> %m_by_beta1, %scaled_gradient
+
+  ; Store new linear (with flm=2).
+  call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %new_m, <8 x float> addrspace(207)* %sram_off, i32 1, i32 2)
+
+  ; Step 1: Gradient descent
+  ; var_slice = array_ops.gather(var, indices)
+  ; new_var = var_slice - per_coord_lr * m_slice
+  %per_coord_lr_by_new_m = fmul <8 x float> %per_coord_lr, %new_m
+  %new_weight = fsub <8 x float> %weight, %per_coord_lr_by_new_m
+
+  ; Step 2: Prox operator
+  ; TODO(jmolloy): Assuming l1 > 0.0f here.
+  %per_coord_lr_by_l1 = fmul <8 x float> %per_coord_lr, %l1
+  %new_weight_prox = call <8 x float> @solve(<8 x float> %inv_scaled_lr, <8 x float> %new_weight, <8 x float> %per_coord_lr_by_l1)
+
+  ; Clip new weight (TODO: noop)
+
+  ; Store new weight.
+  call void @llvm.tpu.bc.store.aliaddr(<8 x float> %new_weight_prox, <8 x float> addrspace(207)* %sram_off, i32 1)
+
+  ; Store new v: If gradient is zero, v is unchanged (flm=1)
+  call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %new_v, <8 x float> addrspace(207)* %sram_off, i32 1, i32 1)
+
+  %loopend = call i1 @llvm.tpu.bc.loop.end()
+  br i1 %loopend, label %loop, label %out
+
+out:
+  ret void
+}
+
+define void @yogi_imm(<8 x float> addrspace(207)* noalias %grad_off,
+                      <8 x float> addrspace(207)* noalias %sram_off) {
+  call void @yogi(<8 x float> addrspace(207)* %grad_off, <8 x float> addrspace(207)* %sram_off, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.0)
+  ret void
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/radix_sort_pathological_super_pass_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/radix_sort_pathological_super_pass_sc.ll
new file mode 100644
index 0000000..6612c16
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/radix_sort_pathological_super_pass_sc.ll

@@ -0,0 +1,3408 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-vf < %s \
+; RUN: | llc -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-pipeliner-strategy=swingslack -tpu-fixed-vregs=32-63 \
+; RUN: -tpu-enable-pipeliner-super-pass -tpu-pipeliner-annotate-for-testing \
+; RUN: -enable-pre-spill -debug-only=tpu-loop-analysis -tpu-enable-loop-analysis \
+; RUN: -improve-prolog-epilog-aa=false 2>&1 | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK: Post-RA pipelined loop bb.5 (from bb.3): II=16
+; CHECK: Post-RA pipelined loop bb.8 (from bb.6): II=13
+; CHECK: Post-RA pipelined loop bb.13 (from bb.9): II=74
+
+%"class.embeddings::SparsecoreMemoryAllocator" = type { i32*, <8 x i32> addrspace(201)*, i32 addrspace(204)*, %"class.embeddings::impl::StaticAllocator", %"class.embeddings::impl::StaticAllocator", %"class.embeddings::impl::StaticAllocator" }
+%"class.embeddings::impl::StaticAllocator" = type { i32, i32 }
+%"class.embeddings::PointerBase" = type { %"class.embeddings::MemorySpace", %"class.embeddings::BasicType", %"union.embeddings::PointerBase::AnyPtr" }
+%"class.embeddings::MemorySpace" = type { i32 }
+%"class.embeddings::BasicType" = type { i32 }
+%"union.embeddings::PointerBase::AnyPtr" = type { i32* }
+%"class.embeddings::TileSpmemVectorArray" = type { %"class.embeddings::ScratchpadArray" }
+%"class.embeddings::ScratchpadArray" = type { %"class.embeddings::BaseArray" }
+%"class.embeddings::BaseArray" = type { %"class.embeddings::PointerBase", i32 }
+%"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem" = type { i32, %"class.embeddings::SparsecoreMemoryAllocator"*, %"class.embeddings::ScratchpadArray", %"class.embeddings::ScratchpadArray", %"class.embeddings::ScratchpadArray", %"class.embeddings::ScratchpadArray", %"class.embeddings::ScratchpadArray" }
+%"class.embeddings::TileSpmemPointer" = type { %"class.embeddings::PointerBase" }
+%"class.embeddings::SCM_TileSpmem" = type { %"class.embeddings::MemorySpace" }
+%"class.embeddings::SCTY_V8S32" = type { %"class.embeddings::BasicType" }
+%"class.embeddings::RadixSortIteration" = type { i32, %"class.embeddings::ScratchpadArray"* }
+%"class.embeddings::SmemPointer" = type { %"class.embeddings::PointerBase" }
+%"class.embeddings::SmemArray" = type { %"class.embeddings::ScratchpadArray" }
+%"class.embeddings::SmemArray.0" = type { %"class.embeddings::ScratchpadArray" }
+%"class.embeddings::SmemArray.1" = type { %"class.embeddings::ScratchpadArray" }
+%"class.embeddings::TileSpmemVectorArray.2" = type { %"class.embeddings::ScratchpadArray" }
+%"class.embeddings::TileSpmemVectorArray.3" = type { %"class.embeddings::ScratchpadArray" }
+%"class.embeddings::SCM_Smem" = type { %"class.embeddings::MemorySpace" }
+%"struct.embeddings::UniqueResult" = type { <8 x i32>, <8 x i32>, <8 x i32> }
+%"struct.embeddings::AddScanResult" = type { <8 x i32>, <8 x i32> }
+
+$_ZN10embeddings25SparsecoreMemoryAllocatorC2EPiS2_PU5AS201Dv8_iS5_PU5AS204ViS7_ = comdat any
+
+$_ZN10embeddings14PointerFactoryIPU5AS201Dv8_iE6CreateES3_ = comdat any
+
+$_ZN10embeddings11PointerBaseC2ERKS0_ = comdat any
+
+$_ZN10embeddings20TileSpmemVectorArrayIiEC2ENS_11PointerBaseEi = comdat any
+
+$_ZN10embeddings20TileSpmemVectorArrayIiEC2ERKS1_ = comdat any
+
+$_ZN10embeddings37RadixSortKeyValueTileSpmemToTileSpmemILi4EEC2EPNS_25SparsecoreMemoryAllocatorENS_20TileSpmemVectorArrayIiEES5_ = comdat any
+
+$_ZN10embeddings37RadixSortKeyValueTileSpmemToTileSpmemILi4EE4SortEv = comdat any
+
+$_ZN10embeddings4impl15StaticAllocatorC2Eii = comdat any
+
+$_ZN10embeddings11MemorySpaceC2ERKS0_ = comdat any
+
+$_ZN10embeddings9BasicTypeC2ERKS0_ = comdat any
+
+$_ZN10embeddings9BaseArrayC2ERKS0_ = comdat any
+
+$_ZN10embeddings15ScratchpadArrayC2ENS_11PointerBaseEi = comdat any
+
+$_ZN10embeddings9BaseArrayC2ENS_11PointerBaseEi = comdat any
+
+$_ZN10embeddings11ToBasicTypeIDv8_iE10basic_typeEv = comdat any
+
+$_ZN10embeddings16TileSpmemPointerC2EPU5AS201vNS_9BasicTypeE = comdat any
+
+$_ZN10embeddings11PointerBaseC2EOS0_ = comdat any
+
+$_ZN10embeddings10SCTY_V8S32C2Ev = comdat any
+
+$_ZN10embeddings9BasicTypeC2ENS_19SparsecoreBasicTypeE = comdat any
+
+$_ZN10embeddings11PointerBaseC2ENS_9BasicTypeEPU5AS201v = comdat any
+
+$_ZN10embeddings13SCM_TileSpmemC2Ev = comdat any
+
+$_ZN10embeddings11PointerBase6AnyPtrC2EPU5AS201v = comdat any
+
+$_ZN10embeddings11MemorySpaceC2ENS_21SparsecoreMemorySpaceE = comdat any
+
+$_ZN10embeddings15ScratchpadArrayC2ERKS0_ = comdat any
+
+$_ZN10embeddings17ScratchpadFactory6CreateEPNS_25SparsecoreMemoryAllocatorENS_11MemorySpaceENS_9BasicTypeEi = comdat any
+
+$_ZNK10embeddings9BaseArray11ElementTypeEv = comdat any
+
+$_ZNK10embeddings9BaseArray8ElementsEv = comdat any
+
+$_ZNK10embeddings9BasicType11SizeInBytesEv = comdat any
+
+$_ZNK10embeddings11MemorySpace15WordSizeInBytesEv = comdat any
+
+$_ZNK10embeddings11MemorySpace12memory_spaceEv = comdat any
+
+$_ZNK10embeddings9BasicType4typeEv = comdat any
+
+$_ZN10embeddings25SparsecoreMemoryAllocator12AllocateSmemEi = comdat any
+
+$_ZN10embeddings11SmemPointerC2EPvNS_9BasicTypeE = comdat any
+
+$_ZN10embeddings9SmemArrayIiEC2ENS_11PointerBaseEi = comdat any
+
+$_ZN10embeddings15ScratchpadArrayC2EOS0_ = comdat any
+
+$_ZN10embeddings9SmemArrayIjEC2ENS_11PointerBaseEi = comdat any
+
+$_ZN10embeddings9SmemArrayIfEC2ENS_11PointerBaseEi = comdat any
+
+$_ZN10embeddings25SparsecoreMemoryAllocator17AllocateTileSpmemEi = comdat any
+
+$_ZN10embeddings20TileSpmemVectorArrayIjEC2ENS_11PointerBaseEi = comdat any
+
+$_ZN10embeddings20TileSpmemVectorArrayIfEC2ENS_11PointerBaseEi = comdat any
+
+$_ZN10embeddings9word_sizeIPvE5bytesEv = comdat any
+
+$_ZN10embeddings9word_sizeIPU5AS201vE5bytesEv = comdat any
+
+$_ZN10embeddings9word_sizeIPU5AS202vE5bytesEv = comdat any
+
+$_ZN10embeddings9word_sizeIPU5AS203vE5bytesEv = comdat any
+
+$_ZN10embeddings9word_sizeIPU5AS204vE5bytesEv = comdat any
+
+$_ZN10embeddings4impl15StaticAllocator8AllocateEi = comdat any
+
+$_ZN10embeddings11PointerBaseC2ENS_9BasicTypeEPv = comdat any
+
+$_ZN10embeddings8SCM_SmemC2Ev = comdat any
+
+$_ZN10embeddings11PointerBase6AnyPtrC2EPv = comdat any
+
+$_ZN10embeddings9BaseArrayC2EOS0_ = comdat any
+
+$_ZNK10embeddings11PointerBase10value_typeEv = comdat any
+
+$_ZN10embeddings18RadixSortIterationIiLi4EEC2EPNS_15ScratchpadArrayE = comdat any
+
+$_ZN10embeddings18RadixSortIterationIiLi4EE12ClearBucketsEv = comdat any
+
+$_ZN10embeddings18RadixSortIterationIiLi4EE13HistogramKeysEiNS_20TileSpmemVectorArrayIiEE = comdat any
+
+$_ZN10embeddings4CastINS_20TileSpmemVectorArrayIiEENS_15ScratchpadArrayEEENS_15cast_retty_implIT_T0_E8ret_typeERKS6_ = comdat any
+
+$_ZN10embeddings18RadixSortIterationIiLi4EE11ScanBucketsEv = comdat any
+
+$_ZN10embeddings18RadixSortIterationIiLi4EE14RankAndPermuteIiEEvibNS_20TileSpmemVectorArrayIiEENS3_IT_EEPS4_PS6_ = comdat any
+
+$_ZN10embeddings20TileSpmemVectorArrayIiEclEi = comdat any
+
+$_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv = comdat any
+
+$_ZN10embeddings4CastINS_16TileSpmemPointerENS_11PointerBaseEEENS_15cast_retty_implIT_T0_E8ret_typeERKS5_ = comdat any
+
+$_ZNK10embeddings9BaseArray7BasePtrEv = comdat any
+
+$_ZNK10embeddings16TileSpmemPointer6RawPtrEv = comdat any
+
+$_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii = comdat any
+
+$_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_ = comdat any
+
+$_ZN10embeddings3tpuIDv8_iE6uniqueEPS1_S3_S1_ = comdat any
+
+$_ZN10embeddings13VectorAddScanIDv8_iEENS_13AddScanResultIT_EES1_S3_ = comdat any
+
+$_ZN10embeddings3tpuIDv8_iE7addscanEPS1_S3_ = comdat any
+
+$_ZN10embeddings18RadixSortIterationIiLi4EE16DecrementBucketsEv = comdat any
+
+$_ZN10embeddings18RadixSortIterationIiLi4EE16IncrementBucketsEv = comdat any
+
+$_ZN10embeddings18RadixSortIterationIiLi4EE12AddToBucketsEi = comdat any
+
+@__sc_scs_entry = dso_local alias i32, bitcast (void ()* @scs to i32*)
+@__sc_tile_access_entry = dso_local alias i32, bitcast (void ()* @tile_access to i32*)
+@__sc_tile_execute_entry = dso_local alias i32, bitcast (void ()* @tile_execute to i32*)
+
+; Function Attrs: mustprogress nounwind
+define dso_local void @tile_access() #0 section ".text.tile_access" {
+  ret void
+}
+
+; Function Attrs: mustprogress
+define dso_local void @tile_execute() #1 section ".text.tile_execute" {
+  %1 = alloca i32, align 4
+  %2 = alloca i32*, align 4
+  %3 = alloca i32*, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32 addrspace(204)*, align 4
+  %6 = alloca i32 addrspace(204)*, align 4
+  %7 = alloca i32, align 4
+  %8 = alloca <8 x i32> addrspace(201)*, align 4
+  %9 = alloca <8 x i32> addrspace(201)*, align 4
+  %10 = alloca %"class.embeddings::SparsecoreMemoryAllocator", align 4
+  %11 = alloca %"class.embeddings::PointerBase", align 4
+  %12 = alloca %"class.embeddings::PointerBase", align 4
+  %13 = alloca i32, align 4
+  %14 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %15 = alloca %"class.embeddings::PointerBase", align 4
+  %16 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %17 = alloca %"class.embeddings::PointerBase", align 4
+  %18 = alloca %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", align 4
+  %19 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %20 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %21 = bitcast i32* %1 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %21) #3
+  store i32 512, i32* %1, align 4, !tbaa !3
+  %22 = bitcast i32** %2 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %22) #3
+  %23 = call i32* @llvm.tpu.alloca.smem(i32 512)
+  store i32* %23, i32** %2, align 4, !tbaa !7
+  %24 = bitcast i32** %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %24) #3
+  %25 = load i32*, i32** %2, align 4, !tbaa !7
+  %26 = getelementptr inbounds i32, i32* %25, i32 512
+  store i32* %26, i32** %3, align 4, !tbaa !7
+  %27 = bitcast i32* %4 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %27) #3
+  store i32 10, i32* %4, align 4, !tbaa !3
+  %28 = bitcast i32 addrspace(204)** %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %28) #3
+  %29 = call i32 addrspace(204)* @llvm.tpu.alloca.sflag(i32 10)
+  store i32 addrspace(204)* %29, i32 addrspace(204)** %5, align 4, !tbaa !7
+  %30 = bitcast i32 addrspace(204)** %6 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %30) #3
+  %31 = load i32 addrspace(204)*, i32 addrspace(204)** %5, align 4, !tbaa !7
+  %32 = getelementptr inbounds i32, i32 addrspace(204)* %31, i32 10
+  store i32 addrspace(204)* %32, i32 addrspace(204)** %6, align 4, !tbaa !7
+  %33 = bitcast i32* %7 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %33) #3
+  store i32 65535, i32* %7, align 4, !tbaa !3
+  %34 = bitcast <8 x i32> addrspace(201)** %8 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %34) #3
+  %35 = call i32 addrspace(201)* @llvm.tpu.alloca.tilespmem(i32 65535)
+  %st = bitcast i32 addrspace(201)* %35 to <8 x i32> addrspace(201)*
+  store <8 x i32> addrspace(201)* %st, <8 x i32> addrspace(201)** %8, align 4, !tbaa !7
+  %36 = bitcast <8 x i32> addrspace(201)** %9 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %36) #3
+  %37 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** %8, align 4, !tbaa !7
+  %38 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %37, i32 65535
+  store <8 x i32> addrspace(201)* %38, <8 x i32> addrspace(201)** %9, align 4, !tbaa !7
+  %39 = call i32 addrspace(201)* @llvm.tpu.alloca.tilespmem(i32 32768)
+  %40 = bitcast %"class.embeddings::SparsecoreMemoryAllocator"* %10 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 36, i8* %40) #3
+  %41 = load i32*, i32** %2, align 4, !tbaa !7
+  %42 = load i32*, i32** %3, align 4, !tbaa !7
+  %43 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** %8, align 4, !tbaa !7
+  %44 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** %9, align 4, !tbaa !7
+  %45 = load i32 addrspace(204)*, i32 addrspace(204)** %5, align 4, !tbaa !7
+  %46 = load i32 addrspace(204)*, i32 addrspace(204)** %6, align 4, !tbaa !7
+  call void @_ZN10embeddings25SparsecoreMemoryAllocatorC2EPiS2_PU5AS201Dv8_iS5_PU5AS204ViS7_(%"class.embeddings::SparsecoreMemoryAllocator"* noundef nonnull align 4 dereferenceable(36) %10, i32* noundef %41, i32* noundef %42, <8 x i32> addrspace(201)* noundef %43, <8 x i32> addrspace(201)* noundef %44, i32 addrspace(204)* noundef %45, i32 addrspace(204)* noundef %46) #21
+  %47 = bitcast %"class.embeddings::PointerBase"* %11 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %47) #3
+  %48 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 0) #21
+  %49 = inttoptr i32 %48 to <8 x i32> addrspace(201)*
+  call void @_ZN10embeddings14PointerFactoryIPU5AS201Dv8_iE6CreateES3_(%"class.embeddings::PointerBase"* sret(%"class.embeddings::PointerBase") align 4 %11, <8 x i32> addrspace(201)* noundef %49) #21
+  %50 = bitcast %"class.embeddings::PointerBase"* %12 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %50) #3
+  %51 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 1) #21
+  %52 = inttoptr i32 %51 to <8 x i32> addrspace(201)*
+  call void @_ZN10embeddings14PointerFactoryIPU5AS201Dv8_iE6CreateES3_(%"class.embeddings::PointerBase"* sret(%"class.embeddings::PointerBase") align 4 %12, <8 x i32> addrspace(201)* noundef %52) #21
+  %53 = bitcast i32* %13 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %53) #3
+  %54 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 2) #21
+  store i32 %54, i32* %13, align 4, !tbaa !3
+  %55 = bitcast %"class.embeddings::TileSpmemVectorArray"* %14 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %55) #3
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %15, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %11) #21
+  %56 = load i32, i32* %13, align 4, !tbaa !3
+  %57 = sdiv i32 %56, 8
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %14, %"class.embeddings::PointerBase"* noundef %15, i32 noundef %57) #21
+  %58 = bitcast %"class.embeddings::TileSpmemVectorArray"* %16 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %58) #3
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %17, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %12) #21
+  %59 = load i32, i32* %13, align 4, !tbaa !3
+  %60 = sdiv i32 %59, 8
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %16, %"class.embeddings::PointerBase"* noundef %17, i32 noundef %60) #21
+  %61 = bitcast %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %18 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 88, i8* %61) #3
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %19, %"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %14) #21
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %20, %"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %16) #21
+  call void @_ZN10embeddings37RadixSortKeyValueTileSpmemToTileSpmemILi4EEC2EPNS_25SparsecoreMemoryAllocatorENS_20TileSpmemVectorArrayIiEES5_(%"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* noundef nonnull align 4 dereferenceable(88) %18, %"class.embeddings::SparsecoreMemoryAllocator"* noundef %10, %"class.embeddings::TileSpmemVectorArray"* noundef %19, %"class.embeddings::TileSpmemVectorArray"* noundef %20) #21
+  call void @_ZN10embeddings37RadixSortKeyValueTileSpmemToTileSpmemILi4EE4SortEv(%"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* noundef nonnull align 4 dereferenceable(88) %18) #21
+  call void @_ZN12_GLOBAL__N_16ReturnEii(i32 noundef 1, i32 noundef 0) #21
+  %62 = bitcast %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %18 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 88, i8* %62) #3
+  %63 = bitcast %"class.embeddings::TileSpmemVectorArray"* %16 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %63) #3
+  %64 = bitcast %"class.embeddings::TileSpmemVectorArray"* %14 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %64) #3
+  %65 = bitcast i32* %13 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %65) #3
+  %66 = bitcast %"class.embeddings::PointerBase"* %12 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %66) #3
+  %67 = bitcast %"class.embeddings::PointerBase"* %11 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %67) #3
+  %68 = bitcast %"class.embeddings::SparsecoreMemoryAllocator"* %10 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 36, i8* %68) #3
+  %69 = bitcast <8 x i32> addrspace(201)** %9 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %69) #3
+  %70 = bitcast <8 x i32> addrspace(201)** %8 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %70) #3
+  %71 = bitcast i32* %7 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %71) #3
+  %72 = bitcast i32 addrspace(204)** %6 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %72) #3
+  %73 = bitcast i32 addrspace(204)** %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %73) #3
+  %74 = bitcast i32* %4 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %74) #3
+  %75 = bitcast i32** %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %75) #3
+  %76 = bitcast i32** %2 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %76) #3
+  %77 = bitcast i32* %1 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %77) #3
+  ret void
+}
+
+; Function Attrs: argmemonly nocallback nofree nosync nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: nounwind
+declare i32* @llvm.tpu.alloca.smem(i32) #3
+
+; Function Attrs: nounwind
+declare i32 addrspace(204)* @llvm.tpu.alloca.sflag(i32) #3
+
+; Function Attrs: nounwind
+declare i32 addrspace(201)* @llvm.tpu.alloca.tilespmem(i32) #3
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings25SparsecoreMemoryAllocatorC2EPiS2_PU5AS201Dv8_iS5_PU5AS204ViS7_(%"class.embeddings::SparsecoreMemoryAllocator"* noundef nonnull align 4 dereferenceable(36) %0, i32* noundef %1, i32* noundef %2, <8 x i32> addrspace(201)* noundef %3, <8 x i32> addrspace(201)* noundef %4, i32 addrspace(204)* noundef %5, i32 addrspace(204)* noundef %6) unnamed_addr #4 comdat align 2 {
+  %8 = alloca %"class.embeddings::SparsecoreMemoryAllocator"*, align 4
+  %9 = alloca i32*, align 4
+  %10 = alloca i32*, align 4
+  %11 = alloca <8 x i32> addrspace(201)*, align 4
+  %12 = alloca <8 x i32> addrspace(201)*, align 4
+  %13 = alloca i32 addrspace(204)*, align 4
+  %14 = alloca i32 addrspace(204)*, align 4
+  store %"class.embeddings::SparsecoreMemoryAllocator"* %0, %"class.embeddings::SparsecoreMemoryAllocator"** %8, align 4, !tbaa !7
+  store i32* %1, i32** %9, align 4, !tbaa !7
+  store i32* %2, i32** %10, align 4, !tbaa !7
+  store <8 x i32> addrspace(201)* %3, <8 x i32> addrspace(201)** %11, align 4, !tbaa !7
+  store <8 x i32> addrspace(201)* %4, <8 x i32> addrspace(201)** %12, align 4, !tbaa !7
+  store i32 addrspace(204)* %5, i32 addrspace(204)** %13, align 4, !tbaa !7
+  store i32 addrspace(204)* %6, i32 addrspace(204)** %14, align 4, !tbaa !7
+  %15 = load %"class.embeddings::SparsecoreMemoryAllocator"*, %"class.embeddings::SparsecoreMemoryAllocator"** %8, align 4
+  %16 = getelementptr inbounds %"class.embeddings::SparsecoreMemoryAllocator", %"class.embeddings::SparsecoreMemoryAllocator"* %15, i32 0, i32 0
+  %17 = load i32*, i32** %9, align 4, !tbaa !7
+  store i32* %17, i32** %16, align 4, !tbaa !9
+  %18 = getelementptr inbounds %"class.embeddings::SparsecoreMemoryAllocator", %"class.embeddings::SparsecoreMemoryAllocator"* %15, i32 0, i32 1
+  %19 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** %11, align 4, !tbaa !7
+  store <8 x i32> addrspace(201)* %19, <8 x i32> addrspace(201)** %18, align 4, !tbaa !12
+  %20 = getelementptr inbounds %"class.embeddings::SparsecoreMemoryAllocator", %"class.embeddings::SparsecoreMemoryAllocator"* %15, i32 0, i32 2
+  %21 = load i32 addrspace(204)*, i32 addrspace(204)** %13, align 4, !tbaa !7
+  store i32 addrspace(204)* %21, i32 addrspace(204)** %20, align 4, !tbaa !13
+  %22 = getelementptr inbounds %"class.embeddings::SparsecoreMemoryAllocator", %"class.embeddings::SparsecoreMemoryAllocator"* %15, i32 0, i32 3
+  %23 = load i32*, i32** %10, align 4, !tbaa !7
+  %24 = load i32*, i32** %9, align 4, !tbaa !7
+  %25 = ptrtoint i32* %23 to i32
+  %26 = ptrtoint i32* %24 to i32
+  %27 = sub i32 %25, %26
+  %28 = sdiv exact i32 %27, 4
+  call void @_ZN10embeddings4impl15StaticAllocatorC2Eii(%"class.embeddings::impl::StaticAllocator"* noundef nonnull align 4 dereferenceable(8) %22, i32 noundef 0, i32 noundef %28) #22
+  %29 = getelementptr inbounds %"class.embeddings::SparsecoreMemoryAllocator", %"class.embeddings::SparsecoreMemoryAllocator"* %15, i32 0, i32 4
+  %30 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** %12, align 4, !tbaa !7
+  %31 = getelementptr inbounds %"class.embeddings::SparsecoreMemoryAllocator", %"class.embeddings::SparsecoreMemoryAllocator"* %15, i32 0, i32 1
+  %32 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** %31, align 4, !tbaa !12
+  %33 = ptrtoint <8 x i32> addrspace(201)* %30 to i32
+  %34 = ptrtoint <8 x i32> addrspace(201)* %32 to i32
+  %35 = sub i32 %33, %34
+  %36 = sdiv exact i32 %35, 32
+  call void @_ZN10embeddings4impl15StaticAllocatorC2Eii(%"class.embeddings::impl::StaticAllocator"* noundef nonnull align 4 dereferenceable(8) %29, i32 noundef 0, i32 noundef %36) #22
+  %37 = getelementptr inbounds %"class.embeddings::SparsecoreMemoryAllocator", %"class.embeddings::SparsecoreMemoryAllocator"* %15, i32 0, i32 5
+  %38 = load i32 addrspace(204)*, i32 addrspace(204)** %14, align 4, !tbaa !7
+  %39 = load i32 addrspace(204)*, i32 addrspace(204)** %13, align 4, !tbaa !7
+  %40 = ptrtoint i32 addrspace(204)* %38 to i32
+  %41 = ptrtoint i32 addrspace(204)* %39 to i32
+  %42 = sub i32 %40, %41
+  %43 = sdiv exact i32 %42, 4
+  call void @_ZN10embeddings4impl15StaticAllocatorC2Eii(%"class.embeddings::impl::StaticAllocator"* noundef nonnull align 4 dereferenceable(8) %37, i32 noundef 0, i32 noundef %43) #22
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings14PointerFactoryIPU5AS201Dv8_iE6CreateES3_(%"class.embeddings::PointerBase"* noalias sret(%"class.embeddings::PointerBase") align 4 %0, <8 x i32> addrspace(201)* noundef %1) #5 comdat align 2 {
+  %3 = alloca i8*, align 4
+  %4 = alloca <8 x i32> addrspace(201)*, align 4
+  %5 = alloca %"class.embeddings::TileSpmemPointer", align 4
+  %6 = alloca %"class.embeddings::BasicType", align 4
+  %7 = bitcast %"class.embeddings::PointerBase"* %0 to i8*
+  store i8* %7, i8** %3, align 4
+  store <8 x i32> addrspace(201)* %1, <8 x i32> addrspace(201)** %4, align 4, !tbaa !7
+  %8 = bitcast %"class.embeddings::TileSpmemPointer"* %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %8) #3
+  %9 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** %4, align 4, !tbaa !7
+  %10 = bitcast <8 x i32> addrspace(201)* %9 to i8 addrspace(201)*
+  call void @_ZN10embeddings11ToBasicTypeIDv8_iE10basic_typeEv(%"class.embeddings::BasicType"* sret(%"class.embeddings::BasicType") align 4 %6) #22
+  call void @_ZN10embeddings16TileSpmemPointerC2EPU5AS201vNS_9BasicTypeE(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %5, i8 addrspace(201)* noundef %10, %"class.embeddings::BasicType"* noundef %6) #22
+  %11 = bitcast %"class.embeddings::TileSpmemPointer"* %5 to %"class.embeddings::PointerBase"*
+  call void @_ZN10embeddings11PointerBaseC2EOS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %11) #22
+  %12 = bitcast %"class.embeddings::TileSpmemPointer"* %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %12) #3
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind
+define internal noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef %0) #6 {
+  %2 = alloca i32, align 4
+  %3 = alloca i32*, align 4
+  store i32 %0, i32* %2, align 4, !tbaa !3
+  %4 = bitcast i32** %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3
+  %5 = load i32, i32* %2, align 4, !tbaa !3
+  %6 = add nsw i32 256, %5
+  %7 = inttoptr i32 %6 to i32*
+  store i32* %7, i32** %3, align 4, !tbaa !7
+  %8 = load i32*, i32** %3, align 4, !tbaa !7
+  %9 = load i32, i32* %8, align 4, !tbaa !3
+  %10 = bitcast i32** %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %10) #3
+  ret i32 %9
+}
+
+; Function Attrs: inlinehint
+define linkonce_odr dso_local void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) unnamed_addr #7 comdat align 2 {
+  %3 = alloca %"class.embeddings::PointerBase"*, align 4
+  %4 = alloca %"class.embeddings::PointerBase"*, align 4
+  store %"class.embeddings::PointerBase"* %0, %"class.embeddings::PointerBase"** %3, align 4, !tbaa !7
+  store %"class.embeddings::PointerBase"* %1, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %3, align 4
+  %6 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 0
+  %7 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %8 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 0
+  call void @_ZN10embeddings11MemorySpaceC2ERKS0_(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %6, %"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %8) #22
+  %9 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 1
+  %10 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %11 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %10, i32 0, i32 1
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %9, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %11) #22
+  %12 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 2
+  %13 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %14 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %13, i32 0, i32 2
+  %15 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %12 to i8*
+  %16 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %14 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %15, i8* align 4 %16, i32 4, i1 false), !tbaa.struct !14
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::PointerBase"* noundef %1, i32 noundef %2) unnamed_addr #4 comdat align 2 {
+  %4 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca %"class.embeddings::PointerBase", align 4
+  store %"class.embeddings::TileSpmemVectorArray"* %0, %"class.embeddings::TileSpmemVectorArray"** %4, align 4, !tbaa !7
+  store i32 %2, i32* %5, align 4, !tbaa !3
+  %7 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %4, align 4
+  %8 = bitcast %"class.embeddings::TileSpmemVectorArray"* %7 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %6, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) #22
+  %9 = load i32, i32* %5, align 4, !tbaa !3
+  call void @_ZN10embeddings15ScratchpadArrayC2ENS_11PointerBaseEi(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %8, %"class.embeddings::PointerBase"* noundef %6, i32 noundef %9) #22
+  ret void
+}
+
+; Function Attrs: inlinehint
+define linkonce_odr dso_local void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %1) unnamed_addr #7 comdat align 2 {
+  %3 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %4 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  store %"class.embeddings::TileSpmemVectorArray"* %0, %"class.embeddings::TileSpmemVectorArray"** %3, align 4, !tbaa !7
+  store %"class.embeddings::TileSpmemVectorArray"* %1, %"class.embeddings::TileSpmemVectorArray"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %3, align 4
+  %6 = bitcast %"class.embeddings::TileSpmemVectorArray"* %5 to %"class.embeddings::ScratchpadArray"*
+  %7 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %4, align 4, !tbaa !7
+  %8 = bitcast %"class.embeddings::TileSpmemVectorArray"* %7 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings15ScratchpadArrayC2ERKS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %6, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %8) #22
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings37RadixSortKeyValueTileSpmemToTileSpmemILi4EEC2EPNS_25SparsecoreMemoryAllocatorENS_20TileSpmemVectorArrayIiEES5_(%"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* noundef nonnull align 4 dereferenceable(88) %0, %"class.embeddings::SparsecoreMemoryAllocator"* noundef %1, %"class.embeddings::TileSpmemVectorArray"* noundef %2, %"class.embeddings::TileSpmemVectorArray"* noundef %3) unnamed_addr #4 comdat align 2 {
+  %5 = alloca %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"*, align 4
+  %6 = alloca %"class.embeddings::SparsecoreMemoryAllocator"*, align 4
+  %7 = alloca %"class.embeddings::MemorySpace", align 4
+  %8 = alloca %"class.embeddings::SCM_TileSpmem", align 4
+  %9 = alloca %"class.embeddings::BasicType", align 4
+  %10 = alloca %"class.embeddings::MemorySpace", align 4
+  %11 = alloca %"class.embeddings::SCM_TileSpmem", align 4
+  %12 = alloca %"class.embeddings::BasicType", align 4
+  %13 = alloca %"class.embeddings::MemorySpace", align 4
+  %14 = alloca %"class.embeddings::SCM_TileSpmem", align 4
+  %15 = alloca %"class.embeddings::BasicType", align 4
+  %16 = alloca %"class.embeddings::SCTY_V8S32", align 4
+  store %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %0, %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"** %5, align 4, !tbaa !7
+  store %"class.embeddings::SparsecoreMemoryAllocator"* %1, %"class.embeddings::SparsecoreMemoryAllocator"** %6, align 4, !tbaa !7
+  %17 = load %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"*, %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"** %5, align 4
+  %18 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %17, i32 0, i32 0
+  store i32 16, i32* %18, align 4, !tbaa !15
+  %19 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %17, i32 0, i32 1
+  %20 = load %"class.embeddings::SparsecoreMemoryAllocator"*, %"class.embeddings::SparsecoreMemoryAllocator"** %6, align 4, !tbaa !7
+  store %"class.embeddings::SparsecoreMemoryAllocator"* %20, %"class.embeddings::SparsecoreMemoryAllocator"** %19, align 4, !tbaa !18
+  %21 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %17, i32 0, i32 2
+  %22 = bitcast %"class.embeddings::TileSpmemVectorArray"* %2 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings15ScratchpadArrayC2ERKS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %21, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %22) #22
+  %23 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %17, i32 0, i32 3
+  %24 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings15ScratchpadArrayC2ERKS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %23, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %24) #22
+  %25 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %17, i32 0, i32 4
+  %26 = load %"class.embeddings::SparsecoreMemoryAllocator"*, %"class.embeddings::SparsecoreMemoryAllocator"** %6, align 4, !tbaa !7
+  %27 = bitcast %"class.embeddings::SCM_TileSpmem"* %8 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %27) #3
+  call void @_ZN10embeddings13SCM_TileSpmemC2Ev(%"class.embeddings::SCM_TileSpmem"* noundef nonnull align 4 dereferenceable(4) %8) #22
+  %28 = bitcast %"class.embeddings::SCM_TileSpmem"* %8 to %"class.embeddings::MemorySpace"*
+  call void @_ZN10embeddings11MemorySpaceC2ERKS0_(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %7, %"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %28) #22
+  %29 = bitcast %"class.embeddings::TileSpmemVectorArray"* %2 to %"class.embeddings::BaseArray"*
+  call void @_ZNK10embeddings9BaseArray11ElementTypeEv(%"class.embeddings::BasicType"* sret(%"class.embeddings::BasicType") align 4 %9, %"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %29) #22
+  %30 = bitcast %"class.embeddings::TileSpmemVectorArray"* %2 to %"class.embeddings::BaseArray"*
+  %31 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %30) #22
+  call void @_ZN10embeddings17ScratchpadFactory6CreateEPNS_25SparsecoreMemoryAllocatorENS_11MemorySpaceENS_9BasicTypeEi(%"class.embeddings::ScratchpadArray"* sret(%"class.embeddings::ScratchpadArray") align 4 %25, %"class.embeddings::SparsecoreMemoryAllocator"* noundef %26, %"class.embeddings::MemorySpace"* noundef %7, %"class.embeddings::BasicType"* noundef %9, i32 noundef %31) #22
+  %32 = bitcast %"class.embeddings::SCM_TileSpmem"* %8 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3
+  %33 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %17, i32 0, i32 5
+  %34 = load %"class.embeddings::SparsecoreMemoryAllocator"*, %"class.embeddings::SparsecoreMemoryAllocator"** %6, align 4, !tbaa !7
+  %35 = bitcast %"class.embeddings::SCM_TileSpmem"* %11 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %35) #3
+  call void @_ZN10embeddings13SCM_TileSpmemC2Ev(%"class.embeddings::SCM_TileSpmem"* noundef nonnull align 4 dereferenceable(4) %11) #22
+  %36 = bitcast %"class.embeddings::SCM_TileSpmem"* %11 to %"class.embeddings::MemorySpace"*
+  call void @_ZN10embeddings11MemorySpaceC2ERKS0_(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %10, %"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %36) #22
+  %37 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to %"class.embeddings::BaseArray"*
+  call void @_ZNK10embeddings9BaseArray11ElementTypeEv(%"class.embeddings::BasicType"* sret(%"class.embeddings::BasicType") align 4 %12, %"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %37) #22
+  %38 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to %"class.embeddings::BaseArray"*
+  %39 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %38) #22
+  call void @_ZN10embeddings17ScratchpadFactory6CreateEPNS_25SparsecoreMemoryAllocatorENS_11MemorySpaceENS_9BasicTypeEi(%"class.embeddings::ScratchpadArray"* sret(%"class.embeddings::ScratchpadArray") align 4 %33, %"class.embeddings::SparsecoreMemoryAllocator"* noundef %34, %"class.embeddings::MemorySpace"* noundef %10, %"class.embeddings::BasicType"* noundef %12, i32 noundef %39) #22
+  %40 = bitcast %"class.embeddings::SCM_TileSpmem"* %11 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %40) #3
+  %41 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %17, i32 0, i32 6
+  %42 = load %"class.embeddings::SparsecoreMemoryAllocator"*, %"class.embeddings::SparsecoreMemoryAllocator"** %6, align 4, !tbaa !7
+  %43 = bitcast %"class.embeddings::SCM_TileSpmem"* %14 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %43) #3
+  call void @_ZN10embeddings13SCM_TileSpmemC2Ev(%"class.embeddings::SCM_TileSpmem"* noundef nonnull align 4 dereferenceable(4) %14) #22
+  %44 = bitcast %"class.embeddings::SCM_TileSpmem"* %14 to %"class.embeddings::MemorySpace"*
+  call void @_ZN10embeddings11MemorySpaceC2ERKS0_(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %13, %"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %44) #22
+  %45 = bitcast %"class.embeddings::SCTY_V8S32"* %16 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %45) #3
+  call void @_ZN10embeddings10SCTY_V8S32C2Ev(%"class.embeddings::SCTY_V8S32"* noundef nonnull align 4 dereferenceable(4) %16) #22
+  %46 = bitcast %"class.embeddings::SCTY_V8S32"* %16 to %"class.embeddings::BasicType"*
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %15, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %46) #22
+  %47 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %17, i32 0, i32 0
+  %48 = load i32, i32* %47, align 4, !tbaa !15
+  %49 = mul nsw i32 8, %48
+  %50 = sdiv i32 %49, 8
+  call void @_ZN10embeddings17ScratchpadFactory6CreateEPNS_25SparsecoreMemoryAllocatorENS_11MemorySpaceENS_9BasicTypeEi(%"class.embeddings::ScratchpadArray"* sret(%"class.embeddings::ScratchpadArray") align 4 %41, %"class.embeddings::SparsecoreMemoryAllocator"* noundef %42, %"class.embeddings::MemorySpace"* noundef %13, %"class.embeddings::BasicType"* noundef %15, i32 noundef %50) #22
+  %51 = bitcast %"class.embeddings::SCTY_V8S32"* %16 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %51) #3
+  %52 = bitcast %"class.embeddings::SCM_TileSpmem"* %14 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %52) #3
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings37RadixSortKeyValueTileSpmemToTileSpmemILi4EE4SortEv(%"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* noundef nonnull align 4 dereferenceable(88) %0) #5 comdat align 2 {
+  %2 = alloca %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"*, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca %"class.embeddings::RadixSortIteration", align 4
+  %5 = alloca i32, align 4
+  %6 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %7 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %8 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %9 = alloca %"class.embeddings::ScratchpadArray", align 4
+  %10 = alloca %"class.embeddings::ScratchpadArray", align 4
+  store %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %0, %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"** %2, align 4, !tbaa !7
+  %11 = load %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"*, %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"** %2, align 4
+  %12 = bitcast i32* %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %12) #3
+  store i32 8, i32* %3, align 4, !tbaa !3
+  %13 = bitcast %"class.embeddings::RadixSortIteration"* %4 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* %13) #3
+  %14 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %11, i32 0, i32 6
+  call void @_ZN10embeddings18RadixSortIterationIiLi4EEC2EPNS_15ScratchpadArrayE(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %4, %"class.embeddings::ScratchpadArray"* noundef %14) #22
+  %15 = bitcast i32* %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %15) #3
+  store i32 0, i32* %5, align 4, !tbaa !3
+  br label %16
+
+16:                                               ; preds = %55, %1
+  %17 = load i32, i32* %5, align 4, !tbaa !3
+  %18 = load i32, i32* %3, align 4, !tbaa !3
+  %19 = icmp slt i32 %17, %18
+  br i1 %19, label %22, label %20
+
+20:                                               ; preds = %16
+  %21 = bitcast i32* %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %21) #3
+  br label %58
+
+22:                                               ; preds = %16
+  call void @_ZN10embeddings18RadixSortIterationIiLi4EE12ClearBucketsEv(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %4) #22
+  %23 = load i32, i32* %5, align 4, !tbaa !3
+  %24 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %11, i32 0, i32 2
+  %25 = call noundef %"class.embeddings::TileSpmemVectorArray"* @_ZN10embeddings4CastINS_20TileSpmemVectorArrayIiEENS_15ScratchpadArrayEEENS_15cast_retty_implIT_T0_E8ret_typeERKS6_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %24) #22
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %6, %"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %25) #22
+  call void @_ZN10embeddings18RadixSortIterationIiLi4EE13HistogramKeysEiNS_20TileSpmemVectorArrayIiEE(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %4, i32 noundef %23, %"class.embeddings::TileSpmemVectorArray"* noundef %6) #22
+  call void @_ZN10embeddings18RadixSortIterationIiLi4EE11ScanBucketsEv(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %4) #22
+  %26 = load i32, i32* %5, align 4, !tbaa !3
+  %27 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %11, i32 0, i32 2
+  %28 = call noundef %"class.embeddings::TileSpmemVectorArray"* @_ZN10embeddings4CastINS_20TileSpmemVectorArrayIiEENS_15ScratchpadArrayEEENS_15cast_retty_implIT_T0_E8ret_typeERKS6_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %27) #22
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %7, %"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %28) #22
+  %29 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %11, i32 0, i32 3
+  %30 = call noundef %"class.embeddings::TileSpmemVectorArray"* @_ZN10embeddings4CastINS_20TileSpmemVectorArrayIiEENS_15ScratchpadArrayEEENS_15cast_retty_implIT_T0_E8ret_typeERKS6_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %29) #22
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %8, %"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %30) #22
+  %31 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %11, i32 0, i32 4
+  %32 = call noundef %"class.embeddings::TileSpmemVectorArray"* @_ZN10embeddings4CastINS_20TileSpmemVectorArrayIiEENS_15ScratchpadArrayEEENS_15cast_retty_implIT_T0_E8ret_typeERKS6_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %31) #22
+  %33 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %11, i32 0, i32 5
+  %34 = call noundef %"class.embeddings::TileSpmemVectorArray"* @_ZN10embeddings4CastINS_20TileSpmemVectorArrayIiEENS_15ScratchpadArrayEEENS_15cast_retty_implIT_T0_E8ret_typeERKS6_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %33) #22
+  call void @_ZN10embeddings18RadixSortIterationIiLi4EE14RankAndPermuteIiEEvibNS_20TileSpmemVectorArrayIiEENS3_IT_EEPS4_PS6_(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %4, i32 noundef %26, i1 noundef zeroext false, %"class.embeddings::TileSpmemVectorArray"* noundef %7, %"class.embeddings::TileSpmemVectorArray"* noundef %8, %"class.embeddings::TileSpmemVectorArray"* noundef %32, %"class.embeddings::TileSpmemVectorArray"* noundef %34) #22
+  %35 = bitcast %"class.embeddings::ScratchpadArray"* %9 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %35) #3
+  %36 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %11, i32 0, i32 2
+  call void @_ZN10embeddings15ScratchpadArrayC2ERKS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %9, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %36) #22
+  %37 = bitcast %"class.embeddings::ScratchpadArray"* %10 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %37) #3
+  %38 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %11, i32 0, i32 3
+  call void @_ZN10embeddings15ScratchpadArrayC2ERKS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %10, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %38) #22
+  %39 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %11, i32 0, i32 4
+  %40 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %11, i32 0, i32 2
+  %41 = bitcast %"class.embeddings::ScratchpadArray"* %40 to i8*
+  %42 = bitcast %"class.embeddings::ScratchpadArray"* %39 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %41, i8* align 4 %42, i32 16, i1 false)
+  %43 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %11, i32 0, i32 5
+  %44 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %11, i32 0, i32 3
+  %45 = bitcast %"class.embeddings::ScratchpadArray"* %44 to i8*
+  %46 = bitcast %"class.embeddings::ScratchpadArray"* %43 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %45, i8* align 4 %46, i32 16, i1 false)
+  %47 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %11, i32 0, i32 4
+  %48 = bitcast %"class.embeddings::ScratchpadArray"* %47 to i8*
+  %49 = bitcast %"class.embeddings::ScratchpadArray"* %9 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %48, i8* align 4 %49, i32 16, i1 false)
+  %50 = getelementptr inbounds %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem", %"class.embeddings::RadixSortKeyValueTileSpmemToTileSpmem"* %11, i32 0, i32 5
+  %51 = bitcast %"class.embeddings::ScratchpadArray"* %50 to i8*
+  %52 = bitcast %"class.embeddings::ScratchpadArray"* %10 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %51, i8* align 4 %52, i32 16, i1 false)
+  %53 = bitcast %"class.embeddings::ScratchpadArray"* %10 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %53) #3
+  %54 = bitcast %"class.embeddings::ScratchpadArray"* %9 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %54) #3
+  br label %55
+
+55:                                               ; preds = %22
+  %56 = load i32, i32* %5, align 4, !tbaa !3
+  %57 = add nsw i32 %56, 1
+  store i32 %57, i32* %5, align 4, !tbaa !3
+  br label %16, !llvm.loop !19
+
+58:                                               ; preds = %20
+  %59 = bitcast %"class.embeddings::RadixSortIteration"* %4 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* %59) #3
+  %60 = bitcast i32* %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %60) #3
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind
+define internal void @_ZN12_GLOBAL__N_16ReturnEii(i32 noundef %0, i32 noundef %1) #6 {
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32*, align 4
+  store i32 %0, i32* %3, align 4, !tbaa !3
+  store i32 %1, i32* %4, align 4, !tbaa !3
+  %6 = bitcast i32** %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3
+  %7 = load i32, i32* %4, align 4, !tbaa !3
+  %8 = add nsw i32 256, %7
+  %9 = inttoptr i32 %8 to i32*
+  store i32* %9, i32** %5, align 4, !tbaa !7
+  %10 = load i32, i32* %3, align 4, !tbaa !3
+  %11 = load i32*, i32** %5, align 4, !tbaa !7
+  store i32 %10, i32* %11, align 4, !tbaa !3
+  %12 = bitcast i32** %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %12) #3
+  ret void
+}
+
+; Function Attrs: argmemonly nocallback nofree nosync nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: mustprogress nounwind
+define dso_local void @scs() #8 section ".text.scs" {
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define linkonce_odr dso_local void @_ZN10embeddings4impl15StaticAllocatorC2Eii(%"class.embeddings::impl::StaticAllocator"* noundef nonnull align 4 dereferenceable(8) %0, i32 noundef %1, i32 noundef %2) unnamed_addr #9 comdat align 2 {
+  %4 = alloca %"class.embeddings::impl::StaticAllocator"*, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store %"class.embeddings::impl::StaticAllocator"* %0, %"class.embeddings::impl::StaticAllocator"** %4, align 4, !tbaa !7
+  store i32 %1, i32* %5, align 4, !tbaa !3
+  store i32 %2, i32* %6, align 4, !tbaa !3
+  %7 = load %"class.embeddings::impl::StaticAllocator"*, %"class.embeddings::impl::StaticAllocator"** %4, align 4
+  %8 = getelementptr inbounds %"class.embeddings::impl::StaticAllocator", %"class.embeddings::impl::StaticAllocator"* %7, i32 0, i32 0
+  %9 = load i32, i32* %5, align 4, !tbaa !3
+  store i32 %9, i32* %8, align 4, !tbaa !21
+  %10 = getelementptr inbounds %"class.embeddings::impl::StaticAllocator", %"class.embeddings::impl::StaticAllocator"* %7, i32 0, i32 1
+  %11 = load i32, i32* %6, align 4, !tbaa !3
+  store i32 %11, i32* %10, align 4, !tbaa !22
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define linkonce_odr dso_local void @_ZN10embeddings11MemorySpaceC2ERKS0_(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %0, %"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %1) unnamed_addr #9 comdat align 2 {
+  %3 = alloca %"class.embeddings::MemorySpace"*, align 4
+  %4 = alloca %"class.embeddings::MemorySpace"*, align 4
+  store %"class.embeddings::MemorySpace"* %0, %"class.embeddings::MemorySpace"** %3, align 4, !tbaa !7
+  store %"class.embeddings::MemorySpace"* %1, %"class.embeddings::MemorySpace"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::MemorySpace"*, %"class.embeddings::MemorySpace"** %3, align 4
+  %6 = load %"class.embeddings::MemorySpace"*, %"class.embeddings::MemorySpace"** %4, align 4, !tbaa !7
+  %7 = getelementptr inbounds %"class.embeddings::MemorySpace", %"class.embeddings::MemorySpace"* %6, i32 0, i32 0
+  %8 = load i32, i32* %7, align 4, !tbaa !23
+  %9 = getelementptr inbounds %"class.embeddings::MemorySpace", %"class.embeddings::MemorySpace"* %5, i32 0, i32 0
+  store i32 %8, i32* %9, align 4, !tbaa !23
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define linkonce_odr dso_local void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %0, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %1) unnamed_addr #9 comdat align 2 {
+  %3 = alloca %"class.embeddings::BasicType"*, align 4
+  %4 = alloca %"class.embeddings::BasicType"*, align 4
+  store %"class.embeddings::BasicType"* %0, %"class.embeddings::BasicType"** %3, align 4, !tbaa !7
+  store %"class.embeddings::BasicType"* %1, %"class.embeddings::BasicType"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::BasicType"*, %"class.embeddings::BasicType"** %3, align 4
+  %6 = load %"class.embeddings::BasicType"*, %"class.embeddings::BasicType"** %4, align 4, !tbaa !7
+  %7 = getelementptr inbounds %"class.embeddings::BasicType", %"class.embeddings::BasicType"* %6, i32 0, i32 0
+  %8 = load i32, i32* %7, align 4, !tbaa !26
+  %9 = getelementptr inbounds %"class.embeddings::BasicType", %"class.embeddings::BasicType"* %5, i32 0, i32 0
+  store i32 %8, i32* %9, align 4, !tbaa !26
+  ret void
+}
+
+; Function Attrs: argmemonly nofree nounwind willreturn
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #10
+
+; Function Attrs: inlinehint
+define linkonce_odr dso_local void @_ZN10embeddings9BaseArrayC2ERKS0_(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %1) unnamed_addr #7 comdat align 2 {
+  %3 = alloca %"class.embeddings::BaseArray"*, align 4
+  %4 = alloca %"class.embeddings::BaseArray"*, align 4
+  store %"class.embeddings::BaseArray"* %0, %"class.embeddings::BaseArray"** %3, align 4, !tbaa !7
+  store %"class.embeddings::BaseArray"* %1, %"class.embeddings::BaseArray"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %3, align 4
+  %6 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %5, i32 0, i32 0
+  %7 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %4, align 4, !tbaa !7
+  %8 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %7, i32 0, i32 0
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %6, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %8) #22
+  %9 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %5, i32 0, i32 1
+  %10 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %4, align 4, !tbaa !7
+  %11 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %10, i32 0, i32 1
+  %12 = load i32, i32* %11, align 4, !tbaa !29
+  store i32 %12, i32* %9, align 4, !tbaa !29
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings15ScratchpadArrayC2ENS_11PointerBaseEi(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::PointerBase"* noundef %1, i32 noundef %2) unnamed_addr #4 comdat align 2 {
+  %4 = alloca %"class.embeddings::ScratchpadArray"*, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca %"class.embeddings::PointerBase", align 4
+  store %"class.embeddings::ScratchpadArray"* %0, %"class.embeddings::ScratchpadArray"** %4, align 4, !tbaa !7
+  store i32 %2, i32* %5, align 4, !tbaa !3
+  %7 = load %"class.embeddings::ScratchpadArray"*, %"class.embeddings::ScratchpadArray"** %4, align 4
+  %8 = bitcast %"class.embeddings::ScratchpadArray"* %7 to %"class.embeddings::BaseArray"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %6, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) #22
+  %9 = load i32, i32* %5, align 4, !tbaa !3
+  call void @_ZN10embeddings9BaseArrayC2ENS_11PointerBaseEi(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %8, %"class.embeddings::PointerBase"* noundef %6, i32 noundef %9) #22
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings9BaseArrayC2ENS_11PointerBaseEi(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::PointerBase"* noundef %1, i32 noundef %2) unnamed_addr #4 comdat align 2 {
+  %4 = alloca %"class.embeddings::BaseArray"*, align 4
+  %5 = alloca i32, align 4
+  store %"class.embeddings::BaseArray"* %0, %"class.embeddings::BaseArray"** %4, align 4, !tbaa !7
+  store i32 %2, i32* %5, align 4, !tbaa !3
+  %6 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %4, align 4
+  %7 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %6, i32 0, i32 0
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %7, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) #22
+  %8 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %6, i32 0, i32 1
+  %9 = load i32, i32* %5, align 4, !tbaa !3
+  store i32 %9, i32* %8, align 4, !tbaa !29
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings11ToBasicTypeIDv8_iE10basic_typeEv(%"class.embeddings::BasicType"* noalias sret(%"class.embeddings::BasicType") align 4 %0) #5 comdat align 2 {
+  %2 = alloca i8*, align 4
+  %3 = alloca %"class.embeddings::SCTY_V8S32", align 4
+  %4 = bitcast %"class.embeddings::BasicType"* %0 to i8*
+  store i8* %4, i8** %2, align 4
+  %5 = bitcast %"class.embeddings::SCTY_V8S32"* %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3
+  call void @_ZN10embeddings10SCTY_V8S32C2Ev(%"class.embeddings::SCTY_V8S32"* noundef nonnull align 4 dereferenceable(4) %3) #22
+  %6 = bitcast %"class.embeddings::SCTY_V8S32"* %3 to %"class.embeddings::BasicType"*
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %0, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %6) #22
+  %7 = bitcast %"class.embeddings::SCTY_V8S32"* %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %7) #3
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings16TileSpmemPointerC2EPU5AS201vNS_9BasicTypeE(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %0, i8 addrspace(201)* noundef %1, %"class.embeddings::BasicType"* noundef %2) unnamed_addr #4 comdat align 2 {
+  %4 = alloca %"class.embeddings::TileSpmemPointer"*, align 4
+  %5 = alloca i8 addrspace(201)*, align 4
+  %6 = alloca %"class.embeddings::BasicType", align 4
+  store %"class.embeddings::TileSpmemPointer"* %0, %"class.embeddings::TileSpmemPointer"** %4, align 4, !tbaa !7
+  store i8 addrspace(201)* %1, i8 addrspace(201)** %5, align 4, !tbaa !7
+  %7 = load %"class.embeddings::TileSpmemPointer"*, %"class.embeddings::TileSpmemPointer"** %4, align 4
+  %8 = bitcast %"class.embeddings::TileSpmemPointer"* %7 to %"class.embeddings::PointerBase"*
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %6, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %2) #22
+  %9 = load i8 addrspace(201)*, i8 addrspace(201)** %5, align 4, !tbaa !7
+  call void @_ZN10embeddings11PointerBaseC2ENS_9BasicTypeEPU5AS201v(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %8, %"class.embeddings::BasicType"* noundef %6, i8 addrspace(201)* noundef %9) #22
+  ret void
+}
+
+; Function Attrs: inlinehint
+define linkonce_odr dso_local void @_ZN10embeddings11PointerBaseC2EOS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) unnamed_addr #7 comdat align 2 {
+  %3 = alloca %"class.embeddings::PointerBase"*, align 4
+  %4 = alloca %"class.embeddings::PointerBase"*, align 4
+  store %"class.embeddings::PointerBase"* %0, %"class.embeddings::PointerBase"** %3, align 4, !tbaa !7
+  store %"class.embeddings::PointerBase"* %1, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %3, align 4
+  %6 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 0
+  %7 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %8 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 0
+  call void @_ZN10embeddings11MemorySpaceC2ERKS0_(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %6, %"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %8) #22
+  %9 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 1
+  %10 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %11 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %10, i32 0, i32 1
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %9, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %11) #22
+  %12 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 2
+  %13 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %14 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %13, i32 0, i32 2
+  %15 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %12 to i8*
+  %16 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %14 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %15, i8* align 4 %16, i32 4, i1 false), !tbaa.struct !14
+  ret void
+}
+
+define linkonce_odr dso_local void @_ZN10embeddings10SCTY_V8S32C2Ev(%"class.embeddings::SCTY_V8S32"* noundef nonnull align 4 dereferenceable(4) %0) unnamed_addr #11 comdat align 2 {
+  %2 = alloca %"class.embeddings::SCTY_V8S32"*, align 4
+  store %"class.embeddings::SCTY_V8S32"* %0, %"class.embeddings::SCTY_V8S32"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::SCTY_V8S32"*, %"class.embeddings::SCTY_V8S32"** %2, align 4
+  %4 = bitcast %"class.embeddings::SCTY_V8S32"* %3 to %"class.embeddings::BasicType"*
+  call void @_ZN10embeddings9BasicTypeC2ENS_19SparsecoreBasicTypeE(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %4, i32 noundef 3) #22
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define linkonce_odr dso_local void @_ZN10embeddings9BasicTypeC2ENS_19SparsecoreBasicTypeE(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %0, i32 noundef %1) unnamed_addr #9 comdat align 2 {
+  %3 = alloca %"class.embeddings::BasicType"*, align 4
+  %4 = alloca i32, align 4
+  store %"class.embeddings::BasicType"* %0, %"class.embeddings::BasicType"** %3, align 4, !tbaa !7
+  store i32 %1, i32* %4, align 4, !tbaa !32
+  %5 = load %"class.embeddings::BasicType"*, %"class.embeddings::BasicType"** %3, align 4
+  %6 = getelementptr inbounds %"class.embeddings::BasicType", %"class.embeddings::BasicType"* %5, i32 0, i32 0
+  %7 = load i32, i32* %4, align 4, !tbaa !32
+  store i32 %7, i32* %6, align 4, !tbaa !26
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings11PointerBaseC2ENS_9BasicTypeEPU5AS201v(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::BasicType"* noundef %1, i8 addrspace(201)* noundef %2) unnamed_addr #4 comdat align 2 {
+  %4 = alloca %"class.embeddings::PointerBase"*, align 4
+  %5 = alloca i8 addrspace(201)*, align 4
+  %6 = alloca %"class.embeddings::SCM_TileSpmem", align 4
+  store %"class.embeddings::PointerBase"* %0, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  store i8 addrspace(201)* %2, i8 addrspace(201)** %5, align 4, !tbaa !7
+  %7 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4
+  %8 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 0
+  %9 = bitcast %"class.embeddings::SCM_TileSpmem"* %6 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %9) #3
+  call void @_ZN10embeddings13SCM_TileSpmemC2Ev(%"class.embeddings::SCM_TileSpmem"* noundef nonnull align 4 dereferenceable(4) %6) #22
+  %10 = bitcast %"class.embeddings::SCM_TileSpmem"* %6 to %"class.embeddings::MemorySpace"*
+  call void @_ZN10embeddings11MemorySpaceC2ERKS0_(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %8, %"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %10) #22
+  %11 = bitcast %"class.embeddings::SCM_TileSpmem"* %6 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %11) #3
+  %12 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 1
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %12, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %1) #22
+  %13 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 2
+  %14 = load i8 addrspace(201)*, i8 addrspace(201)** %5, align 4, !tbaa !7
+  call void @_ZN10embeddings11PointerBase6AnyPtrC2EPU5AS201v(%"union.embeddings::PointerBase::AnyPtr"* noundef nonnull align 4 dereferenceable(4) %13, i8 addrspace(201)* noundef %14) #22
+  ret void
+}
+
+define linkonce_odr dso_local void @_ZN10embeddings13SCM_TileSpmemC2Ev(%"class.embeddings::SCM_TileSpmem"* noundef nonnull align 4 dereferenceable(4) %0) unnamed_addr #11 comdat align 2 {
+  %2 = alloca %"class.embeddings::SCM_TileSpmem"*, align 4
+  store %"class.embeddings::SCM_TileSpmem"* %0, %"class.embeddings::SCM_TileSpmem"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::SCM_TileSpmem"*, %"class.embeddings::SCM_TileSpmem"** %2, align 4
+  %4 = bitcast %"class.embeddings::SCM_TileSpmem"* %3 to %"class.embeddings::MemorySpace"*
+  call void @_ZN10embeddings11MemorySpaceC2ENS_21SparsecoreMemorySpaceE(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %4, i32 noundef 1) #22
+  ret void
+}
+
+; Function Attrs: nounwind
+define linkonce_odr dso_local void @_ZN10embeddings11PointerBase6AnyPtrC2EPU5AS201v(%"union.embeddings::PointerBase::AnyPtr"* noundef nonnull align 4 dereferenceable(4) %0, i8 addrspace(201)* noundef %1) unnamed_addr #12 comdat align 2 {
+  %3 = alloca %"union.embeddings::PointerBase::AnyPtr"*, align 4
+  %4 = alloca i8 addrspace(201)*, align 4
+  store %"union.embeddings::PointerBase::AnyPtr"* %0, %"union.embeddings::PointerBase::AnyPtr"** %3, align 4, !tbaa !7
+  store i8 addrspace(201)* %1, i8 addrspace(201)** %4, align 4, !tbaa !7
+  %5 = load %"union.embeddings::PointerBase::AnyPtr"*, %"union.embeddings::PointerBase::AnyPtr"** %3, align 4
+  %6 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %5 to i32 addrspace(201)**
+  %7 = load i8 addrspace(201)*, i8 addrspace(201)** %4, align 4, !tbaa !7
+  %8 = bitcast i8 addrspace(201)* %7 to i32 addrspace(201)*
+  store i32 addrspace(201)* %8, i32 addrspace(201)** %6, align 4, !tbaa !33
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define linkonce_odr dso_local void @_ZN10embeddings11MemorySpaceC2ENS_21SparsecoreMemorySpaceE(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %0, i32 noundef %1) unnamed_addr #9 comdat align 2 {
+  %3 = alloca %"class.embeddings::MemorySpace"*, align 4
+  %4 = alloca i32, align 4
+  store %"class.embeddings::MemorySpace"* %0, %"class.embeddings::MemorySpace"** %3, align 4, !tbaa !7
+  store i32 %1, i32* %4, align 4, !tbaa !34
+  %5 = load %"class.embeddings::MemorySpace"*, %"class.embeddings::MemorySpace"** %3, align 4
+  %6 = getelementptr inbounds %"class.embeddings::MemorySpace", %"class.embeddings::MemorySpace"* %5, i32 0, i32 0
+  %7 = load i32, i32* %4, align 4, !tbaa !34
+  store i32 %7, i32* %6, align 4, !tbaa !23
+  ret void
+}
+
+; Function Attrs: inlinehint
+define linkonce_odr dso_local void @_ZN10embeddings15ScratchpadArrayC2ERKS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %1) unnamed_addr #7 comdat align 2 {
+  %3 = alloca %"class.embeddings::ScratchpadArray"*, align 4
+  %4 = alloca %"class.embeddings::ScratchpadArray"*, align 4
+  store %"class.embeddings::ScratchpadArray"* %0, %"class.embeddings::ScratchpadArray"** %3, align 4, !tbaa !7
+  store %"class.embeddings::ScratchpadArray"* %1, %"class.embeddings::ScratchpadArray"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::ScratchpadArray"*, %"class.embeddings::ScratchpadArray"** %3, align 4
+  %6 = bitcast %"class.embeddings::ScratchpadArray"* %5 to %"class.embeddings::BaseArray"*
+  %7 = load %"class.embeddings::ScratchpadArray"*, %"class.embeddings::ScratchpadArray"** %4, align 4, !tbaa !7
+  %8 = bitcast %"class.embeddings::ScratchpadArray"* %7 to %"class.embeddings::BaseArray"*
+  call void @_ZN10embeddings9BaseArrayC2ERKS0_(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %6, %"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %8) #22
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings17ScratchpadFactory6CreateEPNS_25SparsecoreMemoryAllocatorENS_11MemorySpaceENS_9BasicTypeEi(%"class.embeddings::ScratchpadArray"* noalias sret(%"class.embeddings::ScratchpadArray") align 4 %0, %"class.embeddings::SparsecoreMemoryAllocator"* noundef %1, %"class.embeddings::MemorySpace"* noundef %2, %"class.embeddings::BasicType"* noundef %3, i32 noundef %4) #5 comdat align 2 {
+  %6 = alloca i8*, align 4
+  %7 = alloca %"class.embeddings::SparsecoreMemoryAllocator"*, align 4
+  %8 = alloca i32, align 4
+  %9 = alloca i32, align 4
+  %10 = alloca %"class.embeddings::SmemPointer", align 4
+  %11 = alloca %"class.embeddings::BasicType", align 4
+  %12 = alloca %"class.embeddings::SmemArray", align 4
+  %13 = alloca %"class.embeddings::PointerBase", align 4
+  %14 = alloca i32, align 4
+  %15 = alloca %"class.embeddings::SmemPointer", align 4
+  %16 = alloca %"class.embeddings::BasicType", align 4
+  %17 = alloca %"class.embeddings::SmemArray.0", align 4
+  %18 = alloca %"class.embeddings::PointerBase", align 4
+  %19 = alloca %"class.embeddings::SmemPointer", align 4
+  %20 = alloca %"class.embeddings::BasicType", align 4
+  %21 = alloca %"class.embeddings::SmemArray.1", align 4
+  %22 = alloca %"class.embeddings::PointerBase", align 4
+  %23 = alloca %"class.embeddings::TileSpmemPointer", align 4
+  %24 = alloca %"class.embeddings::BasicType", align 4
+  %25 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %26 = alloca %"class.embeddings::PointerBase", align 4
+  %27 = alloca %"class.embeddings::TileSpmemPointer", align 4
+  %28 = alloca %"class.embeddings::BasicType", align 4
+  %29 = alloca %"class.embeddings::TileSpmemVectorArray.2", align 4
+  %30 = alloca %"class.embeddings::PointerBase", align 4
+  %31 = alloca %"class.embeddings::TileSpmemPointer", align 4
+  %32 = alloca %"class.embeddings::BasicType", align 4
+  %33 = alloca %"class.embeddings::TileSpmemVectorArray.3", align 4
+  %34 = alloca %"class.embeddings::PointerBase", align 4
+  %35 = alloca %"class.embeddings::SmemPointer", align 4
+  %36 = alloca %"class.embeddings::BasicType", align 4
+  %37 = alloca %"class.embeddings::SmemArray", align 4
+  %38 = alloca %"class.embeddings::PointerBase", align 4
+  %39 = bitcast %"class.embeddings::ScratchpadArray"* %0 to i8*
+  store i8* %39, i8** %6, align 4
+  store %"class.embeddings::SparsecoreMemoryAllocator"* %1, %"class.embeddings::SparsecoreMemoryAllocator"** %7, align 4, !tbaa !7
+  store i32 %4, i32* %8, align 4, !tbaa !3
+  %40 = bitcast i32* %9 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %40) #3
+  %41 = call noundef i32 @_ZNK10embeddings9BasicType11SizeInBytesEv(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %3) #22
+  %42 = load i32, i32* %8, align 4, !tbaa !3
+  %43 = mul nsw i32 %41, %42
+  %44 = call noundef i32 @_ZNK10embeddings11MemorySpace15WordSizeInBytesEv(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %2) #22
+  %45 = sdiv i32 %43, %44
+  store i32 %45, i32* %9, align 4, !tbaa !3
+  %46 = call noundef i32 @_ZNK10embeddings11MemorySpace12memory_spaceEv(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %2) #22
+  %47 = icmp eq i32 %46, 0
+  br i1 %47, label %48, label %91
+
+48:                                               ; preds = %5
+  %49 = call noundef i32 @_ZNK10embeddings9BasicType4typeEv(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %3) #22
+  switch i32 %49, label %90 [
+    i32 0, label %50
+    i32 1, label %63
+    i32 2, label %76
+    i32 3, label %89
+    i32 4, label %89
+    i32 5, label %89
+  ]
+
+50:                                               ; preds = %48
+  %51 = bitcast %"class.embeddings::SmemPointer"* %10 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %51) #3
+  %52 = load %"class.embeddings::SparsecoreMemoryAllocator"*, %"class.embeddings::SparsecoreMemoryAllocator"** %7, align 4, !tbaa !7
+  %53 = load i32, i32* %9, align 4, !tbaa !3
+  %54 = call noundef i8* @_ZN10embeddings25SparsecoreMemoryAllocator12AllocateSmemEi(%"class.embeddings::SparsecoreMemoryAllocator"* noundef nonnull align 4 dereferenceable(36) %52, i32 noundef %53) #22
+  %55 = bitcast i8* %54 to i32*
+  %56 = bitcast i32* %55 to i8*
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %11, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %3) #22
+  call void @_ZN10embeddings11SmemPointerC2EPvNS_9BasicTypeE(%"class.embeddings::SmemPointer"* noundef nonnull align 4 dereferenceable(12) %10, i8* noundef %56, %"class.embeddings::BasicType"* noundef %11) #22
+  %57 = bitcast %"class.embeddings::SmemArray"* %12 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %57) #3
+  %58 = bitcast %"class.embeddings::SmemPointer"* %10 to %"class.embeddings::PointerBase"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %13, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %58) #22
+  %59 = load i32, i32* %8, align 4, !tbaa !3
+  call void @_ZN10embeddings9SmemArrayIiEC2ENS_11PointerBaseEi(%"class.embeddings::SmemArray"* noundef nonnull align 4 dereferenceable(16) %12, %"class.embeddings::PointerBase"* noundef %13, i32 noundef %59) #22
+  %60 = bitcast %"class.embeddings::SmemArray"* %12 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings15ScratchpadArrayC2EOS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %60) #22
+  %61 = bitcast %"class.embeddings::SmemArray"* %12 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %61) #3
+  store i32 1, i32* %14, align 4
+  %62 = bitcast %"class.embeddings::SmemPointer"* %10 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %62) #3
+  br label %145
+
+63:                                               ; preds = %48
+  %64 = bitcast %"class.embeddings::SmemPointer"* %15 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %64) #3
+  %65 = load %"class.embeddings::SparsecoreMemoryAllocator"*, %"class.embeddings::SparsecoreMemoryAllocator"** %7, align 4, !tbaa !7
+  %66 = load i32, i32* %9, align 4, !tbaa !3
+  %67 = call noundef i8* @_ZN10embeddings25SparsecoreMemoryAllocator12AllocateSmemEi(%"class.embeddings::SparsecoreMemoryAllocator"* noundef nonnull align 4 dereferenceable(36) %65, i32 noundef %66) #22
+  %68 = bitcast i8* %67 to i32*
+  %69 = bitcast i32* %68 to i8*
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %16, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %3) #22
+  call void @_ZN10embeddings11SmemPointerC2EPvNS_9BasicTypeE(%"class.embeddings::SmemPointer"* noundef nonnull align 4 dereferenceable(12) %15, i8* noundef %69, %"class.embeddings::BasicType"* noundef %16) #22
+  %70 = bitcast %"class.embeddings::SmemArray.0"* %17 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %70) #3
+  %71 = bitcast %"class.embeddings::SmemPointer"* %15 to %"class.embeddings::PointerBase"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %18, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %71) #22
+  %72 = load i32, i32* %8, align 4, !tbaa !3
+  call void @_ZN10embeddings9SmemArrayIjEC2ENS_11PointerBaseEi(%"class.embeddings::SmemArray.0"* noundef nonnull align 4 dereferenceable(16) %17, %"class.embeddings::PointerBase"* noundef %18, i32 noundef %72) #22
+  %73 = bitcast %"class.embeddings::SmemArray.0"* %17 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings15ScratchpadArrayC2EOS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %73) #22
+  %74 = bitcast %"class.embeddings::SmemArray.0"* %17 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %74) #3
+  store i32 1, i32* %14, align 4
+  %75 = bitcast %"class.embeddings::SmemPointer"* %15 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %75) #3
+  br label %145
+
+76:                                               ; preds = %48
+  %77 = bitcast %"class.embeddings::SmemPointer"* %19 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %77) #3
+  %78 = load %"class.embeddings::SparsecoreMemoryAllocator"*, %"class.embeddings::SparsecoreMemoryAllocator"** %7, align 4, !tbaa !7
+  %79 = load i32, i32* %9, align 4, !tbaa !3
+  %80 = call noundef i8* @_ZN10embeddings25SparsecoreMemoryAllocator12AllocateSmemEi(%"class.embeddings::SparsecoreMemoryAllocator"* noundef nonnull align 4 dereferenceable(36) %78, i32 noundef %79) #22
+  %81 = bitcast i8* %80 to float*
+  %82 = bitcast float* %81 to i8*
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %20, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %3) #22
+  call void @_ZN10embeddings11SmemPointerC2EPvNS_9BasicTypeE(%"class.embeddings::SmemPointer"* noundef nonnull align 4 dereferenceable(12) %19, i8* noundef %82, %"class.embeddings::BasicType"* noundef %20) #22
+  %83 = bitcast %"class.embeddings::SmemArray.1"* %21 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %83) #3
+  %84 = bitcast %"class.embeddings::SmemPointer"* %19 to %"class.embeddings::PointerBase"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %22, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %84) #22
+  %85 = load i32, i32* %8, align 4, !tbaa !3
+  call void @_ZN10embeddings9SmemArrayIfEC2ENS_11PointerBaseEi(%"class.embeddings::SmemArray.1"* noundef nonnull align 4 dereferenceable(16) %21, %"class.embeddings::PointerBase"* noundef %22, i32 noundef %85) #22
+  %86 = bitcast %"class.embeddings::SmemArray.1"* %21 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings15ScratchpadArrayC2EOS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %86) #22
+  %87 = bitcast %"class.embeddings::SmemArray.1"* %21 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %87) #3
+  store i32 1, i32* %14, align 4
+  %88 = bitcast %"class.embeddings::SmemPointer"* %19 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %88) #3
+  br label %145
+
+89:                                               ; preds = %48, %48, %48
+  br label %90
+
+90:                                               ; preds = %48, %89
+  br label %138
+
+91:                                               ; preds = %5
+  %92 = call noundef i32 @_ZNK10embeddings11MemorySpace12memory_spaceEv(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %2) #22
+  %93 = icmp eq i32 %92, 1
+  br i1 %93, label %94, label %137
+
+94:                                               ; preds = %91
+  %95 = call noundef i32 @_ZNK10embeddings9BasicType4typeEv(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %3) #22
+  switch i32 %95, label %136 [
+    i32 0, label %96
+    i32 1, label %96
+    i32 2, label %96
+    i32 3, label %97
+    i32 4, label %110
+    i32 5, label %123
+  ]
+
+96:                                               ; preds = %94, %94, %94
+  br label %136
+
+97:                                               ; preds = %94
+  %98 = bitcast %"class.embeddings::TileSpmemPointer"* %23 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %98) #3
+  %99 = load %"class.embeddings::SparsecoreMemoryAllocator"*, %"class.embeddings::SparsecoreMemoryAllocator"** %7, align 4, !tbaa !7
+  %100 = load i32, i32* %9, align 4, !tbaa !3
+  %101 = call noundef i8 addrspace(201)* @_ZN10embeddings25SparsecoreMemoryAllocator17AllocateTileSpmemEi(%"class.embeddings::SparsecoreMemoryAllocator"* noundef nonnull align 4 dereferenceable(36) %99, i32 noundef %100) #22
+  %102 = bitcast i8 addrspace(201)* %101 to <8 x i32> addrspace(201)*
+  %103 = bitcast <8 x i32> addrspace(201)* %102 to i8 addrspace(201)*
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %24, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %3) #22
+  call void @_ZN10embeddings16TileSpmemPointerC2EPU5AS201vNS_9BasicTypeE(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %23, i8 addrspace(201)* noundef %103, %"class.embeddings::BasicType"* noundef %24) #22
+  %104 = bitcast %"class.embeddings::TileSpmemVectorArray"* %25 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %104) #3
+  %105 = bitcast %"class.embeddings::TileSpmemPointer"* %23 to %"class.embeddings::PointerBase"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %26, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %105) #22
+  %106 = load i32, i32* %8, align 4, !tbaa !3
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %25, %"class.embeddings::PointerBase"* noundef %26, i32 noundef %106) #22
+  %107 = bitcast %"class.embeddings::TileSpmemVectorArray"* %25 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings15ScratchpadArrayC2EOS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %107) #22
+  %108 = bitcast %"class.embeddings::TileSpmemVectorArray"* %25 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %108) #3
+  store i32 1, i32* %14, align 4
+  %109 = bitcast %"class.embeddings::TileSpmemPointer"* %23 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %109) #3
+  br label %145
+
+110:                                              ; preds = %94
+  %111 = bitcast %"class.embeddings::TileSpmemPointer"* %27 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %111) #3
+  %112 = load %"class.embeddings::SparsecoreMemoryAllocator"*, %"class.embeddings::SparsecoreMemoryAllocator"** %7, align 4, !tbaa !7
+  %113 = load i32, i32* %9, align 4, !tbaa !3
+  %114 = call noundef i8 addrspace(201)* @_ZN10embeddings25SparsecoreMemoryAllocator17AllocateTileSpmemEi(%"class.embeddings::SparsecoreMemoryAllocator"* noundef nonnull align 4 dereferenceable(36) %112, i32 noundef %113) #22
+  %115 = bitcast i8 addrspace(201)* %114 to <8 x i32> addrspace(201)*
+  %116 = bitcast <8 x i32> addrspace(201)* %115 to i8 addrspace(201)*
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %28, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %3) #22
+  call void @_ZN10embeddings16TileSpmemPointerC2EPU5AS201vNS_9BasicTypeE(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %27, i8 addrspace(201)* noundef %116, %"class.embeddings::BasicType"* noundef %28) #22
+  %117 = bitcast %"class.embeddings::TileSpmemVectorArray.2"* %29 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %117) #3
+  %118 = bitcast %"class.embeddings::TileSpmemPointer"* %27 to %"class.embeddings::PointerBase"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %30, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %118) #22
+  %119 = load i32, i32* %8, align 4, !tbaa !3
+  call void @_ZN10embeddings20TileSpmemVectorArrayIjEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray.2"* noundef nonnull align 4 dereferenceable(16) %29, %"class.embeddings::PointerBase"* noundef %30, i32 noundef %119) #22
+  %120 = bitcast %"class.embeddings::TileSpmemVectorArray.2"* %29 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings15ScratchpadArrayC2EOS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %120) #22
+  %121 = bitcast %"class.embeddings::TileSpmemVectorArray.2"* %29 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %121) #3
+  store i32 1, i32* %14, align 4
+  %122 = bitcast %"class.embeddings::TileSpmemPointer"* %27 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %122) #3
+  br label %145
+
+123:                                              ; preds = %94
+  %124 = bitcast %"class.embeddings::TileSpmemPointer"* %31 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %124) #3
+  %125 = load %"class.embeddings::SparsecoreMemoryAllocator"*, %"class.embeddings::SparsecoreMemoryAllocator"** %7, align 4, !tbaa !7
+  %126 = load i32, i32* %9, align 4, !tbaa !3
+  %127 = call noundef i8 addrspace(201)* @_ZN10embeddings25SparsecoreMemoryAllocator17AllocateTileSpmemEi(%"class.embeddings::SparsecoreMemoryAllocator"* noundef nonnull align 4 dereferenceable(36) %125, i32 noundef %126) #22
+  %128 = bitcast i8 addrspace(201)* %127 to <8 x float> addrspace(201)*
+  %129 = bitcast <8 x float> addrspace(201)* %128 to i8 addrspace(201)*
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %32, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %3) #22
+  call void @_ZN10embeddings16TileSpmemPointerC2EPU5AS201vNS_9BasicTypeE(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %31, i8 addrspace(201)* noundef %129, %"class.embeddings::BasicType"* noundef %32) #22
+  %130 = bitcast %"class.embeddings::TileSpmemVectorArray.3"* %33 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %130) #3
+  %131 = bitcast %"class.embeddings::TileSpmemPointer"* %31 to %"class.embeddings::PointerBase"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %34, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %131) #22
+  %132 = load i32, i32* %8, align 4, !tbaa !3
+  call void @_ZN10embeddings20TileSpmemVectorArrayIfEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray.3"* noundef nonnull align 4 dereferenceable(16) %33, %"class.embeddings::PointerBase"* noundef %34, i32 noundef %132) #22
+  %133 = bitcast %"class.embeddings::TileSpmemVectorArray.3"* %33 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings15ScratchpadArrayC2EOS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %133) #22
+  %134 = bitcast %"class.embeddings::TileSpmemVectorArray.3"* %33 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %134) #3
+  store i32 1, i32* %14, align 4
+  %135 = bitcast %"class.embeddings::TileSpmemPointer"* %31 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %135) #3
+  br label %145
+
+136:                                              ; preds = %94, %96
+  br label %137
+
+137:                                              ; preds = %136, %91
+  br label %138
+
+138:                                              ; preds = %137, %90
+  %139 = bitcast %"class.embeddings::SmemPointer"* %35 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %139) #3
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %36, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %3) #22
+  call void @_ZN10embeddings11SmemPointerC2EPvNS_9BasicTypeE(%"class.embeddings::SmemPointer"* noundef nonnull align 4 dereferenceable(12) %35, i8* noundef null, %"class.embeddings::BasicType"* noundef %36) #22
+  %140 = bitcast %"class.embeddings::SmemArray"* %37 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %140) #3
+  %141 = bitcast %"class.embeddings::SmemPointer"* %35 to %"class.embeddings::PointerBase"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %38, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %141) #22
+  call void @_ZN10embeddings9SmemArrayIiEC2ENS_11PointerBaseEi(%"class.embeddings::SmemArray"* noundef nonnull align 4 dereferenceable(16) %37, %"class.embeddings::PointerBase"* noundef %38, i32 noundef 0) #22
+  %142 = bitcast %"class.embeddings::SmemArray"* %37 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings15ScratchpadArrayC2EOS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %142) #22
+  %143 = bitcast %"class.embeddings::SmemArray"* %37 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %143) #3
+  store i32 1, i32* %14, align 4
+  %144 = bitcast %"class.embeddings::SmemPointer"* %35 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %144) #3
+  br label %145
+
+145:                                              ; preds = %138, %123, %110, %97, %76, %63, %50
+  %146 = bitcast i32* %9 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %146) #3
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZNK10embeddings9BaseArray11ElementTypeEv(%"class.embeddings::BasicType"* noalias sret(%"class.embeddings::BasicType") align 4 %0, %"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %1) #5 comdat align 2 {
+  %3 = alloca i8*, align 4
+  %4 = alloca %"class.embeddings::BaseArray"*, align 4
+  %5 = bitcast %"class.embeddings::BasicType"* %0 to i8*
+  store i8* %5, i8** %3, align 4
+  store %"class.embeddings::BaseArray"* %1, %"class.embeddings::BaseArray"** %4, align 4, !tbaa !7
+  %6 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %4, align 4
+  %7 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %6, i32 0, i32 0
+  call void @_ZNK10embeddings11PointerBase10value_typeEv(%"class.embeddings::BasicType"* sret(%"class.embeddings::BasicType") align 4 %0, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %7) #22
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %0) #13 comdat align 2 {
+  %2 = alloca %"class.embeddings::BaseArray"*, align 4
+  store %"class.embeddings::BaseArray"* %0, %"class.embeddings::BaseArray"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %2, align 4
+  %4 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %3, i32 0, i32 1
+  %5 = load i32, i32* %4, align 4, !tbaa !29
+  ret i32 %5
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local noundef i32 @_ZNK10embeddings9BasicType11SizeInBytesEv(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %0) #5 comdat align 2 {
+  %2 = alloca i32, align 4
+  %3 = alloca %"class.embeddings::BasicType"*, align 4
+  store %"class.embeddings::BasicType"* %0, %"class.embeddings::BasicType"** %3, align 4, !tbaa !7
+  %4 = load %"class.embeddings::BasicType"*, %"class.embeddings::BasicType"** %3, align 4
+  %5 = call noundef i32 @_ZNK10embeddings9BasicType4typeEv(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %4) #22
+  switch i32 %5, label %10 [
+    i32 0, label %6
+    i32 1, label %6
+    i32 2, label %7
+    i32 3, label %8
+    i32 4, label %8
+    i32 5, label %9
+  ]
+
+6:                                                ; preds = %1, %1
+  store i32 4, i32* %2, align 4
+  br label %11
+
+7:                                                ; preds = %1
+  store i32 4, i32* %2, align 4
+  br label %11
+
+8:                                                ; preds = %1, %1
+  store i32 32, i32* %2, align 4
+  br label %11
+
+9:                                                ; preds = %1
+  store i32 32, i32* %2, align 4
+  br label %11
+
+10:                                               ; preds = %1
+  unreachable
+
+11:                                               ; preds = %9, %8, %7, %6
+  %12 = load i32, i32* %2, align 4
+  ret i32 %12
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local noundef i32 @_ZNK10embeddings11MemorySpace15WordSizeInBytesEv(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %0) #5 comdat align 2 {
+  %2 = alloca i32, align 4
+  %3 = alloca %"class.embeddings::MemorySpace"*, align 4
+  store %"class.embeddings::MemorySpace"* %0, %"class.embeddings::MemorySpace"** %3, align 4, !tbaa !7
+  %4 = load %"class.embeddings::MemorySpace"*, %"class.embeddings::MemorySpace"** %3, align 4
+  %5 = call noundef i32 @_ZNK10embeddings11MemorySpace12memory_spaceEv(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %4) #22
+  switch i32 %5, label %16 [
+    i32 0, label %6
+    i32 1, label %8
+    i32 2, label %10
+    i32 3, label %12
+    i32 4, label %14
+  ]
+
+6:                                                ; preds = %1
+  %7 = call noundef i32 @_ZN10embeddings9word_sizeIPvE5bytesEv() #22
+  store i32 %7, i32* %2, align 4
+  br label %17
+
+8:                                                ; preds = %1
+  %9 = call noundef i32 @_ZN10embeddings9word_sizeIPU5AS201vE5bytesEv() #22
+  store i32 %9, i32* %2, align 4
+  br label %17
+
+10:                                               ; preds = %1
+  %11 = call noundef i32 @_ZN10embeddings9word_sizeIPU5AS202vE5bytesEv() #22
+  store i32 %11, i32* %2, align 4
+  br label %17
+
+12:                                               ; preds = %1
+  %13 = call noundef i32 @_ZN10embeddings9word_sizeIPU5AS203vE5bytesEv() #22
+  store i32 %13, i32* %2, align 4
+  br label %17
+
+14:                                               ; preds = %1
+  %15 = call noundef i32 @_ZN10embeddings9word_sizeIPU5AS204vE5bytesEv() #22
+  store i32 %15, i32* %2, align 4
+  br label %17
+
+16:                                               ; preds = %1
+  unreachable
+
+17:                                               ; preds = %14, %12, %10, %8, %6
+  %18 = load i32, i32* %2, align 4
+  ret i32 %18
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef i32 @_ZNK10embeddings11MemorySpace12memory_spaceEv(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %0) #13 comdat align 2 {
+  %2 = alloca %"class.embeddings::MemorySpace"*, align 4
+  store %"class.embeddings::MemorySpace"* %0, %"class.embeddings::MemorySpace"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::MemorySpace"*, %"class.embeddings::MemorySpace"** %2, align 4
+  %4 = getelementptr inbounds %"class.embeddings::MemorySpace", %"class.embeddings::MemorySpace"* %3, i32 0, i32 0
+  %5 = load i32, i32* %4, align 4, !tbaa !23
+  ret i32 %5
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef i32 @_ZNK10embeddings9BasicType4typeEv(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %0) #13 comdat align 2 {
+  %2 = alloca %"class.embeddings::BasicType"*, align 4
+  store %"class.embeddings::BasicType"* %0, %"class.embeddings::BasicType"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::BasicType"*, %"class.embeddings::BasicType"** %2, align 4
+  %4 = getelementptr inbounds %"class.embeddings::BasicType", %"class.embeddings::BasicType"* %3, i32 0, i32 0
+  %5 = load i32, i32* %4, align 4, !tbaa !26
+  ret i32 %5
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local noundef i8* @_ZN10embeddings25SparsecoreMemoryAllocator12AllocateSmemEi(%"class.embeddings::SparsecoreMemoryAllocator"* noundef nonnull align 4 dereferenceable(36) %0, i32 noundef %1) #5 comdat align 2 {
+  %3 = alloca %"class.embeddings::SparsecoreMemoryAllocator"*, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  store %"class.embeddings::SparsecoreMemoryAllocator"* %0, %"class.embeddings::SparsecoreMemoryAllocator"** %3, align 4, !tbaa !7
+  store i32 %1, i32* %4, align 4, !tbaa !3
+  %6 = load %"class.embeddings::SparsecoreMemoryAllocator"*, %"class.embeddings::SparsecoreMemoryAllocator"** %3, align 4
+  %7 = bitcast i32* %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3
+  %8 = getelementptr inbounds %"class.embeddings::SparsecoreMemoryAllocator", %"class.embeddings::SparsecoreMemoryAllocator"* %6, i32 0, i32 3
+  %9 = load i32, i32* %4, align 4, !tbaa !3
+  %10 = call noundef i32 @_ZN10embeddings4impl15StaticAllocator8AllocateEi(%"class.embeddings::impl::StaticAllocator"* noundef nonnull align 4 dereferenceable(8) %8, i32 noundef %9) #22
+  store i32 %10, i32* %5, align 4, !tbaa !3
+  %11 = getelementptr inbounds %"class.embeddings::SparsecoreMemoryAllocator", %"class.embeddings::SparsecoreMemoryAllocator"* %6, i32 0, i32 0
+  %12 = load i32*, i32** %11, align 4, !tbaa !9
+  %13 = load i32, i32* %5, align 4, !tbaa !3
+  %14 = getelementptr inbounds i32, i32* %12, i32 %13
+  %15 = bitcast i32* %14 to i8*
+  %16 = bitcast i32* %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %16) #3
+  ret i8* %15
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings11SmemPointerC2EPvNS_9BasicTypeE(%"class.embeddings::SmemPointer"* noundef nonnull align 4 dereferenceable(12) %0, i8* noundef %1, %"class.embeddings::BasicType"* noundef %2) unnamed_addr #4 comdat align 2 {
+  %4 = alloca %"class.embeddings::SmemPointer"*, align 4
+  %5 = alloca i8*, align 4
+  %6 = alloca %"class.embeddings::BasicType", align 4
+  store %"class.embeddings::SmemPointer"* %0, %"class.embeddings::SmemPointer"** %4, align 4, !tbaa !7
+  store i8* %1, i8** %5, align 4, !tbaa !7
+  %7 = load %"class.embeddings::SmemPointer"*, %"class.embeddings::SmemPointer"** %4, align 4
+  %8 = bitcast %"class.embeddings::SmemPointer"* %7 to %"class.embeddings::PointerBase"*
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %6, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %2) #22
+  %9 = load i8*, i8** %5, align 4, !tbaa !7
+  call void @_ZN10embeddings11PointerBaseC2ENS_9BasicTypeEPv(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %8, %"class.embeddings::BasicType"* noundef %6, i8* noundef %9) #22
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings9SmemArrayIiEC2ENS_11PointerBaseEi(%"class.embeddings::SmemArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::PointerBase"* noundef %1, i32 noundef %2) unnamed_addr #4 comdat align 2 {
+  %4 = alloca %"class.embeddings::SmemArray"*, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca %"class.embeddings::PointerBase", align 4
+  store %"class.embeddings::SmemArray"* %0, %"class.embeddings::SmemArray"** %4, align 4, !tbaa !7
+  store i32 %2, i32* %5, align 4, !tbaa !3
+  %7 = load %"class.embeddings::SmemArray"*, %"class.embeddings::SmemArray"** %4, align 4
+  %8 = bitcast %"class.embeddings::SmemArray"* %7 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %6, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) #22
+  %9 = load i32, i32* %5, align 4, !tbaa !3
+  call void @_ZN10embeddings15ScratchpadArrayC2ENS_11PointerBaseEi(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %8, %"class.embeddings::PointerBase"* noundef %6, i32 noundef %9) #22
+  ret void
+}
+
+; Function Attrs: inlinehint
+define linkonce_odr dso_local void @_ZN10embeddings15ScratchpadArrayC2EOS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %1) unnamed_addr #7 comdat align 2 {
+  %3 = alloca %"class.embeddings::ScratchpadArray"*, align 4
+  %4 = alloca %"class.embeddings::ScratchpadArray"*, align 4
+  store %"class.embeddings::ScratchpadArray"* %0, %"class.embeddings::ScratchpadArray"** %3, align 4, !tbaa !7
+  store %"class.embeddings::ScratchpadArray"* %1, %"class.embeddings::ScratchpadArray"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::ScratchpadArray"*, %"class.embeddings::ScratchpadArray"** %3, align 4
+  %6 = bitcast %"class.embeddings::ScratchpadArray"* %5 to %"class.embeddings::BaseArray"*
+  %7 = load %"class.embeddings::ScratchpadArray"*, %"class.embeddings::ScratchpadArray"** %4, align 4, !tbaa !7
+  %8 = bitcast %"class.embeddings::ScratchpadArray"* %7 to %"class.embeddings::BaseArray"*
+  call void @_ZN10embeddings9BaseArrayC2EOS0_(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %6, %"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %8) #22
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings9SmemArrayIjEC2ENS_11PointerBaseEi(%"class.embeddings::SmemArray.0"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::PointerBase"* noundef %1, i32 noundef %2) unnamed_addr #4 comdat align 2 {
+  %4 = alloca %"class.embeddings::SmemArray.0"*, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca %"class.embeddings::PointerBase", align 4
+  store %"class.embeddings::SmemArray.0"* %0, %"class.embeddings::SmemArray.0"** %4, align 4, !tbaa !7
+  store i32 %2, i32* %5, align 4, !tbaa !3
+  %7 = load %"class.embeddings::SmemArray.0"*, %"class.embeddings::SmemArray.0"** %4, align 4
+  %8 = bitcast %"class.embeddings::SmemArray.0"* %7 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %6, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) #22
+  %9 = load i32, i32* %5, align 4, !tbaa !3
+  call void @_ZN10embeddings15ScratchpadArrayC2ENS_11PointerBaseEi(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %8, %"class.embeddings::PointerBase"* noundef %6, i32 noundef %9) #22
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings9SmemArrayIfEC2ENS_11PointerBaseEi(%"class.embeddings::SmemArray.1"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::PointerBase"* noundef %1, i32 noundef %2) unnamed_addr #4 comdat align 2 {
+  %4 = alloca %"class.embeddings::SmemArray.1"*, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca %"class.embeddings::PointerBase", align 4
+  store %"class.embeddings::SmemArray.1"* %0, %"class.embeddings::SmemArray.1"** %4, align 4, !tbaa !7
+  store i32 %2, i32* %5, align 4, !tbaa !3
+  %7 = load %"class.embeddings::SmemArray.1"*, %"class.embeddings::SmemArray.1"** %4, align 4
+  %8 = bitcast %"class.embeddings::SmemArray.1"* %7 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %6, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) #22
+  %9 = load i32, i32* %5, align 4, !tbaa !3
+  call void @_ZN10embeddings15ScratchpadArrayC2ENS_11PointerBaseEi(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %8, %"class.embeddings::PointerBase"* noundef %6, i32 noundef %9) #22
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef i8 addrspace(201)* @_ZN10embeddings25SparsecoreMemoryAllocator17AllocateTileSpmemEi(%"class.embeddings::SparsecoreMemoryAllocator"* noundef nonnull align 4 dereferenceable(36) %0, i32 noundef %1) #13 comdat align 2 {
+  %3 = alloca %"class.embeddings::SparsecoreMemoryAllocator"*, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  store %"class.embeddings::SparsecoreMemoryAllocator"* %0, %"class.embeddings::SparsecoreMemoryAllocator"** %3, align 4, !tbaa !7
+  store i32 %1, i32* %4, align 4, !tbaa !3
+  %6 = load %"class.embeddings::SparsecoreMemoryAllocator"*, %"class.embeddings::SparsecoreMemoryAllocator"** %3, align 4
+  %7 = bitcast i32* %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3
+  %8 = getelementptr inbounds %"class.embeddings::SparsecoreMemoryAllocator", %"class.embeddings::SparsecoreMemoryAllocator"* %6, i32 0, i32 4
+  %9 = load i32, i32* %4, align 4, !tbaa !3
+  %10 = call noundef i32 @_ZN10embeddings4impl15StaticAllocator8AllocateEi(%"class.embeddings::impl::StaticAllocator"* noundef nonnull align 4 dereferenceable(8) %8, i32 noundef %9) #22
+  store i32 %10, i32* %5, align 4, !tbaa !3
+  %11 = getelementptr inbounds %"class.embeddings::SparsecoreMemoryAllocator", %"class.embeddings::SparsecoreMemoryAllocator"* %6, i32 0, i32 1
+  %12 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** %11, align 4, !tbaa !12
+  %13 = load i32, i32* %5, align 4, !tbaa !3
+  %14 = sdiv i32 %13, 8
+  %15 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %12, i32 %14
+  %16 = bitcast <8 x i32> addrspace(201)* %15 to i8 addrspace(201)*
+  %17 = bitcast i32* %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %17) #3
+  ret i8 addrspace(201)* %16
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings20TileSpmemVectorArrayIjEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray.2"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::PointerBase"* noundef %1, i32 noundef %2) unnamed_addr #4 comdat align 2 {
+  %4 = alloca %"class.embeddings::TileSpmemVectorArray.2"*, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca %"class.embeddings::PointerBase", align 4
+  store %"class.embeddings::TileSpmemVectorArray.2"* %0, %"class.embeddings::TileSpmemVectorArray.2"** %4, align 4, !tbaa !7
+  store i32 %2, i32* %5, align 4, !tbaa !3
+  %7 = load %"class.embeddings::TileSpmemVectorArray.2"*, %"class.embeddings::TileSpmemVectorArray.2"** %4, align 4
+  %8 = bitcast %"class.embeddings::TileSpmemVectorArray.2"* %7 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %6, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) #22
+  %9 = load i32, i32* %5, align 4, !tbaa !3
+  call void @_ZN10embeddings15ScratchpadArrayC2ENS_11PointerBaseEi(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %8, %"class.embeddings::PointerBase"* noundef %6, i32 noundef %9) #22
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings20TileSpmemVectorArrayIfEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray.3"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::PointerBase"* noundef %1, i32 noundef %2) unnamed_addr #4 comdat align 2 {
+  %4 = alloca %"class.embeddings::TileSpmemVectorArray.3"*, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca %"class.embeddings::PointerBase", align 4
+  store %"class.embeddings::TileSpmemVectorArray.3"* %0, %"class.embeddings::TileSpmemVectorArray.3"** %4, align 4, !tbaa !7
+  store i32 %2, i32* %5, align 4, !tbaa !3
+  %7 = load %"class.embeddings::TileSpmemVectorArray.3"*, %"class.embeddings::TileSpmemVectorArray.3"** %4, align 4
+  %8 = bitcast %"class.embeddings::TileSpmemVectorArray.3"* %7 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %6, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) #22
+  %9 = load i32, i32* %5, align 4, !tbaa !3
+  call void @_ZN10embeddings15ScratchpadArrayC2ENS_11PointerBaseEi(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %8, %"class.embeddings::PointerBase"* noundef %6, i32 noundef %9) #22
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef i32 @_ZN10embeddings9word_sizeIPvE5bytesEv() #13 comdat align 2 {
+  ret i32 4
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef i32 @_ZN10embeddings9word_sizeIPU5AS201vE5bytesEv() #13 comdat align 2 {
+  ret i32 4
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef i32 @_ZN10embeddings9word_sizeIPU5AS202vE5bytesEv() #13 comdat align 2 {
+  ret i32 4
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef i32 @_ZN10embeddings9word_sizeIPU5AS203vE5bytesEv() #13 comdat align 2 {
+  ret i32 32
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef i32 @_ZN10embeddings9word_sizeIPU5AS204vE5bytesEv() #13 comdat align 2 {
+  ret i32 4
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef i32 @_ZN10embeddings4impl15StaticAllocator8AllocateEi(%"class.embeddings::impl::StaticAllocator"* noundef nonnull align 4 dereferenceable(8) %0, i32 noundef %1) #13 comdat align 2 {
+  %3 = alloca %"class.embeddings::impl::StaticAllocator"*, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  store %"class.embeddings::impl::StaticAllocator"* %0, %"class.embeddings::impl::StaticAllocator"** %3, align 4, !tbaa !7
+  store i32 %1, i32* %4, align 4, !tbaa !3
+  %6 = load %"class.embeddings::impl::StaticAllocator"*, %"class.embeddings::impl::StaticAllocator"** %3, align 4
+  %7 = bitcast i32* %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3
+  %8 = getelementptr inbounds %"class.embeddings::impl::StaticAllocator", %"class.embeddings::impl::StaticAllocator"* %6, i32 0, i32 0
+  %9 = load i32, i32* %8, align 4, !tbaa !21
+  store i32 %9, i32* %5, align 4, !tbaa !3
+  %10 = getelementptr inbounds %"class.embeddings::impl::StaticAllocator", %"class.embeddings::impl::StaticAllocator"* %6, i32 0, i32 0
+  %11 = load i32, i32* %10, align 4, !tbaa !21
+  %12 = load i32, i32* %4, align 4, !tbaa !3
+  %13 = add nsw i32 %11, %12
+  %14 = getelementptr inbounds %"class.embeddings::impl::StaticAllocator", %"class.embeddings::impl::StaticAllocator"* %6, i32 0, i32 0
+  store i32 %13, i32* %14, align 4, !tbaa !21
+  %15 = load i32, i32* %5, align 4, !tbaa !3
+  %16 = bitcast i32* %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %16) #3
+  ret i32 %15
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings11PointerBaseC2ENS_9BasicTypeEPv(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::BasicType"* noundef %1, i8* noundef %2) unnamed_addr #4 comdat align 2 {
+  %4 = alloca %"class.embeddings::PointerBase"*, align 4
+  %5 = alloca i8*, align 4
+  %6 = alloca %"class.embeddings::SCM_Smem", align 4
+  store %"class.embeddings::PointerBase"* %0, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  store i8* %2, i8** %5, align 4, !tbaa !7
+  %7 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4
+  %8 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 0
+  %9 = bitcast %"class.embeddings::SCM_Smem"* %6 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %9) #3
+  call void @_ZN10embeddings8SCM_SmemC2Ev(%"class.embeddings::SCM_Smem"* noundef nonnull align 4 dereferenceable(4) %6) #22
+  %10 = bitcast %"class.embeddings::SCM_Smem"* %6 to %"class.embeddings::MemorySpace"*
+  call void @_ZN10embeddings11MemorySpaceC2ERKS0_(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %8, %"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %10) #22
+  %11 = bitcast %"class.embeddings::SCM_Smem"* %6 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %11) #3
+  %12 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 1
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %12, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %1) #22
+  %13 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 2
+  %14 = load i8*, i8** %5, align 4, !tbaa !7
+  call void @_ZN10embeddings11PointerBase6AnyPtrC2EPv(%"union.embeddings::PointerBase::AnyPtr"* noundef nonnull align 4 dereferenceable(4) %13, i8* noundef %14) #22
+  ret void
+}
+
+; Function Attrs: nounwind
+define linkonce_odr dso_local void @_ZN10embeddings8SCM_SmemC2Ev(%"class.embeddings::SCM_Smem"* noundef nonnull align 4 dereferenceable(4) %0) unnamed_addr #12 comdat align 2 {
+  %2 = alloca %"class.embeddings::SCM_Smem"*, align 4
+  store %"class.embeddings::SCM_Smem"* %0, %"class.embeddings::SCM_Smem"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::SCM_Smem"*, %"class.embeddings::SCM_Smem"** %2, align 4
+  %4 = bitcast %"class.embeddings::SCM_Smem"* %3 to %"class.embeddings::MemorySpace"*
+  call void @_ZN10embeddings11MemorySpaceC2ENS_21SparsecoreMemorySpaceE(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %4, i32 noundef 0) #22
+  ret void
+}
+
+; Function Attrs: nounwind
+define linkonce_odr dso_local void @_ZN10embeddings11PointerBase6AnyPtrC2EPv(%"union.embeddings::PointerBase::AnyPtr"* noundef nonnull align 4 dereferenceable(4) %0, i8* noundef %1) unnamed_addr #12 comdat align 2 {
+  %3 = alloca %"union.embeddings::PointerBase::AnyPtr"*, align 4
+  %4 = alloca i8*, align 4
+  store %"union.embeddings::PointerBase::AnyPtr"* %0, %"union.embeddings::PointerBase::AnyPtr"** %3, align 4, !tbaa !7
+  store i8* %1, i8** %4, align 4, !tbaa !7
+  %5 = load %"union.embeddings::PointerBase::AnyPtr"*, %"union.embeddings::PointerBase::AnyPtr"** %3, align 4
+  %6 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %5 to i32**
+  %7 = load i8*, i8** %4, align 4, !tbaa !7
+  %8 = bitcast i8* %7 to i32*
+  store i32* %8, i32** %6, align 4, !tbaa !33
+  ret void
+}
+
+; Function Attrs: inlinehint
+define linkonce_odr dso_local void @_ZN10embeddings9BaseArrayC2EOS0_(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %1) unnamed_addr #7 comdat align 2 {
+  %3 = alloca %"class.embeddings::BaseArray"*, align 4
+  %4 = alloca %"class.embeddings::BaseArray"*, align 4
+  store %"class.embeddings::BaseArray"* %0, %"class.embeddings::BaseArray"** %3, align 4, !tbaa !7
+  store %"class.embeddings::BaseArray"* %1, %"class.embeddings::BaseArray"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %3, align 4
+  %6 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %5, i32 0, i32 0
+  %7 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %4, align 4, !tbaa !7
+  %8 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %7, i32 0, i32 0
+  call void @_ZN10embeddings11PointerBaseC2EOS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %6, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %8) #22
+  %9 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %5, i32 0, i32 1
+  %10 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %4, align 4, !tbaa !7
+  %11 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %10, i32 0, i32 1
+  %12 = load i32, i32* %11, align 4, !tbaa !29
+  store i32 %12, i32* %9, align 4, !tbaa !29
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZNK10embeddings11PointerBase10value_typeEv(%"class.embeddings::BasicType"* noalias sret(%"class.embeddings::BasicType") align 4 %0, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) #5 comdat align 2 {
+  %3 = alloca i8*, align 4
+  %4 = alloca %"class.embeddings::PointerBase"*, align 4
+  %5 = bitcast %"class.embeddings::BasicType"* %0 to i8*
+  store i8* %5, i8** %3, align 4
+  store %"class.embeddings::PointerBase"* %1, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %6 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4
+  %7 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %6, i32 0, i32 1
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %0, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %7) #22
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define linkonce_odr dso_local void @_ZN10embeddings18RadixSortIterationIiLi4EEC2EPNS_15ScratchpadArrayE(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %0, %"class.embeddings::ScratchpadArray"* noundef %1) unnamed_addr #9 comdat align 2 {
+  %3 = alloca %"class.embeddings::RadixSortIteration"*, align 4
+  %4 = alloca %"class.embeddings::ScratchpadArray"*, align 4
+  store %"class.embeddings::RadixSortIteration"* %0, %"class.embeddings::RadixSortIteration"** %3, align 4, !tbaa !7
+  store %"class.embeddings::ScratchpadArray"* %1, %"class.embeddings::ScratchpadArray"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::RadixSortIteration"*, %"class.embeddings::RadixSortIteration"** %3, align 4
+  %6 = getelementptr inbounds %"class.embeddings::RadixSortIteration", %"class.embeddings::RadixSortIteration"* %5, i32 0, i32 0
+  store i32 16, i32* %6, align 4, !tbaa !35
+  %7 = getelementptr inbounds %"class.embeddings::RadixSortIteration", %"class.embeddings::RadixSortIteration"* %5, i32 0, i32 1
+  %8 = load %"class.embeddings::ScratchpadArray"*, %"class.embeddings::ScratchpadArray"** %4, align 4, !tbaa !7
+  store %"class.embeddings::ScratchpadArray"* %8, %"class.embeddings::ScratchpadArray"** %7, align 4, !tbaa !37
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings18RadixSortIterationIiLi4EE12ClearBucketsEv(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %0) #5 comdat align 2 {
+  %2 = alloca %"class.embeddings::RadixSortIteration"*, align 4
+  %3 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %4 = alloca i32, align 4
+  store %"class.embeddings::RadixSortIteration"* %0, %"class.embeddings::RadixSortIteration"** %2, align 4, !tbaa !7
+  %5 = load %"class.embeddings::RadixSortIteration"*, %"class.embeddings::RadixSortIteration"** %2, align 4
+  %6 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %6) #3
+  %7 = getelementptr inbounds %"class.embeddings::RadixSortIteration", %"class.embeddings::RadixSortIteration"* %5, i32 0, i32 1
+  %8 = load %"class.embeddings::ScratchpadArray"*, %"class.embeddings::ScratchpadArray"** %7, align 4, !tbaa !37
+  %9 = call noundef %"class.embeddings::TileSpmemVectorArray"* @_ZN10embeddings4CastINS_20TileSpmemVectorArrayIiEENS_15ScratchpadArrayEEENS_15cast_retty_implIT_T0_E8ret_typeERKS6_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %8) #22
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %3, %"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %9) #22
+  %10 = bitcast i32* %4 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %10) #3
+  store i32 0, i32* %4, align 4, !tbaa !3
+  br label %11
+
+11:                                               ; preds = %21, %1
+  %12 = load i32, i32* %4, align 4, !tbaa !3
+  %13 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to %"class.embeddings::BaseArray"*
+  %14 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %13) #22
+  %15 = icmp slt i32 %12, %14
+  br i1 %15, label %18, label %16
+
+16:                                               ; preds = %11
+  %17 = bitcast i32* %4 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %17) #3
+  br label %24
+
+18:                                               ; preds = %11
+  %19 = load i32, i32* %4, align 4, !tbaa !3
+  %20 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %3, i32 noundef %19) #22
+  store <8 x i32> zeroinitializer, <8 x i32> addrspace(201)* %20, align 32, !tbaa !33
+  br label %21
+
+21:                                               ; preds = %18
+  %22 = load i32, i32* %4, align 4, !tbaa !3
+  %23 = add nsw i32 %22, 1
+  store i32 %23, i32* %4, align 4, !tbaa !3
+  br label %11, !llvm.loop !38
+
+24:                                               ; preds = %16
+  %25 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %25) #3
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings18RadixSortIterationIiLi4EE13HistogramKeysEiNS_20TileSpmemVectorArrayIiEE(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %0, i32 noundef %1, %"class.embeddings::TileSpmemVectorArray"* noundef %2) #14 comdat align 2 {
+  %4 = alloca %"class.embeddings::RadixSortIteration"*, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %7 = alloca i32, align 4
+  %8 = alloca i32, align 4
+  %9 = alloca <8 x i32>, align 32
+  %10 = alloca %"struct.embeddings::UniqueResult", align 32
+  %11 = alloca i32, align 4
+  %12 = alloca <8 x i32>, align 32
+  %13 = alloca %"struct.embeddings::UniqueResult", align 32
+  %14 = alloca i32, align 4
+  %15 = alloca <8 x i32>, align 32
+  %16 = alloca %"struct.embeddings::UniqueResult", align 32
+  %17 = alloca i32, align 4
+  %18 = alloca <8 x i32>, align 32
+  %19 = alloca %"struct.embeddings::UniqueResult", align 32
+  %20 = alloca i32, align 4
+  %21 = alloca <8 x i32>, align 32
+  %22 = alloca %"struct.embeddings::UniqueResult", align 32
+  %23 = alloca i32, align 4
+  %24 = alloca <8 x i32>, align 32
+  %25 = alloca %"struct.embeddings::UniqueResult", align 32
+  %26 = alloca i32, align 4
+  %27 = alloca <8 x i32>, align 32
+  %28 = alloca %"struct.embeddings::UniqueResult", align 32
+  %29 = alloca i32, align 4
+  %30 = alloca <8 x i32>, align 32
+  %31 = alloca %"struct.embeddings::UniqueResult", align 32
+  store %"class.embeddings::RadixSortIteration"* %0, %"class.embeddings::RadixSortIteration"** %4, align 4, !tbaa !7
+  store i32 %1, i32* %5, align 4, !tbaa !3
+  %32 = load %"class.embeddings::RadixSortIteration"*, %"class.embeddings::RadixSortIteration"** %4, align 4
+  %33 = bitcast %"class.embeddings::TileSpmemVectorArray"* %6 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %33) #3
+  %34 = getelementptr inbounds %"class.embeddings::RadixSortIteration", %"class.embeddings::RadixSortIteration"* %32, i32 0, i32 1
+  %35 = load %"class.embeddings::ScratchpadArray"*, %"class.embeddings::ScratchpadArray"** %34, align 4, !tbaa !37
+  %36 = call noundef %"class.embeddings::TileSpmemVectorArray"* @_ZN10embeddings4CastINS_20TileSpmemVectorArrayIiEENS_15ScratchpadArrayEEENS_15cast_retty_implIT_T0_E8ret_typeERKS6_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %35) #22
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %6, %"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %36) #22
+  %37 = bitcast i32* %7 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %37) #3
+  store i32 0, i32* %7, align 4, !tbaa !3
+  br label %38
+
+38:                                               ; preds = %295, %3
+  %39 = load i32, i32* %7, align 4, !tbaa !3, !llvm.access.group !39
+  %40 = bitcast %"class.embeddings::TileSpmemVectorArray"* %2 to %"class.embeddings::BaseArray"*
+  %41 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %40) #22, !llvm.access.group !39
+  %42 = sdiv i32 %41, 8
+  %43 = icmp slt i32 %39, %42
+  br i1 %43, label %46, label %44
+
+44:                                               ; preds = %38
+  %45 = bitcast i32* %7 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %45) #3, !llvm.access.group !39
+  br label %298
+
+46:                                               ; preds = %38
+  %47 = bitcast i32* %8 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %47) #3, !llvm.access.group !39
+  %48 = load i32, i32* %7, align 4, !tbaa !3, !llvm.access.group !39
+  %49 = add nsw i32 %48, 0
+  store i32 %49, i32* %8, align 4, !tbaa !3, !llvm.access.group !39
+  %50 = bitcast <8 x i32>* %9 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %50) #3, !llvm.access.group !39
+  %51 = bitcast %"class.embeddings::TileSpmemVectorArray"* %2 to %"class.embeddings::BaseArray"*
+  %52 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %51) #22, !llvm.access.group !39
+  %53 = mul nsw i32 0, %52
+  %54 = sdiv i32 %53, 8
+  %55 = load i32, i32* %8, align 4, !tbaa !3, !llvm.access.group !39
+  %56 = add nsw i32 %54, %55
+  %57 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %2, i32 noundef %56) #22, !llvm.access.group !39
+  %58 = load <8 x i32>, <8 x i32> addrspace(201)* %57, align 32, !tbaa !33, !llvm.access.group !39
+  %59 = load i32, i32* %5, align 4, !tbaa !3, !llvm.access.group !39
+  %60 = call noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %32, <8 x i32> noundef %58, i32 noundef %59) #22, !llvm.access.group !39
+  store <8 x i32> %60, <8 x i32>* %9, align 32, !tbaa !33, !llvm.access.group !39
+  %61 = bitcast %"struct.embeddings::UniqueResult"* %10 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %61) #3, !llvm.access.group !39
+  %62 = load <8 x i32>, <8 x i32>* %9, align 32, !tbaa !33, !llvm.access.group !39
+  call void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* sret(%"struct.embeddings::UniqueResult") align 32 %10, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %62) #22, !llvm.access.group !39
+  %63 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %10, i32 0, i32 1
+  %64 = load <8 x i32>, <8 x i32>* %63, align 32, !tbaa !33, !llvm.access.group !39
+  %65 = trunc <8 x i32> %64 to <8 x i1>
+  %66 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %6) #22, !llvm.access.group !39
+  %67 = bitcast %"class.embeddings::TileSpmemVectorArray"* %6 to %"class.embeddings::BaseArray"*
+  %68 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %67) #22, !llvm.access.group !39
+  %69 = mul nsw i32 0, %68
+  %70 = sdiv i32 %69, 8
+  %71 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %66, i32 %70
+  %72 = load <8 x i32>, <8 x i32>* %9, align 32, !tbaa !33, !llvm.access.group !39
+  %73 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %10, i32 0, i32 2
+  %74 = load <8 x i32>, <8 x i32>* %73, align 32, !tbaa !33, !llvm.access.group !39
+  call void @llvm.tpu.vst.msk.idx.add.p201v8i32.v8i32(<8 x i1> %65, <8 x i32> addrspace(201)* %71, <8 x i32> %72, <8 x i32> %74), !llvm.access.group !39
+  %75 = bitcast %"struct.embeddings::UniqueResult"* %10 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %75) #3, !llvm.access.group !39
+  %76 = bitcast <8 x i32>* %9 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %76) #3, !llvm.access.group !39
+  %77 = bitcast i32* %8 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %77) #3, !llvm.access.group !39
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !39
+  %78 = bitcast i32* %11 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %78) #3, !llvm.access.group !39
+  %79 = load i32, i32* %7, align 4, !tbaa !3, !llvm.access.group !39
+  %80 = add nsw i32 %79, 0
+  store i32 %80, i32* %11, align 4, !tbaa !3, !llvm.access.group !39
+  %81 = bitcast <8 x i32>* %12 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %81) #3, !llvm.access.group !39
+  %82 = bitcast %"class.embeddings::TileSpmemVectorArray"* %2 to %"class.embeddings::BaseArray"*
+  %83 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %82) #22, !llvm.access.group !39
+  %84 = mul nsw i32 1, %83
+  %85 = sdiv i32 %84, 8
+  %86 = load i32, i32* %11, align 4, !tbaa !3, !llvm.access.group !39
+  %87 = add nsw i32 %85, %86
+  %88 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %2, i32 noundef %87) #22, !llvm.access.group !39
+  %89 = load <8 x i32>, <8 x i32> addrspace(201)* %88, align 32, !tbaa !33, !llvm.access.group !39
+  %90 = load i32, i32* %5, align 4, !tbaa !3, !llvm.access.group !39
+  %91 = call noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %32, <8 x i32> noundef %89, i32 noundef %90) #22, !llvm.access.group !39
+  store <8 x i32> %91, <8 x i32>* %12, align 32, !tbaa !33, !llvm.access.group !39
+  %92 = bitcast %"struct.embeddings::UniqueResult"* %13 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %92) #3, !llvm.access.group !39
+  %93 = load <8 x i32>, <8 x i32>* %12, align 32, !tbaa !33, !llvm.access.group !39
+  call void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* sret(%"struct.embeddings::UniqueResult") align 32 %13, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %93) #22, !llvm.access.group !39
+  %94 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %13, i32 0, i32 1
+  %95 = load <8 x i32>, <8 x i32>* %94, align 32, !tbaa !33, !llvm.access.group !39
+  %96 = trunc <8 x i32> %95 to <8 x i1>
+  %97 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %6) #22, !llvm.access.group !39
+  %98 = bitcast %"class.embeddings::TileSpmemVectorArray"* %6 to %"class.embeddings::BaseArray"*
+  %99 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %98) #22, !llvm.access.group !39
+  %100 = mul nsw i32 1, %99
+  %101 = sdiv i32 %100, 8
+  %102 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %97, i32 %101
+  %103 = load <8 x i32>, <8 x i32>* %12, align 32, !tbaa !33, !llvm.access.group !39
+  %104 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %13, i32 0, i32 2
+  %105 = load <8 x i32>, <8 x i32>* %104, align 32, !tbaa !33, !llvm.access.group !39
+  call void @llvm.tpu.vst.msk.idx.add.p201v8i32.v8i32(<8 x i1> %96, <8 x i32> addrspace(201)* %102, <8 x i32> %103, <8 x i32> %105), !llvm.access.group !39
+  %106 = bitcast %"struct.embeddings::UniqueResult"* %13 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %106) #3, !llvm.access.group !39
+  %107 = bitcast <8 x i32>* %12 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %107) #3, !llvm.access.group !39
+  %108 = bitcast i32* %11 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %108) #3, !llvm.access.group !39
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !39
+  %109 = bitcast i32* %14 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %109) #3, !llvm.access.group !39
+  %110 = load i32, i32* %7, align 4, !tbaa !3, !llvm.access.group !39
+  %111 = add nsw i32 %110, 0
+  store i32 %111, i32* %14, align 4, !tbaa !3, !llvm.access.group !39
+  %112 = bitcast <8 x i32>* %15 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %112) #3, !llvm.access.group !39
+  %113 = bitcast %"class.embeddings::TileSpmemVectorArray"* %2 to %"class.embeddings::BaseArray"*
+  %114 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %113) #22, !llvm.access.group !39
+  %115 = mul nsw i32 2, %114
+  %116 = sdiv i32 %115, 8
+  %117 = load i32, i32* %14, align 4, !tbaa !3, !llvm.access.group !39
+  %118 = add nsw i32 %116, %117
+  %119 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %2, i32 noundef %118) #22, !llvm.access.group !39
+  %120 = load <8 x i32>, <8 x i32> addrspace(201)* %119, align 32, !tbaa !33, !llvm.access.group !39
+  %121 = load i32, i32* %5, align 4, !tbaa !3, !llvm.access.group !39
+  %122 = call noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %32, <8 x i32> noundef %120, i32 noundef %121) #22, !llvm.access.group !39
+  store <8 x i32> %122, <8 x i32>* %15, align 32, !tbaa !33, !llvm.access.group !39
+  %123 = bitcast %"struct.embeddings::UniqueResult"* %16 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %123) #3, !llvm.access.group !39
+  %124 = load <8 x i32>, <8 x i32>* %15, align 32, !tbaa !33, !llvm.access.group !39
+  call void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* sret(%"struct.embeddings::UniqueResult") align 32 %16, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %124) #22, !llvm.access.group !39
+  %125 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %16, i32 0, i32 1
+  %126 = load <8 x i32>, <8 x i32>* %125, align 32, !tbaa !33, !llvm.access.group !39
+  %127 = trunc <8 x i32> %126 to <8 x i1>
+  %128 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %6) #22, !llvm.access.group !39
+  %129 = bitcast %"class.embeddings::TileSpmemVectorArray"* %6 to %"class.embeddings::BaseArray"*
+  %130 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %129) #22, !llvm.access.group !39
+  %131 = mul nsw i32 2, %130
+  %132 = sdiv i32 %131, 8
+  %133 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %128, i32 %132
+  %134 = load <8 x i32>, <8 x i32>* %15, align 32, !tbaa !33, !llvm.access.group !39
+  %135 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %16, i32 0, i32 2
+  %136 = load <8 x i32>, <8 x i32>* %135, align 32, !tbaa !33, !llvm.access.group !39
+  call void @llvm.tpu.vst.msk.idx.add.p201v8i32.v8i32(<8 x i1> %127, <8 x i32> addrspace(201)* %133, <8 x i32> %134, <8 x i32> %136), !llvm.access.group !39
+  %137 = bitcast %"struct.embeddings::UniqueResult"* %16 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %137) #3, !llvm.access.group !39
+  %138 = bitcast <8 x i32>* %15 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %138) #3, !llvm.access.group !39
+  %139 = bitcast i32* %14 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %139) #3, !llvm.access.group !39
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !39
+  %140 = bitcast i32* %17 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %140) #3, !llvm.access.group !39
+  %141 = load i32, i32* %7, align 4, !tbaa !3, !llvm.access.group !39
+  %142 = add nsw i32 %141, 0
+  store i32 %142, i32* %17, align 4, !tbaa !3, !llvm.access.group !39
+  %143 = bitcast <8 x i32>* %18 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %143) #3, !llvm.access.group !39
+  %144 = bitcast %"class.embeddings::TileSpmemVectorArray"* %2 to %"class.embeddings::BaseArray"*
+  %145 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %144) #22, !llvm.access.group !39
+  %146 = mul nsw i32 3, %145
+  %147 = sdiv i32 %146, 8
+  %148 = load i32, i32* %17, align 4, !tbaa !3, !llvm.access.group !39
+  %149 = add nsw i32 %147, %148
+  %150 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %2, i32 noundef %149) #22, !llvm.access.group !39
+  %151 = load <8 x i32>, <8 x i32> addrspace(201)* %150, align 32, !tbaa !33, !llvm.access.group !39
+  %152 = load i32, i32* %5, align 4, !tbaa !3, !llvm.access.group !39
+  %153 = call noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %32, <8 x i32> noundef %151, i32 noundef %152) #22, !llvm.access.group !39
+  store <8 x i32> %153, <8 x i32>* %18, align 32, !tbaa !33, !llvm.access.group !39
+  %154 = bitcast %"struct.embeddings::UniqueResult"* %19 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %154) #3, !llvm.access.group !39
+  %155 = load <8 x i32>, <8 x i32>* %18, align 32, !tbaa !33, !llvm.access.group !39
+  call void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* sret(%"struct.embeddings::UniqueResult") align 32 %19, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %155) #22, !llvm.access.group !39
+  %156 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %19, i32 0, i32 1
+  %157 = load <8 x i32>, <8 x i32>* %156, align 32, !tbaa !33, !llvm.access.group !39
+  %158 = trunc <8 x i32> %157 to <8 x i1>
+  %159 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %6) #22, !llvm.access.group !39
+  %160 = bitcast %"class.embeddings::TileSpmemVectorArray"* %6 to %"class.embeddings::BaseArray"*
+  %161 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %160) #22, !llvm.access.group !39
+  %162 = mul nsw i32 3, %161
+  %163 = sdiv i32 %162, 8
+  %164 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %159, i32 %163
+  %165 = load <8 x i32>, <8 x i32>* %18, align 32, !tbaa !33, !llvm.access.group !39
+  %166 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %19, i32 0, i32 2
+  %167 = load <8 x i32>, <8 x i32>* %166, align 32, !tbaa !33, !llvm.access.group !39
+  call void @llvm.tpu.vst.msk.idx.add.p201v8i32.v8i32(<8 x i1> %158, <8 x i32> addrspace(201)* %164, <8 x i32> %165, <8 x i32> %167), !llvm.access.group !39
+  %168 = bitcast %"struct.embeddings::UniqueResult"* %19 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %168) #3, !llvm.access.group !39
+  %169 = bitcast <8 x i32>* %18 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %169) #3, !llvm.access.group !39
+  %170 = bitcast i32* %17 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %170) #3, !llvm.access.group !39
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !39
+  %171 = bitcast i32* %20 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %171) #3, !llvm.access.group !39
+  %172 = load i32, i32* %7, align 4, !tbaa !3, !llvm.access.group !39
+  %173 = add nsw i32 %172, 0
+  store i32 %173, i32* %20, align 4, !tbaa !3, !llvm.access.group !39
+  %174 = bitcast <8 x i32>* %21 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %174) #3, !llvm.access.group !39
+  %175 = bitcast %"class.embeddings::TileSpmemVectorArray"* %2 to %"class.embeddings::BaseArray"*
+  %176 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %175) #22, !llvm.access.group !39
+  %177 = mul nsw i32 4, %176
+  %178 = sdiv i32 %177, 8
+  %179 = load i32, i32* %20, align 4, !tbaa !3, !llvm.access.group !39
+  %180 = add nsw i32 %178, %179
+  %181 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %2, i32 noundef %180) #22, !llvm.access.group !39
+  %182 = load <8 x i32>, <8 x i32> addrspace(201)* %181, align 32, !tbaa !33, !llvm.access.group !39
+  %183 = load i32, i32* %5, align 4, !tbaa !3, !llvm.access.group !39
+  %184 = call noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %32, <8 x i32> noundef %182, i32 noundef %183) #22, !llvm.access.group !39
+  store <8 x i32> %184, <8 x i32>* %21, align 32, !tbaa !33, !llvm.access.group !39
+  %185 = bitcast %"struct.embeddings::UniqueResult"* %22 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %185) #3, !llvm.access.group !39
+  %186 = load <8 x i32>, <8 x i32>* %21, align 32, !tbaa !33, !llvm.access.group !39
+  call void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* sret(%"struct.embeddings::UniqueResult") align 32 %22, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %186) #22, !llvm.access.group !39
+  %187 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %22, i32 0, i32 1
+  %188 = load <8 x i32>, <8 x i32>* %187, align 32, !tbaa !33, !llvm.access.group !39
+  %189 = trunc <8 x i32> %188 to <8 x i1>
+  %190 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %6) #22, !llvm.access.group !39
+  %191 = bitcast %"class.embeddings::TileSpmemVectorArray"* %6 to %"class.embeddings::BaseArray"*
+  %192 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %191) #22, !llvm.access.group !39
+  %193 = mul nsw i32 4, %192
+  %194 = sdiv i32 %193, 8
+  %195 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %190, i32 %194
+  %196 = load <8 x i32>, <8 x i32>* %21, align 32, !tbaa !33, !llvm.access.group !39
+  %197 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %22, i32 0, i32 2
+  %198 = load <8 x i32>, <8 x i32>* %197, align 32, !tbaa !33, !llvm.access.group !39
+  call void @llvm.tpu.vst.msk.idx.add.p201v8i32.v8i32(<8 x i1> %189, <8 x i32> addrspace(201)* %195, <8 x i32> %196, <8 x i32> %198), !llvm.access.group !39
+  %199 = bitcast %"struct.embeddings::UniqueResult"* %22 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %199) #3, !llvm.access.group !39
+  %200 = bitcast <8 x i32>* %21 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %200) #3, !llvm.access.group !39
+  %201 = bitcast i32* %20 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %201) #3, !llvm.access.group !39
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !39
+  %202 = bitcast i32* %23 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %202) #3, !llvm.access.group !39
+  %203 = load i32, i32* %7, align 4, !tbaa !3, !llvm.access.group !39
+  %204 = add nsw i32 %203, 0
+  store i32 %204, i32* %23, align 4, !tbaa !3, !llvm.access.group !39
+  %205 = bitcast <8 x i32>* %24 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %205) #3, !llvm.access.group !39
+  %206 = bitcast %"class.embeddings::TileSpmemVectorArray"* %2 to %"class.embeddings::BaseArray"*
+  %207 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %206) #22, !llvm.access.group !39
+  %208 = mul nsw i32 5, %207
+  %209 = sdiv i32 %208, 8
+  %210 = load i32, i32* %23, align 4, !tbaa !3, !llvm.access.group !39
+  %211 = add nsw i32 %209, %210
+  %212 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %2, i32 noundef %211) #22, !llvm.access.group !39
+  %213 = load <8 x i32>, <8 x i32> addrspace(201)* %212, align 32, !tbaa !33, !llvm.access.group !39
+  %214 = load i32, i32* %5, align 4, !tbaa !3, !llvm.access.group !39
+  %215 = call noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %32, <8 x i32> noundef %213, i32 noundef %214) #22, !llvm.access.group !39
+  store <8 x i32> %215, <8 x i32>* %24, align 32, !tbaa !33, !llvm.access.group !39
+  %216 = bitcast %"struct.embeddings::UniqueResult"* %25 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %216) #3, !llvm.access.group !39
+  %217 = load <8 x i32>, <8 x i32>* %24, align 32, !tbaa !33, !llvm.access.group !39
+  call void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* sret(%"struct.embeddings::UniqueResult") align 32 %25, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %217) #22, !llvm.access.group !39
+  %218 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %25, i32 0, i32 1
+  %219 = load <8 x i32>, <8 x i32>* %218, align 32, !tbaa !33, !llvm.access.group !39
+  %220 = trunc <8 x i32> %219 to <8 x i1>
+  %221 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %6) #22, !llvm.access.group !39
+  %222 = bitcast %"class.embeddings::TileSpmemVectorArray"* %6 to %"class.embeddings::BaseArray"*
+  %223 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %222) #22, !llvm.access.group !39
+  %224 = mul nsw i32 5, %223
+  %225 = sdiv i32 %224, 8
+  %226 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %221, i32 %225
+  %227 = load <8 x i32>, <8 x i32>* %24, align 32, !tbaa !33, !llvm.access.group !39
+  %228 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %25, i32 0, i32 2
+  %229 = load <8 x i32>, <8 x i32>* %228, align 32, !tbaa !33, !llvm.access.group !39
+  call void @llvm.tpu.vst.msk.idx.add.p201v8i32.v8i32(<8 x i1> %220, <8 x i32> addrspace(201)* %226, <8 x i32> %227, <8 x i32> %229), !llvm.access.group !39
+  %230 = bitcast %"struct.embeddings::UniqueResult"* %25 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %230) #3, !llvm.access.group !39
+  %231 = bitcast <8 x i32>* %24 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %231) #3, !llvm.access.group !39
+  %232 = bitcast i32* %23 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %232) #3, !llvm.access.group !39
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !39
+  %233 = bitcast i32* %26 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %233) #3, !llvm.access.group !39
+  %234 = load i32, i32* %7, align 4, !tbaa !3, !llvm.access.group !39
+  %235 = add nsw i32 %234, 0
+  store i32 %235, i32* %26, align 4, !tbaa !3, !llvm.access.group !39
+  %236 = bitcast <8 x i32>* %27 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %236) #3, !llvm.access.group !39
+  %237 = bitcast %"class.embeddings::TileSpmemVectorArray"* %2 to %"class.embeddings::BaseArray"*
+  %238 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %237) #22, !llvm.access.group !39
+  %239 = mul nsw i32 6, %238
+  %240 = sdiv i32 %239, 8
+  %241 = load i32, i32* %26, align 4, !tbaa !3, !llvm.access.group !39
+  %242 = add nsw i32 %240, %241
+  %243 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %2, i32 noundef %242) #22, !llvm.access.group !39
+  %244 = load <8 x i32>, <8 x i32> addrspace(201)* %243, align 32, !tbaa !33, !llvm.access.group !39
+  %245 = load i32, i32* %5, align 4, !tbaa !3, !llvm.access.group !39
+  %246 = call noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %32, <8 x i32> noundef %244, i32 noundef %245) #22, !llvm.access.group !39
+  store <8 x i32> %246, <8 x i32>* %27, align 32, !tbaa !33, !llvm.access.group !39
+  %247 = bitcast %"struct.embeddings::UniqueResult"* %28 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %247) #3, !llvm.access.group !39
+  %248 = load <8 x i32>, <8 x i32>* %27, align 32, !tbaa !33, !llvm.access.group !39
+  call void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* sret(%"struct.embeddings::UniqueResult") align 32 %28, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %248) #22, !llvm.access.group !39
+  %249 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %28, i32 0, i32 1
+  %250 = load <8 x i32>, <8 x i32>* %249, align 32, !tbaa !33, !llvm.access.group !39
+  %251 = trunc <8 x i32> %250 to <8 x i1>
+  %252 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %6) #22, !llvm.access.group !39
+  %253 = bitcast %"class.embeddings::TileSpmemVectorArray"* %6 to %"class.embeddings::BaseArray"*
+  %254 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %253) #22, !llvm.access.group !39
+  %255 = mul nsw i32 6, %254
+  %256 = sdiv i32 %255, 8
+  %257 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %252, i32 %256
+  %258 = load <8 x i32>, <8 x i32>* %27, align 32, !tbaa !33, !llvm.access.group !39
+  %259 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %28, i32 0, i32 2
+  %260 = load <8 x i32>, <8 x i32>* %259, align 32, !tbaa !33, !llvm.access.group !39
+  call void @llvm.tpu.vst.msk.idx.add.p201v8i32.v8i32(<8 x i1> %251, <8 x i32> addrspace(201)* %257, <8 x i32> %258, <8 x i32> %260), !llvm.access.group !39
+  %261 = bitcast %"struct.embeddings::UniqueResult"* %28 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %261) #3, !llvm.access.group !39
+  %262 = bitcast <8 x i32>* %27 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %262) #3, !llvm.access.group !39
+  %263 = bitcast i32* %26 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %263) #3, !llvm.access.group !39
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !39
+  %264 = bitcast i32* %29 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %264) #3, !llvm.access.group !39
+  %265 = load i32, i32* %7, align 4, !tbaa !3, !llvm.access.group !39
+  %266 = add nsw i32 %265, 0
+  store i32 %266, i32* %29, align 4, !tbaa !3, !llvm.access.group !39
+  %267 = bitcast <8 x i32>* %30 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %267) #3, !llvm.access.group !39
+  %268 = bitcast %"class.embeddings::TileSpmemVectorArray"* %2 to %"class.embeddings::BaseArray"*
+  %269 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %268) #22, !llvm.access.group !39
+  %270 = mul nsw i32 7, %269
+  %271 = sdiv i32 %270, 8
+  %272 = load i32, i32* %29, align 4, !tbaa !3, !llvm.access.group !39
+  %273 = add nsw i32 %271, %272
+  %274 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %2, i32 noundef %273) #22, !llvm.access.group !39
+  %275 = load <8 x i32>, <8 x i32> addrspace(201)* %274, align 32, !tbaa !33, !llvm.access.group !39
+  %276 = load i32, i32* %5, align 4, !tbaa !3, !llvm.access.group !39
+  %277 = call noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %32, <8 x i32> noundef %275, i32 noundef %276) #22, !llvm.access.group !39
+  store <8 x i32> %277, <8 x i32>* %30, align 32, !tbaa !33, !llvm.access.group !39
+  %278 = bitcast %"struct.embeddings::UniqueResult"* %31 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %278) #3, !llvm.access.group !39
+  %279 = load <8 x i32>, <8 x i32>* %30, align 32, !tbaa !33, !llvm.access.group !39
+  call void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* sret(%"struct.embeddings::UniqueResult") align 32 %31, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %279) #22, !llvm.access.group !39
+  %280 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %31, i32 0, i32 1
+  %281 = load <8 x i32>, <8 x i32>* %280, align 32, !tbaa !33, !llvm.access.group !39
+  %282 = trunc <8 x i32> %281 to <8 x i1>
+  %283 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %6) #22, !llvm.access.group !39
+  %284 = bitcast %"class.embeddings::TileSpmemVectorArray"* %6 to %"class.embeddings::BaseArray"*
+  %285 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %284) #22, !llvm.access.group !39
+  %286 = mul nsw i32 7, %285
+  %287 = sdiv i32 %286, 8
+  %288 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %283, i32 %287
+  %289 = load <8 x i32>, <8 x i32>* %30, align 32, !tbaa !33, !llvm.access.group !39
+  %290 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %31, i32 0, i32 2
+  %291 = load <8 x i32>, <8 x i32>* %290, align 32, !tbaa !33, !llvm.access.group !39
+  call void @llvm.tpu.vst.msk.idx.add.p201v8i32.v8i32(<8 x i1> %282, <8 x i32> addrspace(201)* %288, <8 x i32> %289, <8 x i32> %291), !llvm.access.group !39
+  %292 = bitcast %"struct.embeddings::UniqueResult"* %31 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %292) #3, !llvm.access.group !39
+  %293 = bitcast <8 x i32>* %30 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %293) #3, !llvm.access.group !39
+  %294 = bitcast i32* %29 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %294) #3, !llvm.access.group !39
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !39
+  br label %295
+
+295:                                              ; preds = %46
+  %296 = load i32, i32* %7, align 4, !tbaa !3, !llvm.access.group !39
+  %297 = add nsw i32 %296, 1
+  store i32 %297, i32* %7, align 4, !tbaa !3, !llvm.access.group !39
+  br label %38, !llvm.loop !40
+
+298:                                              ; preds = %44
+  %299 = bitcast %"class.embeddings::TileSpmemVectorArray"* %6 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %299) #3
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind
+define linkonce_odr dso_local noundef %"class.embeddings::TileSpmemVectorArray"* @_ZN10embeddings4CastINS_20TileSpmemVectorArrayIiEENS_15ScratchpadArrayEEENS_15cast_retty_implIT_T0_E8ret_typeERKS6_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %0) #6 comdat {
+  %2 = alloca %"class.embeddings::ScratchpadArray"*, align 4
+  store %"class.embeddings::ScratchpadArray"* %0, %"class.embeddings::ScratchpadArray"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::ScratchpadArray"*, %"class.embeddings::ScratchpadArray"** %2, align 4, !tbaa !7
+  %4 = bitcast %"class.embeddings::ScratchpadArray"* %3 to %"class.embeddings::TileSpmemVectorArray"*
+  ret %"class.embeddings::TileSpmemVectorArray"* %4
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings18RadixSortIterationIiLi4EE11ScanBucketsEv(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %0) #14 comdat align 2 {
+  %2 = alloca %"class.embeddings::RadixSortIteration"*, align 4
+  %3 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %4 = alloca <8 x i32>, align 32
+  %5 = alloca i32, align 4
+  %6 = alloca <8 x i32>, align 32
+  %7 = alloca <8 x i32>, align 32
+  %8 = alloca <8 x i32>, align 32
+  %9 = alloca <8 x i32>, align 32
+  %10 = alloca i32, align 4
+  %11 = alloca <8 x i32>, align 32
+  %12 = alloca %"struct.embeddings::AddScanResult", align 32
+  %13 = alloca <8 x i32>, align 32
+  store %"class.embeddings::RadixSortIteration"* %0, %"class.embeddings::RadixSortIteration"** %2, align 4, !tbaa !7
+  %14 = load %"class.embeddings::RadixSortIteration"*, %"class.embeddings::RadixSortIteration"** %2, align 4
+  %15 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %15) #3
+  %16 = getelementptr inbounds %"class.embeddings::RadixSortIteration", %"class.embeddings::RadixSortIteration"* %14, i32 0, i32 1
+  %17 = load %"class.embeddings::ScratchpadArray"*, %"class.embeddings::ScratchpadArray"** %16, align 4, !tbaa !37
+  %18 = call noundef %"class.embeddings::TileSpmemVectorArray"* @_ZN10embeddings4CastINS_20TileSpmemVectorArrayIiEENS_15ScratchpadArrayEEENS_15cast_retty_implIT_T0_E8ret_typeERKS6_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %17) #22
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %3, %"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %18) #22
+  %19 = bitcast <8 x i32>* %4 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %19) #3
+  store <8 x i32> zeroinitializer, <8 x i32>* %4, align 32, !tbaa !33
+  %20 = bitcast i32* %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %20) #3
+  store i32 8, i32* %5, align 4, !tbaa !3
+  %21 = bitcast <8 x i32>* %6 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %21) #3
+  %22 = call <8 x i32> @llvm.tpu.vlaneseq.v8i32()
+  store <8 x i32> %22, <8 x i32>* %6, align 32, !tbaa !33
+  %23 = bitcast <8 x i32>* %7 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %23) #3
+  %24 = load <8 x i32>, <8 x i32>* %6, align 32, !tbaa !33
+  %25 = srem <8 x i32> %24, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  store <8 x i32> %25, <8 x i32>* %7, align 32, !tbaa !33
+  %26 = bitcast <8 x i32>* %8 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %26) #3
+  %27 = load <8 x i32>, <8 x i32>* %6, align 32, !tbaa !33
+  %28 = sdiv <8 x i32> %27, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  store <8 x i32> %28, <8 x i32>* %8, align 32, !tbaa !33
+  %29 = bitcast <8 x i32>* %9 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %29) #3
+  %30 = load <8 x i32>, <8 x i32>* %7, align 32, !tbaa !33
+  %31 = getelementptr inbounds %"class.embeddings::RadixSortIteration", %"class.embeddings::RadixSortIteration"* %14, i32 0, i32 0
+  %32 = load i32, i32* %31, align 4, !tbaa !35
+  %33 = insertelement <8 x i32> poison, i32 %32, i32 0
+  %34 = shufflevector <8 x i32> %33, <8 x i32> poison, <8 x i32> zeroinitializer
+  %35 = mul <8 x i32> %30, %34
+  %36 = load <8 x i32>, <8 x i32>* %8, align 32, !tbaa !33
+  %37 = add <8 x i32> %35, %36
+  store <8 x i32> %37, <8 x i32>* %9, align 32, !tbaa !33
+  %38 = bitcast i32* %10 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %38) #3
+  store i32 0, i32* %10, align 4, !tbaa !3
+  br label %39
+
+39:                                               ; preds = %78, %1
+  %40 = load i32, i32* %10, align 4, !tbaa !3
+  %41 = getelementptr inbounds %"class.embeddings::RadixSortIteration", %"class.embeddings::RadixSortIteration"* %14, i32 0, i32 0
+  %42 = load i32, i32* %41, align 4, !tbaa !35
+  %43 = icmp slt i32 %40, %42
+  br i1 %43, label %46, label %44
+
+44:                                               ; preds = %39
+  %45 = bitcast i32* %10 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %45) #3
+  br label %81
+
+46:                                               ; preds = %39
+  %47 = bitcast <8 x i32>* %11 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %47) #3
+  %48 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %3) #22
+  %49 = load <8 x i32>, <8 x i32>* %9, align 32, !tbaa !33
+  %50 = load i32, i32* %10, align 4, !tbaa !3
+  %51 = insertelement <8 x i32> poison, i32 %50, i32 0
+  %52 = shufflevector <8 x i32> %51, <8 x i32> poison, <8 x i32> zeroinitializer
+  %53 = add <8 x i32> %49, %52
+  %54 = call <8 x i32> @llvm.tpu.vld.msk.idx.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %48, <8 x i32> %53)
+  store <8 x i32> %54, <8 x i32>* %11, align 32, !tbaa !33
+  %55 = bitcast %"struct.embeddings::AddScanResult"* %12 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 64, i8* %55) #3
+  %56 = load <8 x i32>, <8 x i32>* %11, align 32, !tbaa !33
+  call void @_ZN10embeddings13VectorAddScanIDv8_iEENS_13AddScanResultIT_EES1_S3_(%"struct.embeddings::AddScanResult"* sret(%"struct.embeddings::AddScanResult") align 32 %12, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %56) #22
+  %57 = getelementptr inbounds %"struct.embeddings::AddScanResult", %"struct.embeddings::AddScanResult"* %12, i32 0, i32 1
+  %58 = load <8 x i32>, <8 x i32>* %57, align 32, !tbaa !33
+  %59 = load <8 x i32>, <8 x i32>* %4, align 32, !tbaa !33
+  %60 = extractelement <8 x i32> %59, i32 7
+  %61 = insertelement <8 x i32> poison, i32 %60, i32 0
+  %62 = shufflevector <8 x i32> %61, <8 x i32> poison, <8 x i32> zeroinitializer
+  %63 = add <8 x i32> %58, %62
+  store <8 x i32> %63, <8 x i32>* %4, align 32, !tbaa !33
+  %64 = bitcast <8 x i32>* %13 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %64) #3
+  %65 = load <8 x i32>, <8 x i32>* %4, align 32, !tbaa !33
+  %66 = load <8 x i32>, <8 x i32>* %11, align 32, !tbaa !33
+  %67 = sub <8 x i32> %65, %66
+  store <8 x i32> %67, <8 x i32>* %13, align 32, !tbaa !33
+  %68 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %3) #22
+  %69 = load <8 x i32>, <8 x i32>* %9, align 32, !tbaa !33
+  %70 = load i32, i32* %10, align 4, !tbaa !3
+  %71 = insertelement <8 x i32> poison, i32 %70, i32 0
+  %72 = shufflevector <8 x i32> %71, <8 x i32> poison, <8 x i32> zeroinitializer
+  %73 = add <8 x i32> %69, %72
+  %74 = load <8 x i32>, <8 x i32>* %13, align 32, !tbaa !33
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %68, <8 x i32> %73, <8 x i32> %74)
+  %75 = bitcast <8 x i32>* %13 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %75) #3
+  %76 = bitcast %"struct.embeddings::AddScanResult"* %12 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* %76) #3
+  %77 = bitcast <8 x i32>* %11 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %77) #3
+  br label %78
+
+78:                                               ; preds = %46
+  %79 = load i32, i32* %10, align 4, !tbaa !3
+  %80 = add nsw i32 %79, 1
+  store i32 %80, i32* %10, align 4, !tbaa !3
+  br label %39, !llvm.loop !46
+
+81:                                               ; preds = %44
+  %82 = bitcast <8 x i32>* %9 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %82) #3
+  %83 = bitcast <8 x i32>* %8 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %83) #3
+  %84 = bitcast <8 x i32>* %7 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %84) #3
+  %85 = bitcast <8 x i32>* %6 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %85) #3
+  %86 = bitcast i32* %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %86) #3
+  %87 = bitcast <8 x i32>* %4 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %87) #3
+  %88 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %88) #3
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings18RadixSortIterationIiLi4EE14RankAndPermuteIiEEvibNS_20TileSpmemVectorArrayIiEENS3_IT_EEPS4_PS6_(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %0, i32 noundef %1, i1 noundef zeroext %2, %"class.embeddings::TileSpmemVectorArray"* noundef %3, %"class.embeddings::TileSpmemVectorArray"* noundef %4, %"class.embeddings::TileSpmemVectorArray"* noundef %5, %"class.embeddings::TileSpmemVectorArray"* noundef %6) #14 comdat align 2 {
+  %8 = alloca %"class.embeddings::RadixSortIteration"*, align 4
+  %9 = alloca i32, align 4
+  %10 = alloca i8, align 1
+  %11 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %12 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %13 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %14 = alloca i32, align 4
+  %15 = alloca i32, align 4
+  %16 = alloca <8 x i32>, align 32
+  %17 = alloca <8 x i32>, align 32
+  %18 = alloca <8 x i32>, align 32
+  %19 = alloca %"struct.embeddings::UniqueResult", align 32
+  %20 = alloca <8 x i32>, align 32
+  %21 = alloca <8 x i32>, align 32
+  %22 = alloca i32, align 4
+  %23 = alloca <8 x i32>, align 32
+  %24 = alloca <8 x i32>, align 32
+  %25 = alloca <8 x i32>, align 32
+  %26 = alloca %"struct.embeddings::UniqueResult", align 32
+  %27 = alloca <8 x i32>, align 32
+  %28 = alloca <8 x i32>, align 32
+  %29 = alloca i32, align 4
+  %30 = alloca <8 x i32>, align 32
+  %31 = alloca <8 x i32>, align 32
+  %32 = alloca <8 x i32>, align 32
+  %33 = alloca %"struct.embeddings::UniqueResult", align 32
+  %34 = alloca <8 x i32>, align 32
+  %35 = alloca <8 x i32>, align 32
+  %36 = alloca i32, align 4
+  %37 = alloca <8 x i32>, align 32
+  %38 = alloca <8 x i32>, align 32
+  %39 = alloca <8 x i32>, align 32
+  %40 = alloca %"struct.embeddings::UniqueResult", align 32
+  %41 = alloca <8 x i32>, align 32
+  %42 = alloca <8 x i32>, align 32
+  %43 = alloca i32, align 4
+  %44 = alloca <8 x i32>, align 32
+  %45 = alloca <8 x i32>, align 32
+  %46 = alloca <8 x i32>, align 32
+  %47 = alloca %"struct.embeddings::UniqueResult", align 32
+  %48 = alloca <8 x i32>, align 32
+  %49 = alloca <8 x i32>, align 32
+  %50 = alloca i32, align 4
+  %51 = alloca <8 x i32>, align 32
+  %52 = alloca <8 x i32>, align 32
+  %53 = alloca <8 x i32>, align 32
+  %54 = alloca %"struct.embeddings::UniqueResult", align 32
+  %55 = alloca <8 x i32>, align 32
+  %56 = alloca <8 x i32>, align 32
+  %57 = alloca i32, align 4
+  %58 = alloca <8 x i32>, align 32
+  %59 = alloca <8 x i32>, align 32
+  %60 = alloca <8 x i32>, align 32
+  %61 = alloca %"struct.embeddings::UniqueResult", align 32
+  %62 = alloca <8 x i32>, align 32
+  %63 = alloca <8 x i32>, align 32
+  %64 = alloca i32, align 4
+  %65 = alloca <8 x i32>, align 32
+  %66 = alloca <8 x i32>, align 32
+  %67 = alloca <8 x i32>, align 32
+  %68 = alloca %"struct.embeddings::UniqueResult", align 32
+  %69 = alloca <8 x i32>, align 32
+  %70 = alloca <8 x i32>, align 32
+  store %"class.embeddings::RadixSortIteration"* %0, %"class.embeddings::RadixSortIteration"** %8, align 4, !tbaa !7
+  store i32 %1, i32* %9, align 4, !tbaa !3
+  %71 = zext i1 %2 to i8
+  store i8 %71, i8* %10, align 1, !tbaa !47
+  store %"class.embeddings::TileSpmemVectorArray"* %5, %"class.embeddings::TileSpmemVectorArray"** %11, align 4, !tbaa !7
+  store %"class.embeddings::TileSpmemVectorArray"* %6, %"class.embeddings::TileSpmemVectorArray"** %12, align 4, !tbaa !7
+  %72 = load %"class.embeddings::RadixSortIteration"*, %"class.embeddings::RadixSortIteration"** %8, align 4
+  call void @_ZN10embeddings18RadixSortIterationIiLi4EE16DecrementBucketsEv(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %72) #22
+  %73 = bitcast %"class.embeddings::TileSpmemVectorArray"* %13 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %73) #3
+  %74 = getelementptr inbounds %"class.embeddings::RadixSortIteration", %"class.embeddings::RadixSortIteration"* %72, i32 0, i32 1
+  %75 = load %"class.embeddings::ScratchpadArray"*, %"class.embeddings::ScratchpadArray"** %74, align 4, !tbaa !37
+  %76 = call noundef %"class.embeddings::TileSpmemVectorArray"* @_ZN10embeddings4CastINS_20TileSpmemVectorArrayIiEENS_15ScratchpadArrayEEENS_15cast_retty_implIT_T0_E8ret_typeERKS6_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %75) #22
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %13, %"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %76) #22
+  %77 = bitcast i32* %14 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %77) #3
+  store i32 0, i32* %14, align 4, !tbaa !3
+  br label %78
+
+78:                                               ; preds = %583, %7
+  %79 = load i32, i32* %14, align 4, !tbaa !3, !llvm.access.group !49
+  %80 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to %"class.embeddings::BaseArray"*
+  %81 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %80) #22, !llvm.access.group !49
+  %82 = sdiv i32 %81, 8
+  %83 = icmp slt i32 %79, %82
+  br i1 %83, label %86, label %84
+
+84:                                               ; preds = %78
+  %85 = bitcast i32* %14 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %85) #3, !llvm.access.group !49
+  br label %586
+
+86:                                               ; preds = %78
+  %87 = bitcast i32* %15 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %87) #3, !llvm.access.group !49
+  %88 = load i32, i32* %14, align 4, !tbaa !3, !llvm.access.group !49
+  %89 = add nsw i32 %88, 0
+  store i32 %89, i32* %15, align 4, !tbaa !3, !llvm.access.group !49
+  %90 = bitcast <8 x i32>* %16 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %90) #3, !llvm.access.group !49
+  %91 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to %"class.embeddings::BaseArray"*
+  %92 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %91) #22, !llvm.access.group !49
+  %93 = mul nsw i32 0, %92
+  %94 = sdiv i32 %93, 8
+  %95 = load i32, i32* %15, align 4, !tbaa !3, !llvm.access.group !49
+  %96 = add nsw i32 %94, %95
+  %97 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %3, i32 noundef %96) #22, !llvm.access.group !49
+  %98 = load <8 x i32>, <8 x i32> addrspace(201)* %97, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %98, <8 x i32>* %16, align 32, !tbaa !33, !llvm.access.group !49
+  %99 = bitcast <8 x i32>* %17 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %99) #3, !llvm.access.group !49
+  %100 = bitcast %"class.embeddings::TileSpmemVectorArray"* %4 to %"class.embeddings::BaseArray"*
+  %101 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %100) #22, !llvm.access.group !49
+  %102 = mul nsw i32 0, %101
+  %103 = sdiv i32 %102, 8
+  %104 = load i32, i32* %15, align 4, !tbaa !3, !llvm.access.group !49
+  %105 = add nsw i32 %103, %104
+  %106 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %4, i32 noundef %105) #22, !llvm.access.group !49
+  %107 = load <8 x i32>, <8 x i32> addrspace(201)* %106, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %107, <8 x i32>* %17, align 32, !tbaa !33, !llvm.access.group !49
+  %108 = bitcast <8 x i32>* %18 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %108) #3, !llvm.access.group !49
+  %109 = load <8 x i32>, <8 x i32>* %16, align 32, !tbaa !33, !llvm.access.group !49
+  %110 = load i32, i32* %9, align 4, !tbaa !3, !llvm.access.group !49
+  %111 = call noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %72, <8 x i32> noundef %109, i32 noundef %110) #22, !llvm.access.group !49
+  store <8 x i32> %111, <8 x i32>* %18, align 32, !tbaa !33, !llvm.access.group !49
+  %112 = bitcast %"struct.embeddings::UniqueResult"* %19 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %112) #3, !llvm.access.group !49
+  %113 = load <8 x i32>, <8 x i32>* %18, align 32, !tbaa !33, !llvm.access.group !49
+  call void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* sret(%"struct.embeddings::UniqueResult") align 32 %19, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %113) #22, !llvm.access.group !49
+  %114 = bitcast <8 x i32>* %20 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %114) #3, !llvm.access.group !49
+  %115 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %19, i32 0, i32 2
+  %116 = load <8 x i32>, <8 x i32>* %115, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %116, <8 x i32>* %20, align 32, !tbaa !33, !llvm.access.group !49
+  %117 = bitcast <8 x i32>* %21 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %117) #3, !llvm.access.group !49
+  %118 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %19, i32 0, i32 1
+  %119 = load <8 x i32>, <8 x i32>* %118, align 32, !tbaa !33, !llvm.access.group !49
+  %120 = trunc <8 x i32> %119 to <8 x i1>
+  %121 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %13) #22, !llvm.access.group !49
+  %122 = bitcast %"class.embeddings::TileSpmemVectorArray"* %13 to %"class.embeddings::BaseArray"*
+  %123 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %122) #22, !llvm.access.group !49
+  %124 = mul nsw i32 0, %123
+  %125 = sdiv i32 %124, 8
+  %126 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %121, i32 %125
+  %127 = load <8 x i32>, <8 x i32>* %18, align 32, !tbaa !33, !llvm.access.group !49
+  %128 = load <8 x i32>, <8 x i32>* %20, align 32, !tbaa !33, !llvm.access.group !49
+  %129 = call <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.p201v8i32(<8 x i1> %120, <8 x i32> addrspace(201)* %126, <8 x i32> %127, <8 x i32> %128), !llvm.access.group !49
+  store <8 x i32> %129, <8 x i32>* %21, align 32, !tbaa !33, !llvm.access.group !49
+  %130 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %11, align 4, !tbaa !7, !llvm.access.group !49
+  %131 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %130) #22, !llvm.access.group !49
+  %132 = load <8 x i32>, <8 x i32>* %21, align 32, !tbaa !33, !llvm.access.group !49
+  %133 = load <8 x i32>, <8 x i32>* %20, align 32, !tbaa !33, !llvm.access.group !49
+  %134 = add <8 x i32> %132, %133
+  %135 = load <8 x i32>, <8 x i32>* %16, align 32, !tbaa !33, !llvm.access.group !49
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %131, <8 x i32> %134, <8 x i32> %135), !llvm.access.group !49
+  %136 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %12, align 4, !tbaa !7, !llvm.access.group !49
+  %137 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %136) #22, !llvm.access.group !49
+  %138 = load <8 x i32>, <8 x i32>* %21, align 32, !tbaa !33, !llvm.access.group !49
+  %139 = load <8 x i32>, <8 x i32>* %20, align 32, !tbaa !33, !llvm.access.group !49
+  %140 = add <8 x i32> %138, %139
+  %141 = load <8 x i32>, <8 x i32>* %17, align 32, !tbaa !33, !llvm.access.group !49
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %137, <8 x i32> %140, <8 x i32> %141), !llvm.access.group !49
+  %142 = bitcast <8 x i32>* %21 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %142) #3, !llvm.access.group !49
+  %143 = bitcast <8 x i32>* %20 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %143) #3, !llvm.access.group !49
+  %144 = bitcast %"struct.embeddings::UniqueResult"* %19 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %144) #3, !llvm.access.group !49
+  %145 = bitcast <8 x i32>* %18 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %145) #3, !llvm.access.group !49
+  %146 = bitcast <8 x i32>* %17 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %146) #3, !llvm.access.group !49
+  %147 = bitcast <8 x i32>* %16 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %147) #3, !llvm.access.group !49
+  %148 = bitcast i32* %15 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %148) #3, !llvm.access.group !49
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !49
+  %149 = bitcast i32* %22 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %149) #3, !llvm.access.group !49
+  %150 = load i32, i32* %14, align 4, !tbaa !3, !llvm.access.group !49
+  %151 = add nsw i32 %150, 0
+  store i32 %151, i32* %22, align 4, !tbaa !3, !llvm.access.group !49
+  %152 = bitcast <8 x i32>* %23 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %152) #3, !llvm.access.group !49
+  %153 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to %"class.embeddings::BaseArray"*
+  %154 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %153) #22, !llvm.access.group !49
+  %155 = mul nsw i32 1, %154
+  %156 = sdiv i32 %155, 8
+  %157 = load i32, i32* %22, align 4, !tbaa !3, !llvm.access.group !49
+  %158 = add nsw i32 %156, %157
+  %159 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %3, i32 noundef %158) #22, !llvm.access.group !49
+  %160 = load <8 x i32>, <8 x i32> addrspace(201)* %159, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %160, <8 x i32>* %23, align 32, !tbaa !33, !llvm.access.group !49
+  %161 = bitcast <8 x i32>* %24 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %161) #3, !llvm.access.group !49
+  %162 = bitcast %"class.embeddings::TileSpmemVectorArray"* %4 to %"class.embeddings::BaseArray"*
+  %163 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %162) #22, !llvm.access.group !49
+  %164 = mul nsw i32 1, %163
+  %165 = sdiv i32 %164, 8
+  %166 = load i32, i32* %22, align 4, !tbaa !3, !llvm.access.group !49
+  %167 = add nsw i32 %165, %166
+  %168 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %4, i32 noundef %167) #22, !llvm.access.group !49
+  %169 = load <8 x i32>, <8 x i32> addrspace(201)* %168, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %169, <8 x i32>* %24, align 32, !tbaa !33, !llvm.access.group !49
+  %170 = bitcast <8 x i32>* %25 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %170) #3, !llvm.access.group !49
+  %171 = load <8 x i32>, <8 x i32>* %23, align 32, !tbaa !33, !llvm.access.group !49
+  %172 = load i32, i32* %9, align 4, !tbaa !3, !llvm.access.group !49
+  %173 = call noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %72, <8 x i32> noundef %171, i32 noundef %172) #22, !llvm.access.group !49
+  store <8 x i32> %173, <8 x i32>* %25, align 32, !tbaa !33, !llvm.access.group !49
+  %174 = bitcast %"struct.embeddings::UniqueResult"* %26 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %174) #3, !llvm.access.group !49
+  %175 = load <8 x i32>, <8 x i32>* %25, align 32, !tbaa !33, !llvm.access.group !49
+  call void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* sret(%"struct.embeddings::UniqueResult") align 32 %26, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %175) #22, !llvm.access.group !49
+  %176 = bitcast <8 x i32>* %27 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %176) #3, !llvm.access.group !49
+  %177 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %26, i32 0, i32 2
+  %178 = load <8 x i32>, <8 x i32>* %177, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %178, <8 x i32>* %27, align 32, !tbaa !33, !llvm.access.group !49
+  %179 = bitcast <8 x i32>* %28 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %179) #3, !llvm.access.group !49
+  %180 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %26, i32 0, i32 1
+  %181 = load <8 x i32>, <8 x i32>* %180, align 32, !tbaa !33, !llvm.access.group !49
+  %182 = trunc <8 x i32> %181 to <8 x i1>
+  %183 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %13) #22, !llvm.access.group !49
+  %184 = bitcast %"class.embeddings::TileSpmemVectorArray"* %13 to %"class.embeddings::BaseArray"*
+  %185 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %184) #22, !llvm.access.group !49
+  %186 = mul nsw i32 1, %185
+  %187 = sdiv i32 %186, 8
+  %188 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %183, i32 %187
+  %189 = load <8 x i32>, <8 x i32>* %25, align 32, !tbaa !33, !llvm.access.group !49
+  %190 = load <8 x i32>, <8 x i32>* %27, align 32, !tbaa !33, !llvm.access.group !49
+  %191 = call <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.p201v8i32(<8 x i1> %182, <8 x i32> addrspace(201)* %188, <8 x i32> %189, <8 x i32> %190), !llvm.access.group !49
+  store <8 x i32> %191, <8 x i32>* %28, align 32, !tbaa !33, !llvm.access.group !49
+  %192 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %11, align 4, !tbaa !7, !llvm.access.group !49
+  %193 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %192) #22, !llvm.access.group !49
+  %194 = load <8 x i32>, <8 x i32>* %28, align 32, !tbaa !33, !llvm.access.group !49
+  %195 = load <8 x i32>, <8 x i32>* %27, align 32, !tbaa !33, !llvm.access.group !49
+  %196 = add <8 x i32> %194, %195
+  %197 = load <8 x i32>, <8 x i32>* %23, align 32, !tbaa !33, !llvm.access.group !49
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %193, <8 x i32> %196, <8 x i32> %197), !llvm.access.group !49
+  %198 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %12, align 4, !tbaa !7, !llvm.access.group !49
+  %199 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %198) #22, !llvm.access.group !49
+  %200 = load <8 x i32>, <8 x i32>* %28, align 32, !tbaa !33, !llvm.access.group !49
+  %201 = load <8 x i32>, <8 x i32>* %27, align 32, !tbaa !33, !llvm.access.group !49
+  %202 = add <8 x i32> %200, %201
+  %203 = load <8 x i32>, <8 x i32>* %24, align 32, !tbaa !33, !llvm.access.group !49
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %199, <8 x i32> %202, <8 x i32> %203), !llvm.access.group !49
+  %204 = bitcast <8 x i32>* %28 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %204) #3, !llvm.access.group !49
+  %205 = bitcast <8 x i32>* %27 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %205) #3, !llvm.access.group !49
+  %206 = bitcast %"struct.embeddings::UniqueResult"* %26 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %206) #3, !llvm.access.group !49
+  %207 = bitcast <8 x i32>* %25 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %207) #3, !llvm.access.group !49
+  %208 = bitcast <8 x i32>* %24 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %208) #3, !llvm.access.group !49
+  %209 = bitcast <8 x i32>* %23 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %209) #3, !llvm.access.group !49
+  %210 = bitcast i32* %22 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %210) #3, !llvm.access.group !49
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !49
+  %211 = bitcast i32* %29 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %211) #3, !llvm.access.group !49
+  %212 = load i32, i32* %14, align 4, !tbaa !3, !llvm.access.group !49
+  %213 = add nsw i32 %212, 0
+  store i32 %213, i32* %29, align 4, !tbaa !3, !llvm.access.group !49
+  %214 = bitcast <8 x i32>* %30 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %214) #3, !llvm.access.group !49
+  %215 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to %"class.embeddings::BaseArray"*
+  %216 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %215) #22, !llvm.access.group !49
+  %217 = mul nsw i32 2, %216
+  %218 = sdiv i32 %217, 8
+  %219 = load i32, i32* %29, align 4, !tbaa !3, !llvm.access.group !49
+  %220 = add nsw i32 %218, %219
+  %221 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %3, i32 noundef %220) #22, !llvm.access.group !49
+  %222 = load <8 x i32>, <8 x i32> addrspace(201)* %221, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %222, <8 x i32>* %30, align 32, !tbaa !33, !llvm.access.group !49
+  %223 = bitcast <8 x i32>* %31 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %223) #3, !llvm.access.group !49
+  %224 = bitcast %"class.embeddings::TileSpmemVectorArray"* %4 to %"class.embeddings::BaseArray"*
+  %225 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %224) #22, !llvm.access.group !49
+  %226 = mul nsw i32 2, %225
+  %227 = sdiv i32 %226, 8
+  %228 = load i32, i32* %29, align 4, !tbaa !3, !llvm.access.group !49
+  %229 = add nsw i32 %227, %228
+  %230 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %4, i32 noundef %229) #22, !llvm.access.group !49
+  %231 = load <8 x i32>, <8 x i32> addrspace(201)* %230, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %231, <8 x i32>* %31, align 32, !tbaa !33, !llvm.access.group !49
+  %232 = bitcast <8 x i32>* %32 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %232) #3, !llvm.access.group !49
+  %233 = load <8 x i32>, <8 x i32>* %30, align 32, !tbaa !33, !llvm.access.group !49
+  %234 = load i32, i32* %9, align 4, !tbaa !3, !llvm.access.group !49
+  %235 = call noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %72, <8 x i32> noundef %233, i32 noundef %234) #22, !llvm.access.group !49
+  store <8 x i32> %235, <8 x i32>* %32, align 32, !tbaa !33, !llvm.access.group !49
+  %236 = bitcast %"struct.embeddings::UniqueResult"* %33 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %236) #3, !llvm.access.group !49
+  %237 = load <8 x i32>, <8 x i32>* %32, align 32, !tbaa !33, !llvm.access.group !49
+  call void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* sret(%"struct.embeddings::UniqueResult") align 32 %33, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %237) #22, !llvm.access.group !49
+  %238 = bitcast <8 x i32>* %34 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %238) #3, !llvm.access.group !49
+  %239 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %33, i32 0, i32 2
+  %240 = load <8 x i32>, <8 x i32>* %239, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %240, <8 x i32>* %34, align 32, !tbaa !33, !llvm.access.group !49
+  %241 = bitcast <8 x i32>* %35 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %241) #3, !llvm.access.group !49
+  %242 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %33, i32 0, i32 1
+  %243 = load <8 x i32>, <8 x i32>* %242, align 32, !tbaa !33, !llvm.access.group !49
+  %244 = trunc <8 x i32> %243 to <8 x i1>
+  %245 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %13) #22, !llvm.access.group !49
+  %246 = bitcast %"class.embeddings::TileSpmemVectorArray"* %13 to %"class.embeddings::BaseArray"*
+  %247 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %246) #22, !llvm.access.group !49
+  %248 = mul nsw i32 2, %247
+  %249 = sdiv i32 %248, 8
+  %250 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %245, i32 %249
+  %251 = load <8 x i32>, <8 x i32>* %32, align 32, !tbaa !33, !llvm.access.group !49
+  %252 = load <8 x i32>, <8 x i32>* %34, align 32, !tbaa !33, !llvm.access.group !49
+  %253 = call <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.p201v8i32(<8 x i1> %244, <8 x i32> addrspace(201)* %250, <8 x i32> %251, <8 x i32> %252), !llvm.access.group !49
+  store <8 x i32> %253, <8 x i32>* %35, align 32, !tbaa !33, !llvm.access.group !49
+  %254 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %11, align 4, !tbaa !7, !llvm.access.group !49
+  %255 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %254) #22, !llvm.access.group !49
+  %256 = load <8 x i32>, <8 x i32>* %35, align 32, !tbaa !33, !llvm.access.group !49
+  %257 = load <8 x i32>, <8 x i32>* %34, align 32, !tbaa !33, !llvm.access.group !49
+  %258 = add <8 x i32> %256, %257
+  %259 = load <8 x i32>, <8 x i32>* %30, align 32, !tbaa !33, !llvm.access.group !49
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %255, <8 x i32> %258, <8 x i32> %259), !llvm.access.group !49
+  %260 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %12, align 4, !tbaa !7, !llvm.access.group !49
+  %261 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %260) #22, !llvm.access.group !49
+  %262 = load <8 x i32>, <8 x i32>* %35, align 32, !tbaa !33, !llvm.access.group !49
+  %263 = load <8 x i32>, <8 x i32>* %34, align 32, !tbaa !33, !llvm.access.group !49
+  %264 = add <8 x i32> %262, %263
+  %265 = load <8 x i32>, <8 x i32>* %31, align 32, !tbaa !33, !llvm.access.group !49
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %261, <8 x i32> %264, <8 x i32> %265), !llvm.access.group !49
+  %266 = bitcast <8 x i32>* %35 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %266) #3, !llvm.access.group !49
+  %267 = bitcast <8 x i32>* %34 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %267) #3, !llvm.access.group !49
+  %268 = bitcast %"struct.embeddings::UniqueResult"* %33 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %268) #3, !llvm.access.group !49
+  %269 = bitcast <8 x i32>* %32 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %269) #3, !llvm.access.group !49
+  %270 = bitcast <8 x i32>* %31 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %270) #3, !llvm.access.group !49
+  %271 = bitcast <8 x i32>* %30 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %271) #3, !llvm.access.group !49
+  %272 = bitcast i32* %29 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %272) #3, !llvm.access.group !49
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !49
+  %273 = bitcast i32* %36 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %273) #3, !llvm.access.group !49
+  %274 = load i32, i32* %14, align 4, !tbaa !3, !llvm.access.group !49
+  %275 = add nsw i32 %274, 0
+  store i32 %275, i32* %36, align 4, !tbaa !3, !llvm.access.group !49
+  %276 = bitcast <8 x i32>* %37 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %276) #3, !llvm.access.group !49
+  %277 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to %"class.embeddings::BaseArray"*
+  %278 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %277) #22, !llvm.access.group !49
+  %279 = mul nsw i32 3, %278
+  %280 = sdiv i32 %279, 8
+  %281 = load i32, i32* %36, align 4, !tbaa !3, !llvm.access.group !49
+  %282 = add nsw i32 %280, %281
+  %283 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %3, i32 noundef %282) #22, !llvm.access.group !49
+  %284 = load <8 x i32>, <8 x i32> addrspace(201)* %283, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %284, <8 x i32>* %37, align 32, !tbaa !33, !llvm.access.group !49
+  %285 = bitcast <8 x i32>* %38 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %285) #3, !llvm.access.group !49
+  %286 = bitcast %"class.embeddings::TileSpmemVectorArray"* %4 to %"class.embeddings::BaseArray"*
+  %287 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %286) #22, !llvm.access.group !49
+  %288 = mul nsw i32 3, %287
+  %289 = sdiv i32 %288, 8
+  %290 = load i32, i32* %36, align 4, !tbaa !3, !llvm.access.group !49
+  %291 = add nsw i32 %289, %290
+  %292 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %4, i32 noundef %291) #22, !llvm.access.group !49
+  %293 = load <8 x i32>, <8 x i32> addrspace(201)* %292, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %293, <8 x i32>* %38, align 32, !tbaa !33, !llvm.access.group !49
+  %294 = bitcast <8 x i32>* %39 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %294) #3, !llvm.access.group !49
+  %295 = load <8 x i32>, <8 x i32>* %37, align 32, !tbaa !33, !llvm.access.group !49
+  %296 = load i32, i32* %9, align 4, !tbaa !3, !llvm.access.group !49
+  %297 = call noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %72, <8 x i32> noundef %295, i32 noundef %296) #22, !llvm.access.group !49
+  store <8 x i32> %297, <8 x i32>* %39, align 32, !tbaa !33, !llvm.access.group !49
+  %298 = bitcast %"struct.embeddings::UniqueResult"* %40 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %298) #3, !llvm.access.group !49
+  %299 = load <8 x i32>, <8 x i32>* %39, align 32, !tbaa !33, !llvm.access.group !49
+  call void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* sret(%"struct.embeddings::UniqueResult") align 32 %40, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %299) #22, !llvm.access.group !49
+  %300 = bitcast <8 x i32>* %41 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %300) #3, !llvm.access.group !49
+  %301 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %40, i32 0, i32 2
+  %302 = load <8 x i32>, <8 x i32>* %301, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %302, <8 x i32>* %41, align 32, !tbaa !33, !llvm.access.group !49
+  %303 = bitcast <8 x i32>* %42 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %303) #3, !llvm.access.group !49
+  %304 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %40, i32 0, i32 1
+  %305 = load <8 x i32>, <8 x i32>* %304, align 32, !tbaa !33, !llvm.access.group !49
+  %306 = trunc <8 x i32> %305 to <8 x i1>
+  %307 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %13) #22, !llvm.access.group !49
+  %308 = bitcast %"class.embeddings::TileSpmemVectorArray"* %13 to %"class.embeddings::BaseArray"*
+  %309 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %308) #22, !llvm.access.group !49
+  %310 = mul nsw i32 3, %309
+  %311 = sdiv i32 %310, 8
+  %312 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %307, i32 %311
+  %313 = load <8 x i32>, <8 x i32>* %39, align 32, !tbaa !33, !llvm.access.group !49
+  %314 = load <8 x i32>, <8 x i32>* %41, align 32, !tbaa !33, !llvm.access.group !49
+  %315 = call <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.p201v8i32(<8 x i1> %306, <8 x i32> addrspace(201)* %312, <8 x i32> %313, <8 x i32> %314), !llvm.access.group !49
+  store <8 x i32> %315, <8 x i32>* %42, align 32, !tbaa !33, !llvm.access.group !49
+  %316 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %11, align 4, !tbaa !7, !llvm.access.group !49
+  %317 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %316) #22, !llvm.access.group !49
+  %318 = load <8 x i32>, <8 x i32>* %42, align 32, !tbaa !33, !llvm.access.group !49
+  %319 = load <8 x i32>, <8 x i32>* %41, align 32, !tbaa !33, !llvm.access.group !49
+  %320 = add <8 x i32> %318, %319
+  %321 = load <8 x i32>, <8 x i32>* %37, align 32, !tbaa !33, !llvm.access.group !49
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %317, <8 x i32> %320, <8 x i32> %321), !llvm.access.group !49
+  %322 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %12, align 4, !tbaa !7, !llvm.access.group !49
+  %323 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %322) #22, !llvm.access.group !49
+  %324 = load <8 x i32>, <8 x i32>* %42, align 32, !tbaa !33, !llvm.access.group !49
+  %325 = load <8 x i32>, <8 x i32>* %41, align 32, !tbaa !33, !llvm.access.group !49
+  %326 = add <8 x i32> %324, %325
+  %327 = load <8 x i32>, <8 x i32>* %38, align 32, !tbaa !33, !llvm.access.group !49
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %323, <8 x i32> %326, <8 x i32> %327), !llvm.access.group !49
+  %328 = bitcast <8 x i32>* %42 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %328) #3, !llvm.access.group !49
+  %329 = bitcast <8 x i32>* %41 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %329) #3, !llvm.access.group !49
+  %330 = bitcast %"struct.embeddings::UniqueResult"* %40 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %330) #3, !llvm.access.group !49
+  %331 = bitcast <8 x i32>* %39 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %331) #3, !llvm.access.group !49
+  %332 = bitcast <8 x i32>* %38 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %332) #3, !llvm.access.group !49
+  %333 = bitcast <8 x i32>* %37 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %333) #3, !llvm.access.group !49
+  %334 = bitcast i32* %36 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %334) #3, !llvm.access.group !49
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !49
+  %335 = bitcast i32* %43 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %335) #3, !llvm.access.group !49
+  %336 = load i32, i32* %14, align 4, !tbaa !3, !llvm.access.group !49
+  %337 = add nsw i32 %336, 0
+  store i32 %337, i32* %43, align 4, !tbaa !3, !llvm.access.group !49
+  %338 = bitcast <8 x i32>* %44 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %338) #3, !llvm.access.group !49
+  %339 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to %"class.embeddings::BaseArray"*
+  %340 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %339) #22, !llvm.access.group !49
+  %341 = mul nsw i32 4, %340
+  %342 = sdiv i32 %341, 8
+  %343 = load i32, i32* %43, align 4, !tbaa !3, !llvm.access.group !49
+  %344 = add nsw i32 %342, %343
+  %345 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %3, i32 noundef %344) #22, !llvm.access.group !49
+  %346 = load <8 x i32>, <8 x i32> addrspace(201)* %345, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %346, <8 x i32>* %44, align 32, !tbaa !33, !llvm.access.group !49
+  %347 = bitcast <8 x i32>* %45 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %347) #3, !llvm.access.group !49
+  %348 = bitcast %"class.embeddings::TileSpmemVectorArray"* %4 to %"class.embeddings::BaseArray"*
+  %349 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %348) #22, !llvm.access.group !49
+  %350 = mul nsw i32 4, %349
+  %351 = sdiv i32 %350, 8
+  %352 = load i32, i32* %43, align 4, !tbaa !3, !llvm.access.group !49
+  %353 = add nsw i32 %351, %352
+  %354 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %4, i32 noundef %353) #22, !llvm.access.group !49
+  %355 = load <8 x i32>, <8 x i32> addrspace(201)* %354, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %355, <8 x i32>* %45, align 32, !tbaa !33, !llvm.access.group !49
+  %356 = bitcast <8 x i32>* %46 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %356) #3, !llvm.access.group !49
+  %357 = load <8 x i32>, <8 x i32>* %44, align 32, !tbaa !33, !llvm.access.group !49
+  %358 = load i32, i32* %9, align 4, !tbaa !3, !llvm.access.group !49
+  %359 = call noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %72, <8 x i32> noundef %357, i32 noundef %358) #22, !llvm.access.group !49
+  store <8 x i32> %359, <8 x i32>* %46, align 32, !tbaa !33, !llvm.access.group !49
+  %360 = bitcast %"struct.embeddings::UniqueResult"* %47 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %360) #3, !llvm.access.group !49
+  %361 = load <8 x i32>, <8 x i32>* %46, align 32, !tbaa !33, !llvm.access.group !49
+  call void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* sret(%"struct.embeddings::UniqueResult") align 32 %47, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %361) #22, !llvm.access.group !49
+  %362 = bitcast <8 x i32>* %48 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %362) #3, !llvm.access.group !49
+  %363 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %47, i32 0, i32 2
+  %364 = load <8 x i32>, <8 x i32>* %363, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %364, <8 x i32>* %48, align 32, !tbaa !33, !llvm.access.group !49
+  %365 = bitcast <8 x i32>* %49 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %365) #3, !llvm.access.group !49
+  %366 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %47, i32 0, i32 1
+  %367 = load <8 x i32>, <8 x i32>* %366, align 32, !tbaa !33, !llvm.access.group !49
+  %368 = trunc <8 x i32> %367 to <8 x i1>
+  %369 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %13) #22, !llvm.access.group !49
+  %370 = bitcast %"class.embeddings::TileSpmemVectorArray"* %13 to %"class.embeddings::BaseArray"*
+  %371 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %370) #22, !llvm.access.group !49
+  %372 = mul nsw i32 4, %371
+  %373 = sdiv i32 %372, 8
+  %374 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %369, i32 %373
+  %375 = load <8 x i32>, <8 x i32>* %46, align 32, !tbaa !33, !llvm.access.group !49
+  %376 = load <8 x i32>, <8 x i32>* %48, align 32, !tbaa !33, !llvm.access.group !49
+  %377 = call <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.p201v8i32(<8 x i1> %368, <8 x i32> addrspace(201)* %374, <8 x i32> %375, <8 x i32> %376), !llvm.access.group !49
+  store <8 x i32> %377, <8 x i32>* %49, align 32, !tbaa !33, !llvm.access.group !49
+  %378 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %11, align 4, !tbaa !7, !llvm.access.group !49
+  %379 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %378) #22, !llvm.access.group !49
+  %380 = load <8 x i32>, <8 x i32>* %49, align 32, !tbaa !33, !llvm.access.group !49
+  %381 = load <8 x i32>, <8 x i32>* %48, align 32, !tbaa !33, !llvm.access.group !49
+  %382 = add <8 x i32> %380, %381
+  %383 = load <8 x i32>, <8 x i32>* %44, align 32, !tbaa !33, !llvm.access.group !49
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %379, <8 x i32> %382, <8 x i32> %383), !llvm.access.group !49
+  %384 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %12, align 4, !tbaa !7, !llvm.access.group !49
+  %385 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %384) #22, !llvm.access.group !49
+  %386 = load <8 x i32>, <8 x i32>* %49, align 32, !tbaa !33, !llvm.access.group !49
+  %387 = load <8 x i32>, <8 x i32>* %48, align 32, !tbaa !33, !llvm.access.group !49
+  %388 = add <8 x i32> %386, %387
+  %389 = load <8 x i32>, <8 x i32>* %45, align 32, !tbaa !33, !llvm.access.group !49
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %385, <8 x i32> %388, <8 x i32> %389), !llvm.access.group !49
+  %390 = bitcast <8 x i32>* %49 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %390) #3, !llvm.access.group !49
+  %391 = bitcast <8 x i32>* %48 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %391) #3, !llvm.access.group !49
+  %392 = bitcast %"struct.embeddings::UniqueResult"* %47 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %392) #3, !llvm.access.group !49
+  %393 = bitcast <8 x i32>* %46 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %393) #3, !llvm.access.group !49
+  %394 = bitcast <8 x i32>* %45 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %394) #3, !llvm.access.group !49
+  %395 = bitcast <8 x i32>* %44 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %395) #3, !llvm.access.group !49
+  %396 = bitcast i32* %43 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %396) #3, !llvm.access.group !49
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !49
+  %397 = bitcast i32* %50 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %397) #3, !llvm.access.group !49
+  %398 = load i32, i32* %14, align 4, !tbaa !3, !llvm.access.group !49
+  %399 = add nsw i32 %398, 0
+  store i32 %399, i32* %50, align 4, !tbaa !3, !llvm.access.group !49
+  %400 = bitcast <8 x i32>* %51 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %400) #3, !llvm.access.group !49
+  %401 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to %"class.embeddings::BaseArray"*
+  %402 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %401) #22, !llvm.access.group !49
+  %403 = mul nsw i32 5, %402
+  %404 = sdiv i32 %403, 8
+  %405 = load i32, i32* %50, align 4, !tbaa !3, !llvm.access.group !49
+  %406 = add nsw i32 %404, %405
+  %407 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %3, i32 noundef %406) #22, !llvm.access.group !49
+  %408 = load <8 x i32>, <8 x i32> addrspace(201)* %407, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %408, <8 x i32>* %51, align 32, !tbaa !33, !llvm.access.group !49
+  %409 = bitcast <8 x i32>* %52 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %409) #3, !llvm.access.group !49
+  %410 = bitcast %"class.embeddings::TileSpmemVectorArray"* %4 to %"class.embeddings::BaseArray"*
+  %411 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %410) #22, !llvm.access.group !49
+  %412 = mul nsw i32 5, %411
+  %413 = sdiv i32 %412, 8
+  %414 = load i32, i32* %50, align 4, !tbaa !3, !llvm.access.group !49
+  %415 = add nsw i32 %413, %414
+  %416 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %4, i32 noundef %415) #22, !llvm.access.group !49
+  %417 = load <8 x i32>, <8 x i32> addrspace(201)* %416, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %417, <8 x i32>* %52, align 32, !tbaa !33, !llvm.access.group !49
+  %418 = bitcast <8 x i32>* %53 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %418) #3, !llvm.access.group !49
+  %419 = load <8 x i32>, <8 x i32>* %51, align 32, !tbaa !33, !llvm.access.group !49
+  %420 = load i32, i32* %9, align 4, !tbaa !3, !llvm.access.group !49
+  %421 = call noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %72, <8 x i32> noundef %419, i32 noundef %420) #22, !llvm.access.group !49
+  store <8 x i32> %421, <8 x i32>* %53, align 32, !tbaa !33, !llvm.access.group !49
+  %422 = bitcast %"struct.embeddings::UniqueResult"* %54 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %422) #3, !llvm.access.group !49
+  %423 = load <8 x i32>, <8 x i32>* %53, align 32, !tbaa !33, !llvm.access.group !49
+  call void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* sret(%"struct.embeddings::UniqueResult") align 32 %54, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %423) #22, !llvm.access.group !49
+  %424 = bitcast <8 x i32>* %55 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %424) #3, !llvm.access.group !49
+  %425 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %54, i32 0, i32 2
+  %426 = load <8 x i32>, <8 x i32>* %425, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %426, <8 x i32>* %55, align 32, !tbaa !33, !llvm.access.group !49
+  %427 = bitcast <8 x i32>* %56 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %427) #3, !llvm.access.group !49
+  %428 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %54, i32 0, i32 1
+  %429 = load <8 x i32>, <8 x i32>* %428, align 32, !tbaa !33, !llvm.access.group !49
+  %430 = trunc <8 x i32> %429 to <8 x i1>
+  %431 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %13) #22, !llvm.access.group !49
+  %432 = bitcast %"class.embeddings::TileSpmemVectorArray"* %13 to %"class.embeddings::BaseArray"*
+  %433 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %432) #22, !llvm.access.group !49
+  %434 = mul nsw i32 5, %433
+  %435 = sdiv i32 %434, 8
+  %436 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %431, i32 %435
+  %437 = load <8 x i32>, <8 x i32>* %53, align 32, !tbaa !33, !llvm.access.group !49
+  %438 = load <8 x i32>, <8 x i32>* %55, align 32, !tbaa !33, !llvm.access.group !49
+  %439 = call <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.p201v8i32(<8 x i1> %430, <8 x i32> addrspace(201)* %436, <8 x i32> %437, <8 x i32> %438), !llvm.access.group !49
+  store <8 x i32> %439, <8 x i32>* %56, align 32, !tbaa !33, !llvm.access.group !49
+  %440 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %11, align 4, !tbaa !7, !llvm.access.group !49
+  %441 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %440) #22, !llvm.access.group !49
+  %442 = load <8 x i32>, <8 x i32>* %56, align 32, !tbaa !33, !llvm.access.group !49
+  %443 = load <8 x i32>, <8 x i32>* %55, align 32, !tbaa !33, !llvm.access.group !49
+  %444 = add <8 x i32> %442, %443
+  %445 = load <8 x i32>, <8 x i32>* %51, align 32, !tbaa !33, !llvm.access.group !49
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %441, <8 x i32> %444, <8 x i32> %445), !llvm.access.group !49
+  %446 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %12, align 4, !tbaa !7, !llvm.access.group !49
+  %447 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %446) #22, !llvm.access.group !49
+  %448 = load <8 x i32>, <8 x i32>* %56, align 32, !tbaa !33, !llvm.access.group !49
+  %449 = load <8 x i32>, <8 x i32>* %55, align 32, !tbaa !33, !llvm.access.group !49
+  %450 = add <8 x i32> %448, %449
+  %451 = load <8 x i32>, <8 x i32>* %52, align 32, !tbaa !33, !llvm.access.group !49
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %447, <8 x i32> %450, <8 x i32> %451), !llvm.access.group !49
+  %452 = bitcast <8 x i32>* %56 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %452) #3, !llvm.access.group !49
+  %453 = bitcast <8 x i32>* %55 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %453) #3, !llvm.access.group !49
+  %454 = bitcast %"struct.embeddings::UniqueResult"* %54 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %454) #3, !llvm.access.group !49
+  %455 = bitcast <8 x i32>* %53 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %455) #3, !llvm.access.group !49
+  %456 = bitcast <8 x i32>* %52 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %456) #3, !llvm.access.group !49
+  %457 = bitcast <8 x i32>* %51 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %457) #3, !llvm.access.group !49
+  %458 = bitcast i32* %50 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %458) #3, !llvm.access.group !49
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !49
+  %459 = bitcast i32* %57 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %459) #3, !llvm.access.group !49
+  %460 = load i32, i32* %14, align 4, !tbaa !3, !llvm.access.group !49
+  %461 = add nsw i32 %460, 0
+  store i32 %461, i32* %57, align 4, !tbaa !3, !llvm.access.group !49
+  %462 = bitcast <8 x i32>* %58 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %462) #3, !llvm.access.group !49
+  %463 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to %"class.embeddings::BaseArray"*
+  %464 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %463) #22, !llvm.access.group !49
+  %465 = mul nsw i32 6, %464
+  %466 = sdiv i32 %465, 8
+  %467 = load i32, i32* %57, align 4, !tbaa !3, !llvm.access.group !49
+  %468 = add nsw i32 %466, %467
+  %469 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %3, i32 noundef %468) #22, !llvm.access.group !49
+  %470 = load <8 x i32>, <8 x i32> addrspace(201)* %469, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %470, <8 x i32>* %58, align 32, !tbaa !33, !llvm.access.group !49
+  %471 = bitcast <8 x i32>* %59 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %471) #3, !llvm.access.group !49
+  %472 = bitcast %"class.embeddings::TileSpmemVectorArray"* %4 to %"class.embeddings::BaseArray"*
+  %473 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %472) #22, !llvm.access.group !49
+  %474 = mul nsw i32 6, %473
+  %475 = sdiv i32 %474, 8
+  %476 = load i32, i32* %57, align 4, !tbaa !3, !llvm.access.group !49
+  %477 = add nsw i32 %475, %476
+  %478 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %4, i32 noundef %477) #22, !llvm.access.group !49
+  %479 = load <8 x i32>, <8 x i32> addrspace(201)* %478, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %479, <8 x i32>* %59, align 32, !tbaa !33, !llvm.access.group !49
+  %480 = bitcast <8 x i32>* %60 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %480) #3, !llvm.access.group !49
+  %481 = load <8 x i32>, <8 x i32>* %58, align 32, !tbaa !33, !llvm.access.group !49
+  %482 = load i32, i32* %9, align 4, !tbaa !3, !llvm.access.group !49
+  %483 = call noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %72, <8 x i32> noundef %481, i32 noundef %482) #22, !llvm.access.group !49
+  store <8 x i32> %483, <8 x i32>* %60, align 32, !tbaa !33, !llvm.access.group !49
+  %484 = bitcast %"struct.embeddings::UniqueResult"* %61 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %484) #3, !llvm.access.group !49
+  %485 = load <8 x i32>, <8 x i32>* %60, align 32, !tbaa !33, !llvm.access.group !49
+  call void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* sret(%"struct.embeddings::UniqueResult") align 32 %61, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %485) #22, !llvm.access.group !49
+  %486 = bitcast <8 x i32>* %62 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %486) #3, !llvm.access.group !49
+  %487 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %61, i32 0, i32 2
+  %488 = load <8 x i32>, <8 x i32>* %487, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %488, <8 x i32>* %62, align 32, !tbaa !33, !llvm.access.group !49
+  %489 = bitcast <8 x i32>* %63 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %489) #3, !llvm.access.group !49
+  %490 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %61, i32 0, i32 1
+  %491 = load <8 x i32>, <8 x i32>* %490, align 32, !tbaa !33, !llvm.access.group !49
+  %492 = trunc <8 x i32> %491 to <8 x i1>
+  %493 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %13) #22, !llvm.access.group !49
+  %494 = bitcast %"class.embeddings::TileSpmemVectorArray"* %13 to %"class.embeddings::BaseArray"*
+  %495 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %494) #22, !llvm.access.group !49
+  %496 = mul nsw i32 6, %495
+  %497 = sdiv i32 %496, 8
+  %498 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %493, i32 %497
+  %499 = load <8 x i32>, <8 x i32>* %60, align 32, !tbaa !33, !llvm.access.group !49
+  %500 = load <8 x i32>, <8 x i32>* %62, align 32, !tbaa !33, !llvm.access.group !49
+  %501 = call <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.p201v8i32(<8 x i1> %492, <8 x i32> addrspace(201)* %498, <8 x i32> %499, <8 x i32> %500), !llvm.access.group !49
+  store <8 x i32> %501, <8 x i32>* %63, align 32, !tbaa !33, !llvm.access.group !49
+  %502 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %11, align 4, !tbaa !7, !llvm.access.group !49
+  %503 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %502) #22, !llvm.access.group !49
+  %504 = load <8 x i32>, <8 x i32>* %63, align 32, !tbaa !33, !llvm.access.group !49
+  %505 = load <8 x i32>, <8 x i32>* %62, align 32, !tbaa !33, !llvm.access.group !49
+  %506 = add <8 x i32> %504, %505
+  %507 = load <8 x i32>, <8 x i32>* %58, align 32, !tbaa !33, !llvm.access.group !49
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %503, <8 x i32> %506, <8 x i32> %507), !llvm.access.group !49
+  %508 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %12, align 4, !tbaa !7, !llvm.access.group !49
+  %509 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %508) #22, !llvm.access.group !49
+  %510 = load <8 x i32>, <8 x i32>* %63, align 32, !tbaa !33, !llvm.access.group !49
+  %511 = load <8 x i32>, <8 x i32>* %62, align 32, !tbaa !33, !llvm.access.group !49
+  %512 = add <8 x i32> %510, %511
+  %513 = load <8 x i32>, <8 x i32>* %59, align 32, !tbaa !33, !llvm.access.group !49
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %509, <8 x i32> %512, <8 x i32> %513), !llvm.access.group !49
+  %514 = bitcast <8 x i32>* %63 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %514) #3, !llvm.access.group !49
+  %515 = bitcast <8 x i32>* %62 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %515) #3, !llvm.access.group !49
+  %516 = bitcast %"struct.embeddings::UniqueResult"* %61 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %516) #3, !llvm.access.group !49
+  %517 = bitcast <8 x i32>* %60 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %517) #3, !llvm.access.group !49
+  %518 = bitcast <8 x i32>* %59 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %518) #3, !llvm.access.group !49
+  %519 = bitcast <8 x i32>* %58 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %519) #3, !llvm.access.group !49
+  %520 = bitcast i32* %57 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %520) #3, !llvm.access.group !49
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !49
+  %521 = bitcast i32* %64 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %521) #3, !llvm.access.group !49
+  %522 = load i32, i32* %14, align 4, !tbaa !3, !llvm.access.group !49
+  %523 = add nsw i32 %522, 0
+  store i32 %523, i32* %64, align 4, !tbaa !3, !llvm.access.group !49
+  %524 = bitcast <8 x i32>* %65 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %524) #3, !llvm.access.group !49
+  %525 = bitcast %"class.embeddings::TileSpmemVectorArray"* %3 to %"class.embeddings::BaseArray"*
+  %526 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %525) #22, !llvm.access.group !49
+  %527 = mul nsw i32 7, %526
+  %528 = sdiv i32 %527, 8
+  %529 = load i32, i32* %64, align 4, !tbaa !3, !llvm.access.group !49
+  %530 = add nsw i32 %528, %529
+  %531 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %3, i32 noundef %530) #22, !llvm.access.group !49
+  %532 = load <8 x i32>, <8 x i32> addrspace(201)* %531, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %532, <8 x i32>* %65, align 32, !tbaa !33, !llvm.access.group !49
+  %533 = bitcast <8 x i32>* %66 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %533) #3, !llvm.access.group !49
+  %534 = bitcast %"class.embeddings::TileSpmemVectorArray"* %4 to %"class.embeddings::BaseArray"*
+  %535 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %534) #22, !llvm.access.group !49
+  %536 = mul nsw i32 7, %535
+  %537 = sdiv i32 %536, 8
+  %538 = load i32, i32* %64, align 4, !tbaa !3, !llvm.access.group !49
+  %539 = add nsw i32 %537, %538
+  %540 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %4, i32 noundef %539) #22, !llvm.access.group !49
+  %541 = load <8 x i32>, <8 x i32> addrspace(201)* %540, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %541, <8 x i32>* %66, align 32, !tbaa !33, !llvm.access.group !49
+  %542 = bitcast <8 x i32>* %67 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %542) #3, !llvm.access.group !49
+  %543 = load <8 x i32>, <8 x i32>* %65, align 32, !tbaa !33, !llvm.access.group !49
+  %544 = load i32, i32* %9, align 4, !tbaa !3, !llvm.access.group !49
+  %545 = call noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %72, <8 x i32> noundef %543, i32 noundef %544) #22, !llvm.access.group !49
+  store <8 x i32> %545, <8 x i32>* %67, align 32, !tbaa !33, !llvm.access.group !49
+  %546 = bitcast %"struct.embeddings::UniqueResult"* %68 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %546) #3, !llvm.access.group !49
+  %547 = load <8 x i32>, <8 x i32>* %67, align 32, !tbaa !33, !llvm.access.group !49
+  call void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* sret(%"struct.embeddings::UniqueResult") align 32 %68, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %547) #22, !llvm.access.group !49
+  %548 = bitcast <8 x i32>* %69 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %548) #3, !llvm.access.group !49
+  %549 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %68, i32 0, i32 2
+  %550 = load <8 x i32>, <8 x i32>* %549, align 32, !tbaa !33, !llvm.access.group !49
+  store <8 x i32> %550, <8 x i32>* %69, align 32, !tbaa !33, !llvm.access.group !49
+  %551 = bitcast <8 x i32>* %70 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %551) #3, !llvm.access.group !49
+  %552 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %68, i32 0, i32 1
+  %553 = load <8 x i32>, <8 x i32>* %552, align 32, !tbaa !33, !llvm.access.group !49
+  %554 = trunc <8 x i32> %553 to <8 x i1>
+  %555 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %13) #22, !llvm.access.group !49
+  %556 = bitcast %"class.embeddings::TileSpmemVectorArray"* %13 to %"class.embeddings::BaseArray"*
+  %557 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %556) #22, !llvm.access.group !49
+  %558 = mul nsw i32 7, %557
+  %559 = sdiv i32 %558, 8
+  %560 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %555, i32 %559
+  %561 = load <8 x i32>, <8 x i32>* %67, align 32, !tbaa !33, !llvm.access.group !49
+  %562 = load <8 x i32>, <8 x i32>* %69, align 32, !tbaa !33, !llvm.access.group !49
+  %563 = call <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.p201v8i32(<8 x i1> %554, <8 x i32> addrspace(201)* %560, <8 x i32> %561, <8 x i32> %562), !llvm.access.group !49
+  store <8 x i32> %563, <8 x i32>* %70, align 32, !tbaa !33, !llvm.access.group !49
+  %564 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %11, align 4, !tbaa !7, !llvm.access.group !49
+  %565 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %564) #22, !llvm.access.group !49
+  %566 = load <8 x i32>, <8 x i32>* %70, align 32, !tbaa !33, !llvm.access.group !49
+  %567 = load <8 x i32>, <8 x i32>* %69, align 32, !tbaa !33, !llvm.access.group !49
+  %568 = add <8 x i32> %566, %567
+  %569 = load <8 x i32>, <8 x i32>* %65, align 32, !tbaa !33, !llvm.access.group !49
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %565, <8 x i32> %568, <8 x i32> %569), !llvm.access.group !49
+  %570 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %12, align 4, !tbaa !7, !llvm.access.group !49
+  %571 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %570) #22, !llvm.access.group !49
+  %572 = load <8 x i32>, <8 x i32>* %70, align 32, !tbaa !33, !llvm.access.group !49
+  %573 = load <8 x i32>, <8 x i32>* %69, align 32, !tbaa !33, !llvm.access.group !49
+  %574 = add <8 x i32> %572, %573
+  %575 = load <8 x i32>, <8 x i32>* %66, align 32, !tbaa !33, !llvm.access.group !49
+  call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %571, <8 x i32> %574, <8 x i32> %575), !llvm.access.group !49
+  %576 = bitcast <8 x i32>* %70 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %576) #3, !llvm.access.group !49
+  %577 = bitcast <8 x i32>* %69 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %577) #3, !llvm.access.group !49
+  %578 = bitcast %"struct.embeddings::UniqueResult"* %68 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %578) #3, !llvm.access.group !49
+  %579 = bitcast <8 x i32>* %67 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %579) #3, !llvm.access.group !49
+  %580 = bitcast <8 x i32>* %66 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %580) #3, !llvm.access.group !49
+  %581 = bitcast <8 x i32>* %65 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %581) #3, !llvm.access.group !49
+  %582 = bitcast i32* %64 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %582) #3, !llvm.access.group !49
+  call void @llvm.tpu.loop.parallel(), !llvm.access.group !49
+  br label %583
+
+583:                                              ; preds = %86
+  %584 = load i32, i32* %14, align 4, !tbaa !3, !llvm.access.group !49
+  %585 = add nsw i32 %584, 1
+  store i32 %585, i32* %14, align 4, !tbaa !3, !llvm.access.group !49
+  br label %78, !llvm.loop !50
+
+586:                                              ; preds = %84
+  %587 = load i8, i8* %10, align 1, !tbaa !47, !range !52
+  %588 = trunc i8 %587 to i1
+  br i1 %588, label %589, label %590
+
+589:                                              ; preds = %586
+  call void @_ZN10embeddings18RadixSortIterationIiLi4EE16IncrementBucketsEv(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %72) #22
+  br label %590
+
+590:                                              ; preds = %589, %586
+  %591 = bitcast %"class.embeddings::TileSpmemVectorArray"* %13 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %591) #3
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %0, i32 noundef %1) #5 comdat align 2 {
+  %3 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %4 = alloca i32, align 4
+  store %"class.embeddings::TileSpmemVectorArray"* %0, %"class.embeddings::TileSpmemVectorArray"** %3, align 4, !tbaa !7
+  store i32 %1, i32* %4, align 4, !tbaa !3
+  %5 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %3, align 4
+  %6 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %5) #22
+  %7 = load i32, i32* %4, align 4, !tbaa !3
+  %8 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %6, i32 %7
+  ret <8 x i32> addrspace(201)* %8
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %0) #5 comdat align 2 {
+  %2 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %3 = alloca %"class.embeddings::TileSpmemPointer"*, align 4
+  %4 = alloca %"class.embeddings::PointerBase", align 4
+  store %"class.embeddings::TileSpmemVectorArray"* %0, %"class.embeddings::TileSpmemVectorArray"** %2, align 4, !tbaa !7
+  %5 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %2, align 4
+  %6 = bitcast %"class.embeddings::TileSpmemPointer"** %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3
+  %7 = bitcast %"class.embeddings::PointerBase"* %4 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %7) #3
+  %8 = bitcast %"class.embeddings::TileSpmemVectorArray"* %5 to %"class.embeddings::BaseArray"*
+  call void @_ZNK10embeddings9BaseArray7BasePtrEv(%"class.embeddings::PointerBase"* sret(%"class.embeddings::PointerBase") align 4 %4, %"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %8) #22
+  %9 = call noundef %"class.embeddings::TileSpmemPointer"* @_ZN10embeddings4CastINS_16TileSpmemPointerENS_11PointerBaseEEENS_15cast_retty_implIT_T0_E8ret_typeERKS5_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %4) #22
+  %10 = bitcast %"class.embeddings::PointerBase"* %4 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %10) #3
+  store %"class.embeddings::TileSpmemPointer"* %9, %"class.embeddings::TileSpmemPointer"** %3, align 4, !tbaa !7
+  %11 = load %"class.embeddings::TileSpmemPointer"*, %"class.embeddings::TileSpmemPointer"** %3, align 4, !tbaa !7
+  %12 = call noundef i8 addrspace(201)* @_ZNK10embeddings16TileSpmemPointer6RawPtrEv(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %11) #22
+  %13 = bitcast i8 addrspace(201)* %12 to <8 x i32> addrspace(201)*
+  %14 = bitcast %"class.embeddings::TileSpmemPointer"** %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %14) #3
+  ret <8 x i32> addrspace(201)* %13
+}
+
+; Function Attrs: mustprogress nounwind
+define linkonce_odr dso_local noundef %"class.embeddings::TileSpmemPointer"* @_ZN10embeddings4CastINS_16TileSpmemPointerENS_11PointerBaseEEENS_15cast_retty_implIT_T0_E8ret_typeERKS5_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0) #6 comdat {
+  %2 = alloca %"class.embeddings::PointerBase"*, align 4
+  store %"class.embeddings::PointerBase"* %0, %"class.embeddings::PointerBase"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %2, align 4, !tbaa !7
+  %4 = bitcast %"class.embeddings::PointerBase"* %3 to %"class.embeddings::TileSpmemPointer"*
+  ret %"class.embeddings::TileSpmemPointer"* %4
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZNK10embeddings9BaseArray7BasePtrEv(%"class.embeddings::PointerBase"* noalias sret(%"class.embeddings::PointerBase") align 4 %0, %"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %1) #5 comdat align 2 {
+  %3 = alloca i8*, align 4
+  %4 = alloca %"class.embeddings::BaseArray"*, align 4
+  %5 = bitcast %"class.embeddings::PointerBase"* %0 to i8*
+  store i8* %5, i8** %3, align 4
+  store %"class.embeddings::BaseArray"* %1, %"class.embeddings::BaseArray"** %4, align 4, !tbaa !7
+  %6 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %4, align 4
+  %7 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %6, i32 0, i32 0
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %7) #22
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef i8 addrspace(201)* @_ZNK10embeddings16TileSpmemPointer6RawPtrEv(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %0) #13 comdat align 2 {
+  %2 = alloca %"class.embeddings::TileSpmemPointer"*, align 4
+  store %"class.embeddings::TileSpmemPointer"* %0, %"class.embeddings::TileSpmemPointer"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::TileSpmemPointer"*, %"class.embeddings::TileSpmemPointer"** %2, align 4
+  %4 = bitcast %"class.embeddings::TileSpmemPointer"* %3 to %"class.embeddings::PointerBase"*
+  %5 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %4, i32 0, i32 2
+  %6 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %5 to i32 addrspace(201)**
+  %7 = load i32 addrspace(201)*, i32 addrspace(201)** %6, align 4, !tbaa !33
+  %8 = bitcast i32 addrspace(201)* %7 to i8 addrspace(201)*
+  ret i8 addrspace(201)* %8
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef <8 x i32> @_ZN10embeddings18RadixSortIterationIiLi4EE9GetDigitsEDv8_ii(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %0, <8 x i32> noundef %1, i32 noundef %2) #15 comdat align 2 {
+  %4 = alloca %"class.embeddings::RadixSortIteration"*, align 4
+  %5 = alloca <8 x i32>, align 32
+  %6 = alloca i32, align 4
+  store %"class.embeddings::RadixSortIteration"* %0, %"class.embeddings::RadixSortIteration"** %4, align 4, !tbaa !7
+  store <8 x i32> %1, <8 x i32>* %5, align 32, !tbaa !33
+  store i32 %2, i32* %6, align 4, !tbaa !3
+  %7 = load %"class.embeddings::RadixSortIteration"*, %"class.embeddings::RadixSortIteration"** %4, align 4
+  %8 = load <8 x i32>, <8 x i32>* %5, align 32, !tbaa !33
+  %9 = load i32, i32* %6, align 4, !tbaa !3
+  %10 = mul nsw i32 %9, 4
+  %11 = insertelement <8 x i32> poison, i32 %10, i32 0
+  %12 = shufflevector <8 x i32> %11, <8 x i32> poison, <8 x i32> zeroinitializer
+  %13 = ashr <8 x i32> %8, %12
+  %14 = getelementptr inbounds %"class.embeddings::RadixSortIteration", %"class.embeddings::RadixSortIteration"* %7, i32 0, i32 0
+  %15 = load i32, i32* %14, align 4, !tbaa !35
+  %16 = sub nsw i32 %15, 1
+  %17 = insertelement <8 x i32> poison, i32 %16, i32 0
+  %18 = shufflevector <8 x i32> %17, <8 x i32> poison, <8 x i32> zeroinitializer
+  %19 = and <8 x i32> %13, %18
+  ret <8 x i32> %19
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings12VectorUniqueIDv8_iEENS_12UniqueResultES1_T_(%"struct.embeddings::UniqueResult"* noalias sret(%"struct.embeddings::UniqueResult") align 32 %0, <8 x i32> noundef %1, <8 x i32> noundef %2) #14 comdat {
+  %4 = alloca <8 x i32>, align 32
+  %5 = alloca <8 x i32>, align 32
+  store <8 x i32> %1, <8 x i32>* %4, align 32, !tbaa !33
+  store <8 x i32> %2, <8 x i32>* %5, align 32, !tbaa !33
+  %6 = load <8 x i32>, <8 x i32>* %4, align 32, !tbaa !33
+  %7 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %0, i32 0, i32 1
+  store <8 x i32> %6, <8 x i32>* %7, align 32, !tbaa !33
+  %8 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %0, i32 0, i32 0
+  %9 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %0, i32 0, i32 1
+  %10 = load <8 x i32>, <8 x i32>* %5, align 32, !tbaa !33
+  %11 = call noundef <8 x i32> @_ZN10embeddings3tpuIDv8_iE6uniqueEPS1_S3_S1_(<8 x i32>* noundef %8, <8 x i32>* noundef %9, <8 x i32> noundef %10) #22
+  %12 = getelementptr inbounds %"struct.embeddings::UniqueResult", %"struct.embeddings::UniqueResult"* %0, i32 0, i32 2
+  store <8 x i32> %11, <8 x i32>* %12, align 32, !tbaa !33
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.tpu.vst.msk.idx.add.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>) #16
+
+; Function Attrs: nounwind
+declare void @llvm.tpu.loop.parallel() #3
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef <8 x i32> @_ZN10embeddings3tpuIDv8_iE6uniqueEPS1_S3_S1_(<8 x i32>* noundef %0, <8 x i32>* noundef %1, <8 x i32> noundef %2) #15 comdat align 2 {
+  %4 = alloca <8 x i32>*, align 4
+  %5 = alloca <8 x i32>*, align 4
+  %6 = alloca <8 x i32>, align 32
+  store <8 x i32>* %0, <8 x i32>** %4, align 4, !tbaa !7
+  store <8 x i32>* %1, <8 x i32>** %5, align 4, !tbaa !7
+  store <8 x i32> %2, <8 x i32>* %6, align 32, !tbaa !33
+  %7 = load <8 x i32>*, <8 x i32>** %5, align 4, !tbaa !7
+  %8 = load <8 x i32>, <8 x i32>* %6, align 32, !tbaa !33
+  %9 = load <8 x i32>, <8 x i32>* %7, align 32
+  %10 = trunc <8 x i32> %9 to <8 x i1>
+  %11 = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquei(<8 x i1> %10, <8 x i32> %8)
+  %12 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %11, 2
+  %13 = zext <8 x i1> %12 to <8 x i32>
+  store <8 x i32> %13, <8 x i32>* %7, align 32
+  %14 = load <8 x i32>*, <8 x i32>** %4, align 4, !tbaa !7
+  %15 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %11, 0
+  store <8 x i32> %15, <8 x i32>* %14, align 32
+  %16 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %11, 1
+  ret <8 x i32> %16
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquei(<8 x i1>, <8 x i32>) #17
+
+; Function Attrs: nounwind readnone
+declare <8 x i32> @llvm.tpu.vlaneseq.v8i32() #18
+
+; Function Attrs: argmemonly nounwind readonly
+declare <8 x i32> @llvm.tpu.vld.msk.idx.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>) #19
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings13VectorAddScanIDv8_iEENS_13AddScanResultIT_EES1_S3_(%"struct.embeddings::AddScanResult"* noalias sret(%"struct.embeddings::AddScanResult") align 32 %0, <8 x i32> noundef %1, <8 x i32> noundef %2) #14 comdat {
+  %4 = alloca <8 x i32>, align 32
+  %5 = alloca <8 x i32>, align 32
+  store <8 x i32> %1, <8 x i32>* %4, align 32, !tbaa !33
+  store <8 x i32> %2, <8 x i32>* %5, align 32, !tbaa !33
+  %6 = load <8 x i32>, <8 x i32>* %4, align 32, !tbaa !33
+  %7 = getelementptr inbounds %"struct.embeddings::AddScanResult", %"struct.embeddings::AddScanResult"* %0, i32 0, i32 0
+  store <8 x i32> %6, <8 x i32>* %7, align 32, !tbaa !33
+  %8 = load <8 x i32>, <8 x i32>* %5, align 32, !tbaa !33
+  %9 = getelementptr inbounds %"struct.embeddings::AddScanResult", %"struct.embeddings::AddScanResult"* %0, i32 0, i32 1
+  store <8 x i32> %8, <8 x i32>* %9, align 32, !tbaa !33
+  %10 = getelementptr inbounds %"struct.embeddings::AddScanResult", %"struct.embeddings::AddScanResult"* %0, i32 0, i32 0
+  %11 = getelementptr inbounds %"struct.embeddings::AddScanResult", %"struct.embeddings::AddScanResult"* %0, i32 0, i32 1
+  call void @_ZN10embeddings3tpuIDv8_iE7addscanEPS1_S3_(<8 x i32>* noundef %10, <8 x i32>* noundef %11) #22
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind willreturn writeonly
+declare void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>) #20
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local void @_ZN10embeddings3tpuIDv8_iE7addscanEPS1_S3_(<8 x i32>* noundef %0, <8 x i32>* noundef %1) #13 comdat align 2 {
+  %3 = alloca <8 x i32>*, align 4
+  %4 = alloca <8 x i32>*, align 4
+  store <8 x i32>* %0, <8 x i32>** %3, align 4, !tbaa !7
+  store <8 x i32>* %1, <8 x i32>** %4, align 4, !tbaa !7
+  %5 = load <8 x i32>*, <8 x i32>** %3, align 4, !tbaa !7
+  %6 = load <8 x i32>*, <8 x i32>** %4, align 4, !tbaa !7
+  %7 = load <8 x i32>, <8 x i32>* %5, align 32
+  %8 = trunc <8 x i32> %7 to <8 x i1>
+  %9 = load <8 x i32>, <8 x i32>* %6, align 32
+  %10 = call { <8 x i32>, <8 x i1> } @llvm.tpu.add.scan1xNi(<8 x i1> %8, <8 x i32> %9)
+  %11 = extractvalue { <8 x i32>, <8 x i1> } %10, 1
+  %12 = zext <8 x i1> %11 to <8 x i32>
+  store <8 x i32> %12, <8 x i32>* %5, align 32
+  %13 = extractvalue { <8 x i32>, <8 x i1> } %10, 0
+  store <8 x i32> %13, <8 x i32>* %6, align 32
+  ret void
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.add.scan1xNi(<8 x i1>, <8 x i32>) #17
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings18RadixSortIterationIiLi4EE16DecrementBucketsEv(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %0) #5 comdat align 2 {
+  %2 = alloca %"class.embeddings::RadixSortIteration"*, align 4
+  store %"class.embeddings::RadixSortIteration"* %0, %"class.embeddings::RadixSortIteration"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::RadixSortIteration"*, %"class.embeddings::RadixSortIteration"** %2, align 4
+  call void @_ZN10embeddings18RadixSortIterationIiLi4EE12AddToBucketsEi(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %3, i32 noundef -1) #22
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>) #16
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings18RadixSortIterationIiLi4EE16IncrementBucketsEv(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %0) #5 comdat align 2 {
+  %2 = alloca %"class.embeddings::RadixSortIteration"*, align 4
+  store %"class.embeddings::RadixSortIteration"* %0, %"class.embeddings::RadixSortIteration"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::RadixSortIteration"*, %"class.embeddings::RadixSortIteration"** %2, align 4
+  call void @_ZN10embeddings18RadixSortIterationIiLi4EE12AddToBucketsEi(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %3, i32 noundef 1) #22
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings18RadixSortIterationIiLi4EE12AddToBucketsEi(%"class.embeddings::RadixSortIteration"* noundef nonnull align 4 dereferenceable(8) %0, i32 noundef %1) #5 comdat align 2 {
+  %3 = alloca %"class.embeddings::RadixSortIteration"*, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %6 = alloca i32, align 4
+  store %"class.embeddings::RadixSortIteration"* %0, %"class.embeddings::RadixSortIteration"** %3, align 4, !tbaa !7
+  store i32 %1, i32* %4, align 4, !tbaa !3
+  %7 = load %"class.embeddings::RadixSortIteration"*, %"class.embeddings::RadixSortIteration"** %3, align 4
+  %8 = bitcast %"class.embeddings::TileSpmemVectorArray"* %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %8) #3
+  %9 = getelementptr inbounds %"class.embeddings::RadixSortIteration", %"class.embeddings::RadixSortIteration"* %7, i32 0, i32 1
+  %10 = load %"class.embeddings::ScratchpadArray"*, %"class.embeddings::ScratchpadArray"** %9, align 4, !tbaa !37
+  %11 = call noundef %"class.embeddings::TileSpmemVectorArray"* @_ZN10embeddings4CastINS_20TileSpmemVectorArrayIiEENS_15ScratchpadArrayEEENS_15cast_retty_implIT_T0_E8ret_typeERKS6_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %10) #22
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %5, %"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %11) #22
+  %12 = bitcast i32* %6 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %12) #3
+  store i32 0, i32* %6, align 4, !tbaa !3
+  br label %13
+
+13:                                               ; preds = %28, %2
+  %14 = load i32, i32* %6, align 4, !tbaa !3
+  %15 = bitcast %"class.embeddings::TileSpmemVectorArray"* %5 to %"class.embeddings::BaseArray"*
+  %16 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %15) #22
+  %17 = icmp slt i32 %14, %16
+  br i1 %17, label %20, label %18
+
+18:                                               ; preds = %13
+  %19 = bitcast i32* %6 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %19) #3
+  br label %31
+
+20:                                               ; preds = %13
+  %21 = load i32, i32* %4, align 4, !tbaa !3
+  %22 = insertelement <8 x i32> poison, i32 %21, i32 0
+  %23 = shufflevector <8 x i32> %22, <8 x i32> poison, <8 x i32> zeroinitializer
+  %24 = load i32, i32* %6, align 4, !tbaa !3
+  %25 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %5, i32 noundef %24) #22
+  %26 = load <8 x i32>, <8 x i32> addrspace(201)* %25, align 32, !tbaa !33
+  %27 = add <8 x i32> %26, %23
+  store <8 x i32> %27, <8 x i32> addrspace(201)* %25, align 32, !tbaa !33
+  br label %28
+
+28:                                               ; preds = %20
+  %29 = load i32, i32* %6, align 4, !tbaa !3
+  %30 = add nsw i32 %29, 1
+  store i32 %30, i32* %6, align 4, !tbaa !3
+  br label %13, !llvm.loop !53
+
+31:                                               ; preds = %18
+  %32 = bitcast %"class.embeddings::TileSpmemVectorArray"* %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %32) #3
+  ret void
+}
+
+attributes #0 = { mustprogress nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tac-vf" }
+attributes #1 = { mustprogress "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #2 = { argmemonly nocallback nofree nosync nounwind willreturn }
+attributes #3 = { nounwind }
+attributes #4 = { alwaysinline "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #5 = { alwaysinline mustprogress "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #6 = { mustprogress nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #7 = { inlinehint "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #8 = { mustprogress nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-scs-vf" }
+attributes #9 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #10 = { argmemonly nofree nounwind willreturn }
+attributes #11 = { "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #12 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #13 = { alwaysinline mustprogress nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #14 = { alwaysinline mustprogress "frame-pointer"="all" "min-legal-vector-width"="256" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #15 = { alwaysinline mustprogress nounwind "frame-pointer"="all" "min-legal-vector-width"="256" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #16 = { argmemonly nounwind willreturn }
+attributes #17 = { nounwind readnone speculatable willreturn }
+attributes #18 = { nounwind readnone }
+attributes #19 = { argmemonly nounwind readonly }
+attributes #20 = { argmemonly nounwind willreturn writeonly }
+attributes #21 = { alwaysinline nobuiltin "no-builtins" }
+attributes #22 = { nobuiltin "no-builtins" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"clang version google3-trunk (a48300aee570f8eea4ec0b03e2d176aab648afb0)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"any pointer", !5, i64 0}
+!9 = !{!10, !8, i64 0}
+!10 = !{!"_ZTSN10embeddings25SparsecoreMemoryAllocatorE", !8, i64 0, !8, i64 4, !8, i64 8, !11, i64 12, !11, i64 20, !11, i64 28}
+!11 = !{!"_ZTSN10embeddings4impl15StaticAllocatorE", !4, i64 0, !4, i64 4}
+!12 = !{!10, !8, i64 4}
+!13 = !{!10, !8, i64 8}
+!14 = !{i64 0, i64 4, !7, i64 0, i64 4, !7, i64 0, i64 4, !7, i64 0, i64 4, !7, i64 0, i64 4, !7}
+!15 = !{!16, !4, i64 0}
+!16 = !{!"_ZTSN10embeddings37RadixSortKeyValueTileSpmemToTileSpmemILi4EEE", !4, i64 0, !8, i64 4, !17, i64 8, !17, i64 24, !17, i64 40, !17, i64 56, !17, i64 72}
+!17 = !{!"_ZTSN10embeddings15ScratchpadArrayE"}
+!18 = !{!16, !8, i64 4}
+!19 = distinct !{!19, !20}
+!20 = !{!"llvm.loop.mustprogress"}
+!21 = !{!11, !4, i64 0}
+!22 = !{!11, !4, i64 4}
+!23 = !{!24, !25, i64 0}
+!24 = !{!"_ZTSN10embeddings11MemorySpaceE", !25, i64 0}
+!25 = !{!"_ZTSN10embeddings21SparsecoreMemorySpaceE", !5, i64 0}
+!26 = !{!27, !28, i64 0}
+!27 = !{!"_ZTSN10embeddings9BasicTypeE", !28, i64 0}
+!28 = !{!"_ZTSN10embeddings19SparsecoreBasicTypeE", !5, i64 0}
+!29 = !{!30, !4, i64 12}
+!30 = !{!"_ZTSN10embeddings9BaseArrayE", !31, i64 0, !4, i64 12}
+!31 = !{!"_ZTSN10embeddings11PointerBaseE", !24, i64 0, !27, i64 4, !5, i64 8}
+!32 = !{!28, !28, i64 0}
+!33 = !{!5, !5, i64 0}
+!34 = !{!25, !25, i64 0}
+!35 = !{!36, !4, i64 0}
+!36 = !{!"_ZTSN10embeddings18RadixSortIterationIiLi4EEE", !4, i64 0, !8, i64 4}
+!37 = !{!36, !8, i64 4}
+!38 = distinct !{!38, !20}
+!39 = distinct !{}
+!40 = distinct !{!40, !20, !41, !42, !43, !44, !45}
+!41 = !{!"llvm.loop.parallel_accesses", !39}
+!42 = !{!"llvm.loop.unroll.disable"}
+!43 = !{!"llvm.loop.vectorize.width", i32 1}
+!44 = !{!"llvm.loop.interleave.count", i32 1}
+!45 = !{!"llvm.loop.vectorize.enable", i1 true}
+!46 = distinct !{!46, !20}
+!47 = !{!48, !48, i64 0}
+!48 = !{!"bool", !5, i64 0}
+!49 = distinct !{}
+!50 = distinct !{!50, !20, !51, !42, !43, !44, !45}
+!51 = !{!"llvm.loop.parallel_accesses", !49}
+!52 = !{i8 0, i8 2}
+!53 = distinct !{!53, !20}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/reaching_defs.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/reaching_defs.ll
new file mode 100644
index 0000000..acd6f30
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/reaching_defs.ll

@@ -0,0 +1,170 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -tail-dup-size=0 -tail-dup-limit=0 \
+; RUN: -disable-block-placement -tpu-latencies=%S/Inputs/long_load.yml -ifcvt-limit=0 -early-ifcvt-limit=0 \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: straightline:
+; CHECK: vld
+; CHECK: sdelay $0x5
+; CHECK: vadd
+define <8 x float> @straightline(<8 x float> addrspace(201)* %p, <8 x float> %b) {
+  %c = load <8 x float>, <8 x float> addrspace(201)* %p
+  %d = fadd <8 x float> %c, %b
+  ret <8 x float> %d
+}
+
+; CHECK-LABEL: curvedline:
+; CHECK: p[[P:[0-9]+]] = por !p0, !p0
+; CHECK: (pc) = sbr.rel @!p0 .LBB1_2
+; CHECK: s[[S:[0-9]+]] = smov.u32 s1
+; CHECK: v[[R:[0-9]+]] = vld [tilespmem:s[[S]]+$0x0]
+; CHECK: _ = sdelay $0x5
+; CHECK: vadd.f32 v0, v[[R]]
+define <8 x float> @curvedline(<8 x float> addrspace(201)* %p, <8 x float> addrspace(201)* %p2, <8 x float> %b, i1 %q) {
+  br i1 %q, label %t, label %f
+
+t:
+  %c = load <8 x float>, <8 x float> addrspace(201)* %p
+  br label %fallthrough
+
+f:
+  %d = load <8 x float>, <8 x float> addrspace(201)* %p2
+  br label %fallthrough
+  
+fallthrough:
+  %e = phi <8 x float> [ %c, %t ], [ %d, %f ]
+  %g = fadd <8 x float> %e, %b
+  ret <8 x float> %g
+}
+
+; CHECK-LABEL: loopy:
+; CHECK: v[[R:[0-9]+]] = vld
+; CHECK: sdelay $0x3
+; CHECK: sbr.rel
+; CHECK: snop
+; CHECK: vmov
+define <8 x float> @loopy(<8 x float> addrspace(201)* %p, <8 x float> addrspace(201)* %p2, <8 x float> %b, i1 %q) {
+entry:
+  %c = load <8 x float>, <8 x float> addrspace(201)* %p
+  br label %loop
+
+loop:
+  %ld = phi <8 x float> [ %c, %entry ], [ %d, %loop ]
+  %acc = phi <8 x float> [ zeroinitializer, %entry ], [ %g, %loop ]
+  %i = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %d = load <8 x float>, <8 x float> addrspace(201)* %p2
+  %g = fadd <8 x float> %ld, %acc
+  %inc = add i32 %i, 1
+  br i1 %q, label %loop, label %fallthrough
+  
+fallthrough:
+  ret <8 x float> %acc
+}
+
+; This subtest additionally checks whether the vld is properly spaced in a bundle
+; far enough from its use, testing the bundle scheduler's reaching defs mutation's
+; ability to insert additional edges of the end of a basic block.
+; CHECK-LABEL: loopy2:
+; CHECK: v[[R:[0-9]+]] = vld{{.*}}
+; CHECK-NEXT: _ = sdelay $0x3 }
+; CHECK-NEXT: v[[R:[0-9]+]] = vimm{{.*}} }
+; CHECK-LABEL: .LBB3_1:
+; CHECK: _ = snop
+; CHECK: v[[R:[0-9]+]] = vimm{{.*}} }
+define <8 x float> @loopy2(<8 x float> addrspace(201)* %p, <8 x float> addrspace(201)* %p2, <8 x float> %b, i1 %q) {
+entry:
+  %c = load <8 x float>, <8 x float> addrspace(201)* %p
+  br label %loop
+
+loop:
+  %ld = phi <8 x float> [ %c, %entry ], [ zeroinitializer, %loop ]
+  %acc = phi <8 x float> [ zeroinitializer, %entry ], [ %g, %loop ]
+  %i = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %g = fadd <8 x float> %ld, %acc
+  %inc = add i32 %i, 1
+  br i1 %q, label %loop, label %fallthrough
+  
+fallthrough:
+  ret <8 x float> %acc
+}
+
+; CHECK-LABEL: loopy3:
+; CHECK: v[[R:[0-9]+]] = vld
+; CHECK: sdelay $0x3
+; CHECK: sbr.rel
+; CHECK: snop
+define <8 x float> @loopy3(<8 x float> addrspace(201)* %p, <8 x float> addrspace(201)* %p2, <8 x float> %b, i1 %q) {
+entry:
+  br label %loop
+
+loop:
+  %ld = phi <8 x float> [ zeroinitializer, %entry ], [ %d, %loop ]
+  %acc = phi <8 x float> [ zeroinitializer, %entry ], [ %g, %loop ]
+  %i = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %d = load <8 x float>, <8 x float> addrspace(201)* %p2
+  %g = fadd <8 x float> %ld, %acc
+  %inc = add i32 %i, 1
+  br i1 %q, label %loop, label %fallthrough
+  
+fallthrough:
+  ret <8 x float> %acc
+}
+
+; CHECK-LABEL: transitive:
+; CHECK: v[[R:[0-9]+]] = vld
+; CHECK: {
+; CHECK: {
+; CHECK-NOT: {
+; CHECK: { _ =     snop  }
+; CHECK-NEXT: { _ = snop }
+; CHECK: vadd.f32
+define <8 x float> @transitive(<8 x float> addrspace(201)* %p, <8 x float> addrspace(201)* %p2, <8 x float> %b, i1 %q, i1 %q2) {
+entry:
+  br i1 %q, label %t, label %f
+
+t:
+  %c = load <8 x float>, <8 x float> addrspace(201)* %p
+  br label %f
+
+f:
+  %e = phi <8 x float> [ %c, %t ], [ zeroinitializer, %entry ]
+  %d = load <8 x float>, <8 x float> addrspace(201)* %p2
+  br i1 %q2, label %fallthrough, label %end
+
+fallthrough:
+  %g = fadd <8 x float> %e, %b
+  br label %end
+
+end:
+  %h = phi <8 x float> [ %g, %fallthrough ], [ zeroinitializer, %f ]
+  ret <8 x float> %h
+}
+
+; Test that we consider all paths a reaching definition can take.
+; CHECK-LABEL: multiplepath:
+; CHECK: v[[R:[0-9]+]] = vld
+; CHECK: .LBB6_3
+; CHECK: sdelay $0x3
+; CHECK: vadd.f32
+define <8 x float> @multiplepath(<8 x float> addrspace(201)* %p, <8 x float> addrspace(201)* %p2, <8 x float> %b, i1 %q) {
+entry:
+  %c = load <8 x float>, <8 x float> addrspace(201)* %p
+  br i1 %q, label %f1, label %f
+
+f:
+  %d = load <8 x float>, <8 x float> addrspace(201)* %p2
+  store <8 x float> %d, <8 x float> addrspace(201)* %p
+  br label %fallthrough
+  
+f1:
+  store <8 x float> %b, <8 x float> addrspace(201)* %p
+  br label %fallthrough
+
+fallthrough:
+  %g = fadd <8 x float> %c, %b
+  ret <8 x float> %g
+}
+

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/read_register_intrinsics_jfc_dfc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/read_register_intrinsics_jfc_dfc.ll
new file mode 100644
index 0000000..1fbc741
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/read_register_intrinsics_jfc_dfc.ll

@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -o - %s -mcpu=tensorcore-jf -asm-verbose=false -disable-cgp | FileCheck %s
+; RUN: llc -o - %s -mcpu=tensorcore-df -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+; Test selection of no-operand read register intrinsics.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.rdreg.lcc.lo() nounwind
+declare i32 @llvm.tpu.rdreg.lcc.hi() nounwind
+declare i32 @llvm.tpu.rdreg.gtc.lo() nounwind
+declare i32 @llvm.tpu.rdreg.gtc.hi() nounwind
+declare i32 @llvm.tpu.rdreg.crr.lo() nounwind
+declare i32 @llvm.tpu.rdreg.crr.hi() nounwind
+declare i32 @llvm.tpu.rdreg.btr() nounwind
+declare i32 @llvm.tpu.rdreg.tag() nounwind
+declare i32 @llvm.tpu.rdreg.tm() nounwind
+declare { i32, i32 } @llvm.tpu.read.local.cycle.count() nounwind
+
+define i32 @llc.lo() {
+; CHECK-LABEL: llc.lo:
+; CHECK:       { s0 = srdreg.lcclo;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.lcc.lo()
+  ret i32 %v0
+}
+define i32 @llc.hi() {
+; CHECK-LABEL: llc.hi:
+; CHECK:       { s0 = srdreg.lcchi;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.lcc.hi()
+  ret i32 %v0
+}
+define i32 @gtc.lo() {
+; CHECK-LABEL: gtc.lo:
+; CHECK:       { s0 = srdreg.gtclo;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.gtc.lo()
+  ret i32 %v0
+}
+define i32 @gtc.hi() {
+; CHECK-LABEL: gtc.hi:
+; CHECK:       { s0 = srdreg.gtchi;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.gtc.hi()
+  ret i32 %v0
+}
+define i32 @crr.lo() {
+; CHECK-LABEL: crr.lo:
+; CHECK:       { s0 = srdreg.crrlo;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.crr.lo()
+  ret i32 %v0
+}
+define i32 @crr.hi() {
+; CHECK-LABEL: crr.hi:
+; CHECK:       { s0 = srdreg.crrhi;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.crr.hi()
+  ret i32 %v0
+}
+define i32 @btr() {
+; CHECK-LABEL: btr:
+; CHECK:       { s0 = srdreg.btr;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.btr()
+  ret i32 %v0
+}
+define i32 @tag() {
+; CHECK-LABEL: tag:
+; CHECK:       { s0 = srdreg.tag;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.tag()
+  ret i32 %v0
+}
+define i32 @tm() {
+; CHECK-LABEL: tm:
+; CHECK:       { s0 = srdreg.tm;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.tm()
+  ret i32 %v0
+}
+define i32 @readlcc() {
+; CHECK-LABEL: readlcc:
+; CHECK:       { s0 = srdreg.lcclo;
+; CHECK-NEXT:    s1 = srdreg.lcchi }
+; CHECK-NEXT:  { _ = shalt }
+  %v0 = call { i32, i32 } @llvm.tpu.read.local.cycle.count()
+  %v0ext = extractvalue { i32, i32 } %v0, 0
+  ret i32 %v0ext
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/read_register_intrinsics_pfc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/read_register_intrinsics_pfc.ll
new file mode 100644
index 0000000..77ee396
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/read_register_intrinsics_pfc.ll

@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -o - %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+; Test selection of no-operand read register intrinsics.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.rdreg.lcc.lo() nounwind
+declare i32 @llvm.tpu.rdreg.lcc.hi() nounwind
+declare i32 @llvm.tpu.rdreg.gtc.lo() nounwind
+declare i32 @llvm.tpu.rdreg.gtc.hi() nounwind
+declare i32 @llvm.tpu.rdreg.tag() nounwind
+declare i32 @llvm.tpu.rdreg.tm() nounwind
+declare { i32, i32 } @llvm.tpu.read.local.cycle.count() nounwind
+
+define i32 @llc.lo() {
+; CHECK-LABEL: llc.lo:
+; CHECK:       { s0 = srdreg.lcclo;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.lcc.lo()
+  ret i32 %v0
+}
+define i32 @llc.hi() {
+; CHECK-LABEL: llc.hi:
+; CHECK:       { s0 = srdreg.lcchi;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.lcc.hi()
+  ret i32 %v0
+}
+define i32 @gtc.lo() {
+; CHECK-LABEL: gtc.lo:
+; CHECK:       { s0 = srdreg.gtclo;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.gtc.lo()
+  ret i32 %v0
+}
+define i32 @gtc.hi() {
+; CHECK-LABEL: gtc.hi:
+; CHECK:       { s0 = srdreg.gtchi;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.gtc.hi()
+  ret i32 %v0
+}
+define i32 @tag() {
+; CHECK-LABEL: tag:
+; CHECK:       { s0 = srdreg.tag;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.tag()
+  ret i32 %v0
+}
+define i32 @tm() {
+; CHECK-LABEL: tm:
+; CHECK:       { s0 = srdreg.tm;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.tm()
+  ret i32 %v0
+}
+define i32 @readlcc() {
+; CHECK-LABEL: readlcc:
+; CHECK:       { s0 = srdreg.lcclo;
+; CHECK-NEXT:    s1 = srdreg.lcchi }
+; CHECK-NEXT:  { _ = shalt }
+  %v0 = call { i32, i32 } @llvm.tpu.read.local.cycle.count()
+  %v0ext = extractvalue { i32, i32 } %v0, 0
+  ret i32 %v0ext
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/read_register_intrinsics_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/read_register_intrinsics_sc.ll
new file mode 100644
index 0000000..1715b44
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/read_register_intrinsics_sc.ll

@@ -0,0 +1,205 @@
+; RUN: llc -o - %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+; Test selection of no-operand read register intrinsics.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.rdreg.tag() nounwind
+declare i32 @llvm.tpu.rdreg.yieldreq() nounwind
+declare i32 @llvm.tpu.rdreg.tm() nounwind
+declare i32 @llvm.tpu.rdreg.scid() nounwind
+declare i32 @llvm.tpu.rdreg.tbm() nounwind
+declare i32 @llvm.tpu.rdreg.fsr() nounwind
+declare i32 @llvm.tpu.rdreg.ddr() nounwind
+declare i32 @llvm.tpu.rdreg.dmacrdt() nounwind
+declare i32 @llvm.tpu.rdreg.lcc.lo() nounwind
+declare i32 @llvm.tpu.rdreg.lcc.hi() nounwind
+declare i32 @llvm.tpu.rdreg.gtc.lo() nounwind
+declare i32 @llvm.tpu.rdreg.gtc.hi() nounwind
+declare { i32, i32 } @llvm.tpu.read.global.cycle.count() nounwind
+declare { i32, i32 } @llvm.tpu.read.local.cycle.count() nounwind
+declare void @llvm.tpu.setreg.pdepth(i32)
+declare void @llvm.tpu.setreg.tag(i32)
+declare void @llvm.tpu.setreg.ifvalue(i32)
+declare void @llvm.tpu.setreg.dmacrdt(i32)
+declare void @llvm.tpu.setreg.sflagrange(i32)
+
+define i32 @read_tag() {
+; CHECK-LABEL: read_tag:
+; CHECK:       { s0 = srdreg.tag;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.tag()
+  ret i32 %v0
+}
+
+define i32 @read_tm() {
+; CHECK-LABEL: read_tm:
+; CHECK:       { s0 = srdreg.tm;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.tm()
+  ret i32 %v0
+}
+
+define i32 @read_yieldreq() {
+; CHECK-LABEL: read_yieldreq:
+; CHECK:       { s0 = srdreg.yieldreq;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.yieldreq()
+  ret i32 %v0
+}
+
+define i32 @read_scid() {
+; CHECK-LABEL: read_scid:
+; CHECK:       { s0 = srdreg.scid;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.scid()
+  ret i32 %v0
+}
+
+define i32 @read_tbm() {
+; CHECK-LABEL: read_tbm:
+; CHECK:       { s0 = srdreg.tbm;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.tbm()
+  ret i32 %v0
+}
+
+define i32 @read_fsr() {
+; CHECK-LABEL: read_fsr:
+; CHECK:       { s0 = srdreg.fsr;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.fsr()
+  ret i32 %v0
+}
+
+define i32 @read_ddr() {
+; CHECK-LABEL: read_ddr:
+; CHECK:       { s0 = srdreg.ddr;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.ddr()
+  ret i32 %v0
+}
+
+define i32 @read_dmacrdt() {
+; CHECK-LABEL: read_dmacrdt:
+; CHECK:       { s0 = srdreg.dmacrdt;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.dmacrdt()
+  ret i32 %v0
+}
+
+define { i32, i32 } @read_lcc() {
+; CHECK-LABEL: read_lcc:
+; CHECK:       { s0 = srdreg.lcclo }
+; CHECK-NEXT:  { s1 = srdreg.lcchi
+; CHECK-NEXT:     _ = shalt }
+  %lo = call i32 @llvm.tpu.rdreg.lcc.lo()
+  %hi = call i32 @llvm.tpu.rdreg.lcc.hi()
+  %r0 = insertvalue {i32, i32} undef, i32 %lo, 0
+  %r1 = insertvalue {i32, i32} %r0, i32 %hi, 1
+  ret {i32, i32} %r1
+}
+
+define { i32, i32 } @read_gtc() {
+; CHECK-LABEL: read_gtc:
+; CHECK:       { s0 = srdreg.gtclo }
+; CHECK-NEXT:  { s1 = srdreg.gtchi
+; CHECK-NEXT:     _ = shalt }
+  %lo = call i32 @llvm.tpu.rdreg.gtc.lo()
+  %hi = call i32 @llvm.tpu.rdreg.gtc.hi()
+  %r0 = insertvalue {i32, i32} undef, i32 %lo, 0
+  %r1 = insertvalue {i32, i32} %r0, i32 %hi, 1
+  ret {i32, i32} %r1
+}
+
+; CHECK-LABEL: set_pdepth_r
+; CHECK: (pdepth) = ssetpdepth s0
+define void @set_pdepth_r(i32 %a) {
+  call void @llvm.tpu.setreg.pdepth(i32 %a)
+  ret void
+}
+
+; CHECK-LABEL: set_pdepth_i
+; CHECK: (pdepth) = ssetpdepth $0xa
+define void @set_pdepth_i() {
+  call void @llvm.tpu.setreg.pdepth(i32 10)
+  ret void
+}
+
+; CHECK-LABEL: set_tag_r
+; CHECK: (tag) = ssettag s0
+define void @set_tag_r(i32 %a) {
+  call void @llvm.tpu.setreg.tag(i32 %a)
+  ret void
+}
+
+; CHECK-LABEL: set_tag_i
+; CHECK: (tag) = ssettag $0xa
+define void @set_tag_i() {
+  call void @llvm.tpu.setreg.tag(i32 10)
+  ret void
+}
+
+; CHECK-LABEL: set_ifvalue_r
+; CHECK: (ifvalue) = ssetifvalue s0
+define void @set_ifvalue_r(i32 %a) {
+  call void @llvm.tpu.setreg.ifvalue(i32 %a)
+  ret void
+}
+
+; CHECK-LABEL: set_ifvalue_i
+; CHECK: (ifvalue) = ssetifvalue $0xa
+define void @set_ifvalue_i() {
+  call void @llvm.tpu.setreg.ifvalue(i32 10)
+  ret void
+}
+
+; CHECK-LABEL: set_dmacrdt_r
+; CHECK: (dmacrdt) = ssetdmacrdt s0
+define void @set_dmacrdt_r(i32 %a) {
+  call void @llvm.tpu.setreg.dmacrdt(i32 %a)
+  ret void
+}
+
+; CHECK-LABEL: set_dmacrdt_i
+; CHECK: (dmacrdt) = ssetdmacrdt $0xa
+define void @set_dmacrdt_i() {
+  call void @llvm.tpu.setreg.dmacrdt(i32 10)
+  ret void
+}
+
+; CHECK-LABEL: set_sflagrange_r
+; CHECK: (sflagrange) = ssetsflagrange s0
+define void @set_sflagrange_r(i32 %a) {
+  call void @llvm.tpu.setreg.sflagrange(i32 %a)
+  ret void
+}
+
+; CHECK-LABEL: set_sflagrange_i
+; CHECK: (sflagrange) = ssetsflagrange $0xa
+define void @set_sflagrange_i() {
+  call void @llvm.tpu.setreg.sflagrange(i32 10)
+  ret void
+}
+
+; CHECK-LABEL: read_global_cycle_count
+; CHECK: { s0 = srdreg.gtclo;
+; CHECK:   s1 = srdreg.gtchi  }
+; CHECK: { _ = shalt  }
+define i32 @read_global_cycle_count() {
+  %gtc = call { i32, i32 } @llvm.tpu.read.global.cycle.count()
+  %gtc_lo = extractvalue { i32, i32 } %gtc, 0
+  ret i32 %gtc_lo
+}
+
+; CHECK-LABEL: read_local_cycle_count
+; CHECK: { s0 = srdreg.lcclo;
+; CHECK:   s1 = srdreg.lcchi  }
+; CHECK: { _ = shalt  }
+define i32 @read_local_cycle_count() {
+  %lcc = call { i32, i32 } @llvm.tpu.read.local.cycle.count()
+  %lcc_lo = extractvalue { i32, i32 } %lcc, 0
+  ret i32 %lcc_lo
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/read_register_intrinsics_vfc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/read_register_intrinsics_vfc.ll
new file mode 100644
index 0000000..db38c3e
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/read_register_intrinsics_vfc.ll

@@ -0,0 +1,68 @@
+; RUN: llc -o - %s -mcpu=tensorcore-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+; Test selection of no-operand read register intrinsics.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.rdreg.tag() nounwind
+declare i32 @llvm.tpu.rdreg.yieldreq() nounwind
+declare i32 @llvm.tpu.rdreg.tcid() nounwind
+declare i32 @llvm.tpu.rdreg.tm() nounwind
+declare { i32, i32 } @llvm.tpu.read.local.cycle.count() nounwind
+
+define i32 @tag() {
+; CHECK-LABEL: tag:
+; CHECK:       { s0 = srdreg.tag;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.tag()
+  ret i32 %v0
+}
+define i32 @tm() {
+; CHECK-LABEL: tm:
+; CHECK:       { s0 = srdreg.tm;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.tm()
+  ret i32 %v0
+}
+define i32 @yieldreq() {
+; CHECK-LABEL: yieldreq:
+; CHECK:       { s0 = srdreg.yieldreq;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.yieldreq()
+  ret i32 %v0
+}
+define i32 @tcid() {
+; CHECK-LABEL: tcid:
+; CHECK:       { s0 = srdreg.tcid;
+; CHECK-NEXT:    _ = shalt }
+  %v0 = call i32 @llvm.tpu.rdreg.tcid()
+  ret i32 %v0
+}
+define i32 @readlcc() {
+; CHECK-LABEL: readlcc:
+; CHECK:       { s0 = srdreg.lcclo;
+; CHECK-NEXT:    s1 = srdreg.lcchi }
+; CHECK-NEXT:  { _ = shalt }
+  %v0 = call { i32, i32 } @llvm.tpu.read.local.cycle.count()
+  %v0ext = extractvalue { i32, i32 } %v0, 0
+  ret i32 %v0ext
+}
+
+; Test selection of set next tag register intrinsics.
+declare void @llvm.tpu.setreg.tag(i32)
+
+; CHECK-LABEL: set_tag_r
+; CHECK: (tag) = ssettag s0
+define void @set_tag_r(i32 %a) {
+  call void @llvm.tpu.setreg.tag(i32 %a)
+  ret void
+}
+
+; CHECK-LABEL: set_tag_i
+; CHECK: (tag) = ssettag $0xa
+define void @set_tag_i() {
+  call void @llvm.tpu.setreg.tag(i32 10)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/remat.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/remat.ll
new file mode 100644
index 0000000..63cb7ff
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/remat.ll

@@ -0,0 +1,20 @@
+; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Extarnal functions
+declare void @g1(<1024 x i32>)
+
+declare <1024 x i32> @llvm.tpu.vlaneseq() #0
+
+; CHECK-LABEL: remat
+; CHECK: = vlaneseq.u32
+; CHECK: lr = call g1
+; CHECK: = vlaneseq.u32;
+define <1024 x i32> @remat() {
+  %laneseq = call <1024 x i32> @llvm.tpu.vlaneseq()
+  call void @g1(<1024 x i32> %laneseq)
+  ret <1024 x i32> %laneseq
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/resource_scheduler.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/resource_scheduler.ll
new file mode 100644
index 0000000..9386179
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/resource_scheduler.ll

@@ -0,0 +1,35 @@
+; RUN: opt  < %s -S -passes=loop-unroll -mcpu=tensorcore-pf | \
+; RUN: llc -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp -tpu-use-resource-swing-sched=true
+; REQUIRES: tpu
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Check that we don't spill after unrolling a loop with many add/load/store
+; Spilling would cause an assert as we are not setting the spill mem location
+; metadata. The scheduler shouldn't increase register pressure to a point where
+; we need to spill.
+define void @add_store(<1024 x float> addrspace(205)* noalias %in1, <1024 x float> addrspace(205)* noalias %in2, <1024 x float> addrspace(205)* noalias %out) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds <1024 x float>, <1024 x float> addrspace(205)* %in1, i32 %i.09
+  %0 = load <1024 x float>, <1024 x float> addrspace(205)* %arrayidx
+  %arrayidx1 = getelementptr inbounds <1024 x float>, <1024 x float> addrspace(205)* %in2, i32 %i.09
+  %1 = load <1024 x float>, <1024 x float> addrspace(205)* %arrayidx1
+  %add = fadd <1024 x float> %0, %1
+  %arrayidx2 = getelementptr inbounds <1024 x float>, <1024 x float> addrspace(205)* %out, i32 %i.09
+  store <1024 x float> %add, <1024 x float> addrspace(205)* %arrayidx2
+  %inc = add nuw nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, 32
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !0
+
+  for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.unroll.enable"}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/ret.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/ret.ll
new file mode 100644
index 0000000..ca4ee80
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/ret.ll

@@ -0,0 +1,19 @@
+; RUN: llc < %s -asm-verbose=false -mcpu=sparsecore-tec-vf -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+; Test that passing arguments and returns works.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: f_scalar:
+; CHECK: shalt
+define void @f_scalar(i32 %a, float %b, i1 %c) {
+  ret void
+}
+
+; CHECK-LABEL: f_vector:
+; CHECK: shalt
+define <8 x i1> @f_vector(<8 x i32> %a, <8 x float> %b, <8 x i1> %c) {
+  ret <8 x i1> %c
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/s_pred_spill_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/s_pred_spill_sc.ll
new file mode 100644
index 0000000..1490987
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/s_pred_spill_sc.ll

@@ -0,0 +1,305 @@
+; RUN: llc -O2 < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -tpu-opt-spill-to-dreg=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Same test as sc_pred_spill, plus various sreg/gpr live ranges while predicates
+; are being spilled. The intention for this was to make the register scavenger
+; spill a gpr in order to spill a predicate register. Even with this stress test
+; though, there seems to be always a gpr available that had already been spilled,
+; and hence this test doesn't test what was originally intended. We are still
+; leaving this test in our regression, because it should be the first point of
+; failure should there be an issue with the register scavenger in the future.
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+!smem.funcs.spill = !{!0}
+!smem.ranges.spill.start = !{!1}
+!smem.ranges.spill.limit = !{!2}
+!tilespmem.ranges.spill.start = !{!1}
+!tilespmem.ranges.spill.limit = !{!2}
+
+!0 = !{void (<8 x i32> addrspace(201)*)* @spill_pred_to_smem}
+!1 = !{i32 100}
+!2 = !{i32 200}
+
+; Function Attrs: nounwind readnone
+declare <8 x i32> @llvm.tpu.vlaneseq() #0
+declare i32* @llvm.tpu.inttoptr.p0i32(i32)
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32) nounwind
+attributes #0 = { nounwind readnone }
+attributes #1 = { optnone noinline }
+
+@unknownglob1 = global i32 zeroinitializer
+@unknownglob2 = global i32 zeroinitializer
+@unknownglob3 = global i32 zeroinitializer
+@unknownglob4 = global i32 zeroinitializer
+@unknownglob5 = global i32 zeroinitializer
+@unknownglob6 = global i32 zeroinitializer
+@unknownglob7 = global i32 zeroinitializer
+@unknownglob8 = global i32 zeroinitializer
+@unknownglob9 = global i32 zeroinitializer
+@unknownglob10 = global i32 zeroinitializer
+@unknownglob11 = global i32 zeroinitializer
+@unknownglob12 = global i32 zeroinitializer
+@unknownglob13 = global i32 zeroinitializer
+@unknownglob14 = global i32 zeroinitializer
+@unknownglob15 = global i32 zeroinitializer
+@unknownglob16 = global i32 zeroinitializer
+@unknownglob17 = global i32 zeroinitializer
+@unknownglob18 = global i32 zeroinitializer
+@unknownglob19 = global i32 zeroinitializer
+@unknownglob20 = global i32 zeroinitializer
+@unknownglob21 = global i32 zeroinitializer
+@unknownglob22 = global i32 zeroinitializer
+@unknownglob23 = global i32 zeroinitializer
+@unknownglob24 = global i32 zeroinitializer
+
+; CHECK-LABEL: spill_pred_to_smem:
+; CHECK: s[[#sp:]] = simm.s32 @!p[[#pr:]] $0x0
+; CHECK: s[[#sp]] = simm.s32 @p[[#pr]] $0x1
+; CHECK: [smem:$0x9f] = sst s[[#sp]]
+; CHECK: s[[#sf:]] = sld [smem:$0x9f]
+; CHECK: p{{[0-9]+}} = seq.s32 s[[#sf]], $0x1;
+define void @spill_pred_to_smem(<8 x i32> addrspace(201)* %unknownptr) #1 {
+entry:
+  %mem0 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 0)
+  %mem1 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 1)
+  %mem2 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 2)
+  %mem3 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 3)
+  %mem4 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 4)
+  %mem5 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 5)
+  %mem6 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 6)
+  %mem7 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 7)
+  %mem8 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 8)
+  %mem9 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 9)
+  %mem10 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 10)
+  %mem11 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 11)
+  %mem12 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 12)
+  %mem13 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 13)
+  %mem14 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 14)
+  %mem15 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 15)
+  %mem16 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 16)
+  %mem17 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 17)
+  %mem18 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 18)
+  %mem19 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 19)
+  %mem20 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 20)
+  %mem21 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 21)
+  %mem22 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 22)
+  %mem23 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 23)
+  %mem24 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 24)
+  %mem25 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 25)
+  %mem26 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 26)
+  %mem27 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 27)
+  %mem28 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 28)
+  %mem29 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 29)
+  %mem30 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 30)
+  %mem31 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 31)
+  %mem32 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 32)
+  %mem33 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 33)
+  %mem34 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 34)
+  %mem35 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 35)
+  %mem36 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 36)
+  %mem37 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 37)
+  %mem38 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 38)
+  %mem39 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 39)
+  %mem40 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 40)
+  %sval0 = load i32, i32* %mem0
+  %sval1 = load i32, i32* %mem1
+  %sval2 = load i32, i32* %mem2
+  %sval3 = load i32, i32* %mem3
+  %sval4 = load i32, i32* %mem4
+  %sval5 = load i32, i32* %mem5
+  %sval6 = load i32, i32* %mem6
+  %sval7 = load i32, i32* %mem7
+  %sval8 = load i32, i32* %mem8
+  %sval9 = load i32, i32* %mem9
+  %sval10 = load i32, i32* %mem10
+  %sval11 = load i32, i32* %mem11
+  %sval12 = load i32, i32* %mem12
+  %sval13 = load i32, i32* %mem13
+  %sval14 = load i32, i32* %mem14
+  %sval15 = load i32, i32* %mem15
+  %sval16 = load i32, i32* %mem16
+  %sval17 = load i32, i32* %mem17
+  %sval18 = load i32, i32* %mem18
+  %sval19 = load i32, i32* %mem19
+  %sval20 = load i32, i32* %mem20
+  %sval21 = load i32, i32* %mem21
+  %sval22 = load i32, i32* %mem22
+  %sval23 = load i32, i32* %mem23
+  %sval24 = load i32, i32* %mem24
+  %sval25 = load i32, i32* %mem25
+  %sval26 = load i32, i32* %mem26
+  %sval27 = load i32, i32* %mem27
+  %sval28 = load i32, i32* %mem28
+  %sval29 = load i32, i32* %mem29
+  %sval30 = load i32, i32* %mem30
+  %sval31 = load i32, i32* %mem31
+  %sval32 = load i32, i32* %mem32
+  %sval33 = load i32, i32* %mem33
+  %sval34 = load i32, i32* %mem34
+  %sval35 = load i32, i32* %mem35
+  %sval36 = load i32, i32* %mem36
+  %sval37 = load i32, i32* %mem37
+  %sval38 = load i32, i32* %mem38
+  %sval39 = load i32, i32* %mem39
+  %sval40 = load i32, i32* %mem40
+
+  %laneseq = call <8 x i32> @llvm.tpu.vlaneseq()
+  %splatinsert = insertelement <8 x i32> undef, i32 127, i32 0
+  %splat = shufflevector <8 x i32> %splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
+  %base_rec = and <8 x i32> %laneseq, %splat
+  %zero = xor i32 1, 1
+
+  %unknownval1 = load i32, i32* @unknownglob1
+  %unknownval2 = load i32, i32* @unknownglob2
+  %unknownval3 = load i32, i32* @unknownglob3
+  %unknownval4 = load i32, i32* @unknownglob4
+  %unknownval5 = load i32, i32* @unknownglob5
+  %unknownval6 = load i32, i32* @unknownglob6
+  %unknownval7 = load i32, i32* @unknownglob7
+  %unknownval8 = load i32, i32* @unknownglob8
+  %unknownval9 = load i32, i32* @unknownglob9
+  %unknownval10 = load i32, i32* @unknownglob10
+  %unknownval11 = load i32, i32* @unknownglob11
+  %unknownval12 = load i32, i32* @unknownglob12
+  %unknownval13 = load i32, i32* @unknownglob13
+  %unknownval14 = load i32, i32* @unknownglob14
+  %unknownval15 = load i32, i32* @unknownglob15
+  %unknownval16 = load i32, i32* @unknownglob16
+  %unknownval17 = load i32, i32* @unknownglob17
+  %unknownval18 = load i32, i32* @unknownglob18
+  %unknownval19 = load i32, i32* @unknownglob19
+  %unknownval20 = load i32, i32* @unknownglob20
+  %unknownval21 = load i32, i32* @unknownglob21
+  %unknownval22 = load i32, i32* @unknownglob22
+  %unknownval23 = load i32, i32* @unknownglob23
+  %unknownval24 = load i32, i32* @unknownglob24
+
+  %cmp1 = icmp eq i32 %zero, %unknownval1
+  %cmp2 = icmp eq i32 %zero, %unknownval2  
+  %cmp3 = icmp eq i32 %zero, %unknownval3
+  %cmp4 = icmp eq i32 %zero, %unknownval4
+  %cmp5 = icmp eq i32 %zero, %unknownval5
+  %cmp6 = icmp eq i32 %zero, %unknownval6
+  %cmp7 = icmp eq i32 %zero, %unknownval7
+  %cmp8 = icmp eq i32 %zero, %unknownval8
+  %cmp9 = icmp eq i32 %zero, %unknownval9
+  %cmp10 = icmp eq i32 %zero, %unknownval10
+  %cmp11 = icmp eq i32 %zero, %unknownval11
+  %cmp12 = icmp eq i32 %zero, %unknownval12
+  %cmp13 = icmp eq i32 %zero, %unknownval13
+  %cmp14 = icmp eq i32 %zero, %unknownval14
+  %cmp15 = icmp eq i32 %zero, %unknownval15
+  %cmp16 = icmp eq i32 %zero, %unknownval16
+  %cmp17 = icmp eq i32 %zero, %unknownval17
+  %cmp18 = icmp eq i32 %zero, %unknownval18
+  %cmp19 = icmp eq i32 %zero, %unknownval19
+  %cmp20 = icmp eq i32 %zero, %unknownval20
+  %cmp21 = icmp eq i32 %zero, %unknownval21
+  %cmp22 = icmp eq i32 %zero, %unknownval22
+  %cmp23 = icmp eq i32 %zero, %unknownval23
+  %cmp24 = icmp eq i32 %zero, %unknownval24
+
+  %unknownval = load <8 x i32>, <8 x i32> addrspace(201)* %unknownptr
+  %sel1 = select i1 %cmp1, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel2 = select i1 %cmp2, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel3 = select i1 %cmp3, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel4 = select i1 %cmp4, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel5 = select i1 %cmp5, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel6 = select i1 %cmp6, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel7 = select i1 %cmp7, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel8 = select i1 %cmp8, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel9 = select i1 %cmp9, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel10 = select i1 %cmp10, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel11 = select i1 %cmp11, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel12 = select i1 %cmp12, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel13 = select i1 %cmp13, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel14 = select i1 %cmp14, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel15 = select i1 %cmp15, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel16 = select i1 %cmp16, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel17 = select i1 %cmp17, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel18 = select i1 %cmp18, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel19 = select i1 %cmp19, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel20 = select i1 %cmp20, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel21 = select i1 %cmp21, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel22 = select i1 %cmp22, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel23 = select i1 %cmp23, <8 x i32> %unknownval, <8 x i32> %base_rec
+  %sel24 = select i1 %cmp24, <8 x i32> %unknownval, <8 x i32> %base_rec
+
+  %add1 = add <8 x i32> %sel1, %sel1
+  %add2 = add <8 x i32> %sel2, %add1
+  %add3 = add <8 x i32> %sel3, %add2
+  %add4 = add <8 x i32> %sel4, %add3
+  %add5 = add <8 x i32> %sel5, %add4
+  %add6 = add <8 x i32> %sel6, %add5
+  %add7 = add <8 x i32> %sel7, %add6
+  %add8 = add <8 x i32> %sel8, %add7
+  %add9 = add <8 x i32> %sel9, %add8
+  %add10 = add <8 x i32> %sel10, %add9
+  %add11 = add <8 x i32> %sel11, %add10
+  %add12 = add <8 x i32> %sel12, %add11
+  %add13 = add <8 x i32> %sel13, %add12
+  %add14 = add <8 x i32> %sel14, %add13
+  %add15 = add <8 x i32> %sel15, %add14
+  %add16 = add <8 x i32> %sel16, %add15
+  %add17 = add <8 x i32> %sel17, %add16
+  %add18 = add <8 x i32> %sel18, %add17
+  %add19 = add <8 x i32> %sel19, %add18
+  %add20 = add <8 x i32> %sel20, %add19
+  %add21 = add <8 x i32> %sel21, %add20
+  %add22 = add <8 x i32> %sel22, %add21
+  %add23 = add <8 x i32> %sel23, %add22
+  %add24 = add <8 x i32> %sel24, %add23
+
+  %result = sub <8 x i32> %add24, %base_rec
+
+  %result_addr = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 0)
+
+  %addsval0 = add i32 %sval0, %sval1
+  %addsval1 = add i32 %addsval0, %sval2
+  %addsval2 = add i32 %addsval1, %sval3
+  %addsval3 = add i32 %addsval2, %sval4
+  %addsval4 = add i32 %addsval3, %sval5
+  %addsval5 = add i32 %addsval4, %sval6
+  %addsval6 = add i32 %addsval5, %sval7
+  %addsval7 = add i32 %addsval6, %sval8
+  %addsval8 = add i32 %addsval7, %sval9
+  %addsval9 = add i32 %addsval8, %sval10
+  %addsval10 = add i32 %addsval9, %sval11
+  %addsval11 = add i32 %addsval10, %sval12
+  %addsval12 = add i32 %addsval11, %sval13
+  %addsval13 = add i32 %addsval12, %sval14
+  %addsval14 = add i32 %addsval13, %sval15
+  %addsval15 = add i32 %addsval14, %sval16
+  %addsval16 = add i32 %addsval15, %sval17
+  %addsval17 = add i32 %addsval16, %sval18
+  %addsval18 = add i32 %addsval17, %sval19
+  %addsval19 = add i32 %addsval18, %sval20
+  %addsval20 = add i32 %addsval19, %sval21
+  %addsval21 = add i32 %addsval20, %sval22
+  %addsval22 = add i32 %addsval21, %sval23
+  %addsval23 = add i32 %addsval22, %sval24
+  %addsval24 = add i32 %addsval23, %sval25
+  %addsval25 = add i32 %addsval24, %sval26
+  %addsval26 = add i32 %addsval25, %sval27
+  %addsval27 = add i32 %addsval26, %sval28
+  %addsval28 = add i32 %addsval27, %sval29
+  %addsval29 = add i32 %addsval28, %sval30
+  %addsval30 = add i32 %addsval29, %sval31
+  %addsval31 = add i32 %addsval30, %sval32
+  %addsval32 = add i32 %addsval31, %sval33
+  %addsval33 = add i32 %addsval32, %sval34
+  %addsval34 = add i32 %addsval33, %sval35
+  %addsval35 = add i32 %addsval34, %sval36
+  %addsval36 = add i32 %addsval35, %sval37
+  %addsval37 = add i32 %addsval36, %sval38
+  %addsval38 = add i32 %addsval37, %sval39
+  %addsval39 = add i32 %addsval38, %sval40
+
+  %total_result = insertelement <8 x i32> %result, i32 %addsval39, i32 1
+  store <8 x i32> %total_result, <8 x i32> addrspace(201)* %result_addr
+
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar.ll
new file mode 100644
index 0000000..d858733
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar.ll

@@ -0,0 +1,90 @@
+; RUN: llc < %s -mcpu=tensorcore-vf -asm-verbose=false -tpu-skip-fast-opt | FileCheck %s
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -tpu-skip-fast-opt | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; sadd.ov.s32
+declare  i32  @llvm.tpu.sadd.ov(i32,i32) readnone
+
+; CHECK-LABEL: sadd_ov_r:
+; CHECK: s{{[0-9]+}} = sadd.ov.s32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @sadd_ov_r(i32 %y, i32 %x) {
+  %res = call i32  @llvm.tpu.sadd.ov(i32 %y, i32 %x)
+  ret i32 %res
+}
+
+; CHECK-LABEL: sadd_ov_i:
+; CHECK: s{{[0-9]+}} = sadd.ov.s32 $0xd, s{{[0-9]+}}
+define i32 @sadd_ov_i(i32 %x) {
+  %res = call i32  @llvm.tpu.sadd.ov(i32 %x, i32 13)
+  ret i32 %res
+}
+
+; CHECK-LABEL: sadd_ov_i_commutative:
+; CHECK: s{{[0-9]+}} = simm.s32 $0xd
+; CHECK: s{{[0-9]+}} = sadd.ov.s32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @sadd_ov_i_commutative(i32 %x) {
+  %res = call i32  @llvm.tpu.sadd.ov(i32 13, i32 %x)
+  ret i32 %res
+}
+
+; ssub.ov.s32
+declare  i32  @llvm.tpu.ssub.ov(i32,i32) readnone
+
+; CHECK-LABEL: ssub_ov_r:
+; CHECK: s{{[0-9]+}} = ssub.ov.s32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @ssub_ov_r(i32 %y, i32 %x) {
+  %res = call i32  @llvm.tpu.ssub.ov(i32 %y, i32 %x)
+  ret i32 %res
+}
+
+; CHECK-LABEL: ssub_ov_i:
+; CHECK: s{{[0-9]+}} = simm.s32 $0xd
+; CHECK: s{{[0-9]+}} = ssub.ov.s32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @ssub_ov_i(i32 %x) {
+  %res = call i32  @llvm.tpu.ssub.ov(i32 %x, i32 13)
+  ret i32 %res
+}
+
+; CHECK-LABEL: ssub_ov_i_commutative:
+; CHECK: s{{[0-9]+}} = ssub.ov.s32 $0xd, s{{[0-9]+}}
+define i32 @ssub_ov_i_commutative(i32 %x) {
+  %res = call i32  @llvm.tpu.ssub.ov(i32 13, i32 %x)
+  ret i32 %res
+}
+
+; sshla.ov.s32
+declare  i32  @llvm.tpu.sshla.ov(i32,i32) readnone
+
+; CHECK-LABEL: sshla_ov_r:
+; CHECK: s{{[0-9]+}} = sshla.ov.s32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @sshla_ov_r(i32 %y, i32 %x) {
+  %res = call i32  @llvm.tpu.sshla.ov(i32 %y, i32 %x)
+  ret i32 %res
+}
+
+; CHECK-LABEL: sshla_ov_i:
+; CHECK: s{{[0-9]+}} = sshla.ov.s32 s{{[0-9]+}}, $0xd
+define i32 @sshla_ov_i(i32 %x) {
+  %res = call i32  @llvm.tpu.sshla.ov(i32 %x, i32 13)
+  ret i32 %res
+}
+
+; smulhi.u32
+declare  i32  @llvm.tpu.smulhi(i32,i32) readnone
+
+; CHECK-LABEL: smulhi_r:
+; CHECK: s{{[0-9]+}} = smulhi.u32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @smulhi_r(i32 %y, i32 %x) {
+  %res = call i32  @llvm.tpu.smulhi(i32 %y, i32 %x)
+  ret i32 %res
+}
+
+; CHECK-LABEL: smulhi_i:
+; CHECK: s{{[0-9]+}} = smulhi.u32 $0xd, s{{[0-9]+}}
+define i32 @smulhi_i(i32 %x) {
+  %res = call i32  @llvm.tpu.smulhi(i32 %x, i32 13)
+  ret i32 %res
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_compare.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_compare.ll
new file mode 100644
index 0000000..89c435a
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_compare.ll

@@ -0,0 +1,253 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-SC
+; RUN: llc < %s -mcpu=tensorcore-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-VF
+; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-PF
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: cmpeq:
+; CHECK: p{{[0-9]+}} = seq.s32 s{{[0-9]+}}, s{{[0-9]+}}
+define i1 @cmpeq(i32 %x, i32 %y) {
+  %a = icmp eq i32 %x, %y
+  ret i1 %a
+}
+
+; CHECK-LABEL: cmpne:
+; CHECK: p{{[0-9]+}} = sne.s32 s{{[0-9]+}}, $0x2a
+define i1 @cmpne(i32 %x, i32 %y) {
+  %a = icmp ne i32 %x, 42
+  ret i1 %a
+}
+
+; CHECK-LABEL: cmpgt:
+; CHECK: p{{[0-9]+}} = sgt.s32 s{{[0-9]+}}, $0x2a
+define i1 @cmpgt(i32 %x, i32 %y) {
+  %a = icmp sgt i32 %x, 42
+  ret i1 %a
+}
+
+; CHECK-LABEL: cmpge:
+; CHECK: p{{[0-9]+}} = sge.s32 s{{[0-9]+}}, s{{[0-9]+}}
+define i1 @cmpge(i32 %x, i32 %y) {
+  %a = icmp sge i32 %x, %y
+  ret i1 %a
+}
+
+; CHECK-LABEL: cmplt:
+; CHECK: p{{[0-9]+}} = slt.s32 s{{[0-9]+}}, $0x2a
+define i1 @cmplt(i32 %x, i32 %y) {
+  %a = icmp slt i32 %x, 42
+  ret i1 %a
+}
+
+; CHECK-LABEL: cmpult:
+; CHECK-PF-DAG: p[[z:[0-9]+]] = slt.s32 s0, s1
+; CHECK-PF-DAG: s[[x:[0-9]+]] = sxor.u32 s1, s0
+; CHECK-PF-DAG: p[[y:[0-9]+]] = slt.s32 s[[x]], $0x0
+; CHECK-PF-NEXT: p0 = por !p[[y]], !p[[y]]
+; CHECK-PF-NEXT: p0 = por @!p[[z]] p[[y]], p[[y]]
+; CHECK-SC: p[[z:[0-9]+]] = slt.u32 s0, s1
+; CHECK-VF: p[[z:[0-9]+]] = slt.u32 s0, s1
+define i1 @cmpult(i32 %x, i32 %y) {
+  %a = icmp ult i32 %x, %y
+  ret i1 %a
+}
+
+; CHECK-LABEL: cmpule:
+; CHECK-PF-DAG: p[[z:[0-9]+]] = sle.s32 s0, s1
+; CHECK-PF-DAG: s[[x:[0-9]+]] = sxor.u32 s1, s0
+; CHECK-PF-DAG: p[[y:[0-9]+]] = slt.s32 s[[x]], $0x0
+; CHECK-PF-NEXT: p0 = por !p[[y]], !p[[y]]
+; CHECK-PF-NEXT: p0 = por @!p[[z]] p[[y]], p[[y]]
+; CHECK-SC: p[[z:[0-9]+]] = sle.u32 s0, s1
+; CHECK-VF: p[[z:[0-9]+]] = sle.u32 s0, s1
+define i1 @cmpule(i32 %x, i32 %y) {
+  %a = icmp ule i32 %x, %y
+  ret i1 %a
+}
+
+; CHECK-LABEL: cmpugt:
+; CHECK-PF-DAG: p[[z:[0-9]+]] = sgt.s32 s0, s1
+; CHECK-PF-DAG: s[[x:[0-9]+]] = sxor.u32 s1, s0
+; CHECK-PF-DAG: p[[y:[0-9]+]] = slt.s32 s[[x]], $0x0
+; CHECK-PF-NEXT: p0 = por !p[[y]], !p[[y]]
+; CHECK-PF-NEXT: p0 = por @!p[[z]] p[[y]], p[[y]]
+; CHECK-SC: p[[z:[0-9]+]] = sgt.u32 s0, s1
+; CHECK-VF: p[[z:[0-9]+]] = sgt.u32 s0, s1
+define i1 @cmpugt(i32 %x, i32 %y) {
+  %a = icmp ugt i32 %x, %y
+  ret i1 %a
+}
+
+; CHECK-LABEL: cmpuge:
+; CHECK-PF-DAG: p[[z:[0-9]+]] = sge.s32 s0, s1
+; CHECK-PF-DAG: s[[x:[0-9]+]] = sxor.u32 s1, s0
+; CHECK-PF-DAG: p[[y:[0-9]+]] = slt.s32 s[[x]], $0x0
+; CHECK-PF-NEXT: p0 = por !p[[y]], !p[[y]]
+; CHECK-PF-NEXT: p0 = por @!p[[z]] p[[y]], p[[y]]
+; CHECK-SC: p[[z:[0-9]+]] = sge.u32 s0, s1
+; CHECK-VF: p[[z:[0-9]+]] = sge.u32 s0, s1
+define i1 @cmpuge(i32 %x, i32 %y) {
+  %a = icmp uge i32 %x, %y
+  ret i1 %a
+}
+
+; CHECK-LABEL: cmple:
+; CHECK: p{{[0-9]+}} = sle.s32 s{{[0-9]+}}, s{{[0-9]+}}
+define i1 @cmple(i32 %x, i32 %y) {
+  %a = icmp sle i32 %x, %y
+  ret i1 %a
+}
+
+; CHECK-LABEL: por:
+; CHECK: p{{[0-9]+}} = por p{{[0-9]+}}, p{{[0-9]+}}
+define i1 @por(i32 %x, i32 %y, i32 %z) {
+  %a = icmp sle i32 %x, %y
+  %b = icmp sge i32 %x, %z
+  %c = or i1 %a, %b
+  ret i1 %c
+}
+
+; CHECK-LABEL: fcmple:
+; CHECK: p{{[0-9]+}} = sle.f32 s{{[0-9]+}}, s{{[0-9]+}}
+define i1 @fcmple(float %x, float %y) {
+  %a = fcmp ole float %x, %y
+  ret i1 %a
+}
+
+; CHECK-LABEL: fcmplt:
+; CHECK: p{{[0-9]+}} = slt.f32 s{{[0-9]+}}, s{{[0-9]+}}
+define i1 @fcmplt(float %x, float %y) {
+  %a = fcmp olt float %x, %y
+  ret i1 %a
+}
+
+; CHECK-LABEL: fcmpge:
+; CHECK: p{{[0-9]+}} = sge.f32 s{{[0-9]+}}, s{{[0-9]+}}
+define i1 @fcmpge(float %x, float %y) {
+  %a = fcmp oge float %x, %y
+  ret i1 %a
+}
+
+; CHECK-LABEL: fcmpgt:
+; CHECK: p{{[0-9]+}} = sgt.f32 s{{[0-9]+}}, s{{[0-9]+}}
+define i1 @fcmpgt(float %x, float %y) {
+  %a = fcmp ogt float %x, %y
+  ret i1 %a
+}
+
+; CHECK-LABEL: fcmpeq:
+; CHECK: p{{[0-9]+}} = seq.f32 s{{[0-9]+}}, s{{[0-9]+}}
+define i1 @fcmpeq(float %x, float %y) {
+  %a = fcmp oeq float %x, %y
+  ret i1 %a
+}
+
+; CHECK-LABEL: fcmpeq_unordered:
+; CHECK: p[[p0:[0-9]+]] = slt.f32 s0, $42.0
+; CHECK: p[[p1:[0-9]+]] = sgt.f32 s0, $42.0
+; CHECK: p[[p2:[0-9]+]] = por p[[p1]], p[[p0]]
+; CHECK: p{{[0-9]+}} = por !p[[p2]], !p[[p2]]
+define i1 @fcmpeq_unordered(float %x, float %y) {
+  %a = fcmp ueq float %x, 42.0
+  ret i1 %a
+}
+
+; CHECK-LABEL: fcmpne:
+; CHECK: p[[p0:[0-9]+]] = slt.f32 s0, $42.0
+; CHECK: p[[p1:[0-9]+]] = sgt.f32 s0, $42.0
+; CHECK: p{{[0-9]+}} = por p[[p1]], p[[p0]]
+define i1 @fcmpne(float %x, float %y) {
+  %a = fcmp one float %x, 42.0
+  ret i1 %a
+}
+
+; CHECK-LABEL: fcmpne_unordered:
+; CHECK: p{{[0-9]+}} = sne.f32 s{{[0-9]+}}, $42.0
+define i1 @fcmpne_unordered(float %x, float %y) {
+  %a = fcmp une float %x, 42.0
+  ret i1 %a
+}
+
+declare i1 @llvm.tpu.weird.f32(float) readnone
+
+; CHECK-LABEL: weird:
+; CHECK: p{{[0-9]+}} = sweird.f32 s{{[0-9]+}}
+define i1 @weird(float %x, float %y) {
+  %a = call i1 @llvm.tpu.weird.f32(float %x)
+  ret i1 %a
+}
+
+declare i1 @llvm.tpu.addcarry(i32, i32) nounwind readnone
+
+; CHECK-LABEL: addcarry:
+; CHECK: p{{[0-9]+}} = sc.u32 s{{[0-9]+}}, s{{[0-9]+}}
+define i1 @addcarry(i32 %x, i32 %y) {
+  %a = call i1 @llvm.tpu.addcarry(i32 %x, i32 %y)
+  ret i1 %a
+}
+
+; Test that setcc of i1 doesn't crash and gets promoted
+; CHECK-LABEL: setcc:
+; CHECK:  s{{[0-9]+}} = sand.u32 $0x1, s{{[0-9]+}}
+; CHECK-SC:  p{{[0-9]+}} = seq.s32 s{{[0-9]+}}, $0x0
+; CHECK-VF:  p{{[0-9]+}} = seq.s32 s{{[0-9]+}}, $0x1
+define void @setcc(i32 %x, i1 %c) {
+  entry:
+    br label %for.body
+
+  for.body:                                         ; preds = %if.end, %entry
+    %indc.0150 = phi i32 [ 0, %entry ], [ %x, %if.end ]
+    %mul = and i32 %indc.0150, 1
+    %cmp7 = icmp eq i32 %mul, 0
+    br i1 %cmp7, label %if.end, label %if.then
+
+  if.then:                                          ; preds = %for.body
+    ret void
+
+  if.end:                                           ; preds = %for.body
+    br i1 %c, label %for.body, label %if.then
+}
+
+; Check that we don't promote the flag to i32.
+; DAGCombine used to promote it to i32.
+; CHECK-LABEL: setcc2:
+; CHECK-NOT:  simm.s32
+define void @setcc2(i32 %x, i1 %c, i32* %ptr) {
+  entry:
+    br i1 %c, label %if.then, label %if.end
+
+  if.then:
+    store i32 %x, i32* %ptr
+    br label %if.end
+
+  if.end:
+    ret void
+}
+
+; Test that setcc gets legalized correctly.
+; CHECK-LABEL: setcc_ueq_f:
+; CHECK-DAG: p[[p0:[0-9]+]] = slt.f32 s0, s1
+; CHECK-DAG: p[[p1:[0-9]+]] = sgt.f32 s0, s1
+; CHECK-DAG: s{{[0-9]+}} = simm.f32 $1.0
+; CHECK-DAG: s{{[0-9]+}} = simm.s32 @!p[[p0]]
+; CHECK-DAG: s{{[0-9]+}} = simm.s32 @p[[p1]]
+define float @setcc_ueq_f(float %x, float %y)  #0 {
+entry:
+  %a = fcmp ueq float %x, %y
+  %r = select i1 %a, float  42.0, float 1.0
+  ret float %r
+}
+
+; CHECK-LABEL: setcc_ugt_f:
+; CHECK: sle.f32 s0, s1
+define float @setcc_ugt_f(float %x, float %y)  #0 {
+entry:
+  %a = fcmp ugt float %x, %y
+  %r = select i1 %a, float  42.0, float 1.0
+  ret float %r
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_f32.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_f32.ll
new file mode 100644
index 0000000..50a0707
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_f32.ll

@@ -0,0 +1,149 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; RUN: llc < %s -mcpu=tensorcore-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+; Test that basic 32-bit floating point operations codegen as expected.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare float @llvm.maximum.f32(float, float) readnone
+declare float @llvm.minimum.f32(float, float) readnone
+declare i32 @llvm.tpu.cvt.fptosi.i32.f32(float) readnone
+
+; CHECK-LABEL: add32rr:
+; CHECK: s{{[0-9]+}} = sadd.f32 s{{[0-9]+}}, s{{[0-9]+}}
+define float @add32rr(float %x, float %y) {
+  %a = fadd float %x, %y
+  ret float %a
+}
+
+; CHECK-LABEL: add32ri:
+; CHECK: s{{[0-9]+}} = sadd.f32 $42.0, s{{[0-9]+}}
+define float @add32ri(float %x, float %y) {
+  %a = fadd float %x, 42.0
+  ret float %a
+}
+
+; CHECK-LABEL: sub32rr:
+; CHECK: s{{[0-9]+}} = ssub.f32 s{{[0-9]+}}, s{{[0-9]+}}
+define float @sub32rr(float %x, float %y) {
+  %a = fsub float %x, %y
+  ret float %a
+}
+
+; CHECK-LABEL: sub32ir:
+; CHECK: s{{[0-9]+}} = ssub.f32 $42.0, s{{[0-9]+}}
+define float @sub32ir(float %x, float %y) {
+  %a = fsub float 42.0, %y
+  ret float %a
+}
+
+
+; CHECK-LABEL: mul32rr:
+; CHECK: s{{[0-9]+}} = smul.f32 s{{[0-9]+}}, s{{[0-9]+}}
+define float @mul32rr(float %x, float %y) {
+  %a = fmul float %x, %y
+  ret float %a
+}
+
+; CHECK-LABEL: fmax32rr:
+; CHECK: s{{[0-9]+}} = smax.f32 s{{[0-9]+}}, s{{[0-9]+}}
+define float @fmax32rr(float %x, float %y) {
+  %a = call float @llvm.maximum.f32(float %x, float %y) readnone
+  ret float %a
+}
+
+; CHECK-LABEL: fmin32rr:
+; CHECK: s{{[0-9]+}} = smin.f32 s{{[0-9]+}}, s{{[0-9]+}}
+define float @fmin32rr(float %x, float %y) {
+  %a = call float @llvm.minimum.f32(float %x, float %y) readnone
+  ret float %a
+}
+
+; CHECK-LABEL: fptosi32:
+; CHECK: s{{[0-9]+}} = scvt.f32.s32 s{{[0-9]+}}
+define i32 @fptosi32(float %x, float %y) {
+  %a = fptosi float %x to i32
+  ret i32 %a
+}
+
+; CHECK-LABEL: fptosi32r:
+; CHECK: s{{[0-9]+}} = scvt.f32.s32 s{{[0-9]+}}
+define i32 @fptosi32r(float %x) {
+  %a = call i32 @llvm.tpu.cvt.fptosi.i32.f32(float %x)
+  ret i32 %a
+}
+
+; CHECK-LABEL: fptosi32i:
+; CHECK: s[[x:[0-9]+]] = simm.f32 $1.0
+; CHECK: s{{[0-9]+}} = scvt.f32.s32 s[[x]]
+define i32 @fptosi32i() {
+  %a = call i32 @llvm.tpu.cvt.fptosi.i32.f32(float 1.0)
+  ret i32 %a
+}
+
+; CHECK-LABEL: sitofp32rr:
+; CHECK: s{{[0-9]+}} = scvt.s32.f32 s{{[0-9]+}}
+define float @sitofp32rr(i32 %x) {
+  %a = sitofp i32 %x to float
+  ret float %a
+}
+
+; CHECK-LABEL: select_imm_f:
+; CHECK: s0 =	simm.s32 @!p0 $0x3f800000;
+define float @select_imm_f(i1 %c, float %x) {
+  %r = select i1 %c, float %x, float 1.0
+  ret float %r
+}
+
+; Make sure select_cc doesn't crash.
+; CHECK-LABEL: select_cc_f:
+; CHECK: [[x:p[0-9]+]] = sgt.s32 s0, $0x1
+; CHECK: s{{[0-9]+}} =	smov.u32 @![[x]] s{{[0-9]+}}
+define float @select_cc_f(i32 %s, float %x, float %y) {
+  %c = icmp sgt i32 %s, 1
+  %r = select i1 %c, float %x, float %y
+  ret float %r
+}
+
+; CHECK-LABEL: fneg:
+; CHECK: s{{[0-9]+}} = sxor.u32 $-0x80000000, s{{[0-9]+}}
+define float @fneg(float %x) {
+  %a = fsub float -0.0, %x
+  ret float %a
+}
+
+; CHECK-LABEL: uitofp_i1:
+; CHECK: s0 = simm.s32 $0x3f800000
+; CHECK: s0 = simm.s32 @!p0 $0x0
+define float @uitofp_i1(i1 %x) {
+  %a = uitofp i1 %x to float
+  ret float %a
+}
+
+; CHECK-LABEL: sitofp_i1:
+; CHECK: s0 = simm.s32 $-0x40800000
+; CHECK: s0 = simm.s32 @!p0 $0x0
+define float @sitofp_i1(i1 %x) {
+  %a = sitofp i1 %x to float
+  ret float %a
+}
+
+declare float @llvm.floor.f32(float %v)
+
+; CHECK-LABEL: ffloor32_32:
+; CHECK: s{{[0-9]+}} = sfloor.f32 s{{[0-9]+}}
+define float @ffloor32_32(float %v) {
+  %a = call float @llvm.floor.f32(float %v)
+  ret float %a
+}
+
+declare float @llvm.ceil.f32(float %v)
+
+; CHECK-LABEL: fceil32_32:
+; CHECK: s{{[0-9]+}} = sceil.f32 s{{[0-9]+}}
+define float @fceil32_32(float %v) {
+  %a = call float @llvm.ceil.f32(float %v)
+  ret float %a
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_f32_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_f32_sc.ll
new file mode 100644
index 0000000..b0c8141
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_f32_sc.ll

@@ -0,0 +1,73 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+; Test that basic 32-bit floating point operations codegen as expected.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: fdiv_rr
+; CHECK: { v[[v0:[0-9]+]] = vmov s1 }
+; CHECK: { (erf) = vrcp.f32 v[[v0]];
+; CHECK: { v[[v1:[0-9]+]] = vpop (erf) }
+; CHECK: { (v2sf) = vpush v[[v1]], $0x0;
+; CHECK: { s[[s0:[0-9]+]] = spop (v2sf) }
+; CHECK: { s{{[0-9]+}} = smul.f32 s[[s0]], s0;
+define float @fdiv_rr(float %i, float %d) {
+  %r = fdiv float %i, %d
+  ret float %r
+}
+
+; CHECK-LABEL: fdiv_ri
+; CHECK: { v[[v0:[0-9]+]] = vimm.f32 $42.0 }
+; CHECK: { (erf) = vrcp.f32 v[[v0]];
+; CHECK: { v[[v1:[0-9]+]] = vpop (erf) }
+; CHECK: { (v2sf) = vpush v[[v1]], $0x0;
+; CHECK: { s[[s0:[0-9]+]] = spop (v2sf) }
+; CHECK: { s{{[0-9]+}} = smul.f32 s[[s0]], s0;
+define float @fdiv_ri(float %i) {
+  %r = fdiv float %i, 42.0
+  ret float %r
+}
+
+; CHECK-LABEL: fdiv_ir
+; CHECK: { v[[v0:[0-9]+]] = vmov s0 }
+; CHECK: { (erf) = vrcp.f32 v[[v0]];
+; CHECK: { v[[v1:[0-9]+]] = vpop (erf) }
+; CHECK: { (v2sf) = vpush v[[v1]], $0x0;
+; CHECK: { s[[s0:[0-9]+]] = spop (v2sf) }
+; CHECK: { s{{[0-9]+}} = smul.f32 $42.0, s[[s0]];
+define float @fdiv_ir(float %i) {
+  %r = fdiv float 42.0, %i
+  ret float %r
+}
+
+declare float @llvm.fabs.f32(float %x) readnone
+
+; CHECK-LABEL: absf:
+; CHECK: s{{[0-9]+}} = sand.u32 $0x7fffffff, s0
+define float @absf(float %x) {
+  %a = call float @llvm.fabs.f32(float %x) readnone
+  ret float %a
+}
+
+declare float @llvm.copysign.f32(float, float) readnone
+
+; CHECK-LABEL: copysign_opt:
+; CHECK-NOT: and
+; CHECK-NOT: or
+; CHECK: shalt
+define float @copysign_opt(float %x) {
+  %a = call float @llvm.copysign.f32(float %x, float %x) readnone
+  ret float %a
+}
+
+; CHECK-LABEL: copysign:
+; CHECK: s[[s0:[0-9]+]] = sand.u32 $-0x80000000, s1
+; CHECK: s[[s1:[0-9]+]] = sand.u32 $0x7fffffff, s0
+; CHECK: s{{[0-9]+}} = sor.u32 s[[s0]], s[[s1]]
+define float @scopysign(float %x, float %y) {
+  %a = call float @llvm.copysign.f32(float %x, float %y) readnone
+  ret float %a
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_i1.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_i1.ll
new file mode 100644
index 0000000..038bbb1
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_i1.ll

@@ -0,0 +1,51 @@
+; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp | FileCheck %s
+; RUN: llc < %s -mcpu=tensorcore-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: and1:
+; CHECK:  [[x:p[0-9]+]] =	por !p0, !p1
+; CHECK:  p0 =	por ![[x]], ![[x]]
+define i1 @and1(i1 %x, i1 %y) {
+  %a = and i1 %x, %y
+  ret i1 %a
+}
+
+; CHECK-LABEL: or1:
+;CHECK: p0 =	por p0, p1
+define i1 @or1(i1 %x, i1 %y) {
+  %a = or i1 %x, %y
+  ret i1 %a
+}
+
+; CHECK-LABEL: nand1:
+;CHECK: p0 =	por !p0, !p1
+define i1 @nand1(i1 %x, i1 %y) {
+  %a = and i1 %x, %y
+  %na = xor i1 %a, 1
+  ret i1 %na
+}
+
+; CHECK-LABEL: xor1:
+;CHECK: [[x:p[0-9]+]] =	por !p1, !p1
+;CHECK: [[x]] =	por @!p0 p1, p1
+define i1 @xor1(i1 %x, i1 %y) {
+  %a = xor i1 %x, %y
+  ret i1 %a
+}
+
+; CHECK-LABEL: pnot_xor:
+; CHECK: p0 = por !p0, !p0
+define i1 @pnot_xor(i1 %i) {
+  %r = xor i1 %i, -1
+  ret i1 %r
+}
+
+; CHECK-LABEL: pnot_add:
+; CHECK: p0 = por !p0, !p0
+define i1 @pnot_add(i1 %i) {
+  %r = add i1 %i, -1
+  ret i1 %r
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_i32.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_i32.ll
new file mode 100644
index 0000000..c9568de
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_i32.ll

@@ -0,0 +1,317 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp -tpu-skip-fast-opt \
+; RUN: -instcombine-max-iterations=0 | FileCheck %s --check-prefixes=CHECK,CHECK-DFPLUS
+; REQUIRES: tpu
+
+; Test that basic 32-bit integer operations codegen as expected.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ctlz.i32(i32, i1) #1
+declare void @llvm.tpu.nop() inaccessiblememonly
+declare i32 @llvm.tpu.tileid() #1
+declare i32 @llvm.tpu.shll.i32(i32, i32) #1
+declare i32 @llvm.tpu.shrl.i32(i32, i32) #1
+declare i32 @llvm.tpu.shra.i32(i32, i32) #1
+
+; CHECK-LABEL: add32rr:
+; CHECK: s{{[0-9]+}} = sadd.s32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @add32rr(i32 %x, i32 %y) {
+  %a = add i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: add32ri:
+; CHECK: s{{[0-9]+}} = sadd.s32 $0x2a, s{{[0-9]+}}
+define i32 @add32ri(i32 %x) {
+  %a = add i32 %x, 42
+  ret i32 %a
+}
+
+; CHECK-LABEL: sub32:
+; CHECK: s{{[0-9]+}} = ssub.s32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @sub32(i32 %x, i32 %y) {
+  %a = sub i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: sub32ir:
+; CHECK: s{{[0-9]+}} = ssub.s32 $0x4, s{{[0-9]+}}
+define i32 @sub32ir(i32 %x, i32 %y) {
+  %a = sub i32 4, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: and32:
+; CHECK: s{{[0-9]+}} = sand.u32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @and32(i32 %x, i32 %y) {
+  %a = and i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: or32:
+; CHECK: s{{[0-9]+}} = sor.u32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @or32(i32 %x, i32 %y) {
+  %a = or i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: xor32:
+; CHECK: s{{[0-9]+}} = sxor.u32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @xor32(i32 %x, i32 %y) {
+  %a = xor i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: mul32:
+; CHECK-JF: [[REG0:s[0-9]+]] = sshrl.u32 s1, $0x18
+; CHECK-JF: [[REG1:s[0-9]+]] = sshrl.u32 s0, $0x18
+; CHECK-JF-DAG: [[REG3:s[0-9]+]] = smul.u24 [[REG0]], s0
+; CHECK-JF-DAG: [[REG4:s[0-9]+]] = smul.u24 s1, [[REG1]]
+; CHECK-JF-DAG: [[REG5:s[0-9]+]] = sadd.s32 [[REG3]], [[REG4]]
+; CHECK-JF-DAG: [[REG6:s[0-9]+]] = smul.u24 s1, s0;
+; CHECK-JF-DAG: [[REG7:s[0-9]+]] = sshll.u32 [[REG5]], $0x18  }
+; CHECK-JF: s{{[0-9]+}} =    sadd.s32 [[REG7]], [[REG6]];
+; CHECK-JF-NEXT: shalt
+; CHECK-DFPLUS: s{{[0-9]+}} = smul.u32 s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-DFPLUS-NEXT: shalt
+define i32 @mul32(i32 %x, i32 %y) {
+  %a = mul i32 %x, %y
+  ret i32 %a
+}
+
+; Test that when we can prove that the upper 8 bits are 0 we don't
+; emit the big expansion.
+; CHECK-LABEL: mul24opt:
+; CHECK: s{{[0-9]+}} = sand.u32 $0xffffff, s{{[0-9]+}}
+; CHECK: s{{[0-9]+}} = sand.u32 $0xffffff, s{{[0-9]+}}
+; CHECK-JF: s{{[0-9]+}} = smul.u24 s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-DFPLUS: s{{[0-9]+}} = smul.u32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @mul24opt(i32 %x, i32 %y) {
+  %andx = and i32 %x, 16777215
+  %andy = and i32 %y, 16777215
+  %a = mul i32 %andx, %andy
+  ret i32 %a
+}
+
+; Test optimized mul32 expansion for when only rhs is 24-bit
+; CHECK-LABEL: mul24opt_lhs_only:
+; CHECK-JF: [[REG0:s[0-9]+]] = sand.u32 $0xffffff, s0
+; CHECK-JF: [[REG1:s[0-9]+]] = sshrl.u32 s1, $0x18
+; CHECK-JF-DAG: [[REG2:s[0-9]+]] = smul.u24 [[REG1]], [[REG0]]
+; CHECK-JF-DAG: [[REG3:s[0-9]+]] = smul.u24 s1, [[REG0]]
+; CHECK-JF-DAG: [[REG4:s[0-9]+]] = sshll.u32 [[REG2]], $0x18
+; CHECK-JF: s{{[0-9]+}} = sadd.s32 [[REG4]], [[REG3]]
+; CHECK-JF-NEXT: shalt
+; CHECK-DFPLUS: s{{[0-9]+}} = smul.u32 s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-DFPLUS-NEXT: shalt
+define i32 @mul24opt_lhs_only(i32 %x, i32 %y) {
+  %andx = and i32 %x, 16777215
+  %a = mul i32 %andx, %y
+  ret i32 %a
+}
+
+; Test optimized mul32 expansion for when only rhs is 24-bit
+; CHECK-LABEL: mul24opt_rhs_only:
+; CHECK-JF: [[REG0:s[0-9]+]] = sand.u32 $0xffffff, s1
+; CHECK-JF: [[REG1:s[0-9]+]] = sshrl.u32 s0, $0x18
+; CHECK-JF-DAG: [[REG2:s[0-9]+]] = smul.u24 [[REG0]], [[REG1]]
+; CHECK-JF-DAG: [[REG3:s[0-9]+]] = smul.u24 [[REG0]], s0
+; CHECK-JF-DAG: [[REG4:s[0-9]+]] = sshll.u32 [[REG2]], $0x18
+; CHECK-JF: s{{[0-9]+}} = sadd.s32 [[REG4]], [[REG3]];
+; CHECK-JF-NEXT: shalt
+; CHECK-DFPLUS: s{{[0-9]+}} = smul.u32 s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-DFPLUS-NEXT: shalt
+define i32 @mul24opt_rhs_only(i32 %x, i32 %y) {
+  %andy = and i32 %y, 16777215
+  %a = mul i32 %x, %andy
+  ret i32 %a
+}
+
+; CHECK-LABEL: shl32:
+; CHECK: s{{[0-9]+}} = sshll.u32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @shl32(i32 %x, i32 %y) {
+  %a = shl i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: lshr32:
+; CHECK: s{{[0-9]+}} = sshrl.u32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @lshr32(i32 %x, i32 %y) {
+  %a = lshr i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: ashr32:
+; CHECK: s{{[0-9]+}} = sshra.s32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @ashr32(i32 %x, i32 %y) {
+  %a = ashr i32 %x, %y
+  ret i32 %a
+}
+
+; CHECK-LABEL: int_shl32:
+; CHECK: s{{[0-9]+}} = sshll.u32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @int_shl32(i32 %x, i32 %y) {
+  %a = call i32 @llvm.tpu.shll.i32(i32 %x, i32 %y)
+  ret i32 %a
+}
+
+; CHECK-LABEL: int_imm_shl32:
+; CHECK: s{{[0-9]+}} = sshll.u32 s{{[0-9]+}}, $0x20
+define i32 @int_imm_shl32(i32 %x) {
+  %a = call i32 @llvm.tpu.shll.i32(i32 %x, i32 32)
+  ret i32 %a
+}
+
+; CHECK-LABEL: int_lshr32:
+; CHECK: s{{[0-9]+}} = sshrl.u32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @int_lshr32(i32 %x, i32 %y) {
+  %a = call i32 @llvm.tpu.shrl.i32(i32 %x, i32 %y)
+  ret i32 %a
+}
+
+; CHECK-LABEL: int_imm_lshr32:
+; CHECK: s{{[0-9]+}} = sshrl.u32 s{{[0-9]+}}, $0x20
+define i32 @int_imm_lshr32(i32 %x) {
+  %a = call i32 @llvm.tpu.shrl.i32(i32 %x, i32 32)
+  ret i32 %a
+}
+
+; CHECK-LABEL: int_ashr32:
+; CHECK: s{{[0-9]+}} = sshra.s32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @int_ashr32(i32 %x, i32 %y) {
+  %a = call i32 @llvm.tpu.shra.i32(i32 %x, i32 %y)
+  ret i32 %a
+}
+
+; CHECK-LABEL: int_imm_ashr32:
+; CHECK: s{{[0-9]+}} = sshra.s32 s{{[0-9]+}}, $0x20
+define i32 @int_imm_ashr32(i32 %x, i32 %y) {
+  %a = call i32 @llvm.tpu.shra.i32(i32 %x, i32 32)
+  ret i32 %a
+}
+
+; CHECK-LABEL: int_pat_shl32:
+; CHECK: s{{[0-9]+}} = sshll.u32 s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-NEXT: shalt
+define i32 @int_pat_shl32(i32 %x, i32 %y) {
+  %a = icmp ult i32 %y, 32
+  %b = shl i32 %x, %y
+  %c = select i1 %a, i32 %b, i32 0
+  ret i32 %c
+}
+
+; CHECK-LABEL: int_pat_lshr32:
+; CHECK: s{{[0-9]+}} = sshrl.u32 s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-NEXT: shalt
+define i32 @int_pat_lshr32(i32 %x, i32 %y) {
+  %a = icmp ult i32 %y, 32
+  %b = lshr i32 %x, %y
+  %c = select i1 %a, i32 %b, i32 0
+  ret i32 %c
+}
+
+; CHECK-LABEL: int_pat_ashr32:
+; CHECK: s{{[0-9]+}} = sshra.s32 s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-NEXT: shalt
+define i32 @int_pat_ashr32(i32 %x, i32 %y) {
+  %a = icmp ult i32 %y, 31
+  %b = select i1 %a, i32 %y, i32 31
+  %c = lshr i32 %x, %b
+  ret i32 %c
+}
+
+; CHECK-LABEL: clz32:
+; CHECK: s{{[0-9]+}} = sclz.u32 s0
+define i32 @clz32(i32 %x) {
+  %a = call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+  ret i32 %a
+}
+
+; CHECK-LABEL: mov32:
+; CHECK: s{{[0-9]+}} = smov.u32 s1
+define i32 @mov32(i32 %x, i32 %y) {
+  ret i32 %y
+}
+
+; CHECK-LABEL: sfence:
+; CHECK: _ = sfence
+define void @sfence() {
+  fence seq_cst
+  ret void
+}
+
+; CHECK-LABEL: nop:
+; CHECK: _ = snop
+define void @nop() {
+  call void @llvm.tpu.nop() inaccessiblememonly
+  ret void
+}
+
+; CHECK-LABEL: predtrunc:
+; CHECK: [[x:s[0-9]+]] = sand.u32 $0x1, s0
+; CHECK: p0 = seq.s32 [[x]], $0x1
+define i1 @predtrunc(i32 %x) {
+  %y = trunc i32 %x to i1
+  ret i1 %y
+}
+
+; CHECK-LABEL: predzext:
+; CHECK: s0 = simm.s32 $0x1
+; CHECK: s0 = simm.s32 @!p0 $0x0
+define i32 @predzext(i1 %x) {
+  %y = zext i1 %x to i32
+  ret i32 %y
+}
+
+; CHECK-LABEL: predsext:
+; CHECK: s0 = simm.s32 $-0x1
+; CHECK: s0 = simm.s32 @!p0 $0x0
+define i32 @predsext(i1 %x) {
+  %y = sext i1 %x to i32
+  ret i32 %y
+}
+
+; CHECK-LABEL: tileid:
+; CHECK: s0 = stileid.u32
+define i32 @tileid() {
+  %x = call i32 @llvm.tpu.tileid() readnone
+  ret i32 %x
+}
+
+; CHECK-LABEL: seli32:
+; CHECK: p{{[0-9]+}} = sne.s32 s0, $0x0
+; CHECK: s0 = simm.s32 $-0x1
+; CHECK: s0 = simm.s32 @!p{{[0-9]+}} $0x0
+define i32 @seli32(i32 %a) {
+  %b = icmp ne i32 %a, 0
+  %c = select i1 %b, i32 -1, i32 0
+  ret i32 %c
+}
+
+; CHECK-LABEL: clz32_const_0:
+; CHECK: s{{[0-9]+}} = simm.s32 $0x20
+define i32 @clz32_const_0(i32 %x) {
+  %z = xor i32 %x, %x
+  %a = call i32 @llvm.ctlz.i32(i32 %z, i1 false)
+  ret i32 %a
+}
+
+; CHECK-LABEL: clz32_const_1:
+; CHECK: s{{[0-9]+}} = simm.s32 $0x1f
+define i32 @clz32_const_1(i32 %x) {
+  %z = xor i32 %x, %x
+  %s = add i32 %z, 1
+  %a = call i32 @llvm.ctlz.i32(i32 %s, i1 false)
+  ret i32 %a
+}
+
+; CHECK-LABEL: clz32_const_32:
+; CHECK: s{{[0-9]+}} = simm.s32 $0x0
+define i32 @clz32_const_32(i32 %x) {
+  %z = xor i32 %x, %x
+  %s = add i32 %z, 4294967295
+  %a = call i32 @llvm.ctlz.i32(i32 %s, i1 false)
+  ret i32 %a
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_i32_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_i32_sc.ll
new file mode 100644
index 0000000..c1b0c14
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_i32_sc.ll

@@ -0,0 +1,117 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; RUN: llc < %s -mcpu=tensorcore-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Test that basic 32-bit integer operations codegen as expected.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.umax(i32, i32) readnone
+declare i32 @llvm.umin(i32, i32) readnone
+declare i32 @llvm.ctlz.i32(i32) readnone
+declare i32 @llvm.abs.i32(i32 %x, i1) readnone
+
+; CHECK: s{{[0-9]+}} = smax.u32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @umax32rr(i32 %x, i32 %y) {
+  %r = call i32 @llvm.umax(i32 %x, i32 %y) readnone
+  ret i32 %r
+}
+
+; CHECK-LABEL: umax32ri:
+; CHECK: s{{[0-9]+}} = smax.u32 s{{[0-9]+}}, $0x5
+define i32 @umax32ri(i32 %x) {
+  %r = call i32 @llvm.umax(i32 %x, i32 5) readnone
+  ret i32 %r
+}
+
+; CHECK-LABEL: umax32ir:
+; CHECK: s{{[0-9]+}} = smax.u32 s{{[0-9]+}}, $0x5
+define i32 @umax32ir(i32 %y) {
+  %r = call i32 @llvm.umax(i32 5, i32 %y) readnone
+  ret i32 %r
+}
+
+; CHECK-LABEL: umin32rr:
+; CHECK: s{{[0-9]+}} = smin.u32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @umin32rr(i32 %x, i32 %y) {
+  %r = call i32 @llvm.umin(i32 %x, i32 %y) readnone
+  ret i32 %r
+}
+
+; CHECK-LABEL: umin32ri:
+; CHECK: s{{[0-9]+}} = smin.u32 s{{[0-9]+}}, $0x10
+define i32 @umin32ri(i32 %x) {
+  %r = call i32 @llvm.umin(i32 %x, i32 16) readnone
+  ret i32 %r
+}
+
+; CHECK-LABEL: umin32ir:
+; CHECK: s{{[0-9]+}} = smin.u32 s{{[0-9]+}}, $0x10
+define i32 @umin32ir(i32 %y) {
+  %r = call i32 @llvm.umin(i32 16, i32 %y) readnone
+  ret i32 %r
+}
+
+; CHECK-LABEL: umax32rr_select:
+; CHECK: s{{[0-9]+}} = smax.u32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @umax32rr_select(i32 %x, i32 %y) {
+  %p = icmp ult i32 %x, %y
+  %r = select i1 %p, i32 %y, i32 %x
+  ret i32 %r
+}
+
+; CHECK-LABEL: umin32rr_select:
+; CHECK: s{{[0-9]+}} = smin.u32 s{{[0-9]+}}, s{{[0-9]+}}
+define i32 @umin32rr_select(i32 %x, i32 %y) {
+  %p = icmp ult i32 %x, %y
+  %r = select i1 %p, i32 %x, i32 %y
+  ret i32 %r
+}
+
+; TODO(hgreving): see comment at MIN/MAX in TPUInstrInfo.td,
+; replacing by e.g. 256 should also lead to umax.
+; CHECK-LABEL: umax32ri_select:
+; CHECK: s{{[0-9]+}} = smax.u32 s{{[0-9]+}}, $0x7f
+define i32 @umax32ri_select(i32 %x) {
+  %p = icmp ult i32 %x, 127
+  %r = select i1 %p, i32 127, i32 %x
+  ret i32 %r
+}
+
+; TODO(hgreving): see comment at MIN/MAX in TPUInstrInfo.td,
+; replacing by e.g. 256 should also lead to umin.
+; CHECK-LABEL: umin32ri_select:
+; CHECK: s{{[0-9]+}} = smin.u32 s{{[0-9]+}}, $0x7f
+define i32 @umin32ri_select(i32 %x) {
+  %p = icmp ult i32 %x, 127
+  %r = select i1 %p, i32 %x, i32 127
+  ret i32 %r
+}
+
+; Ensure we are able to pack eligible scalar ops into the scalar misc slot
+; CHECK-LABEL: pack_smisc_alu:
+; CHECK:      { s{{[0-9]+}} = smin.u32 s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-NEXT:   s{{[0-9]+}} = smax.u32 s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-NEXT:   s{{[0-9]+}} = sclz.u32 s{{[0-9]+}} }
+define i32 @pack_smisc_alu(i32 %x, i32 %y, i32* %ptr1, i32* %ptr2, i32* %ptr3) {
+  %min = call i32 @llvm.umin(i32 %x, i32 %y) readnone
+  %max = call i32 @llvm.umax(i32 %x, i32 %y) readnone
+  %clz = call i32 @llvm.ctlz.i32(i32 %x)
+  %v1 = add i32 %min, %max
+  %v2 = add i32 %max, %clz
+  %v3 = add i32 %v1, %v2
+  ret i32 %v3
+}
+
+; CHECK-LABEL: absi:
+; CHECK: s[[s0:[0-9]+]] = ssub.s32 $0x0, s0
+; CHECK: s{{[0-9]+}} = smin.u32 s0, s[[s0]]
+define i32 @absi(i32 %x) {
+  %a = call i32 @llvm.abs.i32(i32 %x, i1 0) readnone
+  ret i32 %a
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_ldst.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_ldst.ll
new file mode 100644
index 0000000..ef9cdde
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_ldst.ll

@@ -0,0 +1,184 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-allow-global-offset-for-test | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+%struct.S = type { i32, i32 }
+
+; CHECK-LABEL: sldi:
+; CHECK: s0 = sld [smem:s0+$0x0]
+define i32 @sldi(i32* %a) {
+  %b = load i32, i32* %a
+  ret i32 %b
+}
+
+; CHECK-LABEL: sldi2:
+; CHECK: s0 = sld [smem:s0+$0x1]
+define i32 @sldi2(i32* %a) {
+  %addr = getelementptr i32, i32* %a, i32 1
+  %b = load i32, i32* %addr
+  ret i32 %b
+}
+
+; CHECK-LABEL: sldi2_neg:
+; CHECK: s0 = sld [smem:s0+$-0x1]
+define i32 @sldi2_neg(i32* %a) {
+  %addr = getelementptr i32, i32* %a, i32 -1
+  %b = load i32, i32* %addr
+  ret i32 %b
+}
+
+; CHECK-LABEL: ssti:
+; CHECK: [smem:s1] = sst s0
+define void @ssti(i32 %a, i32* %b) {
+  store i32 %a, i32* %b
+  ret void
+}
+
+; CHECK-LABEL: ssti2:
+; CHECK: [[x:s[0-9]+]] = sadd.s32 $0x1, s1
+; CHECK: [smem:[[x]]] = sst s0
+define void @ssti2(i32 %a, i32* %b) {
+  %addr = getelementptr i32, i32* %b, i32 1
+  store i32 %a, i32* %addr
+  ret void
+}
+
+; CHECK-LABEL: sstf2:
+; CHECK: [[x:s[0-9]+]] =	sadd.s32 $0x1, s1
+; CHECK: [smem:[[x]]] = sst s0
+define void @sstf2(float %a, float* %b) {
+  %addr = getelementptr float, float* %b, i32 1
+  store float %a, float* %addr
+  ret void
+}
+
+; CHECK-LABEL: sst_gep:
+; CHECK: s1 = sadd.s32 s2, s1
+; CHECK: [smem:s1] = sst s0
+define void @sst_gep(float %a, float* %b, i32 %c) {
+  %addr = getelementptr float, float* %b, i32 %c
+  store float %a, float* %addr
+  ret void
+}
+
+; CHECK-LABEL: sld_struct:
+; CHECK: s0 = sld [smem:s0+$0x1]
+define i32 @sld_struct(%struct.S* %s) {
+  %p = getelementptr inbounds %struct.S, %struct.S* %s, i32 0, i32 1
+  %l = load i32, i32* %p, align 4
+  ret i32 %l
+}
+
+; CHECK-LABEL: sld_roff:
+; CHECK: s0 = sld [smem:s0+s1]
+define i32 @sld_roff(i32* %s, i32 %off) {
+  %p = getelementptr i32, i32* %s, i32 %off
+  %l = load i32, i32* %p, align 4
+  ret i32 %l
+}
+
+@garr =  global [2 x i32] [i32 42, i32 42]
+
+; CHECK-LABEL: sld_global1:
+; CHECK: s0 = sld [smem:garr+1]
+define i32 @sld_global1() {
+  %p = getelementptr inbounds [2 x i32], [2 x i32]* @garr, i32 0, i32 1
+  %l = load i32, i32* %p, align 4
+  ret i32 %l
+}
+
+; CHECK-LABEL: sld_global0:
+; CHECK: s0 = sld [smem:garr]
+define i32 @sld_global0() {
+  %p = getelementptr inbounds [2 x i32], [2 x i32]* @garr, i32 0, i32 0
+  %l = load i32, i32* %p, align 4
+  ret i32 %l
+}
+
+; CHECK-LABEL: loadimmadd:
+; CHECK: s0 = sld [smem:$0x4]
+define i32 @loadimmadd(float %a) {
+  %l = load i32, i32* inttoptr (i32 4 to i32*), align 4
+  ret i32 %l
+}
+
+; CHECK-LABEL: sst_gep_bitcast:
+; CHECK: [[x:s[0-9]+]] =	sadd.s32 $0x2, s1
+; CHECK: [smem:[[x]]] = sst s0
+define void @sst_gep_bitcast(float %a, float* %b) {
+  %bc = bitcast float* %b to i8*
+  %addr = getelementptr i8, i8* %bc, i32 8
+  %bc2 = bitcast i8* %addr to float*
+  store float %a, float* %bc2
+  ret void
+}
+
+@gfloat = global float 1.0
+
+; CHECK-LABEL: sst_global:
+; CHECK: [smem:gfloat] = sst s0
+define void @sst_global(float %a) {
+  store float %a, float* @gfloat
+  ret void
+}
+
+; CHECK-LABEL: storeimmadd:
+; CHECK: [smem:$0x1] = sst s0
+define void @storeimmadd(float %a) {
+  store float %a, float* inttoptr (i32 1 to float*)
+  ret void
+}
+
+@gint = global i32 1
+
+; CHECK-LABEL: gep_CE:
+; CHECK: s0 = sld [smem:gint+2]
+define float @gep_CE() {
+  %1 = load float, float* bitcast (i32* getelementptr inbounds (i32, i32* @gint, i32 2) to float*), align 4
+  ret float %1
+}
+
+; CHECK-LABEL: gep_CE_Phi:
+; CHECK: simm.s32 gint+3
+; CHECK: simm.s32 @!p0 gint+2
+define float @gep_CE_Phi(i1 %0) {
+  br i1 %0, label %2, label %3
+
+2: ; preds = %1
+  br label %3
+
+3: ; preds = %1, %2
+  %phi = phi float* [ bitcast (i32* getelementptr inbounds (i32, i32* @gint, i64 2) to float*), %2 ], [ bitcast (i32* getelementptr inbounds (i32, i32* @gint, i64 3) to float*), %1 ]
+  %res = load float, float* %phi, align 4
+  ret float %res
+}
+
+; Test that DAG selection doesn't generate a 16bits Store as we don't natively
+; support it.
+; CHECK-LABEL: reduced_store:
+; CHECK:  = sld [smem:$0xa1]
+; CHECK:  = sand.u32
+; CHECK:  = sor.u32
+; CHECK: [smem:$0xa1] = sst
+define void @reduced_store(){
+entry:
+  %0 = load i32, i32* inttoptr (i32 161 to i32*), align 4
+  %1 = and i32 %0, -65281
+  %2 = or i32 %1, 13568
+  store i32 %2, i32* inttoptr (i32 161 to i32*), align 4
+  ret void
+}
+
+; Test that we have the correct latency for sld.
+; CHECK-LABEL: sld_latency:
+; CHECK:  = sld [smem:$0xa1]
+; CHECK:  = sadd.s32
+define i32 @sld_latency(){
+entry:
+  %0 = load i32, i32* inttoptr (i32 161 to i32*), align 4
+  %1 = add i32 %0, 1
+  ret i32 %1
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_ldst_encoded.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_ldst_encoded.ll
new file mode 100644
index 0000000..3a9557e
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_ldst_encoded.ll

@@ -0,0 +1,57 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-encode-mcinst-bundles -tpu-print-sc-mcinst-encodings \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: ldi:
+; CHECK: s0 = sld [smem:s0+$0x0 (embed encoding 48)]
+define i32 @ldi(i32* %a) {
+  %r = load i32, i32* %a
+  ret i32 %r
+}
+
+; CHECK-LABEL: ldtf2:
+; CHECK: s0 = sld [smem:s0+$0x1 (embed encoding 49)]
+define float @ldtf2(float* %a) {
+  %addr = getelementptr float, float* %a, i32 1
+  %r = load float, float* %addr
+  ret float %r
+}
+
+; CHECK-LABEL: vaddrs:
+; CHECK: vadd.f32 s0 (vs0), v0
+define <8 x float> @vaddrs(<8 x float> %a, float %b) {
+  %c = insertelement <8 x float> undef, float %b, i32 0
+  %d = shufflevector <8 x float> %c, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %e = fadd <8 x float> %a, %d
+  ret <8 x float> %e
+}
+
+; CHECK-LABEL: vaddrs_dbl:
+; CHECK: { v0 = vadd.s32 s0 (vs0), v0;
+; CHECK:   v1 = vadd.s32 s0 (vs0), v1 }
+define <8 x i32> @vaddrs_dbl(<8 x i32> %a, <8 x i32> %z, i32 %b) {
+  %c = insertelement <8 x i32> undef, i32 %b, i32 0
+  %d = shufflevector <8 x i32> %c, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %e = add <8 x i32> %a, %d
+  %f = add <8 x i32> %z, %d
+  %g = or <8 x i32> %e, %f
+  ret <8 x i32> %g
+}
+
+; CHECK-LABEL: vaddrs_dbl2:
+; CHECK: { v0 = vadd.s32 s0 (vs1), v0;
+; CHECK:   v1 = vadd.s32 s1 (vs0), v1 }
+define <8 x i32> @vaddrs_dbl2(<8 x i32> %a, <8 x i32> %z, i32 %b, i32 %y) {
+  %c = insertelement <8 x i32> undef, i32 %b, i32 0
+  %d = shufflevector <8 x i32> %c, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %y1 = insertelement <8 x i32> undef, i32 %y, i32 0
+  %y2 = shufflevector <8 x i32> %y1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %e = add <8 x i32> %a, %d
+  %f = add <8 x i32> %z, %y2
+  %g = or <8 x i32> %e, %f
+  ret <8 x i32> %g
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_sc.ll
new file mode 100644
index 0000000..e74084f
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_sc.ll

@@ -0,0 +1,21 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.sc.sint(i32) readnone
+
+; CHECK-LABEL: sint_r:
+; CHECK: _ = sint s{{[0-9]+}}
+define void @sint_r(i32 %type) {
+  call void @llvm.tpu.sc.sint(i32 %type)
+  ret void
+}
+
+; CHECK-LABEL: sint_i:
+; CHECK: _ = sint $0xd
+define void @sint_i(i32 %x) {
+  call void @llvm.tpu.sc.sint(i32 13)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_vf.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_vf.ll
new file mode 100644
index 0000000..b0a36bc
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalar_vf.ll

@@ -0,0 +1,27 @@
+; RUN: llc < %s -mcpu=tensorcore-vf -asm-verbose=false -tpu-skip-fast-opt | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; read sync flags
+@g = addrspace(204) global i32 42
+declare i32 @llvm.tpu.syncpamov(i32 addrspace(204)*)
+
+; CHECK-LABEL: syncpamov_i
+; CHECK: (sfrf) = vsyncpamov [sflag:g]
+; CHECK: _ = vdelay $0x1
+; CHECK: s0 = spop (sfrf)
+define i32 @syncpamov_i() {
+  %c = call i32 @llvm.tpu.syncpamov(i32 addrspace(204)* @g)
+  ret i32 %c
+}
+
+; CHECK-LABEL: syncpamov_r
+; CHECK: (sfrf) = vsyncpamov [sflag:s0]
+; CHECK: _ = vdelay $0x1
+; CHECK: s0 = spop (sfrf)
+define i32 @syncpamov_r(i32 addrspace(204)* %a) {
+  %c = call i32 @llvm.tpu.syncpamov(i32 addrspace(204)* %a)
+  ret i32 %c
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalarize.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalarize.ll
new file mode 100644
index 0000000..d4baef8
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scalarize.ll

@@ -0,0 +1,45 @@
+; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -tpu-skip-fast-opt | FileCheck %s
+; REQUIRES: tpu
+
+; Test that we convert vector instructions to scalar if possible.
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: shlvi:
+; CHECK: s{{[0-9]+}} = sshll.u32 s0, $0xa
+; CHECK: v{{[0-9]+}} = vmov s{{[0-9]+}}
+define <1024 x i32> @shlvi(i32 %x) {
+  %x1 = insertelement <1024 x i32> undef, i32 %x, i32 0
+  %v0 = insertelement <1024 x i32> zeroinitializer, i32 10, i32 0
+  %y = shl <1024 x i32> %x1, %v0
+  ret <1024 x i32> %y
+}
+
+; CHECK-LABEL: addvv:
+; CHECK: s{{[0-9]+}} = sadd.s32 s1, s0
+; CHECK: v{{[0-9]+}} = vmov s{{[0-9]+}}
+define <1024 x i32> @addvv(i32 %x, i32 %y) {
+  %x1 = insertelement <1024 x i32> undef, i32 %x, i32 0
+  %y1 = insertelement <1024 x i32> undef, i32 %y, i32 0
+  %res = add <1024 x i32> %x1, %y1
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: vector_chain:
+; CHECK: sshll.u32
+; CHECK: sshll.u32
+; CHECK: sxor.u32
+; CHECK: sxor.u32
+define <1024 x i32> @vector_chain(i32 %x, i32 %y, i32 %z) {
+  %splat1 = insertelement <1024 x i32> undef, i32 %x, i32 0
+  %v0 = insertelement <1024 x i32> zeroinitializer, i32 10, i32 0
+  %a = shl <1024 x i32> %splat1, %v0
+  %splat2 = insertelement <1024 x i32> undef, i32 %y, i32 0
+  %v1 = insertelement <1024 x i32> zeroinitializer, i32 15, i32 0
+  %b = shl <1024 x i32> %splat2, %v1
+  %splat3 = insertelement <1024 x i32> undef, i32 %z, i32 0
+  %c = xor <1024 x i32> %a, %splat3
+  %d = xor <1024 x i32> %c, %b
+  ret <1024 x i32> %d
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_alloc_error_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_alloc_error_sc.ll
new file mode 100644
index 0000000..44b1c1f
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_alloc_error_sc.ll

@@ -0,0 +1,100 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-vf -tpu-fatal-mem-alloc-error=false < %s 2>&1 \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Test that smem allocation size is checked properly for sparsecore-tec-vf.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32* @llvm.tpu.alloca.smem(i32)
+declare i32 addrspace(201)* @llvm.tpu.alloca.tilespmem(i32)
+declare i32 addrspace(202)* @llvm.tpu.alloca.spmem(i32)
+declare i32 addrspace(203)* @llvm.tpu.allocate.hbm(i32, i32)
+declare i32 addrspace(204)* @llvm.tpu.allocate.sflag(i32, i32)
+declare i32 addrspace(203)* @llvm.tpu.allocate.dyn.hbm(i32, i32)
+declare i32 addrspace(211)* @llvm.tpu.addrspacecast.scs.p204i32(i32 addrspace(204)*)
+declare void @llvm.tpu.dma.hbm.to.hbm.sc.simple(i32 addrspace(211)*, i32 addrspace(203)*, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+
+; CHECK: Scoped allocation overflow.
+ define void @scoped_allocation_overflow_sparsecore_tec_smem(i32 %a) {
+entry:
+  %mem = call i32* @llvm.tpu.alloca.smem(i32 2049)
+  %arrayidx = getelementptr inbounds i32, i32* %mem, i32 %a
+  store i32 0, i32* %arrayidx, align 4
+  ret void
+}
+
+; CHECK: Scoped allocation overflow.
+ define void @scoped_allocation_overflow_sparsecore_tec_tilespmem(i32 %a) {
+entry:
+  %mem = call i32 addrspace(201)* @llvm.tpu.alloca.tilespmem(i32 131073)
+  %memc = bitcast i32 addrspace(201)* %mem to <8 x i32> addrspace(201)*
+  %arrayidx = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %memc, i32 %a
+  store <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <8 x i32> addrspace(201)* %arrayidx, align 32
+  ret void
+}
+
+; CHECK: Scoped allocation overflow.
+ define void @scoped_allocation_overflow_sparsecore_tec_spmem(i32 %a) {
+entry:
+  %mem = call i32 addrspace(202)* @llvm.tpu.alloca.spmem(i32 2097153)
+  %memc = bitcast i32 addrspace(202)* %mem to <8 x i32> addrspace(202)*
+  %arrayidx = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(202)* %memc, i32 %a
+  store <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <8 x i32> addrspace(202)* %arrayidx, align 32
+  ret void
+}
+
+; CHECK: Scoped allocation overflow.
+ define void @scoped_allocation_overflow_sparsecore_tec_tilespmem_spmem(i32 %a, i32 %b) {
+entry:
+  %mem = call i32 addrspace(201)* @llvm.tpu.alloca.tilespmem(i32 131073)
+  %memc = bitcast i32 addrspace(201)* %mem to <8 x i32> addrspace(201)*
+  %arrayidx1 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %memc, i32 %b
+  store <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <8 x i32> addrspace(201)* %arrayidx1, align 32
+  ret void
+}
+
+; Maximum TEC sflag address is 32
+; CHECK: Bump pointer base metadata sflag.start.alloca too large
+ define void @bump_allocation_invalid_offset(i32 %x) {
+entry:
+  ret void
+}
+
+; CHECK: Scoped allocation overflow.
+define void @allocate_hbm_overflow() {
+entry:
+  %0 = call i32 addrspace(204)* @llvm.tpu.allocate.sflag(i32 1, i32 5)
+  %1 = call i32 addrspace(203)* @llvm.tpu.allocate.hbm(i32 8, i32 64)
+  %2 = call i32 addrspace(203)* @llvm.tpu.allocate.hbm(i32 8, i32 4278190080)
+  %dst_sflag = tail call i32 addrspace(211)* @llvm.tpu.addrspacecast.scs.p204i32(i32 addrspace(204)* %0)
+  call void @llvm.tpu.dma.hbm.to.hbm.sc.simple(i32 addrspace(211)* %dst_sflag, i32 addrspace(203)* %1, i32 addrspace(203)* %2, i32 8, i32 0)
+  ret void
+}
+
+; CHECK: Scoped allocation overflow.
+define void @allocate_dyn_hbm_overflow() {
+entry:
+  %0 = call i32 addrspace(204)* @llvm.tpu.allocate.sflag(i32 1, i32 5)
+  %1 = call i32 addrspace(203)* @llvm.tpu.allocate.hbm(i32 8, i32 64)
+  %2 = call i32 addrspace(203)* @llvm.tpu.allocate.dyn.hbm(i32 8, i32 4278190080)
+  %dst_sflag = tail call i32 addrspace(211)* @llvm.tpu.addrspacecast.scs.p204i32(i32 addrspace(204)* %0)
+  call void @llvm.tpu.dma.hbm.to.hbm.sc.simple(i32 addrspace(211)* %dst_sflag, i32 addrspace(203)* %1, i32 addrspace(203)* %2, i32 8, i32 0)
+  ret void
+}
+
+
+!sflag.funcs.alloca = !{!1, !2, !3, !4, !5, !6, !7}
+!sflag.start.alloca = !{!0, !0, !0, !0, !128, !0, !0}
+
+!1 = !{void (i32)* @scoped_allocation_overflow_sparsecore_tec_smem}
+!2 = !{void (i32)* @scoped_allocation_overflow_sparsecore_tec_tilespmem}
+!3 = !{void (i32)* @scoped_allocation_overflow_sparsecore_tec_spmem}
+!4 = !{void (i32, i32)* @scoped_allocation_overflow_sparsecore_tec_tilespmem_spmem}
+!5 = !{void (i32)* @bump_allocation_invalid_offset}
+!6 = !{void ()* @allocate_hbm_overflow}
+!7 = !{void ()* @allocate_dyn_hbm_overflow}
+
+!0 = !{i32 0}
+!128 = !{i32 128}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_alloc_error_tc_jf.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_alloc_error_tc_jf.ll
new file mode 100644
index 0000000..cebaeed
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_alloc_error_tc_jf.ll

@@ -0,0 +1,28 @@
+; RUN: opt -S -O2 -mcpu=tensorcore-jf -tpu-fatal-mem-alloc-error=false < %s 2>&1 \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Test that smem allocation size is checked properly for tensorcore-jf.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32* @llvm.tpu.alloca.smem(i32)
+declare <1024 x i32> addrspace(205)* @llvm.tpu.allocate.vmem(i32, i32)
+
+; CHECK: Scoped allocation overflow.
+ define void @scoped_allocation_overflow_tensorcore_jf_smem(i32 %a) {
+  %mem = call i32* @llvm.tpu.alloca.smem(i32 4193)
+  %arrayidx = getelementptr inbounds i32, i32* %mem, i32 %a
+  store i32 0, i32* %arrayidx, align 4
+  ret void
+}
+
+; CHECK: Scoped allocation overflow.
+ define void @scoped_allocation_overflow_tensorcore_jf_vmem(i32 %a) {
+  %mem = call <1024 x i32> addrspace(205)* @llvm.tpu.allocate.vmem(i32 65537, i32 0)
+  %arrayidx = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %mem, i32 %a
+  ; We're not coding a store here for readability, due to its long vector size.
+  ; The test works without it.
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_alloc_error_tc_pf.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_alloc_error_tc_pf.ll
new file mode 100644
index 0000000..25bd28f
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_alloc_error_tc_pf.ll

@@ -0,0 +1,28 @@
+; RUN: opt -S -O2 -mcpu=tensorcore-pf -tpu-fatal-mem-alloc-error=false < %s 2>&1 \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Test that smem allocation size is checked properly for tensorcore-pf.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32* @llvm.tpu.alloca.smem(i32)
+declare <1024 x i32> addrspace(205)* @llvm.tpu.allocate.vmem(i32, i32)
+
+; CHECK: Scoped allocation overflow.
+ define void @scoped_allocation_overflow_tensorcore_pf_smem(i32 %a) {
+  %mem = call i32* @llvm.tpu.alloca.smem(i32 262145)
+  %arrayidx = getelementptr inbounds i32, i32* %mem, i32 %a
+  store i32 0, i32* %arrayidx, align 4
+  ret void
+}
+
+; CHECK: Scoped allocation overflow.
+ define void @scoped_allocation_overflow_tensorcore_pf_vmem(i32 %a) {
+  %mem = call <1024 x i32> addrspace(205)* @llvm.tpu.allocate.vmem(i32 65537, i32 0)
+  %arrayidx = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %mem, i32 %a
+  ; We're not coding a store here for readability, due to its long vector size.
+  ; The test works without it.
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_alloc_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_alloc_sc.ll
new file mode 100644
index 0000000..b700640
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_alloc_sc.ll

@@ -0,0 +1,674 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-vf < %s \
+; RUN: | llc -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32* @llvm.tpu.alloca.smem(i32)
+declare i32* @llvm.tpu.allocate.smem(i32, i32)
+declare i32 addrspace(201)* @llvm.tpu.alloca.tilespmem(i32)
+declare i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32, i32)
+declare i32 addrspace(202)* @llvm.tpu.alloca.spmem(i32)
+declare i32 addrspace(202)* @llvm.tpu.allocate.spmem(i32, i32)
+declare i32 addrspace(208)* @llvm.tpu.allocate.dreg(i32, i32)
+declare i32 addrspace(203)* @llvm.tpu.allocate.hbm(i32, i32)
+declare i32 addrspace(204)* @llvm.tpu.alloca.sflag(i32)
+declare i32 addrspace(204)* @llvm.tpu.allocate.sflag(i32, i32)
+declare i32 addrspace(210)* @llvm.tpu.allocate.sflag.other(i32, i32)
+declare i32 addrspace(211)* @llvm.tpu.allocate.sflag.any(i32, i32)
+declare i32 addrspace(212)* @llvm.tpu.allocate.smem.any(i32, i32)
+declare i32 addrspace(213)* @llvm.tpu.allocate.hbm.any(i32, i32)
+declare i32 addrspace(214)* @llvm.tpu.allocate.timem(i32, i32)
+declare i32 addrspace(203)* @llvm.tpu.allocate.dyn.hbm(i32, i32)
+declare i32 addrspace(216)* @llvm.tpu.allocate.dyn.iova(i32, i32)
+declare i32 addrspace(216)* @llvm.tpu.allocate.iova(i32, i32)
+declare void @llvm.tpu.end.allocation.scope()
+declare void @llvm.tpu.dma.descriptor(i32 *) nounwind
+declare i32 @llvm.tpu.ptrtoint.pi32(i32*)
+declare i32* @llvm.tpu.inttoptr.pi32(i32)
+declare void @llvm.tpu.sst(i32, i32 addrspace(209)*, i32)
+declare i32 addrspace(211)* @llvm.tpu.addrspacecast.scs(i32 addrspace(204)*)
+declare void @llvm.tpu.dma.hbm.to.hbm.sc.simple(i32 addrspace(211)*, i32 addrspace(203)*, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+; Test that the compiler is able to resolve the aliasing and re-order all the
+; loads before the store.
+; CHECK-LABEL: buffer:
+; CHECK: sld
+; CHECK: sld
+; CHECK: sld
+; CHECK: sadd.s32
+; CHECK: sadd.s32
+; CHECK  sst
+; CHECK: sst
+; CHECK: _ = shalt
+define void @buffer(i32 %a) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %2 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %3, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %a
+  store i32 %add, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %0, i32 1
+  %4 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %0, i32 2
+  %5 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %4, %5
+  %arrayidx5 = getelementptr inbounds i32, i32* %2, i32 2
+  store i32 %add4, i32* %arrayidx5, align 4
+  ret void
+}
+
+; Same test as above, but with multiple address spaces. Test that the compiler
+; is able to resolve the aliasing and re-order all the loads before the store.
+; A spmem allocation has been added to model its usage model. As opposed to this
+; test, we currently only expect the scs core to use it, not tec.
+; CHECK-LABEL: buffer_multiaddr:
+; CHECK: sld
+; CHECK: sld
+; CHECK: sld
+; CHECK: vadd.s32
+; CHECK: vbroadcast
+; CHECK: sadd.s32
+; CHECK: vst
+; CHECK  sst
+; CHECK: _ = shalt
+define void @buffer_multiaddr(i32 %a) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32 addrspace(201)* @llvm.tpu.alloca.tilespmem(i32 32)
+  %b1 = bitcast i32 addrspace(201)* %1 to <8 x i32> addrspace(201)*
+  %2 = call i32 addrspace(202)* @llvm.tpu.alloca.spmem(i32 32)
+  %b2 = bitcast i32 addrspace(202)* %2 to <8 x i32> addrspace(202)*
+  %3 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %4 = load i32, i32* %arrayidx, align 4
+  %5 = insertelement <8 x i32> undef, i32 %4, i32 0
+  %6 = shufflevector <8 x i32> %5, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %add = add nsw <8 x i32> %6, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %arrayidx1 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %b1, i32 %a
+  store <8 x i32> %add, <8 x i32> addrspace(201)* %arrayidx1, align 32
+  %arrayidx2 = getelementptr inbounds i32, i32* %0, i32 1
+  %spaddr = ptrtoint <8 x i32> addrspace(202)* %b2 to i32
+  store i32 %spaddr, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %0, i32 2
+  %7 = load i32, i32* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %0, i32 3
+  %8 = load i32, i32* %arrayidx4, align 4
+  %add4 = add nsw i32 %7, %8
+  %arrayidx5 = getelementptr inbounds i32, i32* %3, i32 2
+  store i32 %add4, i32* %arrayidx5, align 4
+  ret void
+}
+
+; Test that ld store in allocation aliasing each other cannot be re-ordered.
+; Allocation %7 alias %0, %1, %2 since it will cover range [5, 14].
+; CHECK-LABEL: EndScope:
+; CHECK: sld
+; CHECK: sld
+; CHECK: sld
+; CHECK  sst
+; CHECK: sst
+; CHECK: sld
+; CHECK: sld
+; CHECK: sld
+; CHECK  sst
+; CHECK: sst
+; CHECK: _ = shalt
+define dso_local void @EndScope(i32 %off1, i32 %off2, i32 %off3, i32 %off4) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %2 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %3, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %off1
+  store i32 %add, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %0, i32 1
+  %4 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %0, i32 2
+  %5 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %4, %5
+  %arrayidx5 = getelementptr inbounds i32, i32* %2, i32 %off2
+  store i32 %add4, i32* %arrayidx5, align 4
+  call void @llvm.tpu.end.allocation.scope()
+  %6 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %7 = call i32* @llvm.tpu.alloca.smem(i32 10)
+  %8 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %arrayidx9 = getelementptr inbounds i32, i32* %7, i32 0
+  %9 = load i32, i32* %arrayidx9, align 4
+  %add10 = add nsw i32 %9, 2
+  %arrayidx11 = getelementptr inbounds i32, i32* %6, i32 %off3
+  store i32 %add10, i32* %arrayidx11, align 4
+  %arrayidx12 = getelementptr inbounds i32, i32* %7, i32 1
+  %10 = load i32, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, i32* %7, i32 2
+  %11 = load i32, i32* %arrayidx13, align 4
+  %add14 = add nsw i32 %10, %11
+  %arrayidx15 = getelementptr inbounds i32, i32* %8, i32 %off4
+  store i32 %add14, i32* %arrayidx15, align 4
+  call void @llvm.tpu.end.allocation.scope()
+  ret void
+}
+
+; Test that ld/stores across different scope don't alias if there range don't
+; alias. Here %7 doesn't alias with any other allocation.
+; CHECK-LABEL: NoAliasAcrossScope:
+; CHECK: sld
+; CHECK: sld
+; CHECK: sld
+; CHECK: sld
+; CHECK: sld
+; CHECK: sld
+; CHECK  sst
+; CHECK: sst
+; CHECK  sst
+; CHECK: sst
+; CHECK: _ = shalt
+define dso_local void @NoAliasAcrossScope(i32 %off1, i32 %off2, i32 %off3, i32 %off4) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %2 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %3, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %off1
+  store i32 %add, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %0, i32 1
+  %4 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %0, i32 2
+  %5 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %4, %5
+  %arrayidx5 = getelementptr inbounds i32, i32* %2, i32 %off2
+  store i32 %add4, i32* %arrayidx5, align 4
+  call void @llvm.tpu.end.allocation.scope()
+  %6 = call i32* @llvm.tpu.alloca.smem(i32 20)
+  %7 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %8 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %arrayidx9 = getelementptr inbounds i32, i32* %7, i32 0
+  %9 = load i32, i32* %arrayidx9, align 4
+  %add10 = add nsw i32 %9, 2
+  %arrayidx11 = getelementptr inbounds i32, i32* %6, i32 %off3
+  store i32 %add10, i32* %arrayidx11, align 4
+  %arrayidx12 = getelementptr inbounds i32, i32* %7, i32 1
+  %10 = load i32, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, i32* %7, i32 2
+  %11 = load i32, i32* %arrayidx13, align 4
+  %add14 = add nsw i32 %10, %11
+  %arrayidx15 = getelementptr inbounds i32, i32* %8, i32 %off4
+  store i32 %add14, i32* %arrayidx15, align 4
+  call void @llvm.tpu.end.allocation.scope()
+  ret void
+}
+
+; Same test as first function with pre-fixed offsets. Make sure the loads are
+; reordered before the stores.
+; CHECK-LABEL: buffer_prefix:
+; CHECK: sld
+; CHECK: sld
+; CHECK: sld
+; CHECK: sadd.s32
+; CHECK: sadd.s32
+; CHECK  sst
+; CHECK: sst
+; CHECK: _ = shalt
+define void @buffer_prefix(i32 %a) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.allocate.smem(i32 5, i32 0)
+  %1 = call i32* @llvm.tpu.allocate.smem(i32 4, i32 5)
+  %2 = call i32* @llvm.tpu.allocate.smem(i32 5, i32 9)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %3, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %a
+  store i32 %add, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %0, i32 1
+  %4 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %0, i32 2
+  %5 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %4, %5
+  %arrayidx5 = getelementptr inbounds i32, i32* %2, i32 2
+  store i32 %add4, i32* %arrayidx5, align 4
+  ret void
+}
+
+; Same test as above but with multiple address spaces. Make sure the loads
+; are reordered before the stores and multiple address spaces.
+; CHECK-LABEL: buffer_prefix_multiaddr:
+; CHECK: sld
+; CHECK: sld
+; CHECK: sld
+; CHECK: vadd.s32
+; CHECK: sadd.s32
+; CHECK  vbroadcast
+; CHECK  sst
+; CHECK: vst
+; CHECK: _ = shalt
+define void @buffer_prefix_multiaddr(i32 %a) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.allocate.smem(i32 5, i32 0)
+  %1 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 32, i32 5)
+  %b1 = bitcast i32 addrspace(201)* %1 to <8 x i32> addrspace(201)*
+  %2 = call i32* @llvm.tpu.allocate.smem(i32 5, i32 9)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %arrayidx, align 4
+  %4 = insertelement <8 x i32> undef, i32 %3, i32 0
+  %5 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %add = add nsw <8 x i32> %5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %arrayidx1 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %b1, i32 %a
+  store <8 x i32> %add, <8 x i32> addrspace(201)* %arrayidx1, align 32
+  %arrayidx2 = getelementptr inbounds i32, i32* %0, i32 1
+  %6 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %0, i32 2
+  %7 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %6, %7
+  %arrayidx5 = getelementptr inbounds i32, i32* %2, i32 2
+  store i32 %add4, i32* %arrayidx5, align 4
+  ret void
+}
+
+; Test that ld store in allocation aliasing each other cannot be re-ordered.
+; Allocation %7 alias %0, %1, %2 since it will cover range [5, 14].
+; CHECK-LABEL: TwoScopes:
+; CHECK: sld
+; CHECK: sld
+; CHECK: sld
+; CHECK  sst
+; CHECK: sst
+; CHECK: sld
+; CHECK: sld
+; CHECK: sld
+; CHECK  sst
+; CHECK: sst
+; CHECK: _ = shalt
+define dso_local void @TwoScopes(i32 %off1, i32 %off2, i32 %off3, i32 %off4) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.allocate.smem(i32 5, i32 0)
+  %1 = call i32* @llvm.tpu.allocate.smem(i32 4, i32 5)
+  %2 = call i32* @llvm.tpu.allocate.smem(i32 5, i32 9)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %3, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %off1
+  store i32 %add, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %0, i32 1
+  %4 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %0, i32 2
+  %5 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %4, %5
+  %arrayidx5 = getelementptr inbounds i32, i32* %2, i32 %off2
+  store i32 %add4, i32* %arrayidx5, align 4
+  %6 = call i32* @llvm.tpu.allocate.smem(i32 4, i32 0)
+  %7 = call i32* @llvm.tpu.allocate.smem(i32 10, i32 4)
+  %8 = call i32* @llvm.tpu.allocate.smem(i32 5, i32 14)
+  %arrayidx9 = getelementptr inbounds i32, i32* %7, i32 0
+  %9 = load i32, i32* %arrayidx9, align 4
+  %add10 = add nsw i32 %9, 2
+  %arrayidx11 = getelementptr inbounds i32, i32* %6, i32 %off3
+  store i32 %add10, i32* %arrayidx11, align 4
+  %arrayidx12 = getelementptr inbounds i32, i32* %7, i32 1
+  %10 = load i32, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, i32* %7, i32 2
+  %11 = load i32, i32* %arrayidx13, align 4
+  %add14 = add nsw i32 %10, %11
+  %arrayidx15 = getelementptr inbounds i32, i32* %8, i32 %off4
+  store i32 %add14, i32* %arrayidx15, align 4
+  ret void
+}
+
+; Test that (v)ld store in allocation of different address spaces can be re-ordered if
+; their offsets would alias if it was the same address space. Allocation %6 would alias
+; %0, %1, %2 if it was the same address space.
+; CHECK-LABEL: TwoScopes_multiaddr:
+; CHECK: vld
+; CHECK: vld
+; CHECK: sld
+; CHECK: vld
+; CHECK: sld
+; CHECK: sld
+; CHECK: sst
+; CHECK: vst
+; CHECK: sst
+; CHECK: vst
+; CHECK: _ = shalt
+define dso_local void @TwoScopes_multiaddr(i32 %off1, i32 %off2, i32 %off3, i32 %off4) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.allocate.smem(i32 5, i32 0)
+  %1 = call i32* @llvm.tpu.allocate.smem(i32 4, i32 5)
+  %2 = call i32* @llvm.tpu.allocate.smem(i32 5, i32 9)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %3, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %off1
+  store i32 %add, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %0, i32 1
+  %4 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %0, i32 2
+  %5 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %4, %5
+  %arrayidx5 = getelementptr inbounds i32, i32* %2, i32 %off2
+  store i32 %add4, i32* %arrayidx5, align 4
+  %6 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 32, i32 0)
+  %7 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 33, i32 32)
+  %8 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 49, i32 55)
+  %b6 = bitcast i32 addrspace(201)* %6 to <8 x i32> addrspace(201)*
+  %b7 = bitcast i32 addrspace(201)* %7 to <8 x i32> addrspace(201)*
+  %b8 = bitcast i32 addrspace(201)* %8 to <8 x i32> addrspace(201)*
+  %arrayidx9 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %b7, i32 0
+  %9 = load <8 x i32>, <8 x i32> addrspace(201)* %arrayidx9, align 32
+  %add10 = add nsw <8 x i32> %9, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %arrayidx11 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %b6, i32 %off3
+  store <8 x i32> %add10, <8 x i32> addrspace(201)* %arrayidx11, align 32
+  %arrayidx12 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %b7, i32 1
+  %10 = load <8 x i32>, <8 x i32> addrspace(201)* %arrayidx12, align 32
+  %arrayidx13 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %b7, i32 2
+  %11 = load <8 x i32>, <8 x i32> addrspace(201)* %arrayidx13, align 32
+  %add14 = add nsw <8 x i32> %10, %11
+  %arrayidx15 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %b8, i32 %off4
+  store <8 x i32> %add14, <8 x i32> addrspace(201)* %arrayidx15, align 32
+  ret void
+}
+
+; Make sure we mark as no alias only intrinsic marked as IntrArgMemOnly.
+; Here the DMA can alias other allocations even though it only depends on %desc
+; This is because this intrinsic can access memory based not based on its
+; arguments
+; CHECK-LABEL: buffer_dma:
+; CHECK: dma.desc
+; CHECK: sld
+; CHECK: sld
+; CHECK: sld
+; CHECK: sadd.s32
+; CHECK: sadd.s32
+; CHECK  sst
+; CHECK: sst
+; CHECK: _ = shalt
+define void @buffer_dma(i32 %a) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %2 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %desc = call i32* @llvm.tpu.alloca.smem(i32 5)
+  call void @llvm.tpu.dma.descriptor(i32* %desc)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %3, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %a
+  store i32 %add, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %0, i32 1
+  %4 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %0, i32 2
+  %5 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %4, %5
+  %arrayidx5 = getelementptr inbounds i32, i32* %2, i32 2
+  store i32 %add4, i32* %arrayidx5, align 4
+  ret void
+}
+
+; Test that the compiler can handle extra cast in the IR.
+; CHECK-LABEL: buffer_cast:
+; CHECK: sld
+; CHECK: sld
+; CHECK: sld
+; CHECK: sadd.s32
+; CHECK: sadd.s32
+; CHECK  sst
+; CHECK: sst
+; CHECK: _ = shalt
+define void @buffer_cast(i32 %a) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %2 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %3, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %a
+  store i32 %add, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %0, i32 1
+  %4 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %0, i32 2
+  %5 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %4, %5
+  %c1 = bitcast i32* %2 to float*
+  %c2 = getelementptr float, float* %c1, i32 0
+  %c3 = bitcast float* %c2 to i32*
+  %c4 = bitcast i32* %c3 to float*
+  %c5 = getelementptr float, float* %c4, i32 120
+  %c6 = bitcast float* %c5 to i32*
+  %c7 = call i32 @llvm.tpu.ptrtoint.pi32(i32* %c6)
+  %c8 = call i32* @llvm.tpu.inttoptr.pi32(i32 %c7)
+  %c9 = bitcast i32* %c8 to float*
+  %c10 = getelementptr float, float* %c9, i32 0
+  %c11 = bitcast float* %c10 to i32*
+  %arrayidx5 = getelementptr inbounds i32, i32* %c11, i32 2
+  store i32 %add4, i32* %arrayidx5, align 4
+  ret void
+}
+
+; Test that the compiler can allocate, read and write dregs.
+; CHECK-LABEL: allocate_dreg:
+; CHECK: rddreg
+; CHECK: wrdreg
+; CHECK: wrdreg
+; CHECK: _ = shalt
+define void @allocate_dreg() #2 {
+entry:
+  %0 = call i32 addrspace(208)* @llvm.tpu.allocate.dreg(i32 3, i32 0)
+  %1 = getelementptr inbounds i32, i32 addrspace(208)* %0, i32 0
+  %2 = load i32, i32 addrspace(208)* %1, align 4
+  %3 = add nsw i32 %2, 7
+  %4 = getelementptr inbounds i32, i32 addrspace(208)* %0, i32 0
+  store i32 %3, i32 addrspace(208)* %4, align 4
+  %5 = getelementptr inbounds i32, i32 addrspace(208)* %0, i32 1
+  store i32 11, i32 addrspace(208)* %5, align 4
+  ret void
+}
+
+; Test that the compiler is able to use the metadata as a bump pointer
+; offset, and that the fixed allocations don't overlap and crash the
+; compiler.
+; CHECK-LABEL: buffer_mix:
+; CHECK: _ = shalt
+define void @buffer_mix(i32 %a) #2 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %2 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %3 = call i32* @llvm.tpu.allocate.smem(i32 1, i32 4)
+  %4 = call i32* @llvm.tpu.allocate.smem(i32 5, i32 1)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %5 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %5, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %a
+  store i32 %add, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %0, i32 1
+  %6 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %0, i32 2
+  %7 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %6, %7
+  %arrayidx5 = getelementptr inbounds i32, i32* %4, i32 3
+  store i32 %add4, i32* %arrayidx5, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %0, i32 3
+  %8 = load i32, i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %add4, %8
+  %arrayidx6 = getelementptr inbounds i32, i32* %3, i32 1
+  store i32 %add5, i32* %arrayidx6, align 4
+  ret void
+}
+
+; Test that the compiler can allocate sflag and HBM with an offset.
+; CHECK-LABEL: allocate_hbm_and_sflag:
+; CHECK: s[[SRC:[0-9]+]] = simm.s32 $0x40
+; CHECK: s[[SFLAG:[0-9]+]] = simm.s32 $0x5
+; CHECK: s[[DST:[0-9]+]] = simm.s32 $0x30
+; CHECK: [hbm:s[[DST]]], [sflag:s[[SFLAG]]] = dma.local [hbm:s[[SRC]]], $0x8
+define void @allocate_hbm_and_sflag() #2 {
+entry:
+  %0 = call i32 addrspace(204)* @llvm.tpu.allocate.sflag(i32 1, i32 5)
+  %1 = call i32 addrspace(203)* @llvm.tpu.allocate.hbm(i32 8, i32 64)
+  %2 = call i32 addrspace(203)* @llvm.tpu.allocate.hbm(i32 8, i32 48)
+  %dst_sflag = tail call i32 addrspace(211)* @llvm.tpu.addrspacecast.scs(i32 addrspace(204)* %0)
+  call void @llvm.tpu.dma.hbm.to.hbm.sc.simple(i32 addrspace(211)* %dst_sflag, i32 addrspace(203)* %1, i32 addrspace(203)* %2, i32 8, i32 0)
+  ret void
+}
+
+; Test that the compiler can stack-allocate an sflag.
+; CHECK-LABEL: alloca_sflag:
+; CHECK: s{{[0-9]+}} = simm.s32 $0x{{[0-9]+}}
+define i32 addrspace(204)* @alloca_sflag() {
+entry:
+  %0 = call i32 addrspace(204)* @llvm.tpu.alloca.sflag(i32 1)
+  ret i32 addrspace(204)* %0
+}
+
+; Test that the compiler can allocate the "other" core's sflag with an offset.
+; CHECK-LABEL: allocate_other_sflag:
+; CHECK: s{{[0-9]+}} = simm.s32 $0x5
+define i32 addrspace(210)* @allocate_other_sflag() #2 {
+entry:
+  %0 = call i32 addrspace(210)* @llvm.tpu.allocate.sflag.other(i32 1, i32 5)
+  ret i32 addrspace(210)* %0
+}
+
+; Test that the compiler can allocate the "any" remote sflag with an offset.
+; CHECK-LABEL: allocate_any_sflag:
+; CHECK: s{{[0-9]+}} = simm.s32 $0x5
+define i32 addrspace(211)* @allocate_any_sflag() #2 {
+entry:
+  %0 = call i32 addrspace(211)* @llvm.tpu.allocate.sflag.any(i32 1, i32 5)
+  ret i32 addrspace(211)* %0
+}
+
+; Test that the compiler can allocate the "any" remote hbm with an offset.
+; CHECK-LABEL: allocate_any_hbm:
+; CHECK: s{{[0-9]+}} = simm.s32 $0x64
+define i32 addrspace(213)* @allocate_any_hbm() #2 {
+entry:
+  %0 = call i32 addrspace(213)* @llvm.tpu.allocate.hbm.any(i32 1, i32 100)
+  ret i32 addrspace(213)* %0
+}
+
+; Test that the compiler can allocate the "any" remote smem with an offset.
+; CHECK-LABEL: allocate_any_smem:
+; CHECK: s{{[0-9]+}} = simm.s32 $0xa
+define i32 addrspace(212)* @allocate_any_smem() #2 {
+entry:
+  %0 = call i32 addrspace(212)* @llvm.tpu.allocate.smem.any(i32 1, i32 10)
+  ret i32 addrspace(212)* %0
+}
+
+; Test that the compiler can allocate on timem from scs.
+; CHECK-LABEL: allocate_timem_from_scs:
+; CHECK: s{{[0-9]+}} = simm.s32 $0x5
+define i32 addrspace(214)* @allocate_timem_from_scs() #0 {
+entry:
+  %0 = call i32 addrspace(214)* @llvm.tpu.allocate.timem(i32 1, i32 5)
+  ret i32 addrspace(214)* %0
+}
+
+; Test that the compiler can allocate on timem from tec.
+; CHECK-LABEL: allocate_timem_from_tec:
+; CHECK: s{{[0-9]+}} = simm.s32 $0x5
+define i32 addrspace(214)* @allocate_timem_from_tec() #2 {
+entry:
+  %0 = call i32 addrspace(214)* @llvm.tpu.allocate.timem(i32 1, i32 5)
+  ret i32 addrspace(214)* %0
+}
+
+; Test that the compiler can use dynamic HBM allocation.
+; CHECK-LABEL: allocate_dyn_hbm:
+; CHECK-NEXT: s{{[0-9]+}} = sadd.s32 $0x3, s{{[0-9]+}}
+; CHECK-NEXT: _ = shalt
+define i32 @allocate_dyn_hbm(i32 %a) #2 {
+entry:
+  %0 = call i32 addrspace(203)* @llvm.tpu.allocate.dyn.hbm(i32 10, i32 %a)
+  %1 = ptrtoint i32 addrspace(203)* %0 to i32
+  %2 = add i32 %1, 3
+  ret i32 %2
+}
+
+; This simply tests to allocate something again after we've
+; dynamically allocated above.
+; CHECK-LABEL: allocate_any_smem_again:
+; CHECK: s{{[0-9]+}} = simm.s32 $0xa
+define i32 addrspace(212)* @allocate_any_smem_again() #2 {
+entry:
+  %0 = call i32 addrspace(212)* @llvm.tpu.allocate.smem.any(i32 1, i32 10)
+  ret i32 addrspace(212)* %0
+}
+
+; CHECK-LABEL: allocate_iova:
+; CHECK: s{{[0-9]+}} = simm.s32 $0x80
+define i32 addrspace(216)* @allocate_iova() #2 {
+entry:
+  %0 = call i32 addrspace(216)* @llvm.tpu.allocate.iova(i32 1, i32 128)
+  ret i32 addrspace(216)* %0
+}
+
+; Test that the compiler can use dynamic iova allocation.
+; CHECK-LABEL: allocate_dyn_iova:
+; CHECK-NEXT: s{{[0-9]+}} = sadd.s32 $0x3, s{{[0-9]+}}
+; CHECK-NEXT: _ = shalt
+define i32 @allocate_dyn_iova(i32 %a) #2 {
+entry:
+  %0 = call i32 addrspace(216)* @llvm.tpu.allocate.dyn.iova(i32 10, i32 %a)
+  %1 = ptrtoint i32 addrspace(216)* %0 to i32
+  %2 = add i32 %1, 3
+  ret i32 %2
+}
+
+!smem.funcs.alloca = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25}
+!smem.start.alloca = !{!128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !80, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128}
+!tilespmem.funcs.alloca = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25}
+!tilespmem.start.alloca = !{!128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128}
+!spmem.funcs.alloca = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25}
+!spmem.start.alloca = !{!128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128}
+!hbm.funcs.alloca = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25}
+!hbm.start.alloca = !{!128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128, !128}
+!sflag.funcs.alloca = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25}
+!sflag.start.alloca = !{!32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !80, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32}
+!sflagother.funcs.alloca = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25}
+!sflagother.start.alloca = !{!32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32, !32}
+
+!0 = !{void (i32)* @buffer}
+!1 = !{void (i32)* @buffer_multiaddr}
+!2 = !{void (i32, i32, i32, i32)* @EndScope}
+!3 = !{void (i32, i32, i32, i32)* @NoAliasAcrossScope}
+!4 = !{void (i32)* @buffer_prefix}
+!5 = !{void (i32)* @buffer_prefix_multiaddr}
+!6 = !{void (i32, i32, i32, i32)* @TwoScopes}
+!7 = !{void (i32, i32, i32, i32)* @TwoScopes_multiaddr}
+!8 = !{void (i32)* @buffer_dma}
+!9 = !{void (i32)* @buffer_cast}
+!10 = !{void ()* @allocate_dreg}
+!11 = !{void (i32)* @buffer_mix}
+!14 = !{void ()* @allocate_hbm_and_sflag}
+!15 = !{i32 addrspace(204)* ()* @alloca_sflag}
+!16 = !{i32 addrspace(210)* ()* @allocate_other_sflag}
+!17 = !{i32 addrspace(211)* ()* @allocate_any_sflag}
+!18 = !{i32 addrspace(213)* ()* @allocate_any_hbm}
+!19 = !{i32 addrspace(212)* ()* @allocate_any_smem}
+!20 = !{i32 addrspace(214)* ()* @allocate_timem_from_scs}
+!21 = !{i32 addrspace(214)* ()* @allocate_timem_from_tec}
+!22 = !{i32 (i32)* @allocate_dyn_hbm}
+!23 = !{i32 addrspace(212)* ()* @allocate_any_smem_again}
+!24 = !{i32 addrspace(216)* ()* @allocate_iova}
+!25 = !{i32 (i32)* @allocate_dyn_iova}
+
+; Not perfect naming, but more readable as constants than !n, !{n+1}, etc.
+!80 = !{i32 8}
+!160 = !{i32 16}
+!32 = !{i32 32}
+!128 = !{i32 128}
+

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_alloc_spill_limits.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_alloc_spill_limits.ll
new file mode 100644
index 0000000..d4632cf
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_alloc_spill_limits.ll

@@ -0,0 +1,63 @@
+; RUN: opt < %s -S -O2 -mcpu=tensorcore-jf | FileCheck %s
+; REQUIRES: tpu
+
+; This test is running tensorcore-jf, but applies to all targets. Checks that
+; the tpu-mem-alloc pass sets the metadata.
+
+; CHECK: !smem.funcs.spill = !{!0, !1, !2}
+; CHECK: !smem.ranges.spill.start = !{!3, !4, !5}
+; CHECK: !smem.ranges.spill.limit = !{!6, !6, !6}
+; CHECK: !tilespmem.funcs.spill = !{!0, !1, !2}
+; CHECK: !tilespmem.ranges.spill.start = !{!7, !7, !7}
+; CHECK: !tilespmem.ranges.spill.limit = !{!7, !7, !7}
+; CHECK: !vmem.funcs.spill = !{!0, !1, !2}
+; CHECK: !vmem.ranges.spill.start = !{!8, !9, !10}
+; CHECK: !vmem.ranges.spill.limit = !{!11, !11, !11}
+; CHECK: !3 = !{i32 15}
+; CHECK: !4 = !{i32 17}
+; CHECK: !5 = !{i32 16}
+; CHECK: !6 = !{i32 4192}
+; CHECK: !7 = !{i32 0}
+; CHECK: !8 = !{i32 26}
+; CHECK: !9 = !{i32 24}
+; CHECK: !10 = !{i32 25}
+; CHECK: !11 = !{i32 65536}
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32* @llvm.tpu.allocate.smem(i32, i32)
+declare <1024 x i32> addrspace(205)* @llvm.tpu.allocate.vmem(i32, i32)
+
+define void @scoped_alocate_gen_spill_limits1(i32 %a) {
+  %mem0 = call i32* @llvm.tpu.allocate.smem(i32 15, i32 0)
+  %arrayidx0 = getelementptr inbounds i32, i32* %mem0, i32 %a
+  store i32 0, i32* %arrayidx0, align 4
+  %mem1 = call <1024 x i32> addrspace(205)* @llvm.tpu.allocate.vmem(i32 26, i32 0)
+  %arrayidx1 = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %mem1, i32 %a
+  ; We're not coding a store here for readability, due to its long vector size.
+  ; The test works without it.
+  ret void
+}
+
+define void @scoped_alocate_gen_spill_limits2(i32 %a) {
+  %mem0 = call i32* @llvm.tpu.allocate.smem(i32 17, i32 0)
+  %arrayidx0 = getelementptr inbounds i32, i32* %mem0, i32 %a
+  store i32 0, i32* %arrayidx0, align 4
+  %mem1 = call <1024 x i32> addrspace(205)* @llvm.tpu.allocate.vmem(i32 24, i32 0)
+  %arrayidx1 = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %mem1, i32 %a
+  ; We're not coding a store here for readability, due to its long vector size.
+  ; The test works without it.
+  ret void
+}
+
+define void @scoped_alocate_gen_spill_limits3(i32 %a) {
+  %mem0 = call i32* @llvm.tpu.allocate.smem(i32 16, i32 0)
+  %arrayidx0 = getelementptr inbounds i32, i32* %mem0, i32 %a
+  store i32 0, i32* %arrayidx0, align 4
+  %mem1 = call <1024 x i32> addrspace(205)* @llvm.tpu.allocate.vmem(i32 25, i32 0)
+  %arrayidx1 = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %mem1, i32 %a
+  ; We're not coding a store here for readability, due to its long vector size.
+  ; The test works without it.
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_multiple_alloc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_multiple_alloc.ll
new file mode 100644
index 0000000..2260393
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scoped_multiple_alloc.ll

@@ -0,0 +1,43 @@
+; RUN: llc < %s -mcpu=tensorcore-jf -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32) nounwind
+
+; Test that we support load/stores being part of multiple scopes.
+; CHECK-LABEL: loadafterstorealias:
+; CHECK: [vmem:$0x2] = vst
+; CHECK: vdelay $0x4
+; CHECK: vld [vmem:$0x1]
+define <1024 x i32> @loadafterstorealias() {
+  %ptr0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 2)
+  %ptr1 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 1)
+  store <1024 x i32> zeroinitializer, <1024 x i32> addrspace(205)* %ptr0, !alias.scope !4, !noalias !5
+  %res = load <1024 x i32>, <1024 x i32> addrspace(205)* %ptr1, !alias.scope !6, !noalias !7
+  ret <1024 x i32> %res
+}
+
+; Test that if the noalias metadata is missing we alias all the scopes.
+; CHECK-LABEL: no_noalias:
+; CHECK: [vmem:$0x2] = vst
+; CHECK: vdelay $0x4
+; CHECK: vld [vmem:$0x1]
+define <1024 x i32> @no_noalias() {
+  %ptr0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 2)
+  %ptr1 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 1)
+  store <1024 x i32> zeroinitializer, <1024 x i32> addrspace(205)* %ptr0, !alias.scope !4
+  %res = load <1024 x i32>, <1024 x i32> addrspace(205)* %ptr1, !alias.scope !6
+  ret <1024 x i32> %res
+}
+
+!0 = distinct !{!0, !"loadafterstorealias"}
+!1 = distinct !{!1, !0, !"alloc"}
+!2 = distinct !{!2, !0, !"alloc"}
+!3 = distinct !{!3, !0, !"alloc"}
+
+!4 = !{!1}
+!5 = !{!2, !3}
+!6 = !{!1, !2}
+!7 = !{!3}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scs_scoped_alloc_error_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scs_scoped_alloc_error_sc.ll
new file mode 100644
index 0000000..00dfc36
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scs_scoped_alloc_error_sc.ll

@@ -0,0 +1,55 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-scs-vf -tpu-fatal-mem-alloc-error=false < %s 2>&1 \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Test that smem allocation size is checked properly for sparsecore-scs-vf.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(0)* @llvm.tpu.alloca.smem(i32)
+declare i32 addrspace(208)* @llvm.tpu.alloca.dreg(i32)
+declare i32 addrspace(204)* @llvm.tpu.alloca.sflag(i32)
+declare i32 addrspace(203)* @llvm.tpu.alloca.hbm(i32)
+declare i32 addrspace(202)* @llvm.tpu.alloca.spmem(i32)
+
+; CHECK: Scoped allocation overflow.
+ define void @scoped_allocation_overflow_sparsecore_scs_smem(i32 %a) {
+  %mem = call i32 addrspace(0)* @llvm.tpu.alloca.smem(i32 16385)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(0)* %mem, i32 %a
+  store i32 0, i32 addrspace(0)* %arrayidx, align 4
+  ret void
+}
+
+; CHECK: Scoped allocation overflow.
+define void @scoped_allocation_overflow_sparsecore_scs_dreg(i32 %a) {
+  %mem = call i32 addrspace(208)* @llvm.tpu.alloca.dreg(i32 33)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(208)* %mem, i32 %a
+  store i32 0, i32 addrspace(208)* %arrayidx, align 4
+  ret void
+}
+
+; CHECK: Scoped allocation overflow.
+define void @scoped_allocation_overflow_sparsecore_scs_sflag(i32 %a) {
+  %mem = call i32 addrspace(204)* @llvm.tpu.alloca.sflag(i32 7169)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(204)* %mem, i32 %a
+  store i32 0, i32 addrspace(204)* %arrayidx, align 4
+  ret void
+}
+
+; CHECK: Scoped allocation overflow.
+define void @scoped_allocation_overflow_sparsecore_scs_hbm(i32 %a) {
+  %mem = call i32 addrspace(203)* @llvm.tpu.alloca.hbm(i32 3221225473)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(203)* %mem, i32 %a
+  store i32 0, i32 addrspace(203)* %arrayidx, align 4
+  ret void
+}
+
+; CHECK: Scoped allocation overflow.
+define void @scoped_allocation_overflow_sparsecore_scs_spmem(i32 %a) {
+  %mem = call i32 addrspace(202)* @llvm.tpu.alloca.spmem(i32 2097153)
+  %memc = bitcast i32 addrspace(202)* %mem to <8 x i32> addrspace(202)*
+  %arrayidx = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(202)* %memc, i32 %a
+  store <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <8 x i32> addrspace(202)* %arrayidx, align 32
+  ret void
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scs_trampoline.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scs_trampoline.ll
new file mode 100644
index 0000000..fa51c56
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/scs_trampoline.ll

@@ -0,0 +1,78 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-scs-vf < %s | \
+; RUN: llc -mcpu=sparsecore-scs-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-opt-spill-to-dreg=false -tpu-fatal-verifier-error=false 2>&1 \
+; RUN: | FileCheck %s
+
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; This tests the fast call semantics on Sparsecore.
+
+declare fastcc void @fast_task() nounwind noreturn
+declare fastcc void @fast_task_arg(i32 %a) nounwind noreturn
+declare void @task() nounwind
+declare void @task_arg(i32 %a) nounwind
+
+; This check belongs to scs_arg, but will show up before the asm
+; in the compiler output.
+; CHECK: No arguments supported on TPU fastcc.
+
+; Tests that we can lower a fastcc call.
+
+; CHECK-LABEL: scs:
+; CHECK: { _ = call fast_task }
+define void @scs() #0 section ".text.scs" {
+entry:
+  tail call fastcc void @fast_task()
+  ret void
+}
+
+; Tests that we don't accept any arguments on fastcc calls.
+
+; CHECK-LABEL: scs_arg:
+; See check above.
+define void @scs_arg(i32 %a) #0 section ".text.scs" {
+entry:
+  tail call fastcc void @fast_task_arg(i32 %a)
+  ret void
+}
+
+; Tests that code after noreturn fastcc calls is dead.
+
+; CHECK-LABEL: scs_ret_noreturn:
+; CHECK-NOT: sadd
+; CHECK: { _ = call fast_task }
+define i32 @scs_ret_noreturn(i32 %a) #0 section ".text.scs" {
+entry:
+  %r = add i32 %a, 5
+  tail call fastcc void @fast_task()
+  ret i32 %r
+}
+
+; Tests that code after regular calls is not dead.
+
+; CHECK-LABEL: scs_ret_return:
+; CHECK: { s0 = sadd.s32 $0x5, s0 }
+; CHECK: { [smem:$0x3fff] = sst s0 }
+; CHECK: { lr = call task  }
+; CHECK: { s0 = sld [smem:$0x3fff]
+define i32 @scs_ret_return(i32 %a) #0 section ".text.scs" {
+entry:
+  %r = add i32 %a, 5
+  tail call void @task()
+  ret i32 %r
+}
+
+; Tests that we can pass arguments to regular return calls.
+
+; CHECK-LABEL: scs_ret_return_arg:
+; CHECK: { lr = call task_arg }
+define void @scs_ret_return_arg(i32 %a) #0 section ".text.scs" {
+entry:
+  tail call void @task_arg(i32 %a)
+  ret void
+}
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/segmented_pathological_super_pass_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/segmented_pathological_super_pass_sc.ll
new file mode 100644
index 0000000..7e824ba
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/segmented_pathological_super_pass_sc.ll

@@ -0,0 +1,1401 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-vf < %s \
+; RUN: | llc -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-pipeliner-strategy=swingslack -tpu-fixed-vregs=32-63 \
+; RUN: -tpu-enable-pipeliner-super-pass -tpu-pipeliner-annotate-for-testing \
+; RUN: -enable-pre-spill -debug-only=tpu-loop-analysis -tpu-enable-loop-analysis \
+; RUN: -improve-prolog-epilog-aa=false 2>&1 | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK: Post-RA pipelined loop bb.12 (from bb.2): II=33
+
+%"class.embeddings::PointerBase" = type { %"class.embeddings::MemorySpace", %"class.embeddings::BasicType", %"union.embeddings::PointerBase::AnyPtr" }
+%"class.embeddings::MemorySpace" = type { i32 }
+%"class.embeddings::BasicType" = type { i32 }
+%"union.embeddings::PointerBase::AnyPtr" = type { i32* }
+%"class.embeddings::TileSpmemVectorArray" = type { %"class.embeddings::ScratchpadArray" }
+%"class.embeddings::ScratchpadArray" = type { %"class.embeddings::BaseArray" }
+%"class.embeddings::BaseArray" = type { %"class.embeddings::PointerBase", i32 }
+%"class.embeddings::TileSpmemVectorArray.0" = type { %"class.embeddings::ScratchpadArray" }
+%"class.embeddings::SegmentedReduce" = type { i8 }
+%"class.embeddings::TileSpmemPointer" = type { %"class.embeddings::PointerBase" }
+%"struct.embeddings::SortResult" = type { <8 x i32>, <8 x i32>, <8 x i32> }
+%"struct.embeddings::SortResult.1" = type { <8 x i32>, <8 x i32>, <8 x float> }
+%"class.embeddings::SCTY_V8S32" = type { %"class.embeddings::BasicType" }
+%"class.embeddings::SCM_TileSpmem" = type { %"class.embeddings::MemorySpace" }
+%"class.embeddings::SCTY_V8F32" = type { %"class.embeddings::BasicType" }
+
+$_ZN10embeddings14PointerFactoryIPU5AS201Dv8_iE6CreateES3_ = comdat any
+
+$_ZN10embeddings14PointerFactoryIPU5AS201Dv8_fE6CreateES3_ = comdat any
+
+$_ZN10embeddings11PointerBaseC2ERKS0_ = comdat any
+
+$_ZN10embeddings20TileSpmemVectorArrayIiEC2ENS_11PointerBaseEi = comdat any
+
+$_ZN10embeddings20TileSpmemVectorArrayIfEC2ENS_11PointerBaseEi = comdat any
+
+$_ZN10embeddings15SegmentedReduceC2Ev = comdat any
+
+$_ZN10embeddings15SegmentedReduce7ComputeEiNS_20TileSpmemVectorArrayIiEES2_NS1_IfEES3_PS3_ = comdat any
+
+$_ZN10embeddings20TileSpmemVectorArrayIiEC2ERKS1_ = comdat any
+
+$_ZN10embeddings20TileSpmemVectorArrayIfEC2ERKS1_ = comdat any
+
+$_ZN10embeddings11MemorySpaceC2ERKS0_ = comdat any
+
+$_ZN10embeddings9BasicTypeC2ERKS0_ = comdat any
+
+$_ZN10embeddings20TileSpmemVectorArrayIiEclEi = comdat any
+
+$_ZN10embeddings20TileSpmemVectorArrayIfEclEi = comdat any
+
+$_ZN10embeddings10VectorSortIDv8_iS1_EENS_10SortResultIT_T0_EES1_S3_S4_NS_12SortOrderingE = comdat any
+
+$_ZN10embeddings10VectorSortIDv8_iDv8_fEENS_10SortResultIT_T0_EES1_S4_S5_NS_12SortOrderingE = comdat any
+
+$_ZNK10embeddings9BaseArray8ElementsEv = comdat any
+
+$_ZNK10embeddings20TileSpmemVectorArrayIfE4BaseEv = comdat any
+
+$_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv = comdat any
+
+$_ZN10embeddings4CastINS_16TileSpmemPointerENS_11PointerBaseEEENS_15cast_retty_implIT_T0_E8ret_typeERKS5_ = comdat any
+
+$_ZNK10embeddings9BaseArray7BasePtrEv = comdat any
+
+$_ZNK10embeddings16TileSpmemPointer6RawPtrEv = comdat any
+
+$_ZN10embeddings3tpuIDv8_iE8sortascdEPS1_S3_S3_ = comdat any
+
+$_ZN10embeddings3tpuIDv8_iE8sortdscdEPS1_S3_S3_ = comdat any
+
+$_ZN10embeddings3tpuIDv8_iE8sortascdEPS1_S3_PDv8_f = comdat any
+
+$_ZN10embeddings3tpuIDv8_iE8sortdscdEPS1_S3_PDv8_f = comdat any
+
+$_ZN10embeddings15ScratchpadArrayC2ERKS0_ = comdat any
+
+$_ZN10embeddings9BaseArrayC2ERKS0_ = comdat any
+
+$_ZN10embeddings15ScratchpadArrayC2ENS_11PointerBaseEi = comdat any
+
+$_ZN10embeddings9BaseArrayC2ENS_11PointerBaseEi = comdat any
+
+$_ZN10embeddings11ToBasicTypeIDv8_iE10basic_typeEv = comdat any
+
+$_ZN10embeddings16TileSpmemPointerC2EPU5AS201vNS_9BasicTypeE = comdat any
+
+$_ZN10embeddings11PointerBaseC2EOS0_ = comdat any
+
+$_ZN10embeddings10SCTY_V8S32C2Ev = comdat any
+
+$_ZN10embeddings9BasicTypeC2ENS_19SparsecoreBasicTypeE = comdat any
+
+$_ZN10embeddings11PointerBaseC2ENS_9BasicTypeEPU5AS201v = comdat any
+
+$_ZN10embeddings13SCM_TileSpmemC2Ev = comdat any
+
+$_ZN10embeddings11PointerBase6AnyPtrC2EPU5AS201v = comdat any
+
+$_ZN10embeddings11MemorySpaceC2ENS_21SparsecoreMemorySpaceE = comdat any
+
+$_ZN10embeddings11ToBasicTypeIDv8_fE10basic_typeEv = comdat any
+
+$_ZN10embeddings10SCTY_V8F32C2Ev = comdat any
+
+@__sc_scs_entry = dso_local alias i32, bitcast (void ()* @scs to i32*)
+@__sc_tile_access_entry = dso_local alias i32, bitcast (void ()* @tile_access to i32*)
+@__sc_tile_execute_entry = dso_local alias i32, bitcast (void ()* @tile_execute to i32*)
+
+; Function Attrs: mustprogress nounwind
+define dso_local void @tile_access() #0 section ".text.tile_access" {
+  ret void
+}
+
+; Function Attrs: mustprogress
+define dso_local void @tile_execute() #1 section ".text.tile_execute" {
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca %"class.embeddings::PointerBase", align 4
+  %6 = alloca %"class.embeddings::PointerBase", align 4
+  %7 = alloca %"class.embeddings::PointerBase", align 4
+  %8 = alloca %"class.embeddings::PointerBase", align 4
+  %9 = alloca %"class.embeddings::PointerBase", align 4
+  %10 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %11 = alloca %"class.embeddings::PointerBase", align 4
+  %12 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %13 = alloca %"class.embeddings::PointerBase", align 4
+  %14 = alloca %"class.embeddings::TileSpmemVectorArray.0", align 4
+  %15 = alloca %"class.embeddings::PointerBase", align 4
+  %16 = alloca i32, align 4
+  %17 = alloca %"class.embeddings::TileSpmemVectorArray.0", align 4
+  %18 = alloca %"class.embeddings::PointerBase", align 4
+  %19 = alloca %"class.embeddings::TileSpmemVectorArray.0", align 4
+  %20 = alloca %"class.embeddings::PointerBase", align 4
+  %21 = alloca %"class.embeddings::SegmentedReduce", align 1
+  %22 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %23 = alloca %"class.embeddings::TileSpmemVectorArray", align 4
+  %24 = alloca %"class.embeddings::TileSpmemVectorArray.0", align 4
+  %25 = alloca %"class.embeddings::TileSpmemVectorArray.0", align 4
+  %26 = bitcast i32* %1 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %26) #7
+  %27 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 0) #18
+  store i32 %27, i32* %1, align 4, !tbaa !3
+  %28 = bitcast i32* %2 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %28) #7
+  %29 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 1) #18
+  store i32 %29, i32* %2, align 4, !tbaa !3
+  %30 = bitcast i32* %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %30) #7
+  %31 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 2) #18
+  store i32 %31, i32* %3, align 4, !tbaa !3
+  %32 = bitcast i32* %4 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %32) #7
+  %33 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 3) #18
+  store i32 %33, i32* %4, align 4, !tbaa !3
+  %34 = bitcast %"class.embeddings::PointerBase"* %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %34) #7
+  %35 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 4) #18
+  %36 = inttoptr i32 %35 to <8 x i32> addrspace(201)*
+  call void @_ZN10embeddings14PointerFactoryIPU5AS201Dv8_iE6CreateES3_(%"class.embeddings::PointerBase"* sret(%"class.embeddings::PointerBase") align 4 %5, <8 x i32> addrspace(201)* noundef %36) #18
+  %37 = bitcast %"class.embeddings::PointerBase"* %6 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %37) #7
+  %38 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 5) #18
+  %39 = inttoptr i32 %38 to <8 x i32> addrspace(201)*
+  call void @_ZN10embeddings14PointerFactoryIPU5AS201Dv8_iE6CreateES3_(%"class.embeddings::PointerBase"* sret(%"class.embeddings::PointerBase") align 4 %6, <8 x i32> addrspace(201)* noundef %39) #18
+  %40 = bitcast %"class.embeddings::PointerBase"* %7 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %40) #7
+  %41 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 6) #18
+  %42 = inttoptr i32 %41 to <8 x float> addrspace(201)*
+  call void @_ZN10embeddings14PointerFactoryIPU5AS201Dv8_fE6CreateES3_(%"class.embeddings::PointerBase"* sret(%"class.embeddings::PointerBase") align 4 %7, <8 x float> addrspace(201)* noundef %42) #18
+  %43 = bitcast %"class.embeddings::PointerBase"* %8 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %43) #7
+  %44 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 7) #18
+  %45 = inttoptr i32 %44 to <8 x float> addrspace(201)*
+  call void @_ZN10embeddings14PointerFactoryIPU5AS201Dv8_fE6CreateES3_(%"class.embeddings::PointerBase"* sret(%"class.embeddings::PointerBase") align 4 %8, <8 x float> addrspace(201)* noundef %45) #18
+  %46 = bitcast %"class.embeddings::PointerBase"* %9 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %46) #7
+  %47 = call noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef 8) #18
+  %48 = inttoptr i32 %47 to <8 x float> addrspace(201)*
+  call void @_ZN10embeddings14PointerFactoryIPU5AS201Dv8_fE6CreateES3_(%"class.embeddings::PointerBase"* sret(%"class.embeddings::PointerBase") align 4 %9, <8 x float> addrspace(201)* noundef %48) #18
+  %49 = bitcast %"class.embeddings::TileSpmemVectorArray"* %10 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %49) #7
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %11, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %5) #18
+  %50 = load i32, i32* %3, align 4, !tbaa !3
+  %51 = sdiv i32 %50, 8
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %10, %"class.embeddings::PointerBase"* noundef %11, i32 noundef %51) #18
+  %52 = bitcast %"class.embeddings::TileSpmemVectorArray"* %12 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %52) #7
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %13, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %6) #18
+  %53 = load i32, i32* %3, align 4, !tbaa !3
+  %54 = sdiv i32 %53, 8
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %12, %"class.embeddings::PointerBase"* noundef %13, i32 noundef %54) #18
+  %55 = bitcast %"class.embeddings::TileSpmemVectorArray.0"* %14 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %55) #7
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %15, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %7) #18
+  %56 = load i32, i32* %3, align 4, !tbaa !3
+  %57 = sdiv i32 %56, 8
+  call void @_ZN10embeddings20TileSpmemVectorArrayIfEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %14, %"class.embeddings::PointerBase"* noundef %15, i32 noundef %57) #18
+  %58 = bitcast i32* %16 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %58) #7
+  %59 = load i32, i32* %3, align 4, !tbaa !3
+  %60 = load i32, i32* %1, align 4, !tbaa !3
+  %61 = add nsw i32 %60, 0
+  %62 = mul nsw i32 %59, %61
+  store i32 %62, i32* %16, align 4, !tbaa !3
+  %63 = bitcast %"class.embeddings::TileSpmemVectorArray.0"* %17 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %63) #7
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %18, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %8) #18
+  %64 = load i32, i32* %16, align 4, !tbaa !3
+  %65 = sdiv i32 %64, 8
+  call void @_ZN10embeddings20TileSpmemVectorArrayIfEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %17, %"class.embeddings::PointerBase"* noundef %18, i32 noundef %65) #18
+  %66 = bitcast %"class.embeddings::TileSpmemVectorArray.0"* %19 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %66) #7
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %20, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %9) #18
+  %67 = load i32, i32* %2, align 4, !tbaa !3
+  %68 = load i32, i32* %1, align 4, !tbaa !3
+  %69 = mul nsw i32 %67, %68
+  %70 = sdiv i32 %69, 8
+  call void @_ZN10embeddings20TileSpmemVectorArrayIfEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %19, %"class.embeddings::PointerBase"* noundef %20, i32 noundef %70) #18
+  %71 = call i32 addrspace(201)* @llvm.tpu.alloca.tilespmem(i32 32768)
+  %72 = bitcast %"class.embeddings::SegmentedReduce"* %21 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %72) #7
+  call void @_ZN10embeddings15SegmentedReduceC2Ev(%"class.embeddings::SegmentedReduce"* noundef nonnull align 1 dereferenceable(1) %21) #18
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %22, %"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %10) #18
+  call void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %23, %"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %12) #18
+  call void @_ZN10embeddings20TileSpmemVectorArrayIfEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %24, %"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %14) #18
+  call void @_ZN10embeddings20TileSpmemVectorArrayIfEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %25, %"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %17) #18
+  call void @_ZN10embeddings15SegmentedReduce7ComputeEiNS_20TileSpmemVectorArrayIiEES2_NS1_IfEES3_PS3_(%"class.embeddings::SegmentedReduce"* noundef nonnull align 1 dereferenceable(1) %21, i32 noundef 8, %"class.embeddings::TileSpmemVectorArray"* noundef %22, %"class.embeddings::TileSpmemVectorArray"* noundef %23, %"class.embeddings::TileSpmemVectorArray.0"* noundef %24, %"class.embeddings::TileSpmemVectorArray.0"* noundef %25, %"class.embeddings::TileSpmemVectorArray.0"* noundef %19) #18
+  call void @_ZN12_GLOBAL__N_16ReturnEii(i32 noundef 1, i32 noundef 0) #18
+  %73 = bitcast %"class.embeddings::SegmentedReduce"* %21 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %73) #7
+  %74 = bitcast %"class.embeddings::TileSpmemVectorArray.0"* %19 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %74) #7
+  %75 = bitcast %"class.embeddings::TileSpmemVectorArray.0"* %17 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %75) #7
+  %76 = bitcast i32* %16 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %76) #7
+  %77 = bitcast %"class.embeddings::TileSpmemVectorArray.0"* %14 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %77) #7
+  %78 = bitcast %"class.embeddings::TileSpmemVectorArray"* %12 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %78) #7
+  %79 = bitcast %"class.embeddings::TileSpmemVectorArray"* %10 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %79) #7
+  %80 = bitcast %"class.embeddings::PointerBase"* %9 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %80) #7
+  %81 = bitcast %"class.embeddings::PointerBase"* %8 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %81) #7
+  %82 = bitcast %"class.embeddings::PointerBase"* %7 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %82) #7
+  %83 = bitcast %"class.embeddings::PointerBase"* %6 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %83) #7
+  %84 = bitcast %"class.embeddings::PointerBase"* %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %84) #7
+  %85 = bitcast i32* %4 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %85) #7
+  %86 = bitcast i32* %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %86) #7
+  %87 = bitcast i32* %2 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %87) #7
+  %88 = bitcast i32* %1 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %88) #7
+  ret void
+}
+
+; Function Attrs: argmemonly nocallback nofree nosync nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: mustprogress nounwind
+define internal noundef i32 @_ZN12_GLOBAL__N_19ParameterEi(i32 noundef %0) #3 {
+  %2 = alloca i32, align 4
+  %3 = alloca i32*, align 4
+  store i32 %0, i32* %2, align 4, !tbaa !3
+  %4 = bitcast i32** %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #7
+  %5 = load i32, i32* %2, align 4, !tbaa !3
+  %6 = add nsw i32 256, %5
+  %7 = inttoptr i32 %6 to i32*
+  store i32* %7, i32** %3, align 4, !tbaa !7
+  %8 = load i32*, i32** %3, align 4, !tbaa !7
+  %9 = load i32, i32* %8, align 4, !tbaa !3
+  %10 = bitcast i32** %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %10) #7
+  ret i32 %9
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings14PointerFactoryIPU5AS201Dv8_iE6CreateES3_(%"class.embeddings::PointerBase"* noalias sret(%"class.embeddings::PointerBase") align 4 %0, <8 x i32> addrspace(201)* noundef %1) #4 comdat align 2 {
+  %3 = alloca i8*, align 4
+  %4 = alloca <8 x i32> addrspace(201)*, align 4
+  %5 = alloca %"class.embeddings::TileSpmemPointer", align 4
+  %6 = alloca %"class.embeddings::BasicType", align 4
+  %7 = bitcast %"class.embeddings::PointerBase"* %0 to i8*
+  store i8* %7, i8** %3, align 4
+  store <8 x i32> addrspace(201)* %1, <8 x i32> addrspace(201)** %4, align 4, !tbaa !7
+  %8 = bitcast %"class.embeddings::TileSpmemPointer"* %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %8) #7
+  %9 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** %4, align 4, !tbaa !7
+  %10 = bitcast <8 x i32> addrspace(201)* %9 to i8 addrspace(201)*
+  call void @_ZN10embeddings11ToBasicTypeIDv8_iE10basic_typeEv(%"class.embeddings::BasicType"* sret(%"class.embeddings::BasicType") align 4 %6) #19
+  call void @_ZN10embeddings16TileSpmemPointerC2EPU5AS201vNS_9BasicTypeE(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %5, i8 addrspace(201)* noundef %10, %"class.embeddings::BasicType"* noundef %6) #19
+  %11 = bitcast %"class.embeddings::TileSpmemPointer"* %5 to %"class.embeddings::PointerBase"*
+  call void @_ZN10embeddings11PointerBaseC2EOS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %11) #19
+  %12 = bitcast %"class.embeddings::TileSpmemPointer"* %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %12) #7
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings14PointerFactoryIPU5AS201Dv8_fE6CreateES3_(%"class.embeddings::PointerBase"* noalias sret(%"class.embeddings::PointerBase") align 4 %0, <8 x float> addrspace(201)* noundef %1) #4 comdat align 2 {
+  %3 = alloca i8*, align 4
+  %4 = alloca <8 x float> addrspace(201)*, align 4
+  %5 = alloca %"class.embeddings::TileSpmemPointer", align 4
+  %6 = alloca %"class.embeddings::BasicType", align 4
+  %7 = bitcast %"class.embeddings::PointerBase"* %0 to i8*
+  store i8* %7, i8** %3, align 4
+  store <8 x float> addrspace(201)* %1, <8 x float> addrspace(201)** %4, align 4, !tbaa !7
+  %8 = bitcast %"class.embeddings::TileSpmemPointer"* %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %8) #7
+  %9 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** %4, align 4, !tbaa !7
+  %10 = bitcast <8 x float> addrspace(201)* %9 to i8 addrspace(201)*
+  call void @_ZN10embeddings11ToBasicTypeIDv8_fE10basic_typeEv(%"class.embeddings::BasicType"* sret(%"class.embeddings::BasicType") align 4 %6) #19
+  call void @_ZN10embeddings16TileSpmemPointerC2EPU5AS201vNS_9BasicTypeE(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %5, i8 addrspace(201)* noundef %10, %"class.embeddings::BasicType"* noundef %6) #19
+  %11 = bitcast %"class.embeddings::TileSpmemPointer"* %5 to %"class.embeddings::PointerBase"*
+  call void @_ZN10embeddings11PointerBaseC2EOS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %11) #19
+  %12 = bitcast %"class.embeddings::TileSpmemPointer"* %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %12) #7
+  ret void
+}
+
+; Function Attrs: inlinehint
+define linkonce_odr dso_local void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) unnamed_addr #5 comdat align 2 {
+  %3 = alloca %"class.embeddings::PointerBase"*, align 4
+  %4 = alloca %"class.embeddings::PointerBase"*, align 4
+  store %"class.embeddings::PointerBase"* %0, %"class.embeddings::PointerBase"** %3, align 4, !tbaa !7
+  store %"class.embeddings::PointerBase"* %1, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %3, align 4
+  %6 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 0
+  %7 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %8 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 0
+  call void @_ZN10embeddings11MemorySpaceC2ERKS0_(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %6, %"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %8) #19
+  %9 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 1
+  %10 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %11 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %10, i32 0, i32 1
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %9, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %11) #19
+  %12 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 2
+  %13 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %14 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %13, i32 0, i32 2
+  %15 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %12 to i8*
+  %16 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %14 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %15, i8* align 4 %16, i32 4, i1 false), !tbaa.struct !9
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::PointerBase"* noundef %1, i32 noundef %2) unnamed_addr #6 comdat align 2 {
+  %4 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca %"class.embeddings::PointerBase", align 4
+  store %"class.embeddings::TileSpmemVectorArray"* %0, %"class.embeddings::TileSpmemVectorArray"** %4, align 4, !tbaa !7
+  store i32 %2, i32* %5, align 4, !tbaa !3
+  %7 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %4, align 4
+  %8 = bitcast %"class.embeddings::TileSpmemVectorArray"* %7 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %6, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) #19
+  %9 = load i32, i32* %5, align 4, !tbaa !3
+  call void @_ZN10embeddings15ScratchpadArrayC2ENS_11PointerBaseEi(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %8, %"class.embeddings::PointerBase"* noundef %6, i32 noundef %9) #19
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings20TileSpmemVectorArrayIfEC2ENS_11PointerBaseEi(%"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::PointerBase"* noundef %1, i32 noundef %2) unnamed_addr #6 comdat align 2 {
+  %4 = alloca %"class.embeddings::TileSpmemVectorArray.0"*, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca %"class.embeddings::PointerBase", align 4
+  store %"class.embeddings::TileSpmemVectorArray.0"* %0, %"class.embeddings::TileSpmemVectorArray.0"** %4, align 4, !tbaa !7
+  store i32 %2, i32* %5, align 4, !tbaa !3
+  %7 = load %"class.embeddings::TileSpmemVectorArray.0"*, %"class.embeddings::TileSpmemVectorArray.0"** %4, align 4
+  %8 = bitcast %"class.embeddings::TileSpmemVectorArray.0"* %7 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %6, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) #19
+  %9 = load i32, i32* %5, align 4, !tbaa !3
+  call void @_ZN10embeddings15ScratchpadArrayC2ENS_11PointerBaseEi(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %8, %"class.embeddings::PointerBase"* noundef %6, i32 noundef %9) #19
+  ret void
+}
+
+; Function Attrs: nounwind
+declare i32 addrspace(201)* @llvm.tpu.alloca.tilespmem(i32) #7
+
+; Function Attrs: alwaysinline nounwind
+define linkonce_odr dso_local void @_ZN10embeddings15SegmentedReduceC2Ev(%"class.embeddings::SegmentedReduce"* noundef nonnull align 1 dereferenceable(1) %0) unnamed_addr #8 comdat align 2 {
+  %2 = alloca %"class.embeddings::SegmentedReduce"*, align 4
+  store %"class.embeddings::SegmentedReduce"* %0, %"class.embeddings::SegmentedReduce"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::SegmentedReduce"*, %"class.embeddings::SegmentedReduce"** %2, align 4
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings15SegmentedReduce7ComputeEiNS_20TileSpmemVectorArrayIiEES2_NS1_IfEES3_PS3_(%"class.embeddings::SegmentedReduce"* noundef nonnull align 1 dereferenceable(1) %0, i32 noundef %1, %"class.embeddings::TileSpmemVectorArray"* noundef %2, %"class.embeddings::TileSpmemVectorArray"* noundef %3, %"class.embeddings::TileSpmemVectorArray.0"* noundef %4, %"class.embeddings::TileSpmemVectorArray.0"* noundef %5, %"class.embeddings::TileSpmemVectorArray.0"* noundef %6) #9 comdat align 2 {
+  %8 = alloca %"class.embeddings::SegmentedReduce"*, align 4
+  %9 = alloca i32, align 4
+  %10 = alloca %"class.embeddings::TileSpmemVectorArray.0"*, align 4
+  %11 = alloca <8 x i32>, align 32
+  %12 = alloca <8 x i32>, align 32
+  %13 = alloca <8 x float>, align 32
+  %14 = alloca %"struct.embeddings::SortResult", align 32
+  %15 = alloca <8 x i32>, align 32
+  %16 = alloca <8 x i32>, align 32
+  %17 = alloca %"struct.embeddings::SortResult.1", align 32
+  %18 = alloca <8 x float>, align 32
+  %19 = alloca i32, align 4
+  %20 = alloca i32, align 4
+  %21 = alloca i32, align 4
+  %22 = alloca <8 x i32>, align 32
+  %23 = alloca <8 x i32>, align 32
+  %24 = alloca <8 x float>, align 32
+  %25 = alloca %"struct.embeddings::SortResult", align 32
+  %26 = alloca <8 x i32>, align 32
+  %27 = alloca <8 x i32>, align 32
+  %28 = alloca %"struct.embeddings::SortResult.1", align 32
+  %29 = alloca <8 x float>, align 32
+  %30 = alloca i32, align 4
+  %31 = alloca <8 x i32>, align 32
+  %32 = alloca <8 x float>, align 32
+  %33 = alloca <8 x float>, align 32
+  %34 = alloca <8 x i32>, align 32
+  %35 = alloca <8 x i32>, align 32
+  store %"class.embeddings::SegmentedReduce"* %0, %"class.embeddings::SegmentedReduce"** %8, align 4, !tbaa !7
+  store i32 %1, i32* %9, align 4, !tbaa !3
+  store %"class.embeddings::TileSpmemVectorArray.0"* %6, %"class.embeddings::TileSpmemVectorArray.0"** %10, align 4, !tbaa !7
+  %36 = load %"class.embeddings::SegmentedReduce"*, %"class.embeddings::SegmentedReduce"** %8, align 4
+  %37 = bitcast <8 x i32>* %11 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %37) #7
+  %38 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %2, i32 noundef 0) #19
+  %39 = load <8 x i32>, <8 x i32> addrspace(201)* %38, align 32, !tbaa !10
+  store <8 x i32> %39, <8 x i32>* %11, align 32, !tbaa !10
+  %40 = bitcast <8 x i32>* %12 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %40) #7
+  %41 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %3, i32 noundef 0) #19
+  %42 = load <8 x i32>, <8 x i32> addrspace(201)* %41, align 32, !tbaa !10
+  store <8 x i32> %42, <8 x i32>* %12, align 32, !tbaa !10
+  %43 = bitcast <8 x float>* %13 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %43) #7
+  %44 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %4, i32 noundef 0) #19
+  %45 = load <8 x float>, <8 x float> addrspace(201)* %44, align 32, !tbaa !10
+  store <8 x float> %45, <8 x float>* %13, align 32, !tbaa !10
+  %46 = bitcast %"struct.embeddings::SortResult"* %14 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %46) #7
+  %47 = load <8 x i32>, <8 x i32>* %11, align 32, !tbaa !10
+  %48 = load <8 x i32>, <8 x i32>* %12, align 32, !tbaa !10
+  call void @_ZN10embeddings10VectorSortIDv8_iS1_EENS_10SortResultIT_T0_EES1_S3_S4_NS_12SortOrderingE(%"struct.embeddings::SortResult"* sret(%"struct.embeddings::SortResult") align 32 %14, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %47, <8 x i32> noundef %48, i32 noundef 0) #19
+  %49 = bitcast <8 x i32>* %15 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %49) #7
+  %50 = getelementptr inbounds %"struct.embeddings::SortResult", %"struct.embeddings::SortResult"* %14, i32 0, i32 1
+  %51 = load <8 x i32>, <8 x i32>* %50, align 32, !tbaa !10
+  store <8 x i32> %51, <8 x i32>* %15, align 32, !tbaa !10
+  %52 = bitcast <8 x i32>* %16 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %52) #7
+  %53 = getelementptr inbounds %"struct.embeddings::SortResult", %"struct.embeddings::SortResult"* %14, i32 0, i32 2
+  %54 = load <8 x i32>, <8 x i32>* %53, align 32, !tbaa !10
+  store <8 x i32> %54, <8 x i32>* %16, align 32, !tbaa !10
+  %55 = bitcast %"struct.embeddings::SortResult.1"* %17 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %55) #7
+  %56 = load <8 x i32>, <8 x i32>* %11, align 32, !tbaa !10
+  %57 = load <8 x float>, <8 x float>* %13, align 32, !tbaa !10
+  call void @_ZN10embeddings10VectorSortIDv8_iDv8_fEENS_10SortResultIT_T0_EES1_S4_S5_NS_12SortOrderingE(%"struct.embeddings::SortResult.1"* sret(%"struct.embeddings::SortResult.1") align 32 %17, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %56, <8 x float> noundef %57, i32 noundef 0) #19
+  %58 = bitcast <8 x float>* %18 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %58) #7
+  %59 = getelementptr inbounds %"struct.embeddings::SortResult.1", %"struct.embeddings::SortResult.1"* %17, i32 0, i32 2
+  %60 = load <8 x float>, <8 x float>* %59, align 32, !tbaa !10
+  store <8 x float> %60, <8 x float>* %18, align 32, !tbaa !10
+  %61 = bitcast i32* %19 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %61) #7
+  store i32 0, i32* %19, align 4, !tbaa !3
+  br label %62
+
+62:                                               ; preds = %180, %7
+  %63 = load i32, i32* %19, align 4, !tbaa !3, !llvm.access.group !11
+  %64 = bitcast %"class.embeddings::TileSpmemVectorArray"* %2 to %"class.embeddings::BaseArray"*
+  %65 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %64) #19, !llvm.access.group !11
+  %66 = icmp slt i32 %63, %65
+  br i1 %66, label %69, label %67
+
+67:                                               ; preds = %62
+  store i32 2, i32* %20, align 4
+  %68 = bitcast i32* %19 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %68) #7, !llvm.access.group !11
+  br label %183
+
+69:                                               ; preds = %62
+  %70 = bitcast i32* %21 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %70) #7, !llvm.access.group !11
+  %71 = load i32, i32* %19, align 4, !tbaa !3, !llvm.access.group !11
+  %72 = bitcast %"class.embeddings::TileSpmemVectorArray"* %2 to %"class.embeddings::BaseArray"*
+  %73 = call noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %72) #19, !llvm.access.group !11
+  %74 = sub nsw i32 %73, 1
+  %75 = icmp eq i32 %71, %74
+  br i1 %75, label %76, label %78
+
+76:                                               ; preds = %69
+  %77 = load i32, i32* %19, align 4, !tbaa !3, !llvm.access.group !11
+  br label %81
+
+78:                                               ; preds = %69
+  %79 = load i32, i32* %19, align 4, !tbaa !3, !llvm.access.group !11
+  %80 = add nsw i32 %79, 1
+  br label %81
+
+81:                                               ; preds = %78, %76
+  %82 = phi i32 [ %77, %76 ], [ %80, %78 ]
+  store i32 %82, i32* %21, align 4, !tbaa !3, !llvm.access.group !11
+  %83 = bitcast <8 x i32>* %22 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %83) #7, !llvm.access.group !11
+  %84 = load i32, i32* %21, align 4, !tbaa !3, !llvm.access.group !11
+  %85 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %2, i32 noundef %84) #19, !llvm.access.group !11
+  %86 = load <8 x i32>, <8 x i32> addrspace(201)* %85, align 32, !tbaa !10, !llvm.access.group !11
+  store <8 x i32> %86, <8 x i32>* %22, align 32, !tbaa !10, !llvm.access.group !11
+  %87 = bitcast <8 x i32>* %23 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %87) #7, !llvm.access.group !11
+  %88 = load i32, i32* %21, align 4, !tbaa !3, !llvm.access.group !11
+  %89 = call noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %3, i32 noundef %88) #19, !llvm.access.group !11
+  %90 = load <8 x i32>, <8 x i32> addrspace(201)* %89, align 32, !tbaa !10, !llvm.access.group !11
+  store <8 x i32> %90, <8 x i32>* %23, align 32, !tbaa !10, !llvm.access.group !11
+  %91 = bitcast <8 x float>* %24 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %91) #7, !llvm.access.group !11
+  %92 = load i32, i32* %21, align 4, !tbaa !3, !llvm.access.group !11
+  %93 = call noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %4, i32 noundef %92) #19, !llvm.access.group !11
+  %94 = load <8 x float>, <8 x float> addrspace(201)* %93, align 32, !tbaa !10, !llvm.access.group !11
+  store <8 x float> %94, <8 x float>* %24, align 32, !tbaa !10, !llvm.access.group !11
+  %95 = bitcast %"struct.embeddings::SortResult"* %25 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %95) #7, !llvm.access.group !11
+  %96 = load <8 x i32>, <8 x i32>* %22, align 32, !tbaa !10, !llvm.access.group !11
+  %97 = load <8 x i32>, <8 x i32>* %23, align 32, !tbaa !10, !llvm.access.group !11
+  call void @_ZN10embeddings10VectorSortIDv8_iS1_EENS_10SortResultIT_T0_EES1_S3_S4_NS_12SortOrderingE(%"struct.embeddings::SortResult"* sret(%"struct.embeddings::SortResult") align 32 %25, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %96, <8 x i32> noundef %97, i32 noundef 0) #19, !llvm.access.group !11
+  %98 = bitcast <8 x i32>* %26 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %98) #7, !llvm.access.group !11
+  %99 = getelementptr inbounds %"struct.embeddings::SortResult", %"struct.embeddings::SortResult"* %25, i32 0, i32 1
+  %100 = load <8 x i32>, <8 x i32>* %99, align 32, !tbaa !10, !llvm.access.group !11
+  store <8 x i32> %100, <8 x i32>* %26, align 32, !tbaa !10, !llvm.access.group !11
+  %101 = bitcast <8 x i32>* %27 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %101) #7, !llvm.access.group !11
+  %102 = getelementptr inbounds %"struct.embeddings::SortResult", %"struct.embeddings::SortResult"* %25, i32 0, i32 2
+  %103 = load <8 x i32>, <8 x i32>* %102, align 32, !tbaa !10, !llvm.access.group !11
+  store <8 x i32> %103, <8 x i32>* %27, align 32, !tbaa !10, !llvm.access.group !11
+  %104 = bitcast %"struct.embeddings::SortResult.1"* %28 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 96, i8* %104) #7, !llvm.access.group !11
+  %105 = load <8 x i32>, <8 x i32>* %22, align 32, !tbaa !10, !llvm.access.group !11
+  %106 = load <8 x float>, <8 x float>* %24, align 32, !tbaa !10, !llvm.access.group !11
+  call void @_ZN10embeddings10VectorSortIDv8_iDv8_fEENS_10SortResultIT_T0_EES1_S4_S5_NS_12SortOrderingE(%"struct.embeddings::SortResult.1"* sret(%"struct.embeddings::SortResult.1") align 32 %28, <8 x i32> noundef <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> noundef %105, <8 x float> noundef %106, i32 noundef 0) #19, !llvm.access.group !11
+  %107 = bitcast <8 x float>* %29 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %107) #7, !llvm.access.group !11
+  %108 = getelementptr inbounds %"struct.embeddings::SortResult.1", %"struct.embeddings::SortResult.1"* %28, i32 0, i32 2
+  %109 = load <8 x float>, <8 x float>* %108, align 32, !tbaa !10, !llvm.access.group !11
+  store <8 x float> %109, <8 x float>* %29, align 32, !tbaa !10, !llvm.access.group !11
+  %110 = bitcast i32* %30 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %110) #7, !llvm.access.group !11
+  store i32 0, i32* %30, align 4, !tbaa !3, !llvm.access.group !11
+  br label %111
+
+111:                                              ; preds = %164, %81
+  %112 = load i32, i32* %30, align 4, !tbaa !3, !llvm.access.group !11
+  %113 = load i32, i32* %9, align 4, !tbaa !3, !llvm.access.group !11
+  %114 = icmp slt i32 %112, %113
+  br i1 %114, label %117, label %115
+
+115:                                              ; preds = %111
+  store i32 5, i32* %20, align 4
+  %116 = bitcast i32* %30 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %116) #7, !llvm.access.group !11
+  br label %167
+
+117:                                              ; preds = %111
+  %118 = bitcast <8 x i32>* %31 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %118) #7, !llvm.access.group !11
+  %119 = load <8 x i32>, <8 x i32>* %16, align 32, !tbaa !10, !llvm.access.group !11
+  %120 = load i32, i32* %9, align 4, !tbaa !3, !llvm.access.group !11
+  %121 = insertelement <8 x i32> poison, i32 %120, i32 0
+  %122 = shufflevector <8 x i32> %121, <8 x i32> poison, <8 x i32> zeroinitializer
+  %123 = mul <8 x i32> %119, %122
+  %124 = load i32, i32* %30, align 4, !tbaa !3, !llvm.access.group !11
+  %125 = insertelement <8 x i32> poison, i32 %124, i32 0
+  %126 = shufflevector <8 x i32> %125, <8 x i32> poison, <8 x i32> zeroinitializer
+  %127 = add <8 x i32> %123, %126
+  store <8 x i32> %127, <8 x i32>* %31, align 32, !tbaa !10, !llvm.access.group !11
+  %128 = bitcast <8 x float>* %32 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %128) #7, !llvm.access.group !11
+  %129 = call noundef <8 x float> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIfE4BaseEv(%"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %5) #19, !llvm.access.group !11
+  %130 = load <8 x i32>, <8 x i32>* %31, align 32, !tbaa !10, !llvm.access.group !11
+  %131 = call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %129, <8 x i32> %130), !llvm.access.group !11
+  store <8 x float> %131, <8 x float>* %32, align 32, !tbaa !10, !llvm.access.group !11
+  %132 = bitcast <8 x float>* %33 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %132) #7, !llvm.access.group !11
+  %133 = load <8 x float>, <8 x float>* %32, align 32, !tbaa !10, !llvm.access.group !11
+  %134 = load <8 x float>, <8 x float>* %18, align 32, !tbaa !10, !llvm.access.group !11
+  %135 = fmul <8 x float> %133, %134
+  store <8 x float> %135, <8 x float>* %33, align 32, !tbaa !10, !llvm.access.group !11
+  %136 = bitcast <8 x i32>* %34 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %136) #7, !llvm.access.group !11
+  %137 = load <8 x i32>, <8 x i32>* %15, align 32, !tbaa !10, !llvm.access.group !11
+  %138 = load <8 x float>, <8 x float>* %33, align 32, !llvm.access.group !11
+  %139 = call { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %137, <8 x float> %138)
+  %140 = extractvalue { <8 x float>, <8 x i1> } %139, 0
+  store <8 x float> %140, <8 x float>* %33, align 32, !llvm.access.group !11
+  %141 = extractvalue { <8 x float>, <8 x i1> } %139, 1
+  %142 = zext <8 x i1> %141 to <8 x i32>
+  store <8 x i32> %142, <8 x i32>* %34, align 32, !llvm.access.group !11
+  %143 = bitcast <8 x i32>* %35 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %143) #7, !llvm.access.group !11
+  %144 = load <8 x i32>, <8 x i32>* %15, align 32, !tbaa !10, !llvm.access.group !11
+  %145 = load i32, i32* %9, align 4, !tbaa !3, !llvm.access.group !11
+  %146 = insertelement <8 x i32> poison, i32 %145, i32 0
+  %147 = shufflevector <8 x i32> %146, <8 x i32> poison, <8 x i32> zeroinitializer
+  %148 = mul <8 x i32> %144, %147
+  %149 = load i32, i32* %30, align 4, !tbaa !3, !llvm.access.group !11
+  %150 = insertelement <8 x i32> poison, i32 %149, i32 0
+  %151 = shufflevector <8 x i32> %150, <8 x i32> poison, <8 x i32> zeroinitializer
+  %152 = add <8 x i32> %148, %151
+  store <8 x i32> %152, <8 x i32>* %35, align 32, !tbaa !10, !llvm.access.group !11
+  %153 = load <8 x i32>, <8 x i32>* %34, align 32, !tbaa !10, !llvm.access.group !11
+  %154 = trunc <8 x i32> %153 to <8 x i1>
+  %155 = load %"class.embeddings::TileSpmemVectorArray.0"*, %"class.embeddings::TileSpmemVectorArray.0"** %10, align 4, !tbaa !7, !llvm.access.group !11
+  %156 = call noundef <8 x float> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIfE4BaseEv(%"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %155) #19, !llvm.access.group !11
+  %157 = load <8 x i32>, <8 x i32>* %35, align 32, !tbaa !10, !llvm.access.group !11
+  %158 = load <8 x float>, <8 x float>* %33, align 32, !tbaa !10, !llvm.access.group !11
+  call void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1> %154, <8 x float> addrspace(201)* %156, <8 x i32> %157, <8 x float> %158), !llvm.access.group !11
+  %159 = bitcast <8 x i32>* %35 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %159) #7, !llvm.access.group !11
+  %160 = bitcast <8 x i32>* %34 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %160) #7, !llvm.access.group !11
+  %161 = bitcast <8 x float>* %33 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %161) #7, !llvm.access.group !11
+  %162 = bitcast <8 x float>* %32 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %162) #7, !llvm.access.group !11
+  %163 = bitcast <8 x i32>* %31 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %163) #7, !llvm.access.group !11
+  br label %164
+
+164:                                              ; preds = %117
+  %165 = load i32, i32* %30, align 4, !tbaa !3, !llvm.access.group !11
+  %166 = add nsw i32 %165, 1
+  store i32 %166, i32* %30, align 4, !tbaa !3, !llvm.access.group !11
+  br label %111, !llvm.loop !12
+
+167:                                              ; preds = %115
+  %168 = load <8 x i32>, <8 x i32>* %26, align 32, !tbaa !10, !llvm.access.group !11
+  store <8 x i32> %168, <8 x i32>* %15, align 32, !tbaa !10, !llvm.access.group !11
+  %169 = load <8 x i32>, <8 x i32>* %27, align 32, !tbaa !10, !llvm.access.group !11
+  store <8 x i32> %169, <8 x i32>* %16, align 32, !tbaa !10, !llvm.access.group !11
+  %170 = load <8 x float>, <8 x float>* %29, align 32, !tbaa !10, !llvm.access.group !11
+  store <8 x float> %170, <8 x float>* %18, align 32, !tbaa !10, !llvm.access.group !11
+  %171 = bitcast <8 x float>* %29 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %171) #7, !llvm.access.group !11
+  %172 = bitcast %"struct.embeddings::SortResult.1"* %28 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %172) #7, !llvm.access.group !11
+  %173 = bitcast <8 x i32>* %27 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %173) #7, !llvm.access.group !11
+  %174 = bitcast <8 x i32>* %26 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %174) #7, !llvm.access.group !11
+  %175 = bitcast %"struct.embeddings::SortResult"* %25 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %175) #7, !llvm.access.group !11
+  %176 = bitcast <8 x float>* %24 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %176) #7, !llvm.access.group !11
+  %177 = bitcast <8 x i32>* %23 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %177) #7, !llvm.access.group !11
+  %178 = bitcast <8 x i32>* %22 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %178) #7, !llvm.access.group !11
+  %179 = bitcast i32* %21 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %179) #7, !llvm.access.group !11
+  br label %180
+
+180:                                              ; preds = %167
+  %181 = load i32, i32* %19, align 4, !tbaa !3, !llvm.access.group !11
+  %182 = add nsw i32 %181, 1
+  store i32 %182, i32* %19, align 4, !tbaa !3, !llvm.access.group !11
+  br label %62, !llvm.loop !15
+
+183:                                              ; preds = %67
+  %184 = bitcast <8 x float>* %18 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %184) #7
+  %185 = bitcast %"struct.embeddings::SortResult.1"* %17 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %185) #7
+  %186 = bitcast <8 x i32>* %16 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %186) #7
+  %187 = bitcast <8 x i32>* %15 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %187) #7
+  %188 = bitcast %"struct.embeddings::SortResult"* %14 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 96, i8* %188) #7
+  %189 = bitcast <8 x float>* %13 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %189) #7
+  %190 = bitcast <8 x i32>* %12 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %190) #7
+  %191 = bitcast <8 x i32>* %11 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %191) #7
+  ret void
+}
+
+; Function Attrs: inlinehint
+define linkonce_odr dso_local void @_ZN10embeddings20TileSpmemVectorArrayIiEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %1) unnamed_addr #5 comdat align 2 {
+  %3 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %4 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  store %"class.embeddings::TileSpmemVectorArray"* %0, %"class.embeddings::TileSpmemVectorArray"** %3, align 4, !tbaa !7
+  store %"class.embeddings::TileSpmemVectorArray"* %1, %"class.embeddings::TileSpmemVectorArray"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %3, align 4
+  %6 = bitcast %"class.embeddings::TileSpmemVectorArray"* %5 to %"class.embeddings::ScratchpadArray"*
+  %7 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %4, align 4, !tbaa !7
+  %8 = bitcast %"class.embeddings::TileSpmemVectorArray"* %7 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings15ScratchpadArrayC2ERKS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %6, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %8) #19
+  ret void
+}
+
+; Function Attrs: inlinehint
+define linkonce_odr dso_local void @_ZN10embeddings20TileSpmemVectorArrayIfEC2ERKS1_(%"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %1) unnamed_addr #5 comdat align 2 {
+  %3 = alloca %"class.embeddings::TileSpmemVectorArray.0"*, align 4
+  %4 = alloca %"class.embeddings::TileSpmemVectorArray.0"*, align 4
+  store %"class.embeddings::TileSpmemVectorArray.0"* %0, %"class.embeddings::TileSpmemVectorArray.0"** %3, align 4, !tbaa !7
+  store %"class.embeddings::TileSpmemVectorArray.0"* %1, %"class.embeddings::TileSpmemVectorArray.0"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::TileSpmemVectorArray.0"*, %"class.embeddings::TileSpmemVectorArray.0"** %3, align 4
+  %6 = bitcast %"class.embeddings::TileSpmemVectorArray.0"* %5 to %"class.embeddings::ScratchpadArray"*
+  %7 = load %"class.embeddings::TileSpmemVectorArray.0"*, %"class.embeddings::TileSpmemVectorArray.0"** %4, align 4, !tbaa !7
+  %8 = bitcast %"class.embeddings::TileSpmemVectorArray.0"* %7 to %"class.embeddings::ScratchpadArray"*
+  call void @_ZN10embeddings15ScratchpadArrayC2ERKS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %6, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %8) #19
+  ret void
+}
+
+; Function Attrs: mustprogress nounwind
+define internal void @_ZN12_GLOBAL__N_16ReturnEii(i32 noundef %0, i32 noundef %1) #3 {
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32*, align 4
+  store i32 %0, i32* %3, align 4, !tbaa !3
+  store i32 %1, i32* %4, align 4, !tbaa !3
+  %6 = bitcast i32** %5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #7
+  %7 = load i32, i32* %4, align 4, !tbaa !3
+  %8 = add nsw i32 256, %7
+  %9 = inttoptr i32 %8 to i32*
+  store i32* %9, i32** %5, align 4, !tbaa !7
+  %10 = load i32, i32* %3, align 4, !tbaa !3
+  %11 = load i32*, i32** %5, align 4, !tbaa !7
+  store i32 %10, i32* %11, align 4, !tbaa !3
+  %12 = bitcast i32** %5 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %12) #7
+  ret void
+}
+
+; Function Attrs: argmemonly nocallback nofree nosync nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: mustprogress nounwind
+define dso_local void @scs() #10 section ".text.scs" {
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define linkonce_odr dso_local void @_ZN10embeddings11MemorySpaceC2ERKS0_(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %0, %"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %1) unnamed_addr #8 comdat align 2 {
+  %3 = alloca %"class.embeddings::MemorySpace"*, align 4
+  %4 = alloca %"class.embeddings::MemorySpace"*, align 4
+  store %"class.embeddings::MemorySpace"* %0, %"class.embeddings::MemorySpace"** %3, align 4, !tbaa !7
+  store %"class.embeddings::MemorySpace"* %1, %"class.embeddings::MemorySpace"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::MemorySpace"*, %"class.embeddings::MemorySpace"** %3, align 4
+  %6 = load %"class.embeddings::MemorySpace"*, %"class.embeddings::MemorySpace"** %4, align 4, !tbaa !7
+  %7 = getelementptr inbounds %"class.embeddings::MemorySpace", %"class.embeddings::MemorySpace"* %6, i32 0, i32 0
+  %8 = load i32, i32* %7, align 4, !tbaa !21
+  %9 = getelementptr inbounds %"class.embeddings::MemorySpace", %"class.embeddings::MemorySpace"* %5, i32 0, i32 0
+  store i32 %8, i32* %9, align 4, !tbaa !21
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define linkonce_odr dso_local void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %0, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %1) unnamed_addr #8 comdat align 2 {
+  %3 = alloca %"class.embeddings::BasicType"*, align 4
+  %4 = alloca %"class.embeddings::BasicType"*, align 4
+  store %"class.embeddings::BasicType"* %0, %"class.embeddings::BasicType"** %3, align 4, !tbaa !7
+  store %"class.embeddings::BasicType"* %1, %"class.embeddings::BasicType"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::BasicType"*, %"class.embeddings::BasicType"** %3, align 4
+  %6 = load %"class.embeddings::BasicType"*, %"class.embeddings::BasicType"** %4, align 4, !tbaa !7
+  %7 = getelementptr inbounds %"class.embeddings::BasicType", %"class.embeddings::BasicType"* %6, i32 0, i32 0
+  %8 = load i32, i32* %7, align 4, !tbaa !24
+  %9 = getelementptr inbounds %"class.embeddings::BasicType", %"class.embeddings::BasicType"* %5, i32 0, i32 0
+  store i32 %8, i32* %9, align 4, !tbaa !24
+  ret void
+}
+
+; Function Attrs: argmemonly nofree nounwind willreturn
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #11
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local noundef align 32 dereferenceable(32) <8 x i32> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIiEclEi(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %0, i32 noundef %1) #4 comdat align 2 {
+  %3 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %4 = alloca i32, align 4
+  store %"class.embeddings::TileSpmemVectorArray"* %0, %"class.embeddings::TileSpmemVectorArray"** %3, align 4, !tbaa !7
+  store i32 %1, i32* %4, align 4, !tbaa !3
+  %5 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %3, align 4
+  %6 = call noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %5) #19
+  %7 = load i32, i32* %4, align 4, !tbaa !3
+  %8 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %6, i32 %7
+  ret <8 x i32> addrspace(201)* %8
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local noundef align 32 dereferenceable(32) <8 x float> addrspace(201)* @_ZN10embeddings20TileSpmemVectorArrayIfEclEi(%"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %0, i32 noundef %1) #4 comdat align 2 {
+  %3 = alloca %"class.embeddings::TileSpmemVectorArray.0"*, align 4
+  %4 = alloca i32, align 4
+  store %"class.embeddings::TileSpmemVectorArray.0"* %0, %"class.embeddings::TileSpmemVectorArray.0"** %3, align 4, !tbaa !7
+  store i32 %1, i32* %4, align 4, !tbaa !3
+  %5 = load %"class.embeddings::TileSpmemVectorArray.0"*, %"class.embeddings::TileSpmemVectorArray.0"** %3, align 4
+  %6 = call noundef <8 x float> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIfE4BaseEv(%"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %5) #19
+  %7 = load i32, i32* %4, align 4, !tbaa !3
+  %8 = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %6, i32 %7
+  ret <8 x float> addrspace(201)* %8
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings10VectorSortIDv8_iS1_EENS_10SortResultIT_T0_EES1_S3_S4_NS_12SortOrderingE(%"struct.embeddings::SortResult"* noalias sret(%"struct.embeddings::SortResult") align 32 %0, <8 x i32> noundef %1, <8 x i32> noundef %2, <8 x i32> noundef %3, i32 noundef %4) #9 comdat {
+  %6 = alloca <8 x i32>, align 32
+  %7 = alloca <8 x i32>, align 32
+  %8 = alloca <8 x i32>, align 32
+  %9 = alloca i32, align 4
+  store <8 x i32> %1, <8 x i32>* %6, align 32, !tbaa !10
+  store <8 x i32> %2, <8 x i32>* %7, align 32, !tbaa !10
+  store <8 x i32> %3, <8 x i32>* %8, align 32, !tbaa !10
+  store i32 %4, i32* %9, align 4, !tbaa !27
+  %10 = load <8 x i32>, <8 x i32>* %6, align 32, !tbaa !10
+  %11 = getelementptr inbounds %"struct.embeddings::SortResult", %"struct.embeddings::SortResult"* %0, i32 0, i32 0
+  store <8 x i32> %10, <8 x i32>* %11, align 32, !tbaa !10
+  %12 = load <8 x i32>, <8 x i32>* %7, align 32, !tbaa !10
+  %13 = getelementptr inbounds %"struct.embeddings::SortResult", %"struct.embeddings::SortResult"* %0, i32 0, i32 1
+  store <8 x i32> %12, <8 x i32>* %13, align 32, !tbaa !10
+  %14 = load <8 x i32>, <8 x i32>* %8, align 32, !tbaa !10
+  %15 = getelementptr inbounds %"struct.embeddings::SortResult", %"struct.embeddings::SortResult"* %0, i32 0, i32 2
+  store <8 x i32> %14, <8 x i32>* %15, align 32, !tbaa !10
+  %16 = load i32, i32* %9, align 4, !tbaa !27
+  %17 = icmp eq i32 %16, 0
+  br i1 %17, label %18, label %22
+
+18:                                               ; preds = %5
+  %19 = getelementptr inbounds %"struct.embeddings::SortResult", %"struct.embeddings::SortResult"* %0, i32 0, i32 0
+  %20 = getelementptr inbounds %"struct.embeddings::SortResult", %"struct.embeddings::SortResult"* %0, i32 0, i32 1
+  %21 = getelementptr inbounds %"struct.embeddings::SortResult", %"struct.embeddings::SortResult"* %0, i32 0, i32 2
+  call void @_ZN10embeddings3tpuIDv8_iE8sortascdEPS1_S3_S3_(<8 x i32>* noundef %19, <8 x i32>* noundef %20, <8 x i32>* noundef %21) #19
+  br label %26
+
+22:                                               ; preds = %5
+  %23 = getelementptr inbounds %"struct.embeddings::SortResult", %"struct.embeddings::SortResult"* %0, i32 0, i32 0
+  %24 = getelementptr inbounds %"struct.embeddings::SortResult", %"struct.embeddings::SortResult"* %0, i32 0, i32 1
+  %25 = getelementptr inbounds %"struct.embeddings::SortResult", %"struct.embeddings::SortResult"* %0, i32 0, i32 2
+  call void @_ZN10embeddings3tpuIDv8_iE8sortdscdEPS1_S3_S3_(<8 x i32>* noundef %23, <8 x i32>* noundef %24, <8 x i32>* noundef %25) #19
+  br label %26
+
+26:                                               ; preds = %22, %18
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings10VectorSortIDv8_iDv8_fEENS_10SortResultIT_T0_EES1_S4_S5_NS_12SortOrderingE(%"struct.embeddings::SortResult.1"* noalias sret(%"struct.embeddings::SortResult.1") align 32 %0, <8 x i32> noundef %1, <8 x i32> noundef %2, <8 x float> noundef %3, i32 noundef %4) #9 comdat {
+  %6 = alloca <8 x i32>, align 32
+  %7 = alloca <8 x i32>, align 32
+  %8 = alloca <8 x float>, align 32
+  %9 = alloca i32, align 4
+  store <8 x i32> %1, <8 x i32>* %6, align 32, !tbaa !10
+  store <8 x i32> %2, <8 x i32>* %7, align 32, !tbaa !10
+  store <8 x float> %3, <8 x float>* %8, align 32, !tbaa !10
+  store i32 %4, i32* %9, align 4, !tbaa !27
+  %10 = load <8 x i32>, <8 x i32>* %6, align 32, !tbaa !10
+  %11 = getelementptr inbounds %"struct.embeddings::SortResult.1", %"struct.embeddings::SortResult.1"* %0, i32 0, i32 0
+  store <8 x i32> %10, <8 x i32>* %11, align 32, !tbaa !10
+  %12 = load <8 x i32>, <8 x i32>* %7, align 32, !tbaa !10
+  %13 = getelementptr inbounds %"struct.embeddings::SortResult.1", %"struct.embeddings::SortResult.1"* %0, i32 0, i32 1
+  store <8 x i32> %12, <8 x i32>* %13, align 32, !tbaa !10
+  %14 = load <8 x float>, <8 x float>* %8, align 32, !tbaa !10
+  %15 = getelementptr inbounds %"struct.embeddings::SortResult.1", %"struct.embeddings::SortResult.1"* %0, i32 0, i32 2
+  store <8 x float> %14, <8 x float>* %15, align 32, !tbaa !10
+  %16 = load i32, i32* %9, align 4, !tbaa !27
+  %17 = icmp eq i32 %16, 0
+  br i1 %17, label %18, label %22
+
+18:                                               ; preds = %5
+  %19 = getelementptr inbounds %"struct.embeddings::SortResult.1", %"struct.embeddings::SortResult.1"* %0, i32 0, i32 0
+  %20 = getelementptr inbounds %"struct.embeddings::SortResult.1", %"struct.embeddings::SortResult.1"* %0, i32 0, i32 1
+  %21 = getelementptr inbounds %"struct.embeddings::SortResult.1", %"struct.embeddings::SortResult.1"* %0, i32 0, i32 2
+  call void @_ZN10embeddings3tpuIDv8_iE8sortascdEPS1_S3_PDv8_f(<8 x i32>* noundef %19, <8 x i32>* noundef %20, <8 x float>* noundef %21) #19
+  br label %26
+
+22:                                               ; preds = %5
+  %23 = getelementptr inbounds %"struct.embeddings::SortResult.1", %"struct.embeddings::SortResult.1"* %0, i32 0, i32 0
+  %24 = getelementptr inbounds %"struct.embeddings::SortResult.1", %"struct.embeddings::SortResult.1"* %0, i32 0, i32 1
+  %25 = getelementptr inbounds %"struct.embeddings::SortResult.1", %"struct.embeddings::SortResult.1"* %0, i32 0, i32 2
+  call void @_ZN10embeddings3tpuIDv8_iE8sortdscdEPS1_S3_PDv8_f(<8 x i32>* noundef %23, <8 x i32>* noundef %24, <8 x float>* noundef %25) #19
+  br label %26
+
+26:                                               ; preds = %22, %18
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef i32 @_ZNK10embeddings9BaseArray8ElementsEv(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %0) #12 comdat align 2 {
+  %2 = alloca %"class.embeddings::BaseArray"*, align 4
+  store %"class.embeddings::BaseArray"* %0, %"class.embeddings::BaseArray"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %2, align 4
+  %4 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %3, i32 0, i32 1
+  %5 = load i32, i32* %4, align 4, !tbaa !29
+  ret i32 %5
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local noundef <8 x float> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIfE4BaseEv(%"class.embeddings::TileSpmemVectorArray.0"* noundef nonnull align 4 dereferenceable(16) %0) #4 comdat align 2 {
+  %2 = alloca %"class.embeddings::TileSpmemVectorArray.0"*, align 4
+  %3 = alloca %"class.embeddings::TileSpmemPointer"*, align 4
+  %4 = alloca %"class.embeddings::PointerBase", align 4
+  store %"class.embeddings::TileSpmemVectorArray.0"* %0, %"class.embeddings::TileSpmemVectorArray.0"** %2, align 4, !tbaa !7
+  %5 = load %"class.embeddings::TileSpmemVectorArray.0"*, %"class.embeddings::TileSpmemVectorArray.0"** %2, align 4
+  %6 = bitcast %"class.embeddings::TileSpmemPointer"** %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #7
+  %7 = bitcast %"class.embeddings::PointerBase"* %4 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %7) #7
+  %8 = bitcast %"class.embeddings::TileSpmemVectorArray.0"* %5 to %"class.embeddings::BaseArray"*
+  call void @_ZNK10embeddings9BaseArray7BasePtrEv(%"class.embeddings::PointerBase"* sret(%"class.embeddings::PointerBase") align 4 %4, %"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %8) #19
+  %9 = call noundef %"class.embeddings::TileSpmemPointer"* @_ZN10embeddings4CastINS_16TileSpmemPointerENS_11PointerBaseEEENS_15cast_retty_implIT_T0_E8ret_typeERKS5_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %4) #19
+  %10 = bitcast %"class.embeddings::PointerBase"* %4 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %10) #7
+  store %"class.embeddings::TileSpmemPointer"* %9, %"class.embeddings::TileSpmemPointer"** %3, align 4, !tbaa !7
+  %11 = load %"class.embeddings::TileSpmemPointer"*, %"class.embeddings::TileSpmemPointer"** %3, align 4, !tbaa !7
+  %12 = call noundef i8 addrspace(201)* @_ZNK10embeddings16TileSpmemPointer6RawPtrEv(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %11) #19
+  %13 = bitcast i8 addrspace(201)* %12 to <8 x float> addrspace(201)*
+  %14 = bitcast %"class.embeddings::TileSpmemPointer"** %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %14) #7
+  ret <8 x float> addrspace(201)* %13
+}
+
+; Function Attrs: argmemonly nounwind readonly
+declare <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>) #13
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32>, <8 x float>) #14
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, <8 x float>) #15
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local noundef <8 x i32> addrspace(201)* @_ZNK10embeddings20TileSpmemVectorArrayIiE4BaseEv(%"class.embeddings::TileSpmemVectorArray"* noundef nonnull align 4 dereferenceable(16) %0) #4 comdat align 2 {
+  %2 = alloca %"class.embeddings::TileSpmemVectorArray"*, align 4
+  %3 = alloca %"class.embeddings::TileSpmemPointer"*, align 4
+  %4 = alloca %"class.embeddings::PointerBase", align 4
+  store %"class.embeddings::TileSpmemVectorArray"* %0, %"class.embeddings::TileSpmemVectorArray"** %2, align 4, !tbaa !7
+  %5 = load %"class.embeddings::TileSpmemVectorArray"*, %"class.embeddings::TileSpmemVectorArray"** %2, align 4
+  %6 = bitcast %"class.embeddings::TileSpmemPointer"** %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #7
+  %7 = bitcast %"class.embeddings::PointerBase"* %4 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %7) #7
+  %8 = bitcast %"class.embeddings::TileSpmemVectorArray"* %5 to %"class.embeddings::BaseArray"*
+  call void @_ZNK10embeddings9BaseArray7BasePtrEv(%"class.embeddings::PointerBase"* sret(%"class.embeddings::PointerBase") align 4 %4, %"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %8) #19
+  %9 = call noundef %"class.embeddings::TileSpmemPointer"* @_ZN10embeddings4CastINS_16TileSpmemPointerENS_11PointerBaseEEENS_15cast_retty_implIT_T0_E8ret_typeERKS5_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %4) #19
+  %10 = bitcast %"class.embeddings::PointerBase"* %4 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %10) #7
+  store %"class.embeddings::TileSpmemPointer"* %9, %"class.embeddings::TileSpmemPointer"** %3, align 4, !tbaa !7
+  %11 = load %"class.embeddings::TileSpmemPointer"*, %"class.embeddings::TileSpmemPointer"** %3, align 4, !tbaa !7
+  %12 = call noundef i8 addrspace(201)* @_ZNK10embeddings16TileSpmemPointer6RawPtrEv(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %11) #19
+  %13 = bitcast i8 addrspace(201)* %12 to <8 x i32> addrspace(201)*
+  %14 = bitcast %"class.embeddings::TileSpmemPointer"** %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %14) #7
+  ret <8 x i32> addrspace(201)* %13
+}
+
+; Function Attrs: mustprogress nounwind
+define linkonce_odr dso_local noundef %"class.embeddings::TileSpmemPointer"* @_ZN10embeddings4CastINS_16TileSpmemPointerENS_11PointerBaseEEENS_15cast_retty_implIT_T0_E8ret_typeERKS5_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0) #3 comdat {
+  %2 = alloca %"class.embeddings::PointerBase"*, align 4
+  store %"class.embeddings::PointerBase"* %0, %"class.embeddings::PointerBase"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %2, align 4, !tbaa !7
+  %4 = bitcast %"class.embeddings::PointerBase"* %3 to %"class.embeddings::TileSpmemPointer"*
+  ret %"class.embeddings::TileSpmemPointer"* %4
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZNK10embeddings9BaseArray7BasePtrEv(%"class.embeddings::PointerBase"* noalias sret(%"class.embeddings::PointerBase") align 4 %0, %"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %1) #4 comdat align 2 {
+  %3 = alloca i8*, align 4
+  %4 = alloca %"class.embeddings::BaseArray"*, align 4
+  %5 = bitcast %"class.embeddings::PointerBase"* %0 to i8*
+  store i8* %5, i8** %3, align 4
+  store %"class.embeddings::BaseArray"* %1, %"class.embeddings::BaseArray"** %4, align 4, !tbaa !7
+  %6 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %4, align 4
+  %7 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %6, i32 0, i32 0
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %7) #19
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local noundef i8 addrspace(201)* @_ZNK10embeddings16TileSpmemPointer6RawPtrEv(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %0) #12 comdat align 2 {
+  %2 = alloca %"class.embeddings::TileSpmemPointer"*, align 4
+  store %"class.embeddings::TileSpmemPointer"* %0, %"class.embeddings::TileSpmemPointer"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::TileSpmemPointer"*, %"class.embeddings::TileSpmemPointer"** %2, align 4
+  %4 = bitcast %"class.embeddings::TileSpmemPointer"* %3 to %"class.embeddings::PointerBase"*
+  %5 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %4, i32 0, i32 2
+  %6 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %5 to i32 addrspace(201)**
+  %7 = load i32 addrspace(201)*, i32 addrspace(201)** %6, align 4, !tbaa !10
+  %8 = bitcast i32 addrspace(201)* %7 to i8 addrspace(201)*
+  ret i8 addrspace(201)* %8
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local void @_ZN10embeddings3tpuIDv8_iE8sortascdEPS1_S3_S3_(<8 x i32>* noundef %0, <8 x i32>* noundef %1, <8 x i32>* noundef %2) #12 comdat align 2 {
+  %4 = alloca <8 x i32>*, align 4
+  %5 = alloca <8 x i32>*, align 4
+  %6 = alloca <8 x i32>*, align 4
+  store <8 x i32>* %0, <8 x i32>** %4, align 4, !tbaa !7
+  store <8 x i32>* %1, <8 x i32>** %5, align 4, !tbaa !7
+  store <8 x i32>* %2, <8 x i32>** %6, align 4, !tbaa !7
+  %7 = load <8 x i32>*, <8 x i32>** %4, align 4, !tbaa !7
+  %8 = load <8 x i32>*, <8 x i32>** %5, align 4, !tbaa !7
+  %9 = load <8 x i32>*, <8 x i32>** %6, align 4, !tbaa !7
+  %10 = load <8 x i32>, <8 x i32>* %7, align 32
+  %11 = trunc <8 x i32> %10 to <8 x i1>
+  %12 = load <8 x i32>, <8 x i32>* %8, align 32
+  %13 = load <8 x i32>, <8 x i32>* %9, align 32
+  %14 = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> %11, <8 x i32> %12, <8 x i32> %13)
+  %15 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %14, 2
+  %16 = zext <8 x i1> %15 to <8 x i32>
+  store <8 x i32> %16, <8 x i32>* %7, align 32
+  %17 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %14, 0
+  store <8 x i32> %17, <8 x i32>* %8, align 32
+  %18 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %14, 1
+  store <8 x i32> %18, <8 x i32>* %9, align 32
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local void @_ZN10embeddings3tpuIDv8_iE8sortdscdEPS1_S3_S3_(<8 x i32>* noundef %0, <8 x i32>* noundef %1, <8 x i32>* noundef %2) #12 comdat align 2 {
+  %4 = alloca <8 x i32>*, align 4
+  %5 = alloca <8 x i32>*, align 4
+  %6 = alloca <8 x i32>*, align 4
+  store <8 x i32>* %0, <8 x i32>** %4, align 4, !tbaa !7
+  store <8 x i32>* %1, <8 x i32>** %5, align 4, !tbaa !7
+  store <8 x i32>* %2, <8 x i32>** %6, align 4, !tbaa !7
+  %7 = load <8 x i32>*, <8 x i32>** %4, align 4, !tbaa !7
+  %8 = load <8 x i32>*, <8 x i32>** %5, align 4, !tbaa !7
+  %9 = load <8 x i32>*, <8 x i32>** %6, align 4, !tbaa !7
+  %10 = load <8 x i32>, <8 x i32>* %7, align 32
+  %11 = trunc <8 x i32> %10 to <8 x i1>
+  %12 = load <8 x i32>, <8 x i32>* %8, align 32
+  %13 = load <8 x i32>, <8 x i32>* %9, align 32
+  %14 = call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.dscdi.v8i32(<8 x i1> %11, <8 x i32> %12, <8 x i32> %13)
+  %15 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %14, 2
+  %16 = zext <8 x i1> %15 to <8 x i32>
+  store <8 x i32> %16, <8 x i32>* %7, align 32
+  %17 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %14, 0
+  store <8 x i32> %17, <8 x i32>* %8, align 32
+  %18 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %14, 1
+  store <8 x i32> %18, <8 x i32>* %9, align 32
+  ret void
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1>, <8 x i32>, <8 x i32>) #14
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.dscdi.v8i32(<8 x i1>, <8 x i32>, <8 x i32>) #14
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local void @_ZN10embeddings3tpuIDv8_iE8sortascdEPS1_S3_PDv8_f(<8 x i32>* noundef %0, <8 x i32>* noundef %1, <8 x float>* noundef %2) #12 comdat align 2 {
+  %4 = alloca <8 x i32>*, align 4
+  %5 = alloca <8 x i32>*, align 4
+  %6 = alloca <8 x float>*, align 4
+  store <8 x i32>* %0, <8 x i32>** %4, align 4, !tbaa !7
+  store <8 x i32>* %1, <8 x i32>** %5, align 4, !tbaa !7
+  store <8 x float>* %2, <8 x float>** %6, align 4, !tbaa !7
+  %7 = load <8 x i32>*, <8 x i32>** %4, align 4, !tbaa !7
+  %8 = load <8 x i32>*, <8 x i32>** %5, align 4, !tbaa !7
+  %9 = load <8 x float>*, <8 x float>** %6, align 4, !tbaa !7
+  %10 = load <8 x i32>, <8 x i32>* %7, align 32
+  %11 = trunc <8 x i32> %10 to <8 x i1>
+  %12 = load <8 x i32>, <8 x i32>* %8, align 32
+  %13 = load <8 x float>, <8 x float>* %9, align 32
+  %14 = call { <8 x i32>, <8 x float>, <8 x i1> } @llvm.tpu.sort.ascdi.v8f32(<8 x i1> %11, <8 x i32> %12, <8 x float> %13)
+  %15 = extractvalue { <8 x i32>, <8 x float>, <8 x i1> } %14, 2
+  %16 = zext <8 x i1> %15 to <8 x i32>
+  store <8 x i32> %16, <8 x i32>* %7, align 32
+  %17 = extractvalue { <8 x i32>, <8 x float>, <8 x i1> } %14, 0
+  store <8 x i32> %17, <8 x i32>* %8, align 32
+  %18 = extractvalue { <8 x i32>, <8 x float>, <8 x i1> } %14, 1
+  store <8 x float> %18, <8 x float>* %9, align 32
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress nounwind
+define linkonce_odr dso_local void @_ZN10embeddings3tpuIDv8_iE8sortdscdEPS1_S3_PDv8_f(<8 x i32>* noundef %0, <8 x i32>* noundef %1, <8 x float>* noundef %2) #12 comdat align 2 {
+  %4 = alloca <8 x i32>*, align 4
+  %5 = alloca <8 x i32>*, align 4
+  %6 = alloca <8 x float>*, align 4
+  store <8 x i32>* %0, <8 x i32>** %4, align 4, !tbaa !7
+  store <8 x i32>* %1, <8 x i32>** %5, align 4, !tbaa !7
+  store <8 x float>* %2, <8 x float>** %6, align 4, !tbaa !7
+  %7 = load <8 x i32>*, <8 x i32>** %4, align 4, !tbaa !7
+  %8 = load <8 x i32>*, <8 x i32>** %5, align 4, !tbaa !7
+  %9 = load <8 x float>*, <8 x float>** %6, align 4, !tbaa !7
+  %10 = load <8 x i32>, <8 x i32>* %7, align 32
+  %11 = trunc <8 x i32> %10 to <8 x i1>
+  %12 = load <8 x i32>, <8 x i32>* %8, align 32
+  %13 = load <8 x float>, <8 x float>* %9, align 32
+  %14 = call { <8 x i32>, <8 x float>, <8 x i1> } @llvm.tpu.sort.dscdi.v8f32(<8 x i1> %11, <8 x i32> %12, <8 x float> %13)
+  %15 = extractvalue { <8 x i32>, <8 x float>, <8 x i1> } %14, 2
+  %16 = zext <8 x i1> %15 to <8 x i32>
+  store <8 x i32> %16, <8 x i32>* %7, align 32
+  %17 = extractvalue { <8 x i32>, <8 x float>, <8 x i1> } %14, 0
+  store <8 x i32> %17, <8 x i32>* %8, align 32
+  %18 = extractvalue { <8 x i32>, <8 x float>, <8 x i1> } %14, 1
+  store <8 x float> %18, <8 x float>* %9, align 32
+  ret void
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare { <8 x i32>, <8 x float>, <8 x i1> } @llvm.tpu.sort.ascdi.v8f32(<8 x i1>, <8 x i32>, <8 x float>) #14
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare { <8 x i32>, <8 x float>, <8 x i1> } @llvm.tpu.sort.dscdi.v8f32(<8 x i1>, <8 x i32>, <8 x float>) #14
+
+; Function Attrs: inlinehint
+define linkonce_odr dso_local void @_ZN10embeddings15ScratchpadArrayC2ERKS0_(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %1) unnamed_addr #5 comdat align 2 {
+  %3 = alloca %"class.embeddings::ScratchpadArray"*, align 4
+  %4 = alloca %"class.embeddings::ScratchpadArray"*, align 4
+  store %"class.embeddings::ScratchpadArray"* %0, %"class.embeddings::ScratchpadArray"** %3, align 4, !tbaa !7
+  store %"class.embeddings::ScratchpadArray"* %1, %"class.embeddings::ScratchpadArray"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::ScratchpadArray"*, %"class.embeddings::ScratchpadArray"** %3, align 4
+  %6 = bitcast %"class.embeddings::ScratchpadArray"* %5 to %"class.embeddings::BaseArray"*
+  %7 = load %"class.embeddings::ScratchpadArray"*, %"class.embeddings::ScratchpadArray"** %4, align 4, !tbaa !7
+  %8 = bitcast %"class.embeddings::ScratchpadArray"* %7 to %"class.embeddings::BaseArray"*
+  call void @_ZN10embeddings9BaseArrayC2ERKS0_(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %6, %"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %8) #19
+  ret void
+}
+
+; Function Attrs: inlinehint
+define linkonce_odr dso_local void @_ZN10embeddings9BaseArrayC2ERKS0_(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %1) unnamed_addr #5 comdat align 2 {
+  %3 = alloca %"class.embeddings::BaseArray"*, align 4
+  %4 = alloca %"class.embeddings::BaseArray"*, align 4
+  store %"class.embeddings::BaseArray"* %0, %"class.embeddings::BaseArray"** %3, align 4, !tbaa !7
+  store %"class.embeddings::BaseArray"* %1, %"class.embeddings::BaseArray"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %3, align 4
+  %6 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %5, i32 0, i32 0
+  %7 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %4, align 4, !tbaa !7
+  %8 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %7, i32 0, i32 0
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %6, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %8) #19
+  %9 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %5, i32 0, i32 1
+  %10 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %4, align 4, !tbaa !7
+  %11 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %10, i32 0, i32 1
+  %12 = load i32, i32* %11, align 4, !tbaa !29
+  store i32 %12, i32* %9, align 4, !tbaa !29
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings15ScratchpadArrayC2ENS_11PointerBaseEi(%"class.embeddings::ScratchpadArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::PointerBase"* noundef %1, i32 noundef %2) unnamed_addr #6 comdat align 2 {
+  %4 = alloca %"class.embeddings::ScratchpadArray"*, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca %"class.embeddings::PointerBase", align 4
+  store %"class.embeddings::ScratchpadArray"* %0, %"class.embeddings::ScratchpadArray"** %4, align 4, !tbaa !7
+  store i32 %2, i32* %5, align 4, !tbaa !3
+  %7 = load %"class.embeddings::ScratchpadArray"*, %"class.embeddings::ScratchpadArray"** %4, align 4
+  %8 = bitcast %"class.embeddings::ScratchpadArray"* %7 to %"class.embeddings::BaseArray"*
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %6, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) #19
+  %9 = load i32, i32* %5, align 4, !tbaa !3
+  call void @_ZN10embeddings9BaseArrayC2ENS_11PointerBaseEi(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %8, %"class.embeddings::PointerBase"* noundef %6, i32 noundef %9) #19
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings9BaseArrayC2ENS_11PointerBaseEi(%"class.embeddings::BaseArray"* noundef nonnull align 4 dereferenceable(16) %0, %"class.embeddings::PointerBase"* noundef %1, i32 noundef %2) unnamed_addr #6 comdat align 2 {
+  %4 = alloca %"class.embeddings::BaseArray"*, align 4
+  %5 = alloca i32, align 4
+  store %"class.embeddings::BaseArray"* %0, %"class.embeddings::BaseArray"** %4, align 4, !tbaa !7
+  store i32 %2, i32* %5, align 4, !tbaa !3
+  %6 = load %"class.embeddings::BaseArray"*, %"class.embeddings::BaseArray"** %4, align 4
+  %7 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %6, i32 0, i32 0
+  call void @_ZN10embeddings11PointerBaseC2ERKS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %7, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) #19
+  %8 = getelementptr inbounds %"class.embeddings::BaseArray", %"class.embeddings::BaseArray"* %6, i32 0, i32 1
+  %9 = load i32, i32* %5, align 4, !tbaa !3
+  store i32 %9, i32* %8, align 4, !tbaa !29
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings11ToBasicTypeIDv8_iE10basic_typeEv(%"class.embeddings::BasicType"* noalias sret(%"class.embeddings::BasicType") align 4 %0) #4 comdat align 2 {
+  %2 = alloca i8*, align 4
+  %3 = alloca %"class.embeddings::SCTY_V8S32", align 4
+  %4 = bitcast %"class.embeddings::BasicType"* %0 to i8*
+  store i8* %4, i8** %2, align 4
+  %5 = bitcast %"class.embeddings::SCTY_V8S32"* %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #7
+  call void @_ZN10embeddings10SCTY_V8S32C2Ev(%"class.embeddings::SCTY_V8S32"* noundef nonnull align 4 dereferenceable(4) %3) #19
+  %6 = bitcast %"class.embeddings::SCTY_V8S32"* %3 to %"class.embeddings::BasicType"*
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %0, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %6) #19
+  %7 = bitcast %"class.embeddings::SCTY_V8S32"* %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %7) #7
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings16TileSpmemPointerC2EPU5AS201vNS_9BasicTypeE(%"class.embeddings::TileSpmemPointer"* noundef nonnull align 4 dereferenceable(12) %0, i8 addrspace(201)* noundef %1, %"class.embeddings::BasicType"* noundef %2) unnamed_addr #6 comdat align 2 {
+  %4 = alloca %"class.embeddings::TileSpmemPointer"*, align 4
+  %5 = alloca i8 addrspace(201)*, align 4
+  %6 = alloca %"class.embeddings::BasicType", align 4
+  store %"class.embeddings::TileSpmemPointer"* %0, %"class.embeddings::TileSpmemPointer"** %4, align 4, !tbaa !7
+  store i8 addrspace(201)* %1, i8 addrspace(201)** %5, align 4, !tbaa !7
+  %7 = load %"class.embeddings::TileSpmemPointer"*, %"class.embeddings::TileSpmemPointer"** %4, align 4
+  %8 = bitcast %"class.embeddings::TileSpmemPointer"* %7 to %"class.embeddings::PointerBase"*
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %6, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %2) #19
+  %9 = load i8 addrspace(201)*, i8 addrspace(201)** %5, align 4, !tbaa !7
+  call void @_ZN10embeddings11PointerBaseC2ENS_9BasicTypeEPU5AS201v(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %8, %"class.embeddings::BasicType"* noundef %6, i8 addrspace(201)* noundef %9) #19
+  ret void
+}
+
+; Function Attrs: inlinehint
+define linkonce_odr dso_local void @_ZN10embeddings11PointerBaseC2EOS0_(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %1) unnamed_addr #5 comdat align 2 {
+  %3 = alloca %"class.embeddings::PointerBase"*, align 4
+  %4 = alloca %"class.embeddings::PointerBase"*, align 4
+  store %"class.embeddings::PointerBase"* %0, %"class.embeddings::PointerBase"** %3, align 4, !tbaa !7
+  store %"class.embeddings::PointerBase"* %1, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %5 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %3, align 4
+  %6 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 0
+  %7 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %8 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 0
+  call void @_ZN10embeddings11MemorySpaceC2ERKS0_(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %6, %"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %8) #19
+  %9 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 1
+  %10 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %11 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %10, i32 0, i32 1
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %9, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %11) #19
+  %12 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %5, i32 0, i32 2
+  %13 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  %14 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %13, i32 0, i32 2
+  %15 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %12 to i8*
+  %16 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %14 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %15, i8* align 4 %16, i32 4, i1 false), !tbaa.struct !9
+  ret void
+}
+
+define linkonce_odr dso_local void @_ZN10embeddings10SCTY_V8S32C2Ev(%"class.embeddings::SCTY_V8S32"* noundef nonnull align 4 dereferenceable(4) %0) unnamed_addr #16 comdat align 2 {
+  %2 = alloca %"class.embeddings::SCTY_V8S32"*, align 4
+  store %"class.embeddings::SCTY_V8S32"* %0, %"class.embeddings::SCTY_V8S32"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::SCTY_V8S32"*, %"class.embeddings::SCTY_V8S32"** %2, align 4
+  %4 = bitcast %"class.embeddings::SCTY_V8S32"* %3 to %"class.embeddings::BasicType"*
+  call void @_ZN10embeddings9BasicTypeC2ENS_19SparsecoreBasicTypeE(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %4, i32 noundef 3) #19
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define linkonce_odr dso_local void @_ZN10embeddings9BasicTypeC2ENS_19SparsecoreBasicTypeE(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %0, i32 noundef %1) unnamed_addr #8 comdat align 2 {
+  %3 = alloca %"class.embeddings::BasicType"*, align 4
+  %4 = alloca i32, align 4
+  store %"class.embeddings::BasicType"* %0, %"class.embeddings::BasicType"** %3, align 4, !tbaa !7
+  store i32 %1, i32* %4, align 4, !tbaa !32
+  %5 = load %"class.embeddings::BasicType"*, %"class.embeddings::BasicType"** %3, align 4
+  %6 = getelementptr inbounds %"class.embeddings::BasicType", %"class.embeddings::BasicType"* %5, i32 0, i32 0
+  %7 = load i32, i32* %4, align 4, !tbaa !32
+  store i32 %7, i32* %6, align 4, !tbaa !24
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr dso_local void @_ZN10embeddings11PointerBaseC2ENS_9BasicTypeEPU5AS201v(%"class.embeddings::PointerBase"* noundef nonnull align 4 dereferenceable(12) %0, %"class.embeddings::BasicType"* noundef %1, i8 addrspace(201)* noundef %2) unnamed_addr #6 comdat align 2 {
+  %4 = alloca %"class.embeddings::PointerBase"*, align 4
+  %5 = alloca i8 addrspace(201)*, align 4
+  %6 = alloca %"class.embeddings::SCM_TileSpmem", align 4
+  store %"class.embeddings::PointerBase"* %0, %"class.embeddings::PointerBase"** %4, align 4, !tbaa !7
+  store i8 addrspace(201)* %2, i8 addrspace(201)** %5, align 4, !tbaa !7
+  %7 = load %"class.embeddings::PointerBase"*, %"class.embeddings::PointerBase"** %4, align 4
+  %8 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 0
+  %9 = bitcast %"class.embeddings::SCM_TileSpmem"* %6 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %9) #7
+  call void @_ZN10embeddings13SCM_TileSpmemC2Ev(%"class.embeddings::SCM_TileSpmem"* noundef nonnull align 4 dereferenceable(4) %6) #19
+  %10 = bitcast %"class.embeddings::SCM_TileSpmem"* %6 to %"class.embeddings::MemorySpace"*
+  call void @_ZN10embeddings11MemorySpaceC2ERKS0_(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %8, %"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %10) #19
+  %11 = bitcast %"class.embeddings::SCM_TileSpmem"* %6 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %11) #7
+  %12 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 1
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %12, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %1) #19
+  %13 = getelementptr inbounds %"class.embeddings::PointerBase", %"class.embeddings::PointerBase"* %7, i32 0, i32 2
+  %14 = load i8 addrspace(201)*, i8 addrspace(201)** %5, align 4, !tbaa !7
+  call void @_ZN10embeddings11PointerBase6AnyPtrC2EPU5AS201v(%"union.embeddings::PointerBase::AnyPtr"* noundef nonnull align 4 dereferenceable(4) %13, i8 addrspace(201)* noundef %14) #19
+  ret void
+}
+
+define linkonce_odr dso_local void @_ZN10embeddings13SCM_TileSpmemC2Ev(%"class.embeddings::SCM_TileSpmem"* noundef nonnull align 4 dereferenceable(4) %0) unnamed_addr #16 comdat align 2 {
+  %2 = alloca %"class.embeddings::SCM_TileSpmem"*, align 4
+  store %"class.embeddings::SCM_TileSpmem"* %0, %"class.embeddings::SCM_TileSpmem"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::SCM_TileSpmem"*, %"class.embeddings::SCM_TileSpmem"** %2, align 4
+  %4 = bitcast %"class.embeddings::SCM_TileSpmem"* %3 to %"class.embeddings::MemorySpace"*
+  call void @_ZN10embeddings11MemorySpaceC2ENS_21SparsecoreMemorySpaceE(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %4, i32 noundef 1) #19
+  ret void
+}
+
+; Function Attrs: nounwind
+define linkonce_odr dso_local void @_ZN10embeddings11PointerBase6AnyPtrC2EPU5AS201v(%"union.embeddings::PointerBase::AnyPtr"* noundef nonnull align 4 dereferenceable(4) %0, i8 addrspace(201)* noundef %1) unnamed_addr #17 comdat align 2 {
+  %3 = alloca %"union.embeddings::PointerBase::AnyPtr"*, align 4
+  %4 = alloca i8 addrspace(201)*, align 4
+  store %"union.embeddings::PointerBase::AnyPtr"* %0, %"union.embeddings::PointerBase::AnyPtr"** %3, align 4, !tbaa !7
+  store i8 addrspace(201)* %1, i8 addrspace(201)** %4, align 4, !tbaa !7
+  %5 = load %"union.embeddings::PointerBase::AnyPtr"*, %"union.embeddings::PointerBase::AnyPtr"** %3, align 4
+  %6 = bitcast %"union.embeddings::PointerBase::AnyPtr"* %5 to i32 addrspace(201)**
+  %7 = load i8 addrspace(201)*, i8 addrspace(201)** %4, align 4, !tbaa !7
+  %8 = bitcast i8 addrspace(201)* %7 to i32 addrspace(201)*
+  store i32 addrspace(201)* %8, i32 addrspace(201)** %6, align 4, !tbaa !10
+  ret void
+}
+
+; Function Attrs: alwaysinline nounwind
+define linkonce_odr dso_local void @_ZN10embeddings11MemorySpaceC2ENS_21SparsecoreMemorySpaceE(%"class.embeddings::MemorySpace"* noundef nonnull align 4 dereferenceable(4) %0, i32 noundef %1) unnamed_addr #8 comdat align 2 {
+  %3 = alloca %"class.embeddings::MemorySpace"*, align 4
+  %4 = alloca i32, align 4
+  store %"class.embeddings::MemorySpace"* %0, %"class.embeddings::MemorySpace"** %3, align 4, !tbaa !7
+  store i32 %1, i32* %4, align 4, !tbaa !33
+  %5 = load %"class.embeddings::MemorySpace"*, %"class.embeddings::MemorySpace"** %3, align 4
+  %6 = getelementptr inbounds %"class.embeddings::MemorySpace", %"class.embeddings::MemorySpace"* %5, i32 0, i32 0
+  %7 = load i32, i32* %4, align 4, !tbaa !33
+  store i32 %7, i32* %6, align 4, !tbaa !21
+  ret void
+}
+
+; Function Attrs: alwaysinline mustprogress
+define linkonce_odr dso_local void @_ZN10embeddings11ToBasicTypeIDv8_fE10basic_typeEv(%"class.embeddings::BasicType"* noalias sret(%"class.embeddings::BasicType") align 4 %0) #4 comdat align 2 {
+  %2 = alloca i8*, align 4
+  %3 = alloca %"class.embeddings::SCTY_V8F32", align 4
+  %4 = bitcast %"class.embeddings::BasicType"* %0 to i8*
+  store i8* %4, i8** %2, align 4
+  %5 = bitcast %"class.embeddings::SCTY_V8F32"* %3 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #7
+  call void @_ZN10embeddings10SCTY_V8F32C2Ev(%"class.embeddings::SCTY_V8F32"* noundef nonnull align 4 dereferenceable(4) %3) #19
+  %6 = bitcast %"class.embeddings::SCTY_V8F32"* %3 to %"class.embeddings::BasicType"*
+  call void @_ZN10embeddings9BasicTypeC2ERKS0_(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %0, %"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %6) #19
+  %7 = bitcast %"class.embeddings::SCTY_V8F32"* %3 to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %7) #7
+  ret void
+}
+
+; Function Attrs: nounwind
+define linkonce_odr dso_local void @_ZN10embeddings10SCTY_V8F32C2Ev(%"class.embeddings::SCTY_V8F32"* noundef nonnull align 4 dereferenceable(4) %0) unnamed_addr #17 comdat align 2 {
+  %2 = alloca %"class.embeddings::SCTY_V8F32"*, align 4
+  store %"class.embeddings::SCTY_V8F32"* %0, %"class.embeddings::SCTY_V8F32"** %2, align 4, !tbaa !7
+  %3 = load %"class.embeddings::SCTY_V8F32"*, %"class.embeddings::SCTY_V8F32"** %2, align 4
+  %4 = bitcast %"class.embeddings::SCTY_V8F32"* %3 to %"class.embeddings::BasicType"*
+  call void @_ZN10embeddings9BasicTypeC2ENS_19SparsecoreBasicTypeE(%"class.embeddings::BasicType"* noundef nonnull align 4 dereferenceable(4) %4, i32 noundef 5) #19
+  ret void
+}
+
+attributes #0 = { mustprogress nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tac-vf" }
+attributes #1 = { mustprogress "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #2 = { argmemonly nocallback nofree nosync nounwind willreturn }
+attributes #3 = { mustprogress nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #4 = { alwaysinline mustprogress "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #5 = { inlinehint "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #6 = { alwaysinline "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #7 = { nounwind }
+attributes #8 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #9 = { alwaysinline mustprogress "frame-pointer"="all" "min-legal-vector-width"="256" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #10 = { mustprogress nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-scs-vf" }
+attributes #11 = { argmemonly nofree nounwind willreturn }
+attributes #12 = { alwaysinline mustprogress nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #13 = { argmemonly nounwind readonly }
+attributes #14 = { nounwind readnone speculatable willreturn }
+attributes #15 = { argmemonly nounwind willreturn }
+attributes #16 = { "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #17 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" }
+attributes #18 = { alwaysinline nobuiltin "no-builtins" }
+attributes #19 = { nobuiltin "no-builtins" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"clang version google3-trunk (18b9c4637099f6ed5414d8778de8c773291a9cf9)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"any pointer", !5, i64 0}
+!9 = !{i64 0, i64 4, !7, i64 0, i64 4, !7, i64 0, i64 4, !7, i64 0, i64 4, !7, i64 0, i64 4, !7}
+!10 = !{!5, !5, i64 0}
+!11 = distinct !{}
+!12 = distinct !{!12, !13, !14}
+!13 = !{!"llvm.loop.mustprogress"}
+!14 = !{!"llvm.loop.unroll.enable"}
+!15 = distinct !{!15, !13, !16, !17, !18, !19, !20}
+!16 = !{!"llvm.loop.parallel_accesses", !11}
+!17 = !{!"llvm.loop.unroll.disable"}
+!18 = !{!"llvm.loop.vectorize.width", i32 1}
+!19 = !{!"llvm.loop.interleave.count", i32 1}
+!20 = !{!"llvm.loop.vectorize.enable", i1 true}
+!21 = !{!22, !23, i64 0}
+!22 = !{!"_ZTSN10embeddings11MemorySpaceE", !23, i64 0}
+!23 = !{!"_ZTSN10embeddings21SparsecoreMemorySpaceE", !5, i64 0}
+!24 = !{!25, !26, i64 0}
+!25 = !{!"_ZTSN10embeddings9BasicTypeE", !26, i64 0}
+!26 = !{!"_ZTSN10embeddings19SparsecoreBasicTypeE", !5, i64 0}
+!27 = !{!28, !28, i64 0}
+!28 = !{!"_ZTSN10embeddings12SortOrderingE", !5, i64 0}
+!29 = !{!30, !4, i64 12}
+!30 = !{!"_ZTSN10embeddings9BaseArrayE", !31, i64 0, !4, i64 12}
+!31 = !{!"_ZTSN10embeddings11PointerBaseE", !22, i64 0, !25, i64 4, !5, i64 8}
+!32 = !{!26, !26, i64 0}
+!33 = !{!23, !23, i64 0}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sfence_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sfence_sc.ll
new file mode 100644
index 0000000..5f21495
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sfence_sc.ll

@@ -0,0 +1,77 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -instcombine-max-iterations=0 | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.sfence.sel(i32)
+declare void @llvm.tpu.sfence.scmf()
+declare void @llvm.tpu.sfence.stream.spmem()
+declare void @llvm.tpu.sfence.stream.hbm()
+declare void @llvm.tpu.sfence.imem()
+
+; CHECK-LABEL: sfence_test
+; CHECK: _ = sfence
+; CHECK-NEXT: _ = sfence.sel s0
+; CHECK-NEXT: _ = sfence.sel $0xf
+; CHECK-NEXT: _ = sfence.scmf
+; CHECK-NEXT: _ = sfence.stream.spmem
+; CHECK-NEXT: _ = sfence.stream.hbm
+; CHECK-NEXT: _ = sfence
+define void @sfence_test(i32 %a) {
+  fence seq_cst
+  call void @llvm.tpu.sfence.sel(i32 %a)
+  call void @llvm.tpu.sfence.sel(i32 15)
+  call void @llvm.tpu.sfence.scmf()
+  call void @llvm.tpu.sfence.stream.spmem()
+  call void @llvm.tpu.sfence.stream.hbm()
+  call void @llvm.tpu.sfence.imem()
+  ret void
+}
+
+declare <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*)
+
+; CHECK-LABEL: sfence_vld_test:
+; CHECK: vld
+; CHECK: sfence
+; CHECK: sfence
+; CHECK: sfence
+; CHECK: sfence
+define <8 x i32> @sfence_vld_test(<8 x i1> %m, <8 x i32> addrspace(201)* %b) {
+  %r = tail call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %b)
+  fence seq_cst
+  fence seq_cst
+  fence seq_cst
+  fence seq_cst
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: sfence_sld_test:
+; CHECK: sld
+; CHECK: sfence
+; CHECK: sfence
+; CHECK: sfence
+; CHECK: sfence
+define i32 @sfence_sld_test(i32* %a) {
+  %r = load i32, i32* %a
+  fence seq_cst
+  fence seq_cst
+  fence seq_cst
+  fence seq_cst
+  ret i32 %r
+}
+
+; Tests that sfence does not bundle with another sfence. Hardware does
+; allow it, though we don't do it.
+
+; CHECK-LABEL: sfence_bundle:
+; CHECK: { _ = sfence
+; CHECK: { _ = sfence
+; CHECK: { _ = sfence
+define void @sfence_bundle() {
+  fence seq_cst
+  fence seq_cst
+  fence seq_cst
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sflag_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sflag_sc.ll
new file mode 100644
index 0000000..e8dab89
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sflag_sc.ll

@@ -0,0 +1,460 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -opaque-pointers | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+declare void @llvm.tpu.syncset.done(i32 addrspace(204)*, i32)
+declare void @llvm.tpu.syncset.remote(i32 addrspace(211)*, i32, i32, i32)
+declare void @llvm.tpu.syncset.remote.done(i32 addrspace(211)*, i32, i32, i32, i32)
+
+; CHECK-LABEL: syncset_rr:
+; CHECK: [sflag:s0] = ssyncset.s32 s1
+define void @syncset_rr(i32 addrspace(204)* %x, i32 %y) {
+  store i32 %y, i32 addrspace(204)* %x
+  ret void
+}
+
+; CHECK-LABEL: syncset_ri_i32:
+; CHECK: [sflag:s0] = ssyncset.s32 $0x100000
+define void @syncset_ri_i32(i32 addrspace(204)* %x) {
+  store i32 1048576, i32 addrspace(204)* %x
+  ret void
+}
+
+; CHECK-LABEL: syncset_ri_i16:
+; CHECK: [sflag:s0] = ssyncset.s32 $0x20
+define void @syncset_ri_i16(i32 addrspace(204)* %x) {
+  store i32 32, i32 addrspace(204)* %x
+  ret void
+}
+
+; CHECK-LABEL: syncset_done_rr:
+; CHECK: [sflag:s0] = ssyncset.done s1
+define void @syncset_done_rr(i32 addrspace(204)* %x, i32 %y) {
+  call void @llvm.tpu.syncset.done(i32 addrspace(204)* %x, i32 %y)
+  ret void
+}
+
+; CHECK-LABEL: syncset_done_ri:
+; CHECK: [sflag:s0] = ssyncset.done $0x20
+define void @syncset_done_ri(i32 addrspace(204)* %x) {
+  call void @llvm.tpu.syncset.done(i32 addrspace(204)* %x, i32 32)
+  ret void
+}
+
+; CHECK-LABEL: syncset_other_rr:
+; CHECK: [sflag:s0] = ssyncset.other.s32 s1
+define void @syncset_other_rr(i32 addrspace(210)* %x, i32 %y) {
+  store i32 %y, i32 addrspace(210)* %x
+  ret void
+}
+
+; CHECK-LABEL: syncset_other_ri_i32:
+; CHECK: [sflag:s0] = ssyncset.other.s32 $0x100000
+define void @syncset_other_ri_i32(i32 addrspace(210)* %x) {
+  store i32 1048576, i32 addrspace(210)* %x
+  ret void
+}
+
+; CHECK-LABEL: syncset_other_ri_i16:
+; CHECK: [sflag:s0] = ssyncset.other.s32 $0x20
+define void @syncset_other_ri_i16(i32 addrspace(210)* %x) {
+  store i32 32, i32 addrspace(210)* %x
+  ret void
+}
+
+declare void @llvm.tpu.syncset.other.done(i32 addrspace(210)*, i32)
+
+; CHECK-LABEL: syncset_other_done_rr:
+; CHECK: [sflag:s0] = ssyncset.other.done s1
+define void @syncset_other_done_rr(i32 addrspace(210)* %x, i32 %y) {
+  call void @llvm.tpu.syncset.other.done(i32 addrspace(210)* %x, i32 %y)
+  ret void
+}
+
+; CHECK-LABEL: syncset_other_done_ri:
+; CHECK: [sflag:s0] = ssyncset.other.done $0x20
+define void @syncset_other_done_ri(i32 addrspace(210)* %x) {
+  call void @llvm.tpu.syncset.other.done(i32 addrspace(210)* %x, i32 32)
+  ret void
+}
+
+declare void @llvm.tpu.syncset.both(i32 addrspace(204)*, i32 addrspace(210)*, i32)
+declare void @llvm.tpu.syncset.both.done(i32 addrspace(204)*, i32 addrspace(210)*, i32)
+
+; CHECK-LABEL: syncset_both_rr:
+; CHECK: s1 = sshll.u32 s1, $0x10
+; CHECK: s0 = sor.u32 s1, s0
+; CHECK: [sflag:s0], [sflag:s0] = ssyncset.both.s32 s2;
+define void @syncset_both_rr(i32 addrspace(204)* %x, i32 addrspace(210)* %y, i32 %z) {
+  call void @llvm.tpu.syncset.both(i32 addrspace(204)* %x, i32 addrspace(210)* %y, i32 %z)
+  ret void
+}
+
+; CHECK-LABEL: syncset_both_ri_i32:
+; CHECK: s1 = sshll.u32 s1, $0x10
+; CHECK: s0 = sor.u32 s1, s0
+; CHECK: [sflag:s0], [sflag:s0] = ssyncset.both.s32 $0x100000;
+define void @syncset_both_ri_i32(i32 addrspace(204)* %x, i32 addrspace(210)* %y) {
+  call void @llvm.tpu.syncset.both(i32 addrspace(204)* %x, i32 addrspace(210)* %y, i32 1048576)
+  ret void
+}
+
+; CHECK-LABEL: syncset_both_ri_i16:
+; CHECK: s1 = sshll.u32 s1, $0x10
+; CHECK: s0 = sor.u32 s1, s0
+; CHECK: [sflag:s0], [sflag:s0] = ssyncset.both.s32 $0x20;
+define void @syncset_both_ri_i16(i32 addrspace(204)* %x, i32 addrspace(210)* %y) {
+  call void @llvm.tpu.syncset.both(i32 addrspace(204)* %x, i32 addrspace(210)* %y, i32 32)
+  ret void
+}
+
+; CHECK-LABEL: syncset_both_done_rr:
+; CHECK: s1 = sshll.u32 s1, $0x10
+; CHECK: s0 = sor.u32 s1, s0
+; CHECK: [sflag:s0], [sflag:s0] = ssyncset.both.done s2
+define void @syncset_both_done_rr(i32 addrspace(204)* %x, i32 addrspace(210)* %y, i32 %z) {
+  call void @llvm.tpu.syncset.both.done(i32 addrspace(204)* %x, i32 addrspace(210)* %y, i32 %z)
+  ret void
+}
+
+; CHECK-LABEL: syncset_both_done_ri:
+; CHECK: s1 = sshll.u32 s1, $0x10
+; CHECK: s0 = sor.u32 s1, s0
+; CHECK: [sflag:s0], [sflag:s0] = ssyncset.both.done $0x20
+define void @syncset_both_done_ri(i32 addrspace(204)* %x, i32 addrspace(210)* %y) {
+  call void @llvm.tpu.syncset.both.done(i32 addrspace(204)* %x, i32 addrspace(210)* %y, i32 32)
+  ret void
+}
+
+@g = addrspace(204) global i32 42
+
+; CHECK-LABEL: syncset_no_ii_global:
+; CHECK: s0 = simm.s32 g
+; CHECK-NEXT: [sflag:s0] = ssyncset.s32 $0x20
+define void @syncset_no_ii_global(i32 %y) {
+  store i32 32, i32 addrspace(204)* @g
+  ret void
+}
+
+; CHECK-LABEL: syncset_no_ir_global:
+; CHECK: s1 = simm.s32 g
+; CHECK-NEXT: [sflag:s1] = ssyncset.s32 s0
+define void @syncset_no_ir_global(i32 %y) {
+  store i32 %y, i32 addrspace(204)* @g
+  ret void
+}
+
+; CHECK-LABEL: syncset_no_ii:
+; CHECK: s0 = simm.s32 $0x1f
+; CHECK-NEXT: [sflag:s0] = ssyncset.s32 $0x20
+define void @syncset_no_ii(i32 %y) {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  store i32 32, i32 addrspace(204)* %s
+  ret void
+}
+
+; CHECK-LABEL: syncset_no_ir:
+; CHECK: s1 = simm.s32 $0x1f
+; CHECK-NEXT: [sflag:s1] = ssyncset.s32 s0
+define void @syncset_no_ir(i32 %y) {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  store i32 %y, i32 addrspace(204)* %s
+  ret void
+}
+
+declare void @llvm.tpu.syncadd(i32 addrspace(204)*, i32)
+declare void @llvm.tpu.syncadd.other(i32 addrspace(210)*, i32)
+declare void @llvm.tpu.syncadd.both(i32 addrspace(204)*, i32 addrspace(210)*, i32)
+declare void @llvm.tpu.syncadd.tile(i32 addrspace(217)*, i32)
+
+; CHECK-LABEL: syncadd_rr
+; CHECK: [sflag:s0] = ssyncadd.s32 s1
+define void @syncadd_rr(i32 addrspace(204)* %a, i32 %b) {
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %a, i32 %b)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_ri_32
+; CHECK: [sflag:s0] = ssyncadd.s32 $0x100000
+define void @syncadd_ri_32(i32 addrspace(204)* %a) {
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %a, i32 1048576)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_ri_16
+; CHECK: [sflag:s0] = ssyncadd.s32 $0x20
+define void @syncadd_ri_16(i32 addrspace(204)* %a) {
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %a, i32 32)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_other_rr
+; CHECK: [sflag:s0] = ssyncadd.other.s32 s1
+define void @syncadd_other_rr(i32 addrspace(210)* %a, i32 %b) {
+  call void @llvm.tpu.syncadd.other(i32 addrspace(210)* %a, i32 %b)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_other_ri_32
+; CHECK: [sflag:s0] = ssyncadd.other.s32 $0x100000
+define void @syncadd_other_ri_32(i32 addrspace(210)* %a) {
+  call void @llvm.tpu.syncadd.other(i32 addrspace(210)* %a, i32 1048576)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_other_ri_16
+; CHECK: [sflag:s0] = ssyncadd.other.s32 $0x20
+define void @syncadd_other_ri_16(i32 addrspace(210)* %a) {
+  call void @llvm.tpu.syncadd.other(i32 addrspace(210)* %a, i32 32)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_both_rr
+; CHECK: s1 = sshll.u32 s1, $0x10
+; CHECK: s0 = sor.u32 s1, s0
+; CHECK: [sflag:s0], [sflag:s0] = ssyncadd.both.s32 s2
+define void @syncadd_both_rr(i32 addrspace(204)* %a, i32 addrspace(210)* %b, i32 %c) {
+  call void @llvm.tpu.syncadd.both(i32 addrspace(204)* %a, i32 addrspace(210)* %b, i32 %c)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_both_ri_32
+; CHECK: s1 = sshll.u32 s1, $0x10
+; CHECK: s0 = sor.u32 s1, s0
+; CHECK: [sflag:s0], [sflag:s0] = ssyncadd.both.s32 $0x100000
+define void @syncadd_both_ri_32(i32 addrspace(204)* %a, i32 addrspace(210)* %b) {
+  call void @llvm.tpu.syncadd.both(i32 addrspace(204)* %a, i32 addrspace(210)* %b, i32 1048576)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_both_ri_16
+; CHECK: s1 = sshll.u32 s1, $0x10
+; CHECK: s0 = sor.u32 s1, s0
+; CHECK: [sflag:s0], [sflag:s0] = ssyncadd.both.s32 $0x20
+define void @syncadd_both_ri_16(i32 addrspace(204)* %a, i32 addrspace(210)* %b) {
+  call void @llvm.tpu.syncadd.both(i32 addrspace(204)* %a, i32 addrspace(210)* %b, i32 32)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_no_ii
+; CHECK: s0 = simm.s32 $0x1f
+; CHECK-NEXT: [sflag:s0] = ssyncadd.s32 $0x10
+define void @syncadd_no_ii() {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %s, i32 16)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_no_ir
+; CHECK: s1 = simm.s32 $0x1f
+; CHECK-NEXT: [sflag:s1] = ssyncadd.s32 s0;
+define void @syncadd_no_ir(i32 %y) {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %s, i32 %y)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_no_ii_global
+; CHECK: s0 = simm.s32 g
+; CHECK-NEXT: [sflag:s0] = ssyncadd.s32 $0x10
+define void @syncadd_no_ii_global() {
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* @g, i32 16)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_no_ir_global
+; CHECK: s1 = simm.s32 g
+; CHECK-NEXT: [sflag:s1] = ssyncadd.s32 s0;
+define void @syncadd_no_ir_global(i32 %y) {
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* @g, i32 %y)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_tile_rr
+; CHECK: [sflag:s0] = ssyncadd.tile.s32 s1
+define void @syncadd_tile_rr(i32 addrspace(217)* %a, i32 %b) {
+  call void @llvm.tpu.syncadd.tile(i32 addrspace(217)* %a, i32 %b)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_tile_ri_32
+; CHECK: [sflag:s0] = ssyncadd.tile.s32 $0x100000
+define void @syncadd_tile_ri_32(i32 addrspace(217)* %a) {
+  call void @llvm.tpu.syncadd.tile(i32 addrspace(217)* %a, i32 1048576)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_tile_ri_16
+; CHECK: [sflag:s0] = ssyncadd.tile.s32 $0x20
+define void @syncadd_tile_ri_16(i32 addrspace(217)* %a) {
+  call void @llvm.tpu.syncadd.tile(i32 addrspace(217)* %a, i32 32)
+  ret void
+}
+
+; CHECK-LABEL: syncmov_r
+; CHECK: s0 = ssyncread [sflag:s0]
+define i32 @syncmov_r(i32 addrspace(204)* %a, i32 %b) {
+  %c = load i32, i32 addrspace(204)* %a
+  ret i32 %c
+}
+
+; CHECK-LABEL: syncmov_i
+; CHECK: s0 = ssyncread [sflag:g]
+define i32 @syncmov_i() {
+  %c = load i32, i32 addrspace(204)* @g
+  ret i32 %c
+}
+
+declare i32 @llvm.tpu.syncdonemov(i32 addrspace(204)*)
+
+; CHECK-LABEL: syncdonemov_i
+; CHECK: s0 = ssyncread.done [sflag:g]
+define i32 @syncdonemov_i() {
+  %c = call i32 @llvm.tpu.syncdonemov(i32 addrspace(204)* @g)
+  ret i32 %c
+}
+
+; CHECK-LABEL: syncdonemov_r
+; CHECK: s0 = ssyncread.done [sflag:s0]
+define i32 @syncdonemov_r(i32 addrspace(204)* %a) {
+  %c = call i32 @llvm.tpu.syncdonemov(i32 addrspace(204)* %a)
+  ret i32 %c
+}
+
+declare void @llvm.tpu.waiteq(i32 addrspace(204)*, i32)
+
+; CHECK-LABEL: waiteq_no_ii
+; CHECK: s0 = simm.s32 $0x1f
+; CHECK-NEXT: _ = swait.eq [sflag:s0], $0x20
+define void @waiteq_no_ii() {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.waiteq(i32 addrspace(204)* %s, i32 32)
+  ret void
+}
+
+; CHECK-LABEL: waiteq_no_ir
+; CHECK: s0 = simm.s32 $0x1f
+; CHECK-NEXT: _ = swait.eq [sflag:s0], $0x20
+define void @waiteq_no_ir(i32 %b) {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.waiteq(i32 addrspace(204)* %s, i32 32)
+  ret void
+}
+
+; CHECK-LABEL: waiteq_no_ii_global
+; CHECK: s0 = simm.s32 g
+; CHECK-NEXT: _ = swait.eq [sflag:s0], $0x20
+define void @waiteq_no_ii_global() {
+  call void @llvm.tpu.waiteq(i32 addrspace(204)* @g, i32 32)
+  ret void
+}
+
+; CHECK-LABEL: waiteq_no_ir_global
+; CHECK: s0 = simm.s32 g
+; CHECK-NEXT: _ = swait.eq [sflag:s0], $0x20
+define void @waiteq_no_ir_global(i32 %b) {
+  call void @llvm.tpu.waiteq(i32 addrspace(204)* @g, i32 32)
+  ret void
+}
+
+; CHECK-LABEL: waiteq_rr
+; CHECK: _ =	swait.eq [sflag:s0], s1
+define void @waiteq_rr(i32 addrspace(204)* %a, i32 %b) {
+  call void @llvm.tpu.waiteq(i32 addrspace(204)* %a, i32 %b)
+  ret void
+}
+
+; CHECK-LABEL: waiteq_ri
+; CHECK: _ = swait.eq [sflag:s0], $0x20;
+define void @waiteq_ri(i32 addrspace(204)* %a) {
+  call void @llvm.tpu.waiteq(i32 addrspace(204)* %a, i32 32)
+  ret void
+}
+
+declare void @llvm.tpu.waitdone(i32 addrspace(204)*)
+
+; CHECK-LABEL: waitdone_r
+; CHECK: _ = swait.done [sflag:s0]
+define void @waitdone_r(i32 addrspace(204)* %a) {
+  call void @llvm.tpu.waitdone(i32 addrspace(204)* %a)
+  ret void
+}
+
+; CHECK-LABEL: waitdone_no_i
+; CHECK: s0 = simm.s32 $0x1f
+; CHECK: _ = swait.done [sflag:s0]
+define void @waitdone_no_i(i32 addrspace(204)* %a) {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.waitdone(i32 addrspace(204)* %s)
+  ret void
+}
+
+; CHECK-LABEL: vst_to_syncadd_rr
+; CHECK: { [tilespmem:s2+$0x0] =   vst v0;
+; CHECK-NEXT: [sflag:s0] = ssyncadd.s32 s1 }
+; CHECK-NEXT: { [sflag:s0] = ssyncadd.s32 s1
+define void @vst_to_syncadd_rr(i32 addrspace(204)* %a, i32 %b,
+                               <8 x float> addrspace(201)* %c, <8 x float> %d) {
+  store <8 x float> %d, <8 x float> addrspace(201)* %c
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %a, i32 %b)
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %a, i32 %b)
+  ret void
+}
+
+; CHECK-LABEL: vst_to_syncset_rr:
+; CHECK: { [tilespmem:s3+$0x0] =   vst v0;
+; CHECK-NEXT: [sflag:s0] = ssyncset.s32 s2 }
+; { CHECK-NEXT: [sflag:s1] = ssyncset.s32 s2
+define void @vst_to_syncset_rr(i32 addrspace(204)* %x, i32 addrspace(204)* %y, i32 %z,
+                               <8 x float> addrspace(201)* %c, <8 x float> %d) {
+  store <8 x float> %d, <8 x float> addrspace(201)* %c
+  store i32 %z, i32 addrspace(204)* %x
+  store i32 %z, i32 addrspace(204)* %y
+  ret void
+}
+
+; CHECK-LABEL: syncset_remote
+; CHECK-DAG: [[CORE_ID_SHL:s[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0xe
+; CHECK-DAG: [[CHIP_ID_SHL:s[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0x11;
+; CHECK-DAG: [[TARGET:s[0-9]+]] = sor.u32 [[CORE_ID_SHL]], [[TARGET]]
+; CHECK-DAG: [[TARGET]]         = sor.u32 [[CHIP_ID_SHL]], [[TARGET]]
+; CHECK: [sflag:[[TARGET]]] = ssyncset.remote.s32 $0x10;
+define void @syncset_remote(i32 addrspace(211)* %a, i32 %chip_id, i32 %core_id) {
+  call void @llvm.tpu.syncset.remote(i32 addrspace(211)* %a, i32 16, i32 %chip_id, i32 %core_id)
+  ret void
+}
+
+; CHECK-LABEL: syncset_remote_imm
+; CHECK: [[TARGET:s[0-9]+]] = sor.u32 $0x1fc000, s0
+; CHECK: [sflag:[[TARGET]]] = ssyncset.remote.s32 $0x10
+define void @syncset_remote_imm(i32 addrspace(211)* %a) {
+  call void @llvm.tpu.syncset.remote(i32 addrspace(211)* %a, i32 16, i32 15, i32 7)
+  ret void
+}
+
+; CHECK-LABEL: syncset_remote_done
+; CHECK-DAG: [[CORE_ID_SHL:s[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0xe
+; CHECK-DAG: [[CHIP_ID_SHL:s[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0x11;
+; CHECK-DAG: [[DONE_SHL:s[0-9]+]]    = sshll.u32 s{{[0-9]+}}, $0x1f;
+; CHECK-DAG: [[TARGET:s[0-9]+]] = sor.u32 [[CORE_ID_SHL]], [[TARGET]]
+; CHECK-DAG: [[TARGET]]         = sor.u32 [[CHIP_ID_SHL]], [[TARGET]]
+; CHECK-DAG: [[TARGET]]         = sor.u32 [[DONE_SHL]], [[TARGET]]
+; CHECK: [sflag:[[TARGET]]] = ssyncset.remote.done $0x10;
+define void @syncset_remote_done(i32 addrspace(211)* %a, i32 %chip_id, i32 %core_id, i32 %done) {
+  call void @llvm.tpu.syncset.remote.done(i32 addrspace(211)* %a, i32 16, i32 %chip_id, i32 %core_id, i32 %done)
+  ret void
+}
+
+; CHECK-LABEL: syncset_remote_done_imm
+; CHECK: [[TARGET:s[0-9]+]] = sor.u32 $-0x7fe04000, [[TARGET]]
+; CHECK: [sflag:[[TARGET]]] = ssyncset.remote.done $0x10
+define void @syncset_remote_done_imm(i32 addrspace(211)* %a) {
+  call void @llvm.tpu.syncset.remote.done(i32 addrspace(211)* %a, i32 16, i32 15, i32 7, i32 1)
+  ret void
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sflag_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sflag_tc.ll
new file mode 100644
index 0000000..4367d08
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sflag_tc.ll

@@ -0,0 +1,221 @@
+; RUN: llc < %s -mcpu=tensorcore-jf -asm-verbose=false -disable-cgp | FileCheck %s
+; RUN: llc < %s -mcpu=tensorcore-vf -asm-verbose=false -disable-cgp | FileCheck %s --check-prefixes=CHECK,CHECK-VF
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: syncset_rr:
+; CHECK: [sflag:s0] = vsyncset.s32 s1
+define void @syncset_rr(i32 addrspace(204)* %x, i32 %y) {
+  store i32 %y, i32 addrspace(204)* %x
+  ret void
+}
+
+; CHECK-LABEL: syncset_ri_i32:
+; CHECK: s{{[0-9]+}} = simm.s32 $0x100000
+; CHECK: [sflag:s0] = vsyncset.s32 s{{[0-9]+}}
+define void @syncset_ri_i32(i32 addrspace(204)* %x) {
+  store i32 1048576, i32 addrspace(204)* %x
+  ret void
+}
+
+; CHECK-LABEL: syncset_ri_i16:
+; CHECK: [sflag:s0] = vsyncset.s32 $0x20
+define void @syncset_ri_i16(i32 addrspace(204)* %x) {
+  store i32 32, i32 addrspace(204)* %x
+  ret void
+}
+
+@g = addrspace(204) global i32 42
+
+; CHECK-LABEL: syncset_ir:
+; CHECK: [sflag:g] = vsyncset.s32 s0
+define void @syncset_ir(i32 %y) {
+  store i32 %y, i32 addrspace(204)* @g
+  ret void
+}
+
+; CHECK-LABEL: syncset_inttoptr_r:
+; CHECK: [sflag:$0x4] = vsyncset.s32 s0
+define void @syncset_inttoptr_r(i32 %y) {
+  store i32 %y, i32 addrspace(204)* inttoptr (i32 4 to i32 addrspace(204)*)
+  ret void
+}
+
+; CHECK-LABEL: syncset_ii:
+; CHECK: [sflag:g] = vsyncset.s32 $0x20
+define void @syncset_ii(i32 %y) {
+  store i32 32, i32 addrspace(204)* @g
+  ret void
+}
+
+declare void @llvm.tpu.syncadd(i32 addrspace(204)*, i32)
+
+; CHECK-LABEL: syncadd_ii
+; CHECK: [sflag:g] = vsyncadd.s32 $0x10
+define void @syncadd_ii() {
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* @g, i32 16)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_ii_inttoptr
+; CHECK: [sflag:$0x4] = vsyncadd.s32 $0x10
+define void @syncadd_ii_inttoptr() {
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* inttoptr (i32 4 to i32 addrspace(204)*), i32 16)
+  ret void
+}
+
+; Test that if the immediate doesn't fit in 16bits we insert a move immediate.
+; CHECK-LABEL: syncadd_ii_inttoptr_32
+; CHECK: { [[x:s[0-9]+]] = simm.s32 $0x200000 }
+; CHECK: [sflag:$0x4] = vsyncadd.s32 [[x]]
+define void @syncadd_ii_inttoptr_32() {
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* inttoptr (i32 4 to i32 addrspace(204)*), i32 2097152)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_rr
+; CHECK: [sflag:s0] = vsyncadd.s32 s1
+define void @syncadd_rr(i32 addrspace(204)* %a, i32 %b) {
+  call void @llvm.tpu.syncadd(i32 addrspace(204)* %a, i32 %b)
+  ret void
+}
+
+declare void @llvm.tpu.syncadd.done(i32 addrspace(204)*, i32)
+
+; CHECK-LABEL: syncadd_done_ii
+; CHECK: [sflag:g] = vsyncadd.done.s32 $0x10
+define void @syncadd_done_ii() {
+  call void @llvm.tpu.syncadd.done(i32 addrspace(204)* @g, i32 16)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_done_rr
+; CHECK: [sflag:s0] = vsyncadd.done.s32 s1
+define void @syncadd_done_rr(i32 addrspace(204)* %a, i32 %b) {
+  call void @llvm.tpu.syncadd.done(i32 addrspace(204)* %a, i32 %b)
+  ret void
+}
+
+declare void @llvm.tpu.syncadd.notdone(i32 addrspace(204)*, i32)
+
+; CHECK-LABEL: syncadd_notdone_ii
+; CHECK: [sflag:g] = vsyncadd.notdone.s32 $0x10
+define void @syncadd_notdone_ii() {
+  call void @llvm.tpu.syncadd.notdone(i32 addrspace(204)* @g, i32 16)
+  ret void
+}
+
+; CHECK-LABEL: syncadd_notdone_rr
+; CHECK: [sflag:s0] = vsyncadd.notdone.s32 s1
+define void @syncadd_notdone_rr(i32 addrspace(204)* %a, i32 %b) {
+  call void @llvm.tpu.syncadd.notdone(i32 addrspace(204)* %a, i32 %b)
+  ret void
+}
+
+declare void @llvm.tpu.syncset.remote.done(i32 addrspace(211)*, i32, i32, i32, i32)
+
+; CHECK-LABEL: syncset_remote_done
+; CHECK: [sflag:s0] = vsyncset.remote.done.s32 $0x10
+define void @syncset_remote_done(i32 addrspace(211)* %a) {
+  call void @llvm.tpu.syncset.remote.done(i32 addrspace(211)* %a, i32 16, i32 0, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: syncset_remote_done_core_chip
+; CHECK: s0 = sor.u32 $0x1fc000, s0
+; CHECK: [sflag:s0] = vsyncset.remote.done.s32 $0x10
+define void @syncset_remote_done_core_chip(i32 addrspace(211)* %a) {
+  call void @llvm.tpu.syncset.remote.done(i32 addrspace(211)* %a, i32 16, i32 15, i32 7, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: syncmov_r
+; CHECK-JF: (v2sf) = vsyncmov [sflag:s0]
+; CHECK-JF: _ = vnop
+; CHECK-JF: s0 = spop (v2sf);
+; CHECK-VF: (sfrf) = vsyncmov [sflag:s0]
+; CHECK-VF: _ = vdelay $0x1
+; CHECK-VF: s0 = spop (sfrf)
+define i32 @syncmov_r(i32 addrspace(204)* %a, i32 %b) {
+  %c = load i32, i32 addrspace(204)* %a
+  ret i32 %c
+}
+
+; CHECK-LABEL: syncmov_i
+; CHECK-JF: (v2sf) = vsyncmov [sflag:g]
+; CHECK-JF: _ = vnop
+; CHECK-JF: s0 = spop (v2sf);
+; CHECK-VF: (sfrf) = vsyncmov [sflag:g]
+; CHECK-VF: _ = vdelay $0x1
+; CHECK-VF: s0 = spop (sfrf)
+define i32 @syncmov_i() {
+  %c = load i32, i32 addrspace(204)* @g
+  ret i32 %c
+}
+
+declare i32 @llvm.tpu.syncdonemov(i32 addrspace(204)*)
+
+; CHECK-LABEL: syncdonemov_i
+; CHECK-JF: (v2sf) = vsyncmov.done [sflag:g]
+; CHECK-JF: _ = vnop
+; CHECK-JF: s0 = spop (v2sf)
+; CHECK-VF: (sfrf) = vsyncmov.done [sflag:g]
+; CHECK-VF: _ = vdelay $0x1
+; CHECK-VF: s0 = spop (sfrf)
+define i32 @syncdonemov_i() {
+  %c = call i32 @llvm.tpu.syncdonemov(i32 addrspace(204)* @g)
+  ret i32 %c
+}
+
+; CHECK-LABEL: syncdonemov_r
+; CHECK-JF: (v2sf) = vsyncmov.done [sflag:s0]
+; CHECK-JF: _ = vnop
+; CHECK-JF: s0 = spop (v2sf);
+; CHECK-VF: (sfrf) = vsyncmov.done [sflag:s0]
+; CHECK-VF: _ = vdelay $0x1
+; CHECK-VF: s0 = spop (sfrf)
+define i32 @syncdonemov_r(i32 addrspace(204)* %a) {
+  %c = call i32 @llvm.tpu.syncdonemov(i32 addrspace(204)* %a)
+  ret i32 %c
+}
+
+declare void @llvm.tpu.waiteq(i32 addrspace(204)*, i32)
+
+; CHECK-LABEL: waiteq_ii
+; CHECK: _ = vwait.eq [sflag:g], $0x20
+define void @waiteq_ii() {
+  call void @llvm.tpu.waiteq(i32 addrspace(204)* @g, i32 32)
+  ret void
+}
+
+; CHECK-LABEL: waiteq_rr
+; CHECK: _ = vwait.eq [sflag:s0], s1
+define void @waiteq_rr(i32 addrspace(204)* %a, i32 %b) {
+  call void @llvm.tpu.waiteq(i32 addrspace(204)* %a, i32 %b)
+  ret void
+}
+
+declare void @llvm.tpu.waitdone(i32 addrspace(204)*)
+
+; CHECK-LABEL: waitdone_i
+; CHECK: _ = vwait.done [sflag:g]
+define void @waitdone_i() {
+  call void @llvm.tpu.waitdone(i32 addrspace(204)* @g)
+  ret void
+}
+
+; CHECK-LABEL: waitdone_inttoptr
+; CHECK: _ = vwait.done [sflag:$0x4]
+define void @waitdone_inttoptr() {
+  call void @llvm.tpu.waitdone(i32 addrspace(204)* inttoptr (i32 4 to i32 addrspace(204)*))
+  ret void
+}
+
+; CHECK-LABEL: waitdone_r
+; CHECK: _ = vwait.done [sflag:s0]
+define void @waitdone_r(i32 addrspace(204)* %a) {
+  call void @llvm.tpu.waitdone(i32 addrspace(204)* %a)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/software_pipeliner.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/software_pipeliner.ll
new file mode 100644
index 0000000..10eedf4
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/software_pipeliner.ll

@@ -0,0 +1,377 @@
+; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp \
+; RUN: -stop-after=tpu-pipeliner -tpu-pipeliner-annotate-for-testing \
+; RUN: -tpu-use-swing-modulo-sched -tpu-skip-fast-opt \
+; RUN: -tpu-pipeliner-strategy=swingslack -tpu-latencies=%S/Inputs/long_load.yml \
+; RUN: -tpu-enable-vliw-prep-postiv=false -tpu-enable-vliw-prep-post-addrinc=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.vmatpush.f32(<1024 x float>, <1024 x i1>, i32, i32)
+declare i32 @llvm.tpu.vdwg(i32, i32)
+declare i32 @llvm.tpu.vmatmul.f32(<1024 x float>, <1024 x i1>, i32, i32)
+declare <1024 x float> @llvm.tpu.vmatres.f32(i32, i32)
+declare <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32) #1
+
+; Simple loop to check that the induction variable is scheduled correctly before
+; the compare at the top of the loop.
+; CHECK: bb.1.loop_simple:
+; CHECK: [[i:%[0-9]+]]:gpr = nsw ADDri {{.*}} Stage-0_Cycle-0
+; CHECK: ppr = CMPNEri [[i]]{{.*}} Stage-0_Cycle-1
+; CHECK: BR
+
+define void @ind_var(<1024 x float> %gain, <1024 x float> %input) {
+entry:
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  br label %loop_simple
+  
+loop_simple:
+  %ind = phi i32 [ 0, %entry ], [ %ind.1, %loop_simple ]
+  %0 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 undef)
+  %1 = call i32 @llvm.tpu.vdwg(i32 0, i32 %0)
+  %2 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %1)
+  %3 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %2)
+  %ind.1 = add nuw nsw i32 %ind, 1
+  %cnd = icmp eq i32 100, %ind.1
+  br i1 %cnd, label %loop.cleanup, label %loop_simple
+
+loop.cleanup:
+  ret void
+}
+
+; CHECK-NOT: Stage-3
+; CHECK: bb.1.loop:
+; CHECK-DAG: gsfnpr0 = tcMXU0MATPUSH {{.*}} Stage-0
+; CHECK-DAG: vpr = tcMXU0MATPOP {{.*}} Stage-2
+; CHECK-DAG: mrfpr0 = tcMXU0MATMUL {{.*}} Stage-1
+; CHECK: BR
+
+define void @mxu_pipeline(<1024 x float> %gain, <1024 x float> %input) {
+entry:
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  br label %loop
+
+loop:
+  %i.030 = phi i32 [ 0, %entry ], [ %inc14, %loop ]
+  %0 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 undef)
+  %1 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 %0)
+  %2 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 %1)
+  %3 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 %2)
+  %4 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 %3)
+  %5 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 %4)
+  %6 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 %5)
+  %7 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 %6)
+  %8 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 %7)
+  %9 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 %8)
+  %10 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 %9)
+  %11 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 %10)
+  %12 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 %11)
+  %13 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 %12)
+  %14 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 %13)
+  %15 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %gain, <1024 x i1> %mask, i32 0, i32 %14)
+  %16 = call i32 @llvm.tpu.vdwg(i32 0, i32 %15)
+  %17 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %16)
+  %18 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %17)
+  %19 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %16)
+  %20 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %19)
+  %21 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %16)
+  %22 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %21)
+  %23 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %16)
+  %24 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %23)
+  %25 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %16)
+  %26 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %25)
+  %27 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %16)
+  %28 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %27)
+  %29 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %16)
+  %30 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %29)
+  %31 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %16)
+  %32 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %31)
+  %33 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %16)
+  %34 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %33)
+  %35 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %16)
+  %36 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %35)
+  %37 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %16)
+  %38 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %37)
+  %39 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %16)
+  %40 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %39)
+  %41 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %16)
+  %42 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %41)
+  %43 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %16)
+  %44 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %43)
+  %45 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %16)
+  %46 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %45)
+  %47 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %16)
+  %48 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %47)
+  %inc14 = add nuw nsw i32 %i.030, 1
+  %exitcond = icmp eq i32 %inc14, 100
+  br i1 %exitcond, label %loop.cleanup, label %loop
+
+loop.cleanup:
+  ret void
+}
+
+; Test the case of two MXUs with load instructions feeding into the matpush.
+; Since we have 2 MXU slots and 1 load slot this cause the first load to be
+; scheduled at cycle -1. Test that the algortihm does the right thing, succeed
+; to pipeline and shift all the instructions up.
+; The MATPUSH should be scheduled after the MATMUL since they have a memory
+; dependency.
+; CHECK: bb.1.loop_2mxu:
+; CHECK-NOT: Stage-4
+; CHECK-DAG: vpr = tcVLVri {{.*}} Stage-1_Cycle-4
+; CHECK-DAG: vpr = tcVLVri {{.*}} Stage-1_Cycle-5
+; CHECK-DAG: mrfpr0 = tcMXU0MATMUL {{.*}} Stage-2_Cycle-5
+; CHECK-DAG: mrfpr1 = tcMXU1MATMUL {{.*}} Stage-2_Cycle-13
+; CHECK-DAG: gsfnpr0 = tcMXU0MATPUSH {{.*}} Stage-1_Cycle-6
+; CHECK-DAG: gsfnpr1 = tcMXU1MATPUSH {{.*}} Stage-1_Cycle-6
+; CHECK: BR
+
+; Function Attrs: nounwind
+define void @mxu_pipeline_with_load(<1024 x float> %input) {
+entry:
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  br label %loop_2mxu
+
+loop_2mxu:
+  %i.052 = phi i32 [ 0, %entry ], [ %inc23, %loop_2mxu ]
+  %mul = shl i32 %i.052, 4
+  %0 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %mul)
+  %1 = load <1024 x float>, <1024 x float> addrspace(205)* %0, align 4096
+  %2 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %1, <1024 x i1> %mask, i32 0, i32 undef)
+  %add10.1 = or i32 %mul, 1
+  %3 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.1)
+  %4 = load <1024 x float>, <1024 x float> addrspace(205)* %3, align 4096
+  %5 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %4, <1024 x i1> %mask, i32 0, i32 %2)
+  %add10.2 = or i32 %mul, 2
+  %6 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.2)
+  %7 = load <1024 x float>, <1024 x float> addrspace(205)* %6, align 4096
+  %8 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %7, <1024 x i1> %mask, i32 0, i32 %5)
+  %add10.3 = or i32 %mul, 3
+  %9 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.3)
+  %10 = load <1024 x float>, <1024 x float> addrspace(205)* %9, align 4096
+  %11 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %10, <1024 x i1> %mask, i32 0, i32 %8)
+  %add10.4 = or i32 %mul, 4
+  %12 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.4)
+  %13 = load <1024 x float>, <1024 x float> addrspace(205)* %12, align 4096
+  %14 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %13, <1024 x i1> %mask, i32 0, i32 %11)
+  %add10.5 = or i32 %mul, 5
+  %15 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.5)
+  %16 = load <1024 x float>, <1024 x float> addrspace(205)* %15, align 4096
+  %17 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %16, <1024 x i1> %mask, i32 0, i32 %14)
+  %add10.6 = or i32 %mul, 6
+  %18 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.6)
+  %19 = load <1024 x float>, <1024 x float> addrspace(205)* %18, align 4096
+  %20 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %19, <1024 x i1> %mask, i32 0, i32 %17)
+  %add10.7 = or i32 %mul, 7
+  %21 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.7)
+  %22 = load <1024 x float>, <1024 x float> addrspace(205)* %21, align 4096
+  %23 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %22, <1024 x i1> %mask, i32 0, i32 %20)
+  %add10.8 = or i32 %mul, 8
+  %24 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.8)
+  %25 = load <1024 x float>, <1024 x float> addrspace(205)* %24, align 4096
+  %26 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %25, <1024 x i1> %mask, i32 0, i32 %23)
+  %add10.9 = or i32 %mul, 9
+  %27 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.9)
+  %28 = load <1024 x float>, <1024 x float> addrspace(205)* %27, align 4096
+  %29 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %28, <1024 x i1> %mask, i32 0, i32 %26)
+  %add10.10 = or i32 %mul, 10
+  %30 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.10)
+  %31 = load <1024 x float>, <1024 x float> addrspace(205)* %30, align 4096
+  %32 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %31, <1024 x i1> %mask, i32 0, i32 %29)
+  %add10.11 = or i32 %mul, 11
+  %33 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.11)
+  %34 = load <1024 x float>, <1024 x float> addrspace(205)* %33, align 4096
+  %35 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %34, <1024 x i1> %mask, i32 0, i32 %32)
+  %add10.12 = or i32 %mul, 12
+  %36 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.12)
+  %37 = load <1024 x float>, <1024 x float> addrspace(205)* %36, align 4096
+  %38 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %37, <1024 x i1> %mask, i32 0, i32 %35)
+  %add10.13 = or i32 %mul, 13
+  %39 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.13)
+  %40 = load <1024 x float>, <1024 x float> addrspace(205)* %39, align 4096
+  %41 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %40, <1024 x i1> %mask, i32 0, i32 %38)
+  %add10.14 = or i32 %mul, 14
+  %42 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.14)
+  %43 = load <1024 x float>, <1024 x float> addrspace(205)* %42, align 4096
+  %44 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %43, <1024 x i1> %mask, i32 0, i32 %41)
+  %add10.15 = or i32 %mul, 15
+  %45 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.15)
+  %46 = load <1024 x float>, <1024 x float> addrspace(205)* %45, align 4096
+  %47 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %46, <1024 x i1> %mask, i32 0, i32 %44)
+  %48 = call i32 @llvm.tpu.vdwg(i32 0, i32 %47)
+  %49 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %48)
+  %50 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %49)
+  %51 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %48)
+  %52 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %51)
+  %53 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %48)
+  %54 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %53)
+  %55 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %48)
+  %56 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %55)
+  %57 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %48)
+  %58 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %57)
+  %59 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %48)
+  %60 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %59)
+  %61 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %48)
+  %62 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %61)
+  %63 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %48)
+  %64 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %63)
+  %65 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %48)
+  %66 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %65)
+  %67 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %48)
+  %68 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %67)
+  %69 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %48)
+  %70 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %69)
+  %71 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %48)
+  %72 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %71)
+  %73 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %48)
+  %74 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %73)
+  %75 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %48)
+  %76 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %75)
+  %77 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %48)
+  %78 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %77)
+  %79 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 %48)
+  %80 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %79)
+  %81 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %7, <1024 x i1> %mask, i32 1, i32 undef)
+  %add10.1.1 = or i32 %mul, 3
+  %82 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.1.1)
+  %83 = load <1024 x float>, <1024 x float> addrspace(205)* %82, align 4096
+  %84 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %83, <1024 x i1> %mask, i32 1, i32 %81)
+  %add10.2.1 = add nuw nsw i32 %add10.2, 2
+  %85 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.2.1)
+  %86 = load <1024 x float>, <1024 x float> addrspace(205)* %85, align 4096
+  %87 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %86, <1024 x i1> %mask, i32 1, i32 %84)
+  %add10.3.1 = add nuw nsw i32 %add10.2, 3
+  %88 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.3.1)
+  %89 = load <1024 x float>, <1024 x float> addrspace(205)* %88, align 4096
+  %90 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %89, <1024 x i1> %mask, i32 1, i32 %87)
+  %add10.4.1 = or i32 %mul, 6
+  %91 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.4.1)
+  %92 = load <1024 x float>, <1024 x float> addrspace(205)* %91, align 4096
+  %93 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %92, <1024 x i1> %mask, i32 1, i32 %90)
+  %add10.5.1 = or i32 %mul, 7
+  %94 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.5.1)
+  %95 = load <1024 x float>, <1024 x float> addrspace(205)* %94, align 4096
+  %96 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %95, <1024 x i1> %mask, i32 1, i32 %93)
+  %add10.6.1 = add nuw nsw i32 %add10.2, 6
+  %97 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.6.1)
+  %98 = load <1024 x float>, <1024 x float> addrspace(205)* %97, align 4096
+  %99 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %98, <1024 x i1> %mask, i32 1, i32 %96)
+  %add10.7.1 = add nuw nsw i32 %add10.2, 7
+  %100 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.7.1)
+  %101 = load <1024 x float>, <1024 x float> addrspace(205)* %100, align 4096
+  %102 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %101, <1024 x i1> %mask, i32 1, i32 %99)
+  %add10.8.1 = or i32 %mul, 10
+  %103 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.8.1)
+  %104 = load <1024 x float>, <1024 x float> addrspace(205)* %103, align 4096
+  %105 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %104, <1024 x i1> %mask, i32 1, i32 %102)
+  %add10.9.1 = or i32 %mul, 11
+  %106 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.9.1)
+  %107 = load <1024 x float>, <1024 x float> addrspace(205)* %106, align 4096
+  %108 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %107, <1024 x i1> %mask, i32 1, i32 %105)
+  %add10.10.1 = add nuw nsw i32 %add10.2, 10
+  %109 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.10.1)
+  %110 = load <1024 x float>, <1024 x float> addrspace(205)* %109, align 4096
+  %111 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %110, <1024 x i1> %mask, i32 1, i32 %108)
+  %add10.11.1 = add nuw nsw i32 %add10.2, 11
+  %112 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.11.1)
+  %113 = load <1024 x float>, <1024 x float> addrspace(205)* %112, align 4096
+  %114 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %113, <1024 x i1> %mask, i32 1, i32 %111)
+  %add10.12.1 = or i32 %mul, 14
+  %115 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.12.1)
+  %116 = load <1024 x float>, <1024 x float> addrspace(205)* %115, align 4096
+  %117 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %116, <1024 x i1> %mask, i32 1, i32 %114)
+  %add10.13.1 = or i32 %mul, 15
+  %118 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.13.1)
+  %119 = load <1024 x float>, <1024 x float> addrspace(205)* %118, align 4096
+  %120 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %119, <1024 x i1> %mask, i32 1, i32 %117)
+  %add10.14.1 = add nuw nsw i32 %add10.2, 14
+  %121 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.14.1)
+  %122 = load <1024 x float>, <1024 x float> addrspace(205)* %121, align 4096
+  %123 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %122, <1024 x i1> %mask, i32 1, i32 %120)
+  %add10.15.1 = add nuw nsw i32 %add10.2, 15
+  %124 = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %add10.15.1)
+  %125 = load <1024 x float>, <1024 x float> addrspace(205)* %124, align 4096
+  %126 = call i32 @llvm.tpu.vmatpush.f32(<1024 x float> %125, <1024 x i1> %mask, i32 1, i32 %123)
+  %127 = call i32 @llvm.tpu.vdwg(i32 1, i32 %126)
+  %128 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 1, i32 %127)
+  %129 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 1, i32 %128)
+  %130 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 1, i32 %127)
+  %131 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 1, i32 %130)
+  %132 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 1, i32 %127)
+  %133 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 1, i32 %132)
+  %134 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 1, i32 %127)
+  %135 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 1, i32 %134)
+  %136 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 1, i32 %127)
+  %137 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 1, i32 %136)
+  %138 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 1, i32 %127)
+  %139 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 1, i32 %138)
+  %140 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 1, i32 %127)
+  %141 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 1, i32 %140)
+  %142 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 1, i32 %127)
+  %143 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 1, i32 %142)
+  %144 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 1, i32 %127)
+  %145 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 1, i32 %144)
+  %146 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 1, i32 %127)
+  %147 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 1, i32 %146)
+  %148 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 1, i32 %127)
+  %149 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 1, i32 %148)
+  %150 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 1, i32 %127)
+  %151 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 1, i32 %150)
+  %152 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 1, i32 %127)
+  %153 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 1, i32 %152)
+  %154 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 1, i32 %127)
+  %155 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 1, i32 %154)
+  %156 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 1, i32 %127)
+  %157 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 1, i32 %156)
+  %158 = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 1, i32 %127)
+  %159 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 1, i32 %158)
+  %inc23 = add nuw nsw i32 %i.052, 1
+  %exitcond = icmp eq i32 %inc23, 100
+  br i1 %exitcond, label %for.cond.cleanup, label %loop_2mxu
+
+for.cond.cleanup:                                 ; preds = %for.cond1.preheader
+  ret void
+}
+
+; Check that the induction variable compare is scheduled in stage 0. If we fail
+; to analyze the loop we schedule it in stage 1 du to the sld latency.
+@_ZN10embeddingsL9smem_heapE = internal unnamed_addr global [512 x i32] zeroinitializer, align 4
+@_ZN10embeddingsL9tmem_heapE = internal global [512 x i32] zeroinitializer, align 32
+
+; CHECK: bb.1.loop_s:
+; CHECK: ppr = CMPNEri {{.*}} Stage-0
+
+define i32 @loop_small() {
+entry:
+  br label %loop_s
+loop_s:
+    %lsr.iv33 = phi [512 x i32]* [ @_ZN10embeddingsL9smem_heapE, %entry ], [ %n, %loop_s ]
+    %lsr.iv30 = phi [512 x i32]* [ @_ZN10embeddingsL9tmem_heapE, %entry ], [ %j, %loop_s ]
+    %lsr.iv28 = phi i32 [ 100, %entry ], [ %lsr.iv.next29, %loop_s ]
+    %add.1 = phi i32 [ 0, %entry ], [ %add.2, %loop_s ]
+    %a = bitcast [512 x i32]* %lsr.iv33 to i8*
+    %b = bitcast i8* %a to i32*
+    %c = bitcast [512 x i32]* %lsr.iv30 to i8 *
+    %d = bitcast i8* %c to i32*
+    %e = load i32, i32* %b, align 4
+    %add.2 = add i32 %e, %add.1
+    %lsr.iv.next29 = add nsw i32 %lsr.iv28, -1
+    %g = bitcast [512 x i32]* %lsr.iv30 to i8 *
+    %h = getelementptr i8, i8 * %g, i32 32
+    %i = bitcast i8 * %h to i32 *
+    %j = bitcast i32 * %i to [512 x i32] *
+    %k = bitcast [512 x i32]* %lsr.iv33 to i8*
+    %l = getelementptr i8, i8* %k, i32 4
+    %m = bitcast i8* %l to i32*
+    %n = bitcast i32* %m to [512 x i32]*
+    %o = icmp eq i32 %lsr.iv.next29, 0
+    br i1 %o, label %for.cond.cleanup, label %loop_s
+
+for.cond.cleanup:
+  ret i32 %add.2
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/software_pipeliner_complex_rec.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/software_pipeliner_complex_rec.ll
new file mode 100644
index 0000000..b2a5134
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/software_pipeliner_complex_rec.ll

@@ -0,0 +1,47 @@
+; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp \
+; RUN: -stop-after=tpu-pipeliner -tpu-pipeliner-annotate-for-testing \
+; RUN: -tpu-latencies=%S/Inputs/long_or.yml | FileCheck %s
+; REQUIRES: tpu
+; REQUIRES: assert
+
+; Test that the pipeliner doesn't crash when picking the schedule order for this
+; recurrences. We need to consider the backedges when picking the schedule order
+; otherwise all the nodes may not be reachable and they wouldn't get scheduled.
+; TODO(thomasraoux): Note that this loop fails to get pipelined due to bad
+; scheduling of the induction variable. This can be fixed by improving the
+; scheduling.
+
+declare i32 @llvm.tpu.vmatpush.f32(<1024 x float>, <1024 x i1>, i32, i32)
+declare i32 @llvm.tpu.vdwg(i32, i32)
+declare i32 @llvm.tpu.vmatmul.f32(<1024 x float>, <1024 x i1>, i32, i32)
+declare <1024 x float> @llvm.tpu.vmatres.f32(i32, i32)
+declare <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32) #1
+
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.halt.trap(i1)
+
+define void @ind_var(<1024 x float> %gain, <1024 x float> %input, i32* noalias %sp) {
+entry:
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  br label %loop_simple
+  
+loop_simple:
+  %p0 = phi i32 [ 0, %entry ], [ %o, %loop_simple ]
+  %l0 = phi i32 [ 3, %entry ], [ %l1, %loop_simple ]
+  %i1 = phi i32 [ 0, %entry ], [ %i, %loop_simple ]
+  %m = call i32 @llvm.tpu.vmatmul.f32(<1024 x float> %input, <1024 x i1> %mask, i32 0, i32 undef)
+  %m1 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 %m)
+  %i = add nuw nsw i32 %i1, 1
+  %o = or i32 %l0, 1
+  %l1 = load i32, i32* inttoptr (i32 161 to i32*)
+  store i32 %p0, i32* inttoptr (i32 161 to i32*)
+  %cnd = icmp eq i32 %i, 128
+  br i1 %cnd, label %loop.cleanup, label %loop_simple
+
+loop.cleanup:
+  ret void
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/software_pipeliner_out3_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/software_pipeliner_out3_sc.ll
new file mode 100644
index 0000000..8354536
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/software_pipeliner_out3_sc.ll

@@ -0,0 +1,175 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp -tpu-pipeliner-annotate-for-testing \
+; RUN: -stop-after=tpu-pipeliner -instcombine-max-iterations=0 | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; This test is almost the same loop as software_pipeliner_sc.ll, but modified to using
+; a value outside of the loop that was defined by a phi inside the loop. The loop itself
+; is using the phi, and the phi's values are only defined by instructions pre-loop.
+
+; CHECK-LABEL: bb.2.for.cond.loopexit.i:
+; CHECK: scVST_IDX_MSK %[[o:[0-9]+]], %{{[0-9]+}}, %{{[0-9]+}}, %5, 0, $palways, 0
+; CHECK-LABEL: bb.10.for.body22.i:
+; CHECK: %{{[0-9]+}}:vpr = VFMULrr %{{[0-9]+}}, %[[m:[0-9]+]], $palways, 0
+; CHECK-LABEL: bb.11.for.body22.i:
+; CHECK: %{{[0-9]+}}:vpr = VFMULrr %[[p0:[0-9]+]], %[[m]], $palways, 0
+; CHECK-LABEL: bb.7.for.body22.i:
+; CHECK: %[[ph1:[0-9]+]]:vpr = PHI %[[p0]], %bb.11, %[[p0]], %bb.7
+; CHECK: %{{[0-9]+}}:vpr = VFMULrr %[[ph1]], %[[m]], $palways, 0
+; CHECK-LABEL: bb.14.for.body22.i:
+; CHECK-LABEL: bb.13.for.body22.i:
+; CHECK: %[[o:[0-9]+]]:vpr = COPY %[[p0]]
+
+@__sc_tile_execute_entry = dso_local alias i32, bitcast (void ()* @tile_execute to i32*)
+
+; Function Attrs: nounwind
+define dso_local void @tile_execute() #1 section ".text.tile_execute" {
+entry:
+  %0 = load i32, i32* inttoptr (i32 258 to i32*), align 4, !tbaa !4
+  %t = load i32, i32* inttoptr (i32 312 to i32*), align 4, !tbaa !4
+  %1 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 260 to <8 x i32> addrspace(201)**), align 4, !tbaa !4
+  %2 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 261 to <8 x i32> addrspace(201)**), align 4, !tbaa !4
+  %3 = load i32, i32* inttoptr (i32 262 to i32*), align 4, !tbaa !4
+  %4 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 263 to <8 x float> addrspace(201)**), align 4, !tbaa !4
+  %5 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 264 to <8 x float> addrspace(201)**), align 8, !tbaa !4
+  %6 = load <8 x i32>, <8 x i32> addrspace(201)* %1, align 32, !tbaa !8
+  %7 = load <8 x i32>, <8 x i32> addrspace(201)* %2, align 32, !tbaa !8
+  %8 = inttoptr i32 %3 to <8 x i32> addrspace(201)*
+  %9 = load <8 x i32>, <8 x i32> addrspace(201)* %8, align 32, !tbaa !8
+  %10 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %6, <8 x i32> %7) #6, !noalias !9
+  %11 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %6, <8 x i32> %9) #6, !noalias !12
+  %cmp106.i = icmp sgt i32 %0, 7
+  br i1 %cmp106.i, label %for.body.lr.ph.i, label %_ZN10embeddings15SegmentedReduce7ComputeEiNS_15TmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit
+
+for.body.lr.ph.i:                                 ; preds = %entry
+  %div159 = lshr i32 %0, 3
+  %12 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %11, 1
+  %13 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %10, 1
+  %14 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %10, 0
+  %15 = inttoptr i32 %3 to <8 x float> addrspace(201)*
+  %sub.i = add nsw i32 %div159, -1
+  %16 = bitcast <8 x float> addrspace(201)* %5 to <8 x i32> addrspace(201)*
+  br label %for.body.i
+
+for.cond.loopexit.i:                              ; preds = %for.body22.i
+  %exitcond = icmp eq i32 %i.0107.i, %sub.i
+  ; post-loop use of loop's phi value.
+  tail call void @llvm.tpu.vst.msk.idx.p201v8f32.v8f32(<8 x i1> %29, <8 x float> addrspace(201)* %4, <8 x i32> %add29.i, <8 x float> %j.pre)
+  br i1 %exitcond, label %_ZN10embeddings15SegmentedReduce7ComputeEiNS_15TmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit, label %for.cond.loopexit.for.body_crit_edge.i
+
+for.cond.loopexit.for.body_crit_edge.i:           ; preds = %for.cond.loopexit.i
+  %17 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %24, 0
+  %18 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %25, 1
+  %19 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %24, 1
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.cond.loopexit.for.body_crit_edge.i, %for.body.lr.ph.i
+  %sorted_gains.0110.in.i = phi <8 x i32> [ %12, %for.body.lr.ph.i ], [ %18, %for.cond.loopexit.for.body_crit_edge.i ]
+  %sorted_segments.0109.i = phi <8 x i32> [ %14, %for.body.lr.ph.i ], [ %17, %for.cond.loopexit.for.body_crit_edge.i ]
+  %sorted_indices.0108.i = phi <8 x i32> [ %13, %for.body.lr.ph.i ], [ %19, %for.cond.loopexit.for.body_crit_edge.i ]
+  %i.0107.i = phi i32 [ 0, %for.body.lr.ph.i ], [ %add.i, %for.cond.loopexit.for.body_crit_edge.i ]
+  %sorted_gains.0110.i = bitcast <8 x i32> %sorted_gains.0110.in.i to <8 x float>
+  %cmp7.i = icmp eq i32 %i.0107.i, %sub.i
+  %add.i = add nuw nsw i32 %i.0107.i, 1
+  %cond.i = select i1 %cmp7.i, i32 %i.0107.i, i32 %add.i
+  %add.ptr.i81.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %1, i32 %cond.i
+  %20 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i81.i, align 32, !tbaa !8
+  %add.ptr.i79.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %2, i32 %cond.i
+  %21 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i79.i, align 32, !tbaa !8
+  %add.ptr.i.i = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %15, i32 %cond.i
+  %22 = bitcast <8 x float> addrspace(201)* %add.ptr.i.i to <8 x i32> addrspace(201)*
+  %23 = load <8 x i32>, <8 x i32> addrspace(201)* %22, align 32, !tbaa !8
+  %24 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %20, <8 x i32> %21) #6, !noalias !15
+  %25 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %20, <8 x i32> %23) #6, !noalias !18
+  %mul.i = shl <8 x i32> %sorted_indices.0108.i, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  %mul26.i = shl <8 x i32> %sorted_segments.0109.i, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ; pre-loop def of loop's phi value.
+  %pre1 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %4, <8 x i32> %sorted_segments.0109.i)
+  br i1 %cmp7.i, label %for.body22.i, label %for.body22.i.pre
+
+for.body22.i.pre:
+  %pre2 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %4, <8 x i32> %sorted_indices.0108.i)
+  br label %for.body22.i
+
+for.body22.i:                                     ; preds = %for.body22.i, %for.body.i
+  %j.0104.i = phi i32 [ 0, %for.body.i ], [ 0, %for.body22.i.pre ], [ %inc.i, %for.body22.i ]
+  ; loop's phi value.
+  %j.pre = phi <8 x float> [ %pre1, %for.body.i ], [ %pre2, %for.body22.i.pre ], [ %pre1, %for.body22.i ]
+  %splat.splatinsert.i = insertelement <8 x i32> undef, i32 %j.0104.i, i32 0
+  %splat.splat.i = shufflevector <8 x i32> %splat.splatinsert.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %add23.i = add <8 x i32> %splat.splat.i, %mul.i
+  %26 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %4, <8 x i32> %add23.i), !llvm.access.group !21
+  ; in-loop use of loop's phi output.
+  %mul25.i = fmul <8 x float> %j.pre, %sorted_gains.0110.i
+  %27 = tail call { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %sorted_segments.0109.i, <8 x float> %mul25.i)
+  %28 = extractvalue { <8 x float>, <8 x i1> } %27, 0
+  %29 = extractvalue { <8 x float>, <8 x i1> } %27, 1
+  %add29.i = add <8 x i32> %splat.splat.i, %mul26.i
+  %30 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %5, <8 x i32> %add29.i), !llvm.access.group !21
+  %add31.i = fadd <8 x float> %28, %30
+  %31 = bitcast <8 x float> %add31.i to <8 x i32>
+  tail call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> %29, <8 x i32> addrspace(201)* %16, <8 x i32> %add29.i, <8 x i32> %31)
+  %inc.i = add nuw nsw i32 %j.0104.i, 1
+  %exitcond.i = icmp eq i32 %inc.i, %t
+  br i1 %exitcond.i, label %for.cond.loopexit.i, label %for.body22.i, !llvm.loop !22
+
+_ZN10embeddings15SegmentedReduce7ComputeEiNS_15TmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit: ; preds = %for.cond.loopexit.i, %entry
+  store i32 1, i32* inttoptr (i32 256 to i32*), align 256, !tbaa !4
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind readonly
+declare <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>) #3
+; Function Attrs: inaccessiblememonly nounwind
+declare { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32>, <8 x float>) #4
+; Function Attrs: argmemonly nounwind
+declare void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>) #5
+declare void @llvm.tpu.vst.msk.idx.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, <8 x float>) #5
+; Function Attrs: inaccessiblememonly nounwind
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1>, <8 x i32>, <8 x i32>) #4
+
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tac-vf" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="256" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-scs-vf" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { argmemonly nounwind readonly }
+attributes #4 = { inaccessiblememonly nounwind }
+attributes #5 = { argmemonly nounwind }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!smem.spill.start = !{!2}
+!smem.spill.limit = !{!3}
+!vmem.spill.start = !{!2}
+!vmem.spill.limit = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version google3-trunk (trunk d35bcbbb5dab0e29b21a586505f5b274377cc41b)"}
+!2 = !{i32 0}
+!3 = !{i32 1024}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"int", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C++ TBAA"}
+!8 = !{!6, !6, i64 0}
+!9 = !{!10}
+!10 = distinct !{!10, !11, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_: %agg.result"}
+!11 = distinct !{!11, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_"}
+!12 = !{!13}
+!13 = distinct !{!13, !14, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_: %agg.result"}
+!14 = distinct !{!14, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_"}
+!15 = !{!16}
+!16 = distinct !{!16, !17, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_: %agg.result"}
+!17 = distinct !{!17, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_"}
+!18 = !{!19}
+!19 = distinct !{!19, !20, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_: %agg.result"}
+!20 = distinct !{!20, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_"}
+!21 = distinct !{}
+!22 = distinct !{!22, !23, !24, !25, !26, !27}
+!23 = !{!"llvm.loop.parallel_accesses", !21}
+!24 = !{!"llvm.loop.unroll.disable"}
+!25 = !{!"llvm.loop.vectorize.width", i32 1}
+!26 = !{!"llvm.loop.interleave.count", i32 1}
+!27 = !{!"llvm.loop.vectorize.enable", i1 true}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/software_pipeliner_out_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/software_pipeliner_out_sc.ll
new file mode 100644
index 0000000..970558f
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/software_pipeliner_out_sc.ll

@@ -0,0 +1,180 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-pipeliner-annotate-for-testing -tpu-enable-emulate-addressmode=false \
+; RUN: -stop-after=tpu-pipeliner -tpu-enable-vliw-prep-postiv=false \
+; RUN: -tpu-enable-vliw-prep-post-addrinc=false -instcombine-max-iterations=0 \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; This test is almost the same loop as software_pipeliner_sc.ll, but modified to using
+; a value outside of the loop that was defined by a phi inside the loop. The loop itself
+; is also using the phi.
+
+; The test was tailored assuming a vld latency of 4.
+
+; CHECK-LABEL: bb.2.for.cond.loopexit.i:
+; CHECK: scVST_IDX_MSK %[[e1:[0-9]+]], %{{[0-9]+}}, %{{[0-9]+}}, %5, 0, $palways, 0
+; CHECK-LABEL: bb.4.for.body.i:
+; CHECK: %[[p1:[0-9]+]]:vpr = scVLD_IDX
+; CHECK-LABEL: bb.8.for.body22.i:
+; CHECK: %[[p2:[0-9]+]]:vpr = scVLD_IDX
+; CHECK-LABEL: bb.9.for.body22.i:
+; CHECK: scVLD_IDX
+; CHECK: %[[p3:[0-9]+]]:vpr = scVLD_IDX
+; CHECK-LABEL: bb.5.for.body22.i:
+; CHECK: %[[ph1:[0-9]+]]:vpr = PHI %[[p3]], %bb.9, %[[l:[0-9]+]], %bb.5
+; CHECK: %[[ph2:[0-9]+]]:vpr = PHI %[[p3]], %bb.9, %[[l]], %bb.5
+; CHECK: %{{[0-9]+}}:vpr = VFMULrr %[[ph2]]
+; CHECK: %[[l]]:vpr = scVLD_IDX
+; CHECK-LABEL: bb.12.for.body22.i:
+; CHECK: %[[e1]]:vpr = PHI %[[ph1]], %bb.5, %[[p1]], %bb.8, %[[p2]], %bb.9
+
+@__sc_tile_execute_entry = dso_local alias i32, bitcast (void ()* @tile_execute to i32*)
+
+; Function Attrs: nounwind
+define dso_local void @tile_execute() #1 section ".text.tile_execute" {
+entry:
+  %0 = load i32, i32* inttoptr (i32 258 to i32*), align 4, !tbaa !4
+  %t = load i32, i32* inttoptr (i32 312 to i32*), align 4, !tbaa !4
+  %1 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 260 to <8 x i32> addrspace(201)**), align 4, !tbaa !4
+  %2 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 261 to <8 x i32> addrspace(201)**), align 4, !tbaa !4
+  %3 = load i32, i32* inttoptr (i32 262 to i32*), align 4, !tbaa !4
+  %4 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 263 to <8 x float> addrspace(201)**), align 4, !tbaa !4
+  %5 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 264 to <8 x float> addrspace(201)**), align 8, !tbaa !4
+  %6 = load <8 x i32>, <8 x i32> addrspace(201)* %1, align 32, !tbaa !8
+  %7 = load <8 x i32>, <8 x i32> addrspace(201)* %2, align 32, !tbaa !8
+  %8 = inttoptr i32 %3 to <8 x i32> addrspace(201)*
+  %9 = load <8 x i32>, <8 x i32> addrspace(201)* %8, align 32, !tbaa !8
+  %10 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %6, <8 x i32> %7) #6, !noalias !9
+  %11 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %6, <8 x i32> %9) #6, !noalias !12
+  %cmp106.i = icmp sgt i32 %0, 7
+  br i1 %cmp106.i, label %for.body.lr.ph.i, label %_ZN10embeddings15SegmentedReduce7ComputeEiNS_15TmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit
+
+for.body.lr.ph.i:                                 ; preds = %entry
+  %div159 = lshr i32 %0, 3
+  %12 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %11, 1
+  %13 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %10, 1
+  %14 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %10, 0
+  %15 = inttoptr i32 %3 to <8 x float> addrspace(201)*
+  %sub.i = add nsw i32 %div159, -1
+  %16 = bitcast <8 x float> addrspace(201)* %5 to <8 x i32> addrspace(201)*
+  br label %for.body.i
+
+for.cond.loopexit.i:                              ; preds = %for.body22.i
+  %exitcond = icmp eq i32 %i.0107.i, %sub.i
+  ; post-loop use of loop's phi value.
+  tail call void @llvm.tpu.vst.msk.idx.p201v8f32.v8f32(<8 x i1> %29, <8 x float> addrspace(201)* %4, <8 x i32> %add29.i, <8 x float> %j.pre)
+  br i1 %exitcond, label %_ZN10embeddings15SegmentedReduce7ComputeEiNS_15TmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit, label %for.cond.loopexit.for.body_crit_edge.i
+
+for.cond.loopexit.for.body_crit_edge.i:           ; preds = %for.cond.loopexit.i
+  %17 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %24, 0
+  %18 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %25, 1
+  %19 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %24, 1
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.cond.loopexit.for.body_crit_edge.i, %for.body.lr.ph.i
+  %sorted_gains.0110.in.i = phi <8 x i32> [ %12, %for.body.lr.ph.i ], [ %18, %for.cond.loopexit.for.body_crit_edge.i ]
+  %sorted_segments.0109.i = phi <8 x i32> [ %14, %for.body.lr.ph.i ], [ %17, %for.cond.loopexit.for.body_crit_edge.i ]
+  %sorted_indices.0108.i = phi <8 x i32> [ %13, %for.body.lr.ph.i ], [ %19, %for.cond.loopexit.for.body_crit_edge.i ]
+  %i.0107.i = phi i32 [ 0, %for.body.lr.ph.i ], [ %add.i, %for.cond.loopexit.for.body_crit_edge.i ]
+  %sorted_gains.0110.i = bitcast <8 x i32> %sorted_gains.0110.in.i to <8 x float>
+  %cmp7.i = icmp eq i32 %i.0107.i, %sub.i
+  %add.i = add nuw nsw i32 %i.0107.i, 1
+  %cond.i = select i1 %cmp7.i, i32 %i.0107.i, i32 %add.i
+  %add.ptr.i81.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %1, i32 %cond.i
+  %20 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i81.i, align 32, !tbaa !8
+  %add.ptr.i79.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %2, i32 %cond.i
+  %21 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i79.i, align 32, !tbaa !8
+  %add.ptr.i.i = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %15, i32 %cond.i
+  %22 = bitcast <8 x float> addrspace(201)* %add.ptr.i.i to <8 x i32> addrspace(201)*
+  %23 = load <8 x i32>, <8 x i32> addrspace(201)* %22, align 32, !tbaa !8
+  %24 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %20, <8 x i32> %21) #6, !noalias !15
+  %25 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %20, <8 x i32> %23) #6, !noalias !18
+  %mul.i = shl <8 x i32> %sorted_indices.0108.i, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  %mul26.i = shl <8 x i32> %sorted_segments.0109.i, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ; pre-loop def of loop's phi value.
+  %pre = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %4, <8 x i32> %sorted_segments.0109.i)
+  br label %for.body22.i
+
+for.body22.i:                                     ; preds = %for.body22.i, %for.body.i
+  %j.0104.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body22.i ]
+  ; loop's phi value.
+  %j.pre = phi <8 x float> [ %pre, %for.body.i ], [ %26, %for.body22.i ]
+  %splat.splatinsert.i = insertelement <8 x i32> undef, i32 %j.0104.i, i32 0
+  %splat.splat.i = shufflevector <8 x i32> %splat.splatinsert.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %add23.i = add <8 x i32> %splat.splat.i, %mul.i
+  %26 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %4, <8 x i32> %add23.i), !llvm.access.group !21
+  ; in-loop use of loop's phi output.
+  %mul25.i = fmul <8 x float> %j.pre, %sorted_gains.0110.i
+  %27 = tail call { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %sorted_segments.0109.i, <8 x float> %mul25.i)
+  %28 = extractvalue { <8 x float>, <8 x i1> } %27, 0
+  %29 = extractvalue { <8 x float>, <8 x i1> } %27, 1
+  %add29.i = add <8 x i32> %splat.splat.i, %mul26.i
+  %30 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %5, <8 x i32> %add29.i), !llvm.access.group !21
+  %add31.i = fadd <8 x float> %28, %30
+  %31 = bitcast <8 x float> %add31.i to <8 x i32>
+  tail call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> %29, <8 x i32> addrspace(201)* %16, <8 x i32> %add29.i, <8 x i32> %31)
+  %inc.i = add nuw nsw i32 %j.0104.i, 1
+  %exitcond.i = icmp eq i32 %inc.i, %t
+  br i1 %exitcond.i, label %for.cond.loopexit.i, label %for.body22.i, !llvm.loop !22
+
+_ZN10embeddings15SegmentedReduce7ComputeEiNS_15TmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit: ; preds = %for.cond.loopexit.i, %entry
+  store i32 1, i32* inttoptr (i32 256 to i32*), align 256, !tbaa !4
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind readonly
+declare <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>) #3
+; Function Attrs: inaccessiblememonly nounwind
+declare { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32>, <8 x float>) #4
+; Function Attrs: argmemonly nounwind
+declare void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>) #5
+declare void @llvm.tpu.vst.msk.idx.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, <8 x float>) #5
+; Function Attrs: inaccessiblememonly nounwind
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1>, <8 x i32>, <8 x i32>) #4
+
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tac-vf" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="256" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-tec-vf" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sparsecore-scs-vf" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { argmemonly nounwind readonly }
+attributes #4 = { inaccessiblememonly nounwind }
+attributes #5 = { argmemonly nounwind }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!smem.spill.start = !{!2}
+!smem.spill.limit = !{!3}
+!vmem.spill.start = !{!2}
+!vmem.spill.limit = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version google3-trunk (trunk d35bcbbb5dab0e29b21a586505f5b274377cc41b)"}
+!2 = !{i32 0}
+!3 = !{i32 1024}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"int", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C++ TBAA"}
+!8 = !{!6, !6, i64 0}
+!9 = !{!10}
+!10 = distinct !{!10, !11, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_: %agg.result"}
+!11 = distinct !{!11, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_"}
+!12 = !{!13}
+!13 = distinct !{!13, !14, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_: %agg.result"}
+!14 = distinct !{!14, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_"}
+!15 = !{!16}
+!16 = distinct !{!16, !17, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_: %agg.result"}
+!17 = distinct !{!17, !"_ZN10embeddings10VectorSortIDv8_iEENS_10SortResultIT_EES1_S1_S3_"}
+!18 = !{!19}
+!19 = distinct !{!19, !20, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_: %agg.result"}
+!20 = distinct !{!20, !"_ZN10embeddings10VectorSortIDv8_fEENS_10SortResultIT_EEDv8_iS5_S3_"}
+!21 = distinct !{}
+!22 = distinct !{!22, !23, !24, !25, !26, !27}
+!23 = !{!"llvm.loop.parallel_accesses", !21}
+!24 = !{!"llvm.loop.unroll.disable"}
+!25 = !{!"llvm.loop.vectorize.width", i32 1}
+!26 = !{!"llvm.loop.interleave.count", i32 1}
+!27 = !{!"llvm.loop.vectorize.enable", i1 true}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sparsecore-no-vpu.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sparsecore-no-vpu.ll
new file mode 100644
index 0000000..0a7306c
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sparsecore-no-vpu.ll

@@ -0,0 +1,15 @@
+; RUN: llc < %s -mcpu=sparsecore-scs-vf -asm-verbose=false -disable-cgp 2>&1 | FileCheck %s
+; REQUIRES: tpu
+; XFAIL: *
+; Negative test to make sure we don't generate vector code for sparsecore sas
+; which doesn't have vector instructions.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: vaddi:
+; CHECK: v{{[0-9]+}} = vadd.s32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i32> @vaddi(<8 x i32> %x, <8 x i32> %y) {
+  %a = add <8 x i32> %x, %y
+  ret <8 x i32> %a
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/spill_debug_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/spill_debug_sc.ll
new file mode 100644
index 0000000..0afb4f9
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/spill_debug_sc.ll

@@ -0,0 +1,57 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false \
+; RUN: -tpu-enable-spill-debug | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.spill.debug.i32(i32)
+declare void @llvm.tpu.spill.debug.v8i32(<8 x i32>)
+
+; Tests that the spill debug pass works as expected.
+
+; CHECK-LABEL: spill_gpr_debug
+; CHECK-DAG: s[[s0:[0-9]+]] = simm.s32 $0x83
+; CHECK-DAG: s[[s1:[0-9]+]] = simm.s32 $0x82
+; CHECK-DAG: s[[s2:[0-9]+]] = simm.s32 $0x81
+; CHECK-DAG: s[[s3:[0-9]+]] = simm.s32 $0x80
+; CHECK-DAG: [smem:$0x7ff] = sst s[[s0]]
+; CHECK-DAG: [smem:$0x7fe] = sst s[[s1]]
+; CHECK-DAG: [smem:$0x7fd] = sst s[[s2]]
+; CHECK-DAG: [smem:$0x7fc] = sst s[[s3]]
+define void @spill_gpr_debug() {
+entry:
+  tail call void @llvm.tpu.spill.debug.i32(i32 128)
+  tail call void @llvm.tpu.spill.debug.i32(i32 129)
+  tail call void @llvm.tpu.spill.debug.i32(i32 130)
+  tail call void @llvm.tpu.spill.debug.i32(i32 131)
+  ret void
+}
+
+; CHECK-LABEL: spill_vpr_debug
+; CHECK-DAG: [tilespmem:$0x7f0] = vst v0
+; CHECK-DAG: [tilespmem:$0x7e8] = vst v1
+; CHECK-DAG: [tilespmem:$0x7e0] = vst v2
+; CHECK-DAG: v[[v3:[0-9]+]]  = vlaneseq.u32
+; CHECK-DAG: [tilespmem:$0x7f8] = vst v[[v3]]
+define void @spill_vpr_debug(<8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) {
+entry:
+  tail call void @llvm.tpu.spill.debug.v8i32(<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>)
+  tail call void @llvm.tpu.spill.debug.v8i32(<8 x i32> %v0)
+  tail call void @llvm.tpu.spill.debug.v8i32(<8 x i32> %v1)
+  tail call void @llvm.tpu.spill.debug.v8i32(<8 x i32> %v2)
+  ret void
+}
+
+!smem.funcs.spill = !{!0, !1}
+!smem.ranges.spill.start = !{!100, !100}
+!smem.ranges.spill.limit = !{!101, !101}
+!tilespmem.funcs.spill = !{!0, !1}
+!tilespmem.ranges.spill.start = !{!100, !100}
+!tilespmem.ranges.spill.limit = !{!101, !101}
+
+!0 = !{void ()* @spill_gpr_debug}
+!1 = !{void (<8 x i32>, <8 x i32>, <8 x i32>)* @spill_vpr_debug}
+
+!100 = !{i32 0}
+!101 = !{i32 2048}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/spill_limits_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/spill_limits_sc.ll
new file mode 100644
index 0000000..cf49e43
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/spill_limits_sc.ll

@@ -0,0 +1,77 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-scs-vf -tpu-fatal-mem-alloc-error=false < %s 2>&1 \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.ptrtoint.pi32(i32*)
+declare i32 @llvm.tpu.ptrtoint.p201i32(i32 addrspace(201)*)
+declare i32* @llvm.tpu.allocate.smem(i32, i32)
+declare i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32, i32)
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+; Test that we're sanity checking the set spill limits.
+
+; CHECK: Sanity check failed:
+; CHECK: %0 = call i32* @llvm.tpu.allocate.smem(i32 1, i32 10)
+; CHECK: Allocation within spill limits
+define i32 @smem_lower_bound() #0 {
+entry:
+  %0 = call i32* @llvm.tpu.allocate.smem(i32 1, i32 10)
+  %r = call i32 @llvm.tpu.ptrtoint.pi32(i32* %0)
+  ret i32 %r
+}
+
+; CHECK: Sanity check failed:
+; CHECK: %0 = call i32* @llvm.tpu.allocate.smem(i32 1, i32 19)
+; CHECK: Allocation within spill limits
+define i32 @smem_upper_bound() #0 {
+entry:
+  %0 = call i32* @llvm.tpu.allocate.smem(i32 1, i32 19)
+  %r = call i32 @llvm.tpu.ptrtoint.pi32(i32* %0)
+  ret i32 %r
+}
+
+; CHECK: Sanity check failed:
+; CHECK: %0 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 1, i32 10)
+; CHECK: Allocation within spill limits
+define i32 @tilespmem_lower_bound() #0 {
+entry:
+  %0 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 1, i32 10)
+  %r = call i32 @llvm.tpu.ptrtoint.p201i32(i32 addrspace(201)* %0)
+  ret i32 %r
+}
+
+; CHECK: Sanity check failed:
+; CHECK: %0 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 1, i32 19)
+; CHECK: Allocation within spill limits
+define i32 @tilespmem_upper_bound() #0 {
+entry:
+  %0 = call i32 addrspace(201)* @llvm.tpu.allocate.tilespmem(i32 1, i32 19)
+  %r = call i32 @llvm.tpu.ptrtoint.p201i32(i32 addrspace(201)* %0)
+  ret i32 %r
+}
+
+!smem.funcs.spill = !{!0, !1, !2, !3}
+!smem.ranges.spill.start = !{!80, !80, !120, !120}
+!smem.ranges.spill.limit = !{!100, !100, !140, !140}
+!tilespmem.funcs.spill = !{!0, !1, !2, !3}
+!tilespmem.ranges.spill.start = !{!120, !120, !160, !160}
+!tilespmem.ranges.spill.limit = !{!140, !140, !180, !180}
+
+!0 = !{i32 ()* @smem_lower_bound}
+!1 = !{i32 ()* @smem_upper_bound}
+!2 = !{i32 ()* @tilespmem_lower_bound}
+!3 = !{i32 ()* @tilespmem_upper_bound}
+
+!80 = !{i32 10}
+!100 = !{i32 20}
+!120 = !{i32 0}
+!140 = !{i32 0}
+!160 = !{i32 10}
+!180 = !{i32 20}
+

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/spill_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/spill_sc.ll
new file mode 100644
index 0000000..e6810b5
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/spill_sc.ll

@@ -0,0 +1,132 @@
+; RUN: llc -O0 < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+!smem.funcs.spill = !{!0}
+!smem.ranges.spill.start = !{!1}
+!smem.ranges.spill.limit = !{!2}
+
+!0 = !{void (i32)* @smem_spill}
+!1 = !{i32 100}
+!2 = !{i32 200}
+
+declare i32* @llvm.tpu.inttoptr.p0i32(i32)
+
+; CHECK-LABEL: smem_spill:
+; CHECK-DAG: [smem:$0xc7] =	sst s{{[0-9]+}}
+; CHECK-DAG: [smem:$0xc6] =	sst s{{[0-9]+}}
+; CHECK-DAG: s{{[0-9]+}} =	sld [smem:$0xc6]
+; CHECK-DAG: s{{[0-9]+}} =	sld [smem:$0xc7]
+; CHECK-DAG: [smem:$0xc5] =	sst s{{[0-9]+}}
+; CHECK-DAG: s{{[0-9]+}} =	sld [smem:$0xc5]
+; CHECK: shalt
+
+define void @smem_spill(i32 %arg) {
+entry:
+  br label %region-0
+
+region-0:
+  %x = phi i32 [ %arg, %entry ], [ %79, %region-2 ]
+
+  %0 = add i32 %x, 0
+  %1 = add i32 %x, 1
+  %2 = add i32 %x, 2
+  %3 = add i32 %x, 3
+  %4 = add i32 %x, 4
+  %5 = add i32 %x, 5
+  %6 = add i32 %x, 6
+  %7 = add i32 %x, 7
+  %8 = add i32 %x, 8
+  %9 = add i32 %x, 9
+
+  %10 = add i32 %x, 10
+  %11 = add i32 %x, 11
+  %12 = add i32 %x, 12
+  %13 = add i32 %x, 13
+  %14 = add i32 %x, 14
+  %15 = add i32 %x, 15
+  %16 = add i32 %x, 16
+  %17 = add i32 %x, 17
+  %18 = add i32 %x, 18
+  %19 = add i32 %x, 19
+
+  %20 = add i32 %x, 20
+  %21 = add i32 %x, 21
+  %22 = add i32 %x, 22
+  %23 = add i32 %x, 23
+  %24 = add i32 %x, 24
+  %25 = add i32 %x, 25
+  %26 = add i32 %x, 26
+  %27 = add i32 %x, 27
+  %28 = add i32 %x, 28
+  %29 = add i32 %x, 29
+
+  %30 = add i32 %x, 30
+  %31 = add i32 %x, 31
+  %32 = add i32 %x, 32
+  %33 = add i32 %x, 33
+  %34 = add i32 %x, 34
+  %35 = add i32 %x, 35
+  %36 = add i32 %x, 36
+  %37 = add i32 %x, 37
+  %38 = add i32 %x, 38
+  %39 = add i32 %x, 39
+
+  br label %region-2
+
+region-2:
+  %40 = add i32 %arg, %0
+  %41 = add i32 %40, %1
+  %42 = add i32 %41, %2
+  %43 = add i32 %42, %3
+  %44 = add i32 %43, %4
+  %45 = add i32 %44, %5
+  %46 = add i32 %45, %6
+  %47 = add i32 %46, %7
+  %48 = add i32 %47, %8
+  %49 = add i32 %48, %9
+
+  %50 = add i32 %49, %10
+  %51 = add i32 %50, %11
+  %52 = add i32 %51, %12
+  %53 = add i32 %52, %13
+  %54 = add i32 %53, %14
+  %55 = add i32 %54, %15
+  %56 = add i32 %55, %16
+  %57 = add i32 %56, %17
+  %58 = add i32 %57, %18
+  %59 = add i32 %58, %19
+
+  %60 = add i32 %59, %20
+  %61 = add i32 %60, %21
+  %62 = add i32 %61, %22
+  %63 = add i32 %62, %23
+  %64 = add i32 %63, %24
+  %65 = add i32 %64, %25
+  %66 = add i32 %65, %26
+  %67 = add i32 %66, %27
+  %68 = add i32 %67, %28
+  %69 = add i32 %68, %29
+
+  %70 = add i32 %69, %30
+  %71 = add i32 %70, %31
+  %72 = add i32 %71, %32
+  %73 = add i32 %72, %33
+  %74 = add i32 %73, %34
+  %75 = add i32 %74, %35
+  %76 = add i32 %75, %36
+  %77 = add i32 %76, %37
+  %78 = add i32 %77, %38
+  %79 = add i32 %78, %39
+
+  %80 = icmp sge i32 %79, 1000
+  br i1 %80, label %region-0, label %region-3
+
+region-3:
+  %mem = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32* %mem, i32 0
+  store i32 %79, i32* %arrayidx, align 4
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/spill_scavenge_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/spill_scavenge_tc.ll
new file mode 100644
index 0000000..7dc0da4
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/spill_scavenge_tc.ll

@@ -0,0 +1,2652 @@
+; RUN: llc < %s -mcpu=tensorcore-jf -disable-cgp -tpu-enable-overlayer-passes -dag-maps-huge-region=10000 -memdep-block-scan-limit=1000 | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; This test causes enough GPR spills to cause the register scavenger to spill a GPR register.
+; The test's purpose is to make sure this works.
+
+; CHECK-LABEL: fusion.17
+
+define void @fusion.17(i8192 addrspace(203)* nocapture readnone %0, i8192 addrspace(203)* %1, i8192 addrspace(203)* %2, i8192 addrspace(203)* %3) local_unnamed_addr #0 {
+llo-region-10:
+  %4 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 0)
+  %5 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 42)
+  %6 = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 15)
+  tail call void @llvm.tpu.dma.hbm.to.smem.p203i8192(i32 addrspace(204)* %6, i8192 addrspace(203)* %1, i32* %5, i32 1)
+  %7 = getelementptr i8192, i8192 addrspace(203)* %2, i32 1
+  %8 = bitcast <1024 x i32> addrspace(205)* %4 to i4096 addrspace(205)*
+  %9 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 1
+  %10 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 2
+  %11 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 3
+  %12 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 4
+  %13 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 5
+  %14 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 6
+  %15 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 7
+  %16 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 8
+  %17 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 9
+  %18 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 10
+  %19 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 11
+  %20 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 12
+  %21 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 13
+  %22 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 14
+  %23 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 15
+  %24 = getelementptr i8192, i8192 addrspace(203)* %3, i32 4
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %6, i32 1)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %6, i32 -1)
+  fence seq_cst
+  %25 = tail call i32 @llvm.tpu.ptrtoint.p0i32(i32* %5)
+  %26 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 128)
+  %27 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 136)
+  %28 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 144)
+  %29 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 152)
+  %30 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 160)
+  %31 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 168)
+  %32 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 176)
+  %33 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 184)
+  %34 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 192)
+  %35 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 200)
+  %36 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 208)
+  %37 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 216)
+  %38 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 224)
+  %39 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 232)
+  %40 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 240)
+  %41 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 248)
+  %42 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 256)
+  %43 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 264)
+  %44 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 272)
+  %45 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 280)
+  %46 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 288)
+  %47 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 296)
+  %48 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 304)
+  %49 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 312)
+  %50 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 320)
+  %51 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 328)
+  %52 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 336)
+  %53 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 344)
+  %54 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 352)
+  %55 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 360)
+  %56 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 368)
+  %57 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 376)
+  %58 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 384)
+  %59 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 392)
+  %60 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 400)
+  %61 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 408)
+  %62 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 416)
+  %63 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 424)
+  %64 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 432)
+  %65 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 440)
+  %66 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 448)
+  %67 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 456)
+  %68 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 464)
+  %69 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 472)
+  %70 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 480)
+  %71 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 488)
+  %72 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 496)
+  %73 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 504)
+  %74 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 512)
+  %75 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 520)
+  %76 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 528)
+  %77 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 536)
+  %78 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 544)
+  %79 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 552)
+  %80 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 560)
+  %81 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 568)
+  %82 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 576)
+  %83 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 584)
+  %84 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 592)
+  %85 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 600)
+  %86 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 608)
+  %87 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 616)
+  %88 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 624)
+  %89 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 632)
+  %90 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 640)
+  %91 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 648)
+  %92 = tail call i32* @llvm.tpu.inttoptr.p0i32(i32 %25)
+  %93 = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 16)
+  %94 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %7)
+  %95 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %2)
+  %96 = bitcast <1024 x i32> addrspace(205)* %90 to i4096 addrspace(205)*
+  %97 = bitcast <1024 x i32> addrspace(205)* %26 to i4096 addrspace(205)*
+  br label %llo-region-14
+
+llo-region-14:                                    ; preds = %llo-region-14.join590, %llo-region-10
+  %98 = phi i32 [ 0, %llo-region-10 ], [ %1956, %llo-region-14.join590 ]
+  %99 = shl i32 %98, 6
+  %100 = or i32 %99, 2
+  %101 = getelementptr i32, i32* %92, i32 %100
+  %102 = load i32, i32* %101, align 4
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %103 = lshr i32 %102, 2
+  %104 = and i32 %103, 2
+  %105 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %104
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %105, <1024 x i32> addrspace(205)* %27, i32 2)
+  %106 = or i32 %99, 6
+  %107 = getelementptr i32, i32* %92, i32 %106
+  %108 = load i32, i32* %107, align 4
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %109 = lshr i32 %108, 2
+  %110 = and i32 %109, 2
+  %111 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %110
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %111, <1024 x i32> addrspace(205)* %31, i32 2)
+  %112 = or i32 %99, 10
+  %113 = getelementptr i32, i32* %92, i32 %112
+  %114 = load i32, i32* %113, align 4
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %115 = lshr i32 %114, 2
+  %116 = and i32 %115, 2
+  %117 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %116
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %117, <1024 x i32> addrspace(205)* %35, i32 2)
+  %118 = or i32 %99, 14
+  %119 = getelementptr i32, i32* %92, i32 %118
+  %120 = load i32, i32* %119, align 4
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %121 = lshr i32 %120, 2
+  %122 = and i32 %121, 2
+  %123 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %122
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %123, <1024 x i32> addrspace(205)* %39, i32 2)
+  %124 = or i32 %99, 18
+  %125 = getelementptr i32, i32* %92, i32 %124
+  %126 = load i32, i32* %125, align 4
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %127 = lshr i32 %126, 2
+  %128 = and i32 %127, 2
+  %129 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %128
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %129, <1024 x i32> addrspace(205)* %43, i32 2)
+  %130 = or i32 %99, 22
+  %131 = getelementptr i32, i32* %92, i32 %130
+  %132 = load i32, i32* %131, align 4
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %133 = lshr i32 %132, 2
+  %134 = and i32 %133, 2
+  %135 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %134
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %135, <1024 x i32> addrspace(205)* %47, i32 2)
+  %136 = or i32 %99, 26
+  %137 = getelementptr i32, i32* %92, i32 %136
+  %138 = load i32, i32* %137, align 4
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %139 = lshr i32 %138, 2
+  %140 = and i32 %139, 2
+  %141 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %140
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %141, <1024 x i32> addrspace(205)* %51, i32 2)
+  %142 = or i32 %99, 30
+  %143 = getelementptr i32, i32* %92, i32 %142
+  %144 = load i32, i32* %143, align 4
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %145 = lshr i32 %144, 2
+  %146 = and i32 %145, 2
+  %147 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %146
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %147, <1024 x i32> addrspace(205)* %55, i32 2)
+  %148 = or i32 %99, 34
+  %149 = getelementptr i32, i32* %92, i32 %148
+  %150 = load i32, i32* %149, align 4
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %151 = lshr i32 %150, 2
+  %152 = and i32 %151, 2
+  %153 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %152
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %153, <1024 x i32> addrspace(205)* %59, i32 2)
+  %154 = or i32 %99, 38
+  %155 = getelementptr i32, i32* %92, i32 %154
+  %156 = load i32, i32* %155, align 4
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %157 = lshr i32 %156, 2
+  %158 = and i32 %157, 2
+  %159 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %158
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %159, <1024 x i32> addrspace(205)* %63, i32 2)
+  %160 = or i32 %99, 42
+  %161 = getelementptr i32, i32* %92, i32 %160
+  %162 = load i32, i32* %161, align 4
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %163 = lshr i32 %162, 2
+  %164 = and i32 %163, 2
+  %165 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %164
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %165, <1024 x i32> addrspace(205)* %67, i32 2)
+  %166 = or i32 %99, 46
+  %167 = getelementptr i32, i32* %92, i32 %166
+  %168 = load i32, i32* %167, align 4
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %169 = lshr i32 %168, 2
+  %170 = and i32 %169, 2
+  %171 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %170
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %171, <1024 x i32> addrspace(205)* %71, i32 2)
+  %172 = or i32 %99, 50
+  %173 = getelementptr i32, i32* %92, i32 %172
+  %174 = load i32, i32* %173, align 4
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %175 = lshr i32 %174, 2
+  %176 = and i32 %175, 2
+  %177 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %176
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %177, <1024 x i32> addrspace(205)* %75, i32 2)
+  %178 = or i32 %99, 54
+  %179 = getelementptr i32, i32* %92, i32 %178
+  %180 = load i32, i32* %179, align 4
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %181 = lshr i32 %180, 2
+  %182 = and i32 %181, 2
+  %183 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %182
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %183, <1024 x i32> addrspace(205)* %79, i32 2)
+  %184 = or i32 %99, 58
+  %185 = getelementptr i32, i32* %92, i32 %184
+  %186 = load i32, i32* %185, align 4
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %187 = lshr i32 %186, 2
+  %188 = and i32 %187, 2
+  %189 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %188
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %189, <1024 x i32> addrspace(205)* %83, i32 2)
+  %190 = or i32 %99, 62
+  %191 = getelementptr i32, i32* %92, i32 %190
+  %192 = load i32, i32* %191, align 4
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %193 = lshr i32 %192, 2
+  %194 = and i32 %193, 2
+  %195 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %194
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %195, <1024 x i32> addrspace(205)* %87, i32 2)
+  %196 = lshr i32 %98, 6
+  %197 = and i32 %196, 4194303
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 0)
+  %198 = getelementptr i8192, i8192 addrspace(203)* %2, i32 %197
+  %199 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %198)
+  %200 = getelementptr i8192, i8192 addrspace(203)* %198, i32 1
+  %201 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %200)
+  %202 = icmp slt i32 %199, %95
+  %203 = icmp slt i32 %94, %201
+  %204 = or i1 %203, %202
+  tail call void @llvm.tpu.halt.trap(i1 %204)
+  tail call void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)* %93, i8192 addrspace(203)* %198, <1024 x i32> addrspace(205)* %91, i32 1)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %93, i32 1)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %93, i32 -1)
+  %205 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %27, i32 255, i32 1, i32 undef), !alias.scope !4, !noalias !7
+  %206 = tail call <1024 x i32> @llvm.tpu.vlaneseq.v1024i32()
+  %207 = lshr <1024 x i32> %206, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %208 = and <1024 x i32> %207, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %209 = xor <1024 x i32> %208, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %210 = shl <1024 x i32> %205, %209
+  %211 = and <1024 x i32> %210, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  store <1024 x i32> %211, <1024 x i32> addrspace(205)* %26, align 4096, !alias.scope !75, !noalias !76
+  %212 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %31, i32 255, i32 1, i32 undef), !alias.scope !77, !noalias !78
+  %213 = shl <1024 x i32> %212, %209
+  %214 = and <1024 x i32> %213, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  store <1024 x i32> %214, <1024 x i32> addrspace(205)* %30, align 4096, !alias.scope !79, !noalias !80
+  %215 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %35, i32 255, i32 1, i32 undef), !alias.scope !81, !noalias !82
+  %216 = shl <1024 x i32> %215, %209
+  %217 = and <1024 x i32> %216, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  store <1024 x i32> %217, <1024 x i32> addrspace(205)* %34, align 4096, !alias.scope !83, !noalias !84
+  %218 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %39, i32 255, i32 1, i32 undef), !alias.scope !85, !noalias !86
+  %219 = shl <1024 x i32> %218, %209
+  %220 = and <1024 x i32> %219, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  store <1024 x i32> %220, <1024 x i32> addrspace(205)* %38, align 4096, !alias.scope !87, !noalias !88
+  %221 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %43, i32 255, i32 1, i32 undef), !alias.scope !89, !noalias !90
+  %222 = shl <1024 x i32> %221, %209
+  %223 = and <1024 x i32> %222, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  store <1024 x i32> %223, <1024 x i32> addrspace(205)* %42, align 4096, !alias.scope !91, !noalias !92
+  %224 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %47, i32 255, i32 1, i32 undef), !alias.scope !93, !noalias !94
+  %225 = shl <1024 x i32> %224, %209
+  %226 = and <1024 x i32> %225, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  store <1024 x i32> %226, <1024 x i32> addrspace(205)* %46, align 4096, !alias.scope !95, !noalias !96
+  %227 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %51, i32 255, i32 1, i32 undef), !alias.scope !97, !noalias !98
+  %228 = shl <1024 x i32> %227, %209
+  %229 = and <1024 x i32> %228, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  store <1024 x i32> %229, <1024 x i32> addrspace(205)* %50, align 4096, !alias.scope !99, !noalias !100
+  %230 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %55, i32 255, i32 1, i32 undef), !alias.scope !101, !noalias !102
+  %231 = shl <1024 x i32> %230, %209
+  %232 = and <1024 x i32> %231, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  store <1024 x i32> %232, <1024 x i32> addrspace(205)* %54, align 4096, !alias.scope !103, !noalias !104
+  %233 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %59, i32 255, i32 1, i32 undef), !alias.scope !105, !noalias !106
+  %234 = shl <1024 x i32> %233, %209
+  %235 = and <1024 x i32> %234, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  store <1024 x i32> %235, <1024 x i32> addrspace(205)* %58, align 4096, !alias.scope !107, !noalias !108
+  %236 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %63, i32 255, i32 1, i32 undef), !alias.scope !109, !noalias !110
+  %237 = shl <1024 x i32> %236, %209
+  %238 = and <1024 x i32> %237, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  store <1024 x i32> %238, <1024 x i32> addrspace(205)* %62, align 4096, !alias.scope !111, !noalias !112
+  %239 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %67, i32 255, i32 1, i32 undef), !alias.scope !113, !noalias !114
+  %240 = shl <1024 x i32> %239, %209
+  %241 = and <1024 x i32> %240, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  store <1024 x i32> %241, <1024 x i32> addrspace(205)* %66, align 4096, !alias.scope !115, !noalias !116
+  %242 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %71, i32 255, i32 1, i32 undef), !alias.scope !117, !noalias !118
+  %243 = shl <1024 x i32> %242, %209
+  %244 = and <1024 x i32> %243, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  store <1024 x i32> %244, <1024 x i32> addrspace(205)* %70, align 4096, !alias.scope !119, !noalias !120
+  %245 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %75, i32 255, i32 1, i32 undef), !alias.scope !121, !noalias !122
+  %246 = shl <1024 x i32> %245, %209
+  %247 = and <1024 x i32> %246, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  store <1024 x i32> %247, <1024 x i32> addrspace(205)* %74, align 4096, !alias.scope !123, !noalias !124
+  %248 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %79, i32 255, i32 1, i32 undef), !alias.scope !125, !noalias !126
+  %249 = shl <1024 x i32> %248, %209
+  %250 = and <1024 x i32> %249, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  store <1024 x i32> %250, <1024 x i32> addrspace(205)* %78, align 4096, !alias.scope !127, !noalias !128
+  %251 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %83, i32 255, i32 1, i32 undef), !alias.scope !129, !noalias !130
+  %252 = shl <1024 x i32> %251, %209
+  %253 = and <1024 x i32> %252, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  store <1024 x i32> %253, <1024 x i32> addrspace(205)* %82, align 4096, !alias.scope !131, !noalias !132
+  %254 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %87, i32 255, i32 1, i32 undef), !alias.scope !133, !noalias !134
+  %255 = shl <1024 x i32> %254, %209
+  %256 = and <1024 x i32> %255, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  store <1024 x i32> %256, <1024 x i32> addrspace(205)* %86, align 4096, !alias.scope !135, !noalias !136
+  %257 = tail call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %91, i32 15, i32 1, i32 undef), !alias.scope !137, !noalias !138
+  %258 = shl <1024 x i32> %257, %209
+  %259 = and <1024 x i32> %258, <i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536, i32 -65536>
+  tail call void @llvm.tpu.vst.strided.v1024i32.p205v1024i32(<1024 x i32> %259, <1024 x i32> addrspace(205)* %90, i32 15, i32 1, <1024 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !139, !noalias !140
+  %260 = getelementptr i32, i32* %92, i32 %99
+  %261 = load i32, i32* %260, align 4
+  %262 = and i32 %261, 127
+  %263 = load i32, i32* %101, align 4
+  %264 = and i32 %263, 7
+  %265 = or i32 %99, 4
+  %266 = getelementptr i32, i32* %92, i32 %265
+  %267 = load i32, i32* %266, align 4
+  %268 = and i32 %267, 127
+  %269 = load i32, i32* %107, align 4
+  %270 = and i32 %269, 7
+  %271 = or i32 %99, 8
+  %272 = getelementptr i32, i32* %92, i32 %271
+  %273 = load i32, i32* %272, align 4
+  %274 = and i32 %273, 127
+  %275 = load i32, i32* %113, align 4
+  %276 = and i32 %275, 7
+  %277 = or i32 %99, 12
+  %278 = getelementptr i32, i32* %92, i32 %277
+  %279 = load i32, i32* %278, align 4
+  %280 = and i32 %279, 127
+  %281 = load i32, i32* %119, align 4
+  %282 = and i32 %281, 7
+  %283 = or i32 %99, 16
+  %284 = getelementptr i32, i32* %92, i32 %283
+  %285 = load i32, i32* %284, align 4
+  %286 = and i32 %285, 127
+  %287 = load i32, i32* %125, align 4
+  %288 = and i32 %287, 7
+  %289 = or i32 %99, 20
+  %290 = getelementptr i32, i32* %92, i32 %289
+  %291 = load i32, i32* %290, align 4
+  %292 = and i32 %291, 127
+  %293 = load i32, i32* %131, align 4
+  %294 = and i32 %293, 7
+  %295 = or i32 %99, 24
+  %296 = getelementptr i32, i32* %92, i32 %295
+  %297 = load i32, i32* %296, align 4
+  %298 = and i32 %297, 127
+  %299 = load i32, i32* %137, align 4
+  %300 = and i32 %299, 7
+  %301 = or i32 %99, 28
+  %302 = getelementptr i32, i32* %92, i32 %301
+  %303 = load i32, i32* %302, align 4
+  %304 = and i32 %303, 127
+  %305 = load i32, i32* %143, align 4
+  %306 = and i32 %305, 7
+  %307 = or i32 %99, 32
+  %308 = getelementptr i32, i32* %92, i32 %307
+  %309 = load i32, i32* %308, align 4
+  %310 = and i32 %309, 127
+  %311 = load i32, i32* %149, align 4
+  %312 = and i32 %311, 7
+  %313 = or i32 %99, 36
+  %314 = getelementptr i32, i32* %92, i32 %313
+  %315 = load i32, i32* %314, align 4
+  %316 = and i32 %315, 127
+  %317 = load i32, i32* %155, align 4
+  %318 = and i32 %317, 7
+  %319 = or i32 %99, 40
+  %320 = getelementptr i32, i32* %92, i32 %319
+  %321 = load i32, i32* %320, align 4
+  %322 = and i32 %321, 127
+  %323 = load i32, i32* %161, align 4
+  %324 = and i32 %323, 7
+  %325 = or i32 %99, 44
+  %326 = getelementptr i32, i32* %92, i32 %325
+  %327 = load i32, i32* %326, align 4
+  %328 = and i32 %327, 127
+  %329 = load i32, i32* %167, align 4
+  %330 = and i32 %329, 7
+  %331 = or i32 %99, 48
+  %332 = getelementptr i32, i32* %92, i32 %331
+  %333 = load i32, i32* %332, align 4
+  %334 = and i32 %333, 127
+  %335 = load i32, i32* %173, align 4
+  %336 = and i32 %335, 7
+  %337 = or i32 %99, 52
+  %338 = getelementptr i32, i32* %92, i32 %337
+  %339 = load i32, i32* %338, align 4
+  %340 = and i32 %339, 127
+  %341 = load i32, i32* %179, align 4
+  %342 = and i32 %341, 7
+  %343 = or i32 %99, 56
+  %344 = getelementptr i32, i32* %92, i32 %343
+  %345 = load i32, i32* %344, align 4
+  %346 = and i32 %345, 127
+  %347 = load i32, i32* %185, align 4
+  %348 = and i32 %347, 7
+  %349 = or i32 %99, 60
+  %350 = getelementptr i32, i32* %92, i32 %349
+  %351 = load i32, i32* %350, align 4
+  %352 = and i32 %351, 127
+  %353 = load i32, i32* %191, align 4
+  %354 = and i32 %353, 7
+  %355 = lshr i32 %351, 7
+  %356 = lshr i32 %261, 7
+  %357 = icmp eq i32 %355, %356
+  %358 = getelementptr i32, i32* %260, i32 61
+  %359 = load i32, i32* %358, align 4
+  %360 = getelementptr i32, i32* %260, i32 1
+  %361 = load i32, i32* %360, align 4
+  %362 = icmp eq i32 %359, %361
+  %363 = and i1 %357, %362
+  %364 = lshr i32 %353, 3
+  %365 = lshr i32 %263, 3
+  %366 = icmp eq i32 %364, %365
+  %367 = and i1 %363, %366
+  %368 = select i1 %367, i32 0, i32 15
+  %369 = lshr i32 %267, 7
+  %370 = icmp eq i32 %355, %369
+  %371 = getelementptr i32, i32* %260, i32 5
+  %372 = load i32, i32* %371, align 4
+  %373 = icmp eq i32 %359, %372
+  %374 = and i1 %373, %370
+  %375 = lshr i32 %269, 3
+  %376 = icmp eq i32 %364, %375
+  %377 = and i1 %374, %376
+  %378 = select i1 %377, i32 1, i32 %368
+  %379 = lshr i32 %273, 7
+  %380 = icmp eq i32 %355, %379
+  %381 = getelementptr i32, i32* %260, i32 9
+  %382 = load i32, i32* %381, align 4
+  %383 = icmp eq i32 %359, %382
+  %384 = and i1 %383, %380
+  %385 = lshr i32 %275, 3
+  %386 = icmp eq i32 %364, %385
+  %387 = and i1 %384, %386
+  %388 = select i1 %387, i32 2, i32 %378
+  %389 = lshr i32 %279, 7
+  %390 = icmp eq i32 %355, %389
+  %391 = getelementptr i32, i32* %260, i32 13
+  %392 = load i32, i32* %391, align 4
+  %393 = icmp eq i32 %359, %392
+  %394 = and i1 %393, %390
+  %395 = lshr i32 %281, 3
+  %396 = icmp eq i32 %364, %395
+  %397 = and i1 %394, %396
+  %398 = select i1 %397, i32 3, i32 %388
+  %399 = lshr i32 %285, 7
+  %400 = icmp eq i32 %355, %399
+  %401 = getelementptr i32, i32* %260, i32 17
+  %402 = load i32, i32* %401, align 4
+  %403 = icmp eq i32 %359, %402
+  %404 = and i1 %403, %400
+  %405 = lshr i32 %287, 3
+  %406 = icmp eq i32 %364, %405
+  %407 = and i1 %404, %406
+  %408 = select i1 %407, i32 4, i32 %398
+  %409 = lshr i32 %291, 7
+  %410 = icmp eq i32 %355, %409
+  %411 = getelementptr i32, i32* %260, i32 21
+  %412 = load i32, i32* %411, align 4
+  %413 = icmp eq i32 %359, %412
+  %414 = and i1 %413, %410
+  %415 = lshr i32 %293, 3
+  %416 = icmp eq i32 %364, %415
+  %417 = and i1 %414, %416
+  %418 = select i1 %417, i32 5, i32 %408
+  %419 = lshr i32 %297, 7
+  %420 = icmp eq i32 %355, %419
+  %421 = getelementptr i32, i32* %260, i32 25
+  %422 = load i32, i32* %421, align 4
+  %423 = icmp eq i32 %359, %422
+  %424 = and i1 %423, %420
+  %425 = lshr i32 %299, 3
+  %426 = icmp eq i32 %364, %425
+  %427 = and i1 %424, %426
+  %428 = select i1 %427, i32 6, i32 %418
+  %429 = lshr i32 %303, 7
+  %430 = icmp eq i32 %355, %429
+  %431 = getelementptr i32, i32* %260, i32 29
+  %432 = load i32, i32* %431, align 4
+  %433 = icmp eq i32 %359, %432
+  %434 = and i1 %433, %430
+  %435 = lshr i32 %305, 3
+  %436 = icmp eq i32 %364, %435
+  %437 = and i1 %434, %436
+  %438 = select i1 %437, i32 7, i32 %428
+  %439 = lshr i32 %309, 7
+  %440 = icmp eq i32 %355, %439
+  %441 = getelementptr i32, i32* %260, i32 33
+  %442 = load i32, i32* %441, align 4
+  %443 = icmp eq i32 %359, %442
+  %444 = and i1 %443, %440
+  %445 = lshr i32 %311, 3
+  %446 = icmp eq i32 %364, %445
+  %447 = and i1 %444, %446
+  %448 = select i1 %447, i32 8, i32 %438
+  %449 = lshr i32 %315, 7
+  %450 = icmp eq i32 %355, %449
+  %451 = getelementptr i32, i32* %260, i32 37
+  %452 = load i32, i32* %451, align 4
+  %453 = icmp eq i32 %359, %452
+  %454 = and i1 %453, %450
+  %455 = lshr i32 %317, 3
+  %456 = icmp eq i32 %364, %455
+  %457 = and i1 %454, %456
+  %458 = select i1 %457, i32 9, i32 %448
+  %459 = lshr i32 %321, 7
+  %460 = icmp eq i32 %355, %459
+  %461 = getelementptr i32, i32* %260, i32 41
+  %462 = load i32, i32* %461, align 4
+  %463 = icmp eq i32 %359, %462
+  %464 = and i1 %463, %460
+  %465 = lshr i32 %323, 3
+  %466 = icmp eq i32 %364, %465
+  %467 = and i1 %464, %466
+  %468 = select i1 %467, i32 10, i32 %458
+  %469 = lshr i32 %327, 7
+  %470 = icmp eq i32 %355, %469
+  %471 = getelementptr i32, i32* %260, i32 45
+  %472 = load i32, i32* %471, align 4
+  %473 = icmp eq i32 %359, %472
+  %474 = and i1 %473, %470
+  %475 = lshr i32 %329, 3
+  %476 = icmp eq i32 %364, %475
+  %477 = and i1 %474, %476
+  %478 = select i1 %477, i32 11, i32 %468
+  %479 = lshr i32 %333, 7
+  %480 = icmp eq i32 %355, %479
+  %481 = getelementptr i32, i32* %260, i32 49
+  %482 = load i32, i32* %481, align 4
+  %483 = icmp eq i32 %359, %482
+  %484 = and i1 %483, %480
+  %485 = lshr i32 %335, 3
+  %486 = icmp eq i32 %364, %485
+  %487 = and i1 %484, %486
+  %488 = select i1 %487, i32 12, i32 %478
+  %489 = lshr i32 %339, 7
+  %490 = icmp eq i32 %355, %489
+  %491 = getelementptr i32, i32* %260, i32 53
+  %492 = load i32, i32* %491, align 4
+  %493 = icmp eq i32 %359, %492
+  %494 = and i1 %493, %490
+  %495 = lshr i32 %341, 3
+  %496 = icmp eq i32 %364, %495
+  %497 = and i1 %494, %496
+  %498 = select i1 %497, i32 13, i32 %488
+  %499 = lshr i32 %345, 7
+  %500 = icmp eq i32 %355, %499
+  %501 = getelementptr i32, i32* %260, i32 57
+  %502 = load i32, i32* %501, align 4
+  %503 = icmp eq i32 %359, %502
+  %504 = and i1 %503, %500
+  %505 = lshr i32 %347, 3
+  %506 = icmp eq i32 %364, %505
+  %507 = and i1 %504, %506
+  %508 = select i1 %507, i32 14, i32 %498
+  %509 = icmp eq i32 %499, %356
+  %510 = icmp eq i32 %502, %361
+  %511 = and i1 %510, %509
+  %512 = icmp eq i32 %505, %365
+  %513 = and i1 %511, %512
+  %514 = select i1 %513, i32 0, i32 14
+  %515 = icmp eq i32 %499, %369
+  %516 = icmp eq i32 %502, %372
+  %517 = and i1 %516, %515
+  %518 = icmp eq i32 %505, %375
+  %519 = and i1 %517, %518
+  %520 = select i1 %519, i32 1, i32 %514
+  %521 = icmp eq i32 %499, %379
+  %522 = icmp eq i32 %502, %382
+  %523 = and i1 %522, %521
+  %524 = icmp eq i32 %505, %385
+  %525 = and i1 %523, %524
+  %526 = select i1 %525, i32 2, i32 %520
+  %527 = icmp eq i32 %499, %389
+  %528 = icmp eq i32 %502, %392
+  %529 = and i1 %528, %527
+  %530 = icmp eq i32 %505, %395
+  %531 = and i1 %529, %530
+  %532 = select i1 %531, i32 3, i32 %526
+  %533 = icmp eq i32 %499, %399
+  %534 = icmp eq i32 %502, %402
+  %535 = and i1 %534, %533
+  %536 = icmp eq i32 %505, %405
+  %537 = and i1 %535, %536
+  %538 = select i1 %537, i32 4, i32 %532
+  %539 = icmp eq i32 %499, %409
+  %540 = icmp eq i32 %502, %412
+  %541 = and i1 %540, %539
+  %542 = icmp eq i32 %505, %415
+  %543 = and i1 %541, %542
+  %544 = select i1 %543, i32 5, i32 %538
+  %545 = icmp eq i32 %499, %419
+  %546 = icmp eq i32 %502, %422
+  %547 = and i1 %546, %545
+  %548 = icmp eq i32 %505, %425
+  %549 = and i1 %547, %548
+  %550 = select i1 %549, i32 6, i32 %544
+  %551 = icmp eq i32 %499, %429
+  %552 = icmp eq i32 %502, %432
+  %553 = and i1 %552, %551
+  %554 = icmp eq i32 %505, %435
+  %555 = and i1 %553, %554
+  %556 = select i1 %555, i32 7, i32 %550
+  %557 = icmp eq i32 %499, %439
+  %558 = icmp eq i32 %502, %442
+  %559 = and i1 %558, %557
+  %560 = icmp eq i32 %505, %445
+  %561 = and i1 %559, %560
+  %562 = select i1 %561, i32 8, i32 %556
+  %563 = icmp eq i32 %499, %449
+  %564 = icmp eq i32 %502, %452
+  %565 = and i1 %564, %563
+  %566 = icmp eq i32 %505, %455
+  %567 = and i1 %565, %566
+  %568 = select i1 %567, i32 9, i32 %562
+  %569 = icmp eq i32 %499, %459
+  %570 = icmp eq i32 %502, %462
+  %571 = and i1 %570, %569
+  %572 = icmp eq i32 %505, %465
+  %573 = and i1 %571, %572
+  %574 = select i1 %573, i32 10, i32 %568
+  %575 = icmp eq i32 %499, %469
+  %576 = icmp eq i32 %502, %472
+  %577 = and i1 %576, %575
+  %578 = icmp eq i32 %505, %475
+  %579 = and i1 %577, %578
+  %580 = select i1 %579, i32 11, i32 %574
+  %581 = icmp eq i32 %499, %479
+  %582 = icmp eq i32 %502, %482
+  %583 = and i1 %582, %581
+  %584 = icmp eq i32 %505, %485
+  %585 = and i1 %583, %584
+  %586 = select i1 %585, i32 12, i32 %580
+  %587 = icmp eq i32 %499, %489
+  %588 = icmp eq i32 %502, %492
+  %589 = and i1 %588, %587
+  %590 = icmp eq i32 %505, %495
+  %591 = and i1 %589, %590
+  %592 = select i1 %591, i32 13, i32 %586
+  %593 = icmp eq i32 %489, %356
+  %594 = icmp eq i32 %492, %361
+  %595 = and i1 %594, %593
+  %596 = icmp eq i32 %495, %365
+  %597 = and i1 %595, %596
+  %598 = select i1 %597, i32 0, i32 13
+  %599 = icmp eq i32 %489, %369
+  %600 = icmp eq i32 %492, %372
+  %601 = and i1 %600, %599
+  %602 = icmp eq i32 %495, %375
+  %603 = and i1 %601, %602
+  %604 = select i1 %603, i32 1, i32 %598
+  %605 = icmp eq i32 %489, %379
+  %606 = icmp eq i32 %492, %382
+  %607 = and i1 %606, %605
+  %608 = icmp eq i32 %495, %385
+  %609 = and i1 %607, %608
+  %610 = select i1 %609, i32 2, i32 %604
+  %611 = icmp eq i32 %489, %389
+  %612 = icmp eq i32 %492, %392
+  %613 = and i1 %612, %611
+  %614 = icmp eq i32 %495, %395
+  %615 = and i1 %613, %614
+  %616 = select i1 %615, i32 3, i32 %610
+  %617 = icmp eq i32 %489, %399
+  %618 = icmp eq i32 %492, %402
+  %619 = and i1 %618, %617
+  %620 = icmp eq i32 %495, %405
+  %621 = and i1 %619, %620
+  %622 = select i1 %621, i32 4, i32 %616
+  %623 = icmp eq i32 %489, %409
+  %624 = icmp eq i32 %492, %412
+  %625 = and i1 %624, %623
+  %626 = icmp eq i32 %495, %415
+  %627 = and i1 %625, %626
+  %628 = select i1 %627, i32 5, i32 %622
+  %629 = icmp eq i32 %489, %419
+  %630 = icmp eq i32 %492, %422
+  %631 = and i1 %630, %629
+  %632 = icmp eq i32 %495, %425
+  %633 = and i1 %631, %632
+  %634 = select i1 %633, i32 6, i32 %628
+  %635 = icmp eq i32 %489, %429
+  %636 = icmp eq i32 %492, %432
+  %637 = and i1 %636, %635
+  %638 = icmp eq i32 %495, %435
+  %639 = and i1 %637, %638
+  %640 = select i1 %639, i32 7, i32 %634
+  %641 = icmp eq i32 %489, %439
+  %642 = icmp eq i32 %492, %442
+  %643 = and i1 %642, %641
+  %644 = icmp eq i32 %495, %445
+  %645 = and i1 %643, %644
+  %646 = select i1 %645, i32 8, i32 %640
+  %647 = icmp eq i32 %489, %449
+  %648 = icmp eq i32 %492, %452
+  %649 = and i1 %648, %647
+  %650 = icmp eq i32 %495, %455
+  %651 = and i1 %649, %650
+  %652 = select i1 %651, i32 9, i32 %646
+  %653 = icmp eq i32 %489, %459
+  %654 = icmp eq i32 %492, %462
+  %655 = and i1 %654, %653
+  %656 = icmp eq i32 %495, %465
+  %657 = and i1 %655, %656
+  %658 = select i1 %657, i32 10, i32 %652
+  %659 = icmp eq i32 %489, %469
+  %660 = icmp eq i32 %492, %472
+  %661 = and i1 %660, %659
+  %662 = icmp eq i32 %495, %475
+  %663 = and i1 %661, %662
+  %664 = select i1 %663, i32 11, i32 %658
+  %665 = icmp eq i32 %489, %479
+  %666 = icmp eq i32 %492, %482
+  %667 = and i1 %666, %665
+  %668 = icmp eq i32 %495, %485
+  %669 = and i1 %667, %668
+  %670 = select i1 %669, i32 12, i32 %664
+  %671 = icmp eq i32 %479, %356
+  %672 = icmp eq i32 %482, %361
+  %673 = and i1 %672, %671
+  %674 = icmp eq i32 %485, %365
+  %675 = and i1 %673, %674
+  %676 = select i1 %675, i32 0, i32 12
+  %677 = icmp eq i32 %479, %369
+  %678 = icmp eq i32 %482, %372
+  %679 = and i1 %678, %677
+  %680 = icmp eq i32 %485, %375
+  %681 = and i1 %679, %680
+  %682 = select i1 %681, i32 1, i32 %676
+  %683 = icmp eq i32 %479, %379
+  %684 = icmp eq i32 %482, %382
+  %685 = and i1 %684, %683
+  %686 = icmp eq i32 %485, %385
+  %687 = and i1 %685, %686
+  %688 = select i1 %687, i32 2, i32 %682
+  %689 = icmp eq i32 %479, %389
+  %690 = icmp eq i32 %482, %392
+  %691 = and i1 %690, %689
+  %692 = icmp eq i32 %485, %395
+  %693 = and i1 %691, %692
+  %694 = select i1 %693, i32 3, i32 %688
+  %695 = icmp eq i32 %479, %399
+  %696 = icmp eq i32 %482, %402
+  %697 = and i1 %696, %695
+  %698 = icmp eq i32 %485, %405
+  %699 = and i1 %697, %698
+  %700 = select i1 %699, i32 4, i32 %694
+  %701 = icmp eq i32 %479, %409
+  %702 = icmp eq i32 %482, %412
+  %703 = and i1 %702, %701
+  %704 = icmp eq i32 %485, %415
+  %705 = and i1 %703, %704
+  %706 = select i1 %705, i32 5, i32 %700
+  %707 = icmp eq i32 %479, %419
+  %708 = icmp eq i32 %482, %422
+  %709 = and i1 %708, %707
+  %710 = icmp eq i32 %485, %425
+  %711 = and i1 %709, %710
+  %712 = select i1 %711, i32 6, i32 %706
+  %713 = icmp eq i32 %479, %429
+  %714 = icmp eq i32 %482, %432
+  %715 = and i1 %714, %713
+  %716 = icmp eq i32 %485, %435
+  %717 = and i1 %715, %716
+  %718 = select i1 %717, i32 7, i32 %712
+  %719 = icmp eq i32 %479, %439
+  %720 = icmp eq i32 %482, %442
+  %721 = and i1 %720, %719
+  %722 = icmp eq i32 %485, %445
+  %723 = and i1 %721, %722
+  %724 = select i1 %723, i32 8, i32 %718
+  %725 = icmp eq i32 %479, %449
+  %726 = icmp eq i32 %482, %452
+  %727 = and i1 %726, %725
+  %728 = icmp eq i32 %485, %455
+  %729 = and i1 %727, %728
+  %730 = select i1 %729, i32 9, i32 %724
+  %731 = icmp eq i32 %479, %459
+  %732 = icmp eq i32 %482, %462
+  %733 = and i1 %732, %731
+  %734 = icmp eq i32 %485, %465
+  %735 = and i1 %733, %734
+  %736 = select i1 %735, i32 10, i32 %730
+  %737 = icmp eq i32 %479, %469
+  %738 = icmp eq i32 %482, %472
+  %739 = and i1 %738, %737
+  %740 = icmp eq i32 %485, %475
+  %741 = and i1 %739, %740
+  %742 = select i1 %741, i32 11, i32 %736
+  %743 = icmp eq i32 %469, %356
+  %744 = icmp eq i32 %472, %361
+  %745 = and i1 %744, %743
+  %746 = icmp eq i32 %475, %365
+  %747 = and i1 %745, %746
+  %748 = select i1 %747, i32 0, i32 11
+  %749 = icmp eq i32 %469, %369
+  %750 = icmp eq i32 %472, %372
+  %751 = and i1 %750, %749
+  %752 = icmp eq i32 %475, %375
+  %753 = and i1 %751, %752
+  %754 = select i1 %753, i32 1, i32 %748
+  %755 = icmp eq i32 %469, %379
+  %756 = icmp eq i32 %472, %382
+  %757 = and i1 %756, %755
+  %758 = icmp eq i32 %475, %385
+  %759 = and i1 %757, %758
+  %760 = select i1 %759, i32 2, i32 %754
+  %761 = icmp eq i32 %469, %389
+  %762 = icmp eq i32 %472, %392
+  %763 = and i1 %762, %761
+  %764 = icmp eq i32 %475, %395
+  %765 = and i1 %763, %764
+  %766 = select i1 %765, i32 3, i32 %760
+  %767 = icmp eq i32 %469, %399
+  %768 = icmp eq i32 %472, %402
+  %769 = and i1 %768, %767
+  %770 = icmp eq i32 %475, %405
+  %771 = and i1 %769, %770
+  %772 = select i1 %771, i32 4, i32 %766
+  %773 = icmp eq i32 %469, %409
+  %774 = icmp eq i32 %472, %412
+  %775 = and i1 %774, %773
+  %776 = icmp eq i32 %475, %415
+  %777 = and i1 %775, %776
+  %778 = select i1 %777, i32 5, i32 %772
+  %779 = icmp eq i32 %469, %419
+  %780 = icmp eq i32 %472, %422
+  %781 = and i1 %780, %779
+  %782 = icmp eq i32 %475, %425
+  %783 = and i1 %781, %782
+  %784 = select i1 %783, i32 6, i32 %778
+  %785 = icmp eq i32 %469, %429
+  %786 = icmp eq i32 %472, %432
+  %787 = and i1 %786, %785
+  %788 = icmp eq i32 %475, %435
+  %789 = and i1 %787, %788
+  %790 = select i1 %789, i32 7, i32 %784
+  %791 = icmp eq i32 %469, %439
+  %792 = icmp eq i32 %472, %442
+  %793 = and i1 %792, %791
+  %794 = icmp eq i32 %475, %445
+  %795 = and i1 %793, %794
+  %796 = select i1 %795, i32 8, i32 %790
+  %797 = icmp eq i32 %469, %449
+  %798 = icmp eq i32 %472, %452
+  %799 = and i1 %798, %797
+  %800 = icmp eq i32 %475, %455
+  %801 = and i1 %799, %800
+  %802 = select i1 %801, i32 9, i32 %796
+  %803 = icmp eq i32 %469, %459
+  %804 = icmp eq i32 %472, %462
+  %805 = and i1 %804, %803
+  %806 = icmp eq i32 %475, %465
+  %807 = and i1 %805, %806
+  %808 = select i1 %807, i32 10, i32 %802
+  %809 = icmp eq i32 %459, %356
+  %810 = icmp eq i32 %462, %361
+  %811 = and i1 %810, %809
+  %812 = icmp eq i32 %465, %365
+  %813 = and i1 %811, %812
+  %814 = select i1 %813, i32 0, i32 10
+  %815 = icmp eq i32 %459, %369
+  %816 = icmp eq i32 %462, %372
+  %817 = and i1 %816, %815
+  %818 = icmp eq i32 %465, %375
+  %819 = and i1 %817, %818
+  %820 = select i1 %819, i32 1, i32 %814
+  %821 = icmp eq i32 %459, %379
+  %822 = icmp eq i32 %462, %382
+  %823 = and i1 %822, %821
+  %824 = icmp eq i32 %465, %385
+  %825 = and i1 %823, %824
+  %826 = select i1 %825, i32 2, i32 %820
+  %827 = icmp eq i32 %459, %389
+  %828 = icmp eq i32 %462, %392
+  %829 = and i1 %828, %827
+  %830 = icmp eq i32 %465, %395
+  %831 = and i1 %829, %830
+  %832 = select i1 %831, i32 3, i32 %826
+  %833 = icmp eq i32 %459, %399
+  %834 = icmp eq i32 %462, %402
+  %835 = and i1 %834, %833
+  %836 = icmp eq i32 %465, %405
+  %837 = and i1 %835, %836
+  %838 = select i1 %837, i32 4, i32 %832
+  %839 = icmp eq i32 %459, %409
+  %840 = icmp eq i32 %462, %412
+  %841 = and i1 %840, %839
+  %842 = icmp eq i32 %465, %415
+  %843 = and i1 %841, %842
+  %844 = select i1 %843, i32 5, i32 %838
+  %845 = icmp eq i32 %459, %419
+  %846 = icmp eq i32 %462, %422
+  %847 = and i1 %846, %845
+  %848 = icmp eq i32 %465, %425
+  %849 = and i1 %847, %848
+  %850 = select i1 %849, i32 6, i32 %844
+  %851 = icmp eq i32 %459, %429
+  %852 = icmp eq i32 %462, %432
+  %853 = and i1 %852, %851
+  %854 = icmp eq i32 %465, %435
+  %855 = and i1 %853, %854
+  %856 = select i1 %855, i32 7, i32 %850
+  %857 = icmp eq i32 %459, %439
+  %858 = icmp eq i32 %462, %442
+  %859 = and i1 %858, %857
+  %860 = icmp eq i32 %465, %445
+  %861 = and i1 %859, %860
+  %862 = select i1 %861, i32 8, i32 %856
+  %863 = icmp eq i32 %459, %449
+  %864 = icmp eq i32 %462, %452
+  %865 = and i1 %864, %863
+  %866 = icmp eq i32 %465, %455
+  %867 = and i1 %865, %866
+  %868 = select i1 %867, i32 9, i32 %862
+  %869 = icmp eq i32 %449, %356
+  %870 = icmp eq i32 %452, %361
+  %871 = and i1 %870, %869
+  %872 = icmp eq i32 %455, %365
+  %873 = and i1 %871, %872
+  %874 = select i1 %873, i32 0, i32 9
+  %875 = icmp eq i32 %449, %369
+  %876 = icmp eq i32 %452, %372
+  %877 = and i1 %876, %875
+  %878 = icmp eq i32 %455, %375
+  %879 = and i1 %877, %878
+  %880 = select i1 %879, i32 1, i32 %874
+  %881 = icmp eq i32 %449, %379
+  %882 = icmp eq i32 %452, %382
+  %883 = and i1 %882, %881
+  %884 = icmp eq i32 %455, %385
+  %885 = and i1 %883, %884
+  %886 = select i1 %885, i32 2, i32 %880
+  %887 = icmp eq i32 %449, %389
+  %888 = icmp eq i32 %452, %392
+  %889 = and i1 %888, %887
+  %890 = icmp eq i32 %455, %395
+  %891 = and i1 %889, %890
+  %892 = select i1 %891, i32 3, i32 %886
+  %893 = icmp eq i32 %449, %399
+  %894 = icmp eq i32 %452, %402
+  %895 = and i1 %894, %893
+  %896 = icmp eq i32 %455, %405
+  %897 = and i1 %895, %896
+  %898 = select i1 %897, i32 4, i32 %892
+  %899 = icmp eq i32 %449, %409
+  %900 = icmp eq i32 %452, %412
+  %901 = and i1 %900, %899
+  %902 = icmp eq i32 %455, %415
+  %903 = and i1 %901, %902
+  %904 = select i1 %903, i32 5, i32 %898
+  %905 = icmp eq i32 %449, %419
+  %906 = icmp eq i32 %452, %422
+  %907 = and i1 %906, %905
+  %908 = icmp eq i32 %455, %425
+  %909 = and i1 %907, %908
+  %910 = select i1 %909, i32 6, i32 %904
+  %911 = icmp eq i32 %449, %429
+  %912 = icmp eq i32 %452, %432
+  %913 = and i1 %912, %911
+  %914 = icmp eq i32 %455, %435
+  %915 = and i1 %913, %914
+  %916 = select i1 %915, i32 7, i32 %910
+  %917 = icmp eq i32 %449, %439
+  %918 = icmp eq i32 %452, %442
+  %919 = and i1 %918, %917
+  %920 = icmp eq i32 %455, %445
+  %921 = and i1 %919, %920
+  %922 = select i1 %921, i32 8, i32 %916
+  %923 = icmp eq i32 %439, %356
+  %924 = icmp eq i32 %442, %361
+  %925 = and i1 %924, %923
+  %926 = icmp eq i32 %445, %365
+  %927 = and i1 %925, %926
+  %928 = select i1 %927, i32 0, i32 8
+  %929 = icmp eq i32 %439, %369
+  %930 = icmp eq i32 %442, %372
+  %931 = and i1 %930, %929
+  %932 = icmp eq i32 %445, %375
+  %933 = and i1 %931, %932
+  %934 = select i1 %933, i32 1, i32 %928
+  %935 = icmp eq i32 %439, %379
+  %936 = icmp eq i32 %442, %382
+  %937 = and i1 %936, %935
+  %938 = icmp eq i32 %445, %385
+  %939 = and i1 %937, %938
+  %940 = select i1 %939, i32 2, i32 %934
+  %941 = icmp eq i32 %439, %389
+  %942 = icmp eq i32 %442, %392
+  %943 = and i1 %942, %941
+  %944 = icmp eq i32 %445, %395
+  %945 = and i1 %943, %944
+  %946 = select i1 %945, i32 3, i32 %940
+  %947 = icmp eq i32 %439, %399
+  %948 = icmp eq i32 %442, %402
+  %949 = and i1 %948, %947
+  %950 = icmp eq i32 %445, %405
+  %951 = and i1 %949, %950
+  %952 = select i1 %951, i32 4, i32 %946
+  %953 = icmp eq i32 %439, %409
+  %954 = icmp eq i32 %442, %412
+  %955 = and i1 %954, %953
+  %956 = icmp eq i32 %445, %415
+  %957 = and i1 %955, %956
+  %958 = select i1 %957, i32 5, i32 %952
+  %959 = icmp eq i32 %439, %419
+  %960 = icmp eq i32 %442, %422
+  %961 = and i1 %960, %959
+  %962 = icmp eq i32 %445, %425
+  %963 = and i1 %961, %962
+  %964 = select i1 %963, i32 6, i32 %958
+  %965 = icmp eq i32 %439, %429
+  %966 = icmp eq i32 %442, %432
+  %967 = and i1 %966, %965
+  %968 = icmp eq i32 %445, %435
+  %969 = and i1 %967, %968
+  %970 = select i1 %969, i32 7, i32 %964
+  %971 = icmp eq i32 %429, %356
+  %972 = icmp eq i32 %432, %361
+  %973 = and i1 %972, %971
+  %974 = icmp eq i32 %435, %365
+  %975 = and i1 %973, %974
+  %976 = select i1 %975, i32 0, i32 7
+  %977 = icmp eq i32 %429, %369
+  %978 = icmp eq i32 %432, %372
+  %979 = and i1 %978, %977
+  %980 = icmp eq i32 %435, %375
+  %981 = and i1 %979, %980
+  %982 = select i1 %981, i32 1, i32 %976
+  %983 = icmp eq i32 %429, %379
+  %984 = icmp eq i32 %432, %382
+  %985 = and i1 %984, %983
+  %986 = icmp eq i32 %435, %385
+  %987 = and i1 %985, %986
+  %988 = select i1 %987, i32 2, i32 %982
+  %989 = icmp eq i32 %429, %389
+  %990 = icmp eq i32 %432, %392
+  %991 = and i1 %990, %989
+  %992 = icmp eq i32 %435, %395
+  %993 = and i1 %991, %992
+  %994 = select i1 %993, i32 3, i32 %988
+  %995 = icmp eq i32 %429, %399
+  %996 = icmp eq i32 %432, %402
+  %997 = and i1 %996, %995
+  %998 = icmp eq i32 %435, %405
+  %999 = and i1 %997, %998
+  %1000 = select i1 %999, i32 4, i32 %994
+  %1001 = icmp eq i32 %429, %409
+  %1002 = icmp eq i32 %432, %412
+  %1003 = and i1 %1002, %1001
+  %1004 = icmp eq i32 %435, %415
+  %1005 = and i1 %1003, %1004
+  %1006 = select i1 %1005, i32 5, i32 %1000
+  %1007 = icmp eq i32 %429, %419
+  %1008 = icmp eq i32 %432, %422
+  %1009 = and i1 %1008, %1007
+  %1010 = icmp eq i32 %435, %425
+  %1011 = and i1 %1009, %1010
+  %1012 = select i1 %1011, i32 6, i32 %1006
+  %1013 = icmp eq i32 %419, %356
+  %1014 = icmp eq i32 %422, %361
+  %1015 = and i1 %1014, %1013
+  %1016 = icmp eq i32 %425, %365
+  %1017 = and i1 %1015, %1016
+  %1018 = select i1 %1017, i32 0, i32 6
+  %1019 = icmp eq i32 %419, %369
+  %1020 = icmp eq i32 %422, %372
+  %1021 = and i1 %1020, %1019
+  %1022 = icmp eq i32 %425, %375
+  %1023 = and i1 %1021, %1022
+  %1024 = select i1 %1023, i32 1, i32 %1018
+  %1025 = icmp eq i32 %419, %379
+  %1026 = icmp eq i32 %422, %382
+  %1027 = and i1 %1026, %1025
+  %1028 = icmp eq i32 %425, %385
+  %1029 = and i1 %1027, %1028
+  %1030 = select i1 %1029, i32 2, i32 %1024
+  %1031 = icmp eq i32 %419, %389
+  %1032 = icmp eq i32 %422, %392
+  %1033 = and i1 %1032, %1031
+  %1034 = icmp eq i32 %425, %395
+  %1035 = and i1 %1033, %1034
+  %1036 = select i1 %1035, i32 3, i32 %1030
+  %1037 = icmp eq i32 %419, %399
+  %1038 = icmp eq i32 %422, %402
+  %1039 = and i1 %1038, %1037
+  %1040 = icmp eq i32 %425, %405
+  %1041 = and i1 %1039, %1040
+  %1042 = select i1 %1041, i32 4, i32 %1036
+  %1043 = icmp eq i32 %419, %409
+  %1044 = icmp eq i32 %422, %412
+  %1045 = and i1 %1044, %1043
+  %1046 = icmp eq i32 %425, %415
+  %1047 = and i1 %1045, %1046
+  %1048 = select i1 %1047, i32 5, i32 %1042
+  %1049 = icmp eq i32 %409, %356
+  %1050 = icmp eq i32 %412, %361
+  %1051 = and i1 %1050, %1049
+  %1052 = icmp eq i32 %415, %365
+  %1053 = and i1 %1051, %1052
+  %1054 = select i1 %1053, i32 0, i32 5
+  %1055 = icmp eq i32 %409, %369
+  %1056 = icmp eq i32 %412, %372
+  %1057 = and i1 %1056, %1055
+  %1058 = icmp eq i32 %415, %375
+  %1059 = and i1 %1057, %1058
+  %1060 = select i1 %1059, i32 1, i32 %1054
+  %1061 = icmp eq i32 %409, %379
+  %1062 = icmp eq i32 %412, %382
+  %1063 = and i1 %1062, %1061
+  %1064 = icmp eq i32 %415, %385
+  %1065 = and i1 %1063, %1064
+  %1066 = select i1 %1065, i32 2, i32 %1060
+  %1067 = icmp eq i32 %409, %389
+  %1068 = icmp eq i32 %412, %392
+  %1069 = and i1 %1068, %1067
+  %1070 = icmp eq i32 %415, %395
+  %1071 = and i1 %1069, %1070
+  %1072 = select i1 %1071, i32 3, i32 %1066
+  %1073 = icmp eq i32 %409, %399
+  %1074 = icmp eq i32 %412, %402
+  %1075 = and i1 %1074, %1073
+  %1076 = icmp eq i32 %415, %405
+  %1077 = and i1 %1075, %1076
+  %1078 = select i1 %1077, i32 4, i32 %1072
+  %1079 = icmp eq i32 %399, %356
+  %1080 = icmp eq i32 %402, %361
+  %1081 = and i1 %1080, %1079
+  %1082 = icmp eq i32 %405, %365
+  %1083 = and i1 %1081, %1082
+  %1084 = select i1 %1083, i32 0, i32 4
+  %1085 = icmp eq i32 %399, %369
+  %1086 = icmp eq i32 %402, %372
+  %1087 = and i1 %1086, %1085
+  %1088 = icmp eq i32 %405, %375
+  %1089 = and i1 %1087, %1088
+  %1090 = select i1 %1089, i32 1, i32 %1084
+  %1091 = icmp eq i32 %399, %379
+  %1092 = icmp eq i32 %402, %382
+  %1093 = and i1 %1092, %1091
+  %1094 = icmp eq i32 %405, %385
+  %1095 = and i1 %1093, %1094
+  %1096 = select i1 %1095, i32 2, i32 %1090
+  %1097 = icmp eq i32 %399, %389
+  %1098 = icmp eq i32 %402, %392
+  %1099 = and i1 %1098, %1097
+  %1100 = icmp eq i32 %405, %395
+  %1101 = and i1 %1099, %1100
+  %1102 = select i1 %1101, i32 3, i32 %1096
+  %1103 = icmp eq i32 %389, %356
+  %1104 = icmp eq i32 %392, %361
+  %1105 = and i1 %1104, %1103
+  %1106 = icmp eq i32 %395, %365
+  %1107 = and i1 %1105, %1106
+  %1108 = select i1 %1107, i32 0, i32 3
+  %1109 = icmp eq i32 %389, %369
+  %1110 = icmp eq i32 %392, %372
+  %1111 = and i1 %1110, %1109
+  %1112 = icmp eq i32 %395, %375
+  %1113 = and i1 %1111, %1112
+  %1114 = select i1 %1113, i32 1, i32 %1108
+  %1115 = icmp eq i32 %389, %379
+  %1116 = icmp eq i32 %392, %382
+  %1117 = and i1 %1116, %1115
+  %1118 = icmp eq i32 %395, %385
+  %1119 = and i1 %1117, %1118
+  %1120 = select i1 %1119, i32 2, i32 %1114
+  %1121 = icmp eq i32 %379, %356
+  %1122 = icmp eq i32 %382, %361
+  %1123 = and i1 %1122, %1121
+  %1124 = icmp eq i32 %385, %365
+  %1125 = and i1 %1123, %1124
+  %1126 = select i1 %1125, i32 0, i32 2
+  %1127 = icmp eq i32 %379, %369
+  %1128 = icmp eq i32 %382, %372
+  %1129 = and i1 %1128, %1127
+  %1130 = icmp eq i32 %385, %375
+  %1131 = and i1 %1129, %1130
+  %1132 = select i1 %1131, i32 1, i32 %1126
+  %1133 = icmp eq i32 %369, %356
+  %1134 = icmp eq i32 %372, %361
+  %1135 = and i1 %1134, %1133
+  %1136 = icmp eq i32 %375, %365
+  %1137 = and i1 %1135, %1136
+  %1138 = shl i32 %98, 4
+  %1139 = lshr i32 %98, 3
+  %1140 = and i32 %1139, 7
+  %1141 = getelementptr i4096, i4096 addrspace(205)* %96, i32 %1140
+  %1142 = bitcast i4096 addrspace(205)* %1141 to <1024 x i32> addrspace(205)*
+  %1143 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1142, i32 255, i32 0), !alias.scope !139, !noalias !140
+  %1144 = getelementptr i4096, i4096 addrspace(205)* %97, i32 %264
+  %1145 = bitcast i4096 addrspace(205)* %1144 to <1024 x i32> addrspace(205)*
+  %1146 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1145, i32 255, i32 0), !alias.scope !75, !noalias !76
+  %.splatinsert273 = insertelement <1024 x i32> undef, i32 %1138, i32 0
+  %.splat274 = shufflevector <1024 x i32> %.splatinsert273, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1147 = tail call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %.splat274, i32 0)
+  %1148 = tail call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %1143, i32 %1147, i32 0)
+  %1149 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %1148)
+  %1150 = bitcast <1024 x i32> %1146 to <1024 x float>
+  %1151 = bitcast <1024 x i32> %1149 to <1024 x float>
+  %1152 = fadd <1024 x float> %1150, %1151
+  %1153 = bitcast <1024 x float> %1152 to <1024 x i32>
+  %.splatinsert275 = insertelement <1024 x i32> undef, i32 %262, i32 0
+  %.splat276 = shufflevector <1024 x i32> %.splatinsert275, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1154 = and <1024 x i32> %206, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+  %1155 = icmp eq <1024 x i32> %1154, %.splat276
+  %.splatinsert279 = insertelement <1024 x i32> undef, i32 %264, i32 0
+  %.splat280 = shufflevector <1024 x i32> %.splatinsert279, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1156 = lshr <1024 x i32> %206, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %1157 = icmp eq <1024 x i32> %1156, %.splat280
+  %1158 = and <1024 x i1> %1155, %1157
+  %1159 = select <1024 x i1> %1158, <1024 x i32> %1153, <1024 x i32> %211
+  store <1024 x i32> %1159, <1024 x i32> addrspace(205)* %4, align 4096, !alias.scope !141, !noalias !142
+  store <1024 x i32> %1159, <1024 x i32> addrspace(205)* %28, align 4096, !alias.scope !143, !noalias !144
+  %1160 = select i1 %1137, i32 0, i32 8
+  %1161 = getelementptr i4096, i4096 addrspace(205)* %8, i32 %1160
+  %1162 = bitcast i4096 addrspace(205)* %1161 to <1024 x i32> addrspace(205)*
+  %1163 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %1162)
+  %1164 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %30)
+  %1165 = select i1 %1137, i32 %1163, i32 %1164
+  %1166 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 %1165)
+  %1167 = bitcast <1024 x i32> addrspace(205)* %1166 to i4096 addrspace(205)*
+  %1168 = getelementptr i4096, i4096 addrspace(205)* %1167, i32 %270
+  %1169 = bitcast i4096 addrspace(205)* %1168 to <1024 x i32> addrspace(205)*
+  %1170 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1169, i32 255, i32 0)
+  %1171 = or i32 %1138, 1
+  %.splatinsert287 = insertelement <1024 x i32> undef, i32 %1171, i32 0
+  %.splat288 = shufflevector <1024 x i32> %.splatinsert287, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1172 = tail call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %.splat288, i32 0)
+  %1173 = tail call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %1143, i32 %1172, i32 0)
+  %1174 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %1173)
+  %1175 = bitcast <1024 x i32> %1170 to <1024 x float>
+  %1176 = bitcast <1024 x i32> %1174 to <1024 x float>
+  %1177 = fadd <1024 x float> %1175, %1176
+  %1178 = bitcast <1024 x float> %1177 to <1024 x i32>
+  %.splatinsert289 = insertelement <1024 x i32> undef, i32 %268, i32 0
+  %.splat290 = shufflevector <1024 x i32> %.splatinsert289, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1179 = icmp eq <1024 x i32> %1154, %.splat290
+  %.splatinsert293 = insertelement <1024 x i32> undef, i32 %270, i32 0
+  %.splat294 = shufflevector <1024 x i32> %.splatinsert293, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1180 = icmp eq <1024 x i32> %1156, %.splat294
+  %1181 = and <1024 x i1> %1179, %1180
+  %1182 = load <1024 x i32>, <1024 x i32> addrspace(205)* %1166, align 4096
+  %1183 = select <1024 x i1> %1181, <1024 x i32> %1178, <1024 x i32> %1182
+  store <1024 x i32> %1183, <1024 x i32> addrspace(205)* %9, align 4096, !alias.scope !141, !noalias !142
+  store <1024 x i32> %1183, <1024 x i32> addrspace(205)* %32, align 4096, !alias.scope !145, !noalias !146
+  %1184 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 %1132
+  %1185 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %1184)
+  %1186 = icmp eq i32 %1132, 2
+  %1187 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %34)
+  %1188 = select i1 %1186, i32 %1187, i32 %1185
+  %1189 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 %1188)
+  %1190 = bitcast <1024 x i32> addrspace(205)* %1189 to i4096 addrspace(205)*
+  %1191 = getelementptr i4096, i4096 addrspace(205)* %1190, i32 %276
+  %1192 = bitcast i4096 addrspace(205)* %1191 to <1024 x i32> addrspace(205)*
+  %1193 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1192, i32 255, i32 0)
+  %1194 = or i32 %1138, 2
+  %.splatinsert301 = insertelement <1024 x i32> undef, i32 %1194, i32 0
+  %.splat302 = shufflevector <1024 x i32> %.splatinsert301, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1195 = tail call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %.splat302, i32 0)
+  %1196 = tail call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %1143, i32 %1195, i32 0)
+  %1197 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %1196)
+  %1198 = bitcast <1024 x i32> %1193 to <1024 x float>
+  %1199 = bitcast <1024 x i32> %1197 to <1024 x float>
+  %1200 = fadd <1024 x float> %1198, %1199
+  %1201 = bitcast <1024 x float> %1200 to <1024 x i32>
+  %.splatinsert303 = insertelement <1024 x i32> undef, i32 %274, i32 0
+  %.splat304 = shufflevector <1024 x i32> %.splatinsert303, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1202 = icmp eq <1024 x i32> %1154, %.splat304
+  %.splatinsert307 = insertelement <1024 x i32> undef, i32 %276, i32 0
+  %.splat308 = shufflevector <1024 x i32> %.splatinsert307, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1203 = icmp eq <1024 x i32> %1156, %.splat308
+  %1204 = and <1024 x i1> %1202, %1203
+  %1205 = load <1024 x i32>, <1024 x i32> addrspace(205)* %1189, align 4096
+  %1206 = select <1024 x i1> %1204, <1024 x i32> %1201, <1024 x i32> %1205
+  store <1024 x i32> %1206, <1024 x i32> addrspace(205)* %10, align 4096, !alias.scope !141, !noalias !142
+  store <1024 x i32> %1206, <1024 x i32> addrspace(205)* %36, align 4096, !alias.scope !147, !noalias !148
+  %1207 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 %1120
+  %1208 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %1207)
+  %1209 = icmp eq i32 %1120, 3
+  %1210 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %38)
+  %1211 = select i1 %1209, i32 %1210, i32 %1208
+  %1212 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 %1211)
+  %1213 = bitcast <1024 x i32> addrspace(205)* %1212 to i4096 addrspace(205)*
+  %1214 = getelementptr i4096, i4096 addrspace(205)* %1213, i32 %282
+  %1215 = bitcast i4096 addrspace(205)* %1214 to <1024 x i32> addrspace(205)*
+  %1216 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1215, i32 255, i32 0)
+  %1217 = or i32 %1138, 3
+  %.splatinsert315 = insertelement <1024 x i32> undef, i32 %1217, i32 0
+  %.splat316 = shufflevector <1024 x i32> %.splatinsert315, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1218 = tail call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %.splat316, i32 0)
+  %1219 = tail call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %1143, i32 %1218, i32 0)
+  %1220 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %1219)
+  %1221 = bitcast <1024 x i32> %1216 to <1024 x float>
+  %1222 = bitcast <1024 x i32> %1220 to <1024 x float>
+  %1223 = fadd <1024 x float> %1221, %1222
+  %1224 = bitcast <1024 x float> %1223 to <1024 x i32>
+  %.splatinsert317 = insertelement <1024 x i32> undef, i32 %280, i32 0
+  %.splat318 = shufflevector <1024 x i32> %.splatinsert317, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1225 = icmp eq <1024 x i32> %1154, %.splat318
+  %.splatinsert321 = insertelement <1024 x i32> undef, i32 %282, i32 0
+  %.splat322 = shufflevector <1024 x i32> %.splatinsert321, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1226 = icmp eq <1024 x i32> %1156, %.splat322
+  %1227 = and <1024 x i1> %1225, %1226
+  %1228 = load <1024 x i32>, <1024 x i32> addrspace(205)* %1212, align 4096
+  %1229 = select <1024 x i1> %1227, <1024 x i32> %1224, <1024 x i32> %1228
+  store <1024 x i32> %1229, <1024 x i32> addrspace(205)* %11, align 4096, !alias.scope !141, !noalias !142
+  store <1024 x i32> %1229, <1024 x i32> addrspace(205)* %40, align 4096, !alias.scope !149, !noalias !150
+  %1230 = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %4, i32 %1102
+  %1231 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %1230)
+  %1232 = icmp eq i32 %1102, 4
+  %1233 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %42)
+  %1234 = select i1 %1232, i32 %1233, i32 %1231
+  %1235 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 %1234)
+  %1236 = bitcast <1024 x i32> addrspace(205)* %1235 to i4096 addrspace(205)*
+  %1237 = getelementptr i4096, i4096 addrspace(205)* %1236, i32 %288
+  %1238 = bitcast i4096 addrspace(205)* %1237 to <1024 x i32> addrspace(205)*
+  %1239 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1238, i32 255, i32 0)
+  %1240 = or i32 %1138, 4
+  %.splatinsert329 = insertelement <1024 x i32> undef, i32 %1240, i32 0
+  %.splat330 = shufflevector <1024 x i32> %.splatinsert329, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1241 = tail call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %.splat330, i32 0)
+  %1242 = tail call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %1143, i32 %1241, i32 0)
+  %1243 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %1242)
+  %1244 = bitcast <1024 x i32> %1239 to <1024 x float>
+  %1245 = bitcast <1024 x i32> %1243 to <1024 x float>
+  %1246 = fadd <1024 x float> %1244, %1245
+  %1247 = bitcast <1024 x float> %1246 to <1024 x i32>
+  %.splatinsert331 = insertelement <1024 x i32> undef, i32 %286, i32 0
+  %.splat332 = shufflevector <1024 x i32> %.splatinsert331, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1248 = icmp eq <1024 x i32> %1154, %.splat332
+  %.splatinsert335 = insertelement <1024 x i32> undef, i32 %288, i32 0
+  %.splat336 = shufflevector <1024 x i32> %.splatinsert335, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1249 = icmp eq <1024 x i32> %1156, %.splat336
+  %1250 = and <1024 x i1> %1248, %1249
+  %1251 = load <1024 x i32>, <1024 x i32> addrspace(205)* %1235, align 4096
+  %1252 = select <1024 x i1> %1250, <1024 x i32> %1247, <1024 x i32> %1251
+  store <1024 x i32> %1252, <1024 x i32> addrspace(205)* %12, align 4096, !alias.scope !141, !noalias !142
+  store <1024 x i32> %1252, <1024 x i32> addrspace(205)* %44, align 4096, !alias.scope !151, !noalias !152
+  %1253 = shl nuw nsw i32 %1078, 3
+  %1254 = and i32 %1253, 134217720
+  %1255 = getelementptr i4096, i4096 addrspace(205)* %8, i32 %1254
+  %1256 = bitcast i4096 addrspace(205)* %1255 to <1024 x i32> addrspace(205)*
+  %1257 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %1256)
+  %1258 = icmp eq i32 %1078, 5
+  %1259 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %46)
+  %1260 = select i1 %1258, i32 %1259, i32 %1257
+  %1261 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 %1260)
+  %1262 = bitcast <1024 x i32> addrspace(205)* %1261 to i4096 addrspace(205)*
+  %1263 = getelementptr i4096, i4096 addrspace(205)* %1262, i32 %294
+  %1264 = bitcast i4096 addrspace(205)* %1263 to <1024 x i32> addrspace(205)*
+  %1265 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1264, i32 255, i32 0)
+  %1266 = or i32 %1138, 5
+  %.splatinsert343 = insertelement <1024 x i32> undef, i32 %1266, i32 0
+  %.splat344 = shufflevector <1024 x i32> %.splatinsert343, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1267 = tail call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %.splat344, i32 0)
+  %1268 = tail call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %1143, i32 %1267, i32 0)
+  %1269 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %1268)
+  %1270 = bitcast <1024 x i32> %1265 to <1024 x float>
+  %1271 = bitcast <1024 x i32> %1269 to <1024 x float>
+  %1272 = fadd <1024 x float> %1270, %1271
+  %1273 = bitcast <1024 x float> %1272 to <1024 x i32>
+  %.splatinsert345 = insertelement <1024 x i32> undef, i32 %292, i32 0
+  %.splat346 = shufflevector <1024 x i32> %.splatinsert345, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1274 = icmp eq <1024 x i32> %1154, %.splat346
+  %.splatinsert349 = insertelement <1024 x i32> undef, i32 %294, i32 0
+  %.splat350 = shufflevector <1024 x i32> %.splatinsert349, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1275 = icmp eq <1024 x i32> %1156, %.splat350
+  %1276 = and <1024 x i1> %1274, %1275
+  %1277 = load <1024 x i32>, <1024 x i32> addrspace(205)* %1261, align 4096
+  %1278 = select <1024 x i1> %1276, <1024 x i32> %1273, <1024 x i32> %1277
+  store <1024 x i32> %1278, <1024 x i32> addrspace(205)* %13, align 4096, !alias.scope !141, !noalias !142
+  store <1024 x i32> %1278, <1024 x i32> addrspace(205)* %48, align 4096, !alias.scope !153, !noalias !154
+  %1279 = shl nuw nsw i32 %1048, 3
+  %1280 = and i32 %1279, 134217720
+  %1281 = getelementptr i4096, i4096 addrspace(205)* %8, i32 %1280
+  %1282 = bitcast i4096 addrspace(205)* %1281 to <1024 x i32> addrspace(205)*
+  %1283 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %1282)
+  %1284 = icmp eq i32 %1048, 6
+  %1285 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %50)
+  %1286 = select i1 %1284, i32 %1285, i32 %1283
+  %1287 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 %1286)
+  %1288 = bitcast <1024 x i32> addrspace(205)* %1287 to i4096 addrspace(205)*
+  %1289 = getelementptr i4096, i4096 addrspace(205)* %1288, i32 %300
+  %1290 = bitcast i4096 addrspace(205)* %1289 to <1024 x i32> addrspace(205)*
+  %1291 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1290, i32 255, i32 0)
+  %1292 = or i32 %1138, 6
+  %.splatinsert357 = insertelement <1024 x i32> undef, i32 %1292, i32 0
+  %.splat358 = shufflevector <1024 x i32> %.splatinsert357, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1293 = tail call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %.splat358, i32 0)
+  %1294 = tail call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %1143, i32 %1293, i32 0)
+  %1295 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %1294)
+  %1296 = bitcast <1024 x i32> %1291 to <1024 x float>
+  %1297 = bitcast <1024 x i32> %1295 to <1024 x float>
+  %1298 = fadd <1024 x float> %1296, %1297
+  %1299 = bitcast <1024 x float> %1298 to <1024 x i32>
+  %.splatinsert359 = insertelement <1024 x i32> undef, i32 %298, i32 0
+  %.splat360 = shufflevector <1024 x i32> %.splatinsert359, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1300 = icmp eq <1024 x i32> %1154, %.splat360
+  %.splatinsert363 = insertelement <1024 x i32> undef, i32 %300, i32 0
+  %.splat364 = shufflevector <1024 x i32> %.splatinsert363, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1301 = icmp eq <1024 x i32> %1156, %.splat364
+  %1302 = and <1024 x i1> %1300, %1301
+  %1303 = load <1024 x i32>, <1024 x i32> addrspace(205)* %1287, align 4096
+  %1304 = select <1024 x i1> %1302, <1024 x i32> %1299, <1024 x i32> %1303
+  store <1024 x i32> %1304, <1024 x i32> addrspace(205)* %14, align 4096, !alias.scope !141, !noalias !142
+  store <1024 x i32> %1304, <1024 x i32> addrspace(205)* %52, align 4096, !alias.scope !155, !noalias !156
+  %1305 = shl nuw nsw i32 %1012, 3
+  %1306 = and i32 %1305, 134217720
+  %1307 = getelementptr i4096, i4096 addrspace(205)* %8, i32 %1306
+  %1308 = bitcast i4096 addrspace(205)* %1307 to <1024 x i32> addrspace(205)*
+  %1309 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %1308)
+  %1310 = icmp eq i32 %1012, 7
+  %1311 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %54)
+  %1312 = select i1 %1310, i32 %1311, i32 %1309
+  %1313 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 %1312)
+  %1314 = bitcast <1024 x i32> addrspace(205)* %1313 to i4096 addrspace(205)*
+  %1315 = getelementptr i4096, i4096 addrspace(205)* %1314, i32 %306
+  %1316 = bitcast i4096 addrspace(205)* %1315 to <1024 x i32> addrspace(205)*
+  %1317 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1316, i32 255, i32 0)
+  %1318 = or i32 %1138, 7
+  %.splatinsert371 = insertelement <1024 x i32> undef, i32 %1318, i32 0
+  %.splat372 = shufflevector <1024 x i32> %.splatinsert371, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1319 = tail call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %.splat372, i32 0)
+  %1320 = tail call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %1143, i32 %1319, i32 0)
+  %1321 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %1320)
+  %1322 = bitcast <1024 x i32> %1317 to <1024 x float>
+  %1323 = bitcast <1024 x i32> %1321 to <1024 x float>
+  %1324 = fadd <1024 x float> %1322, %1323
+  %1325 = bitcast <1024 x float> %1324 to <1024 x i32>
+  %.splatinsert373 = insertelement <1024 x i32> undef, i32 %304, i32 0
+  %.splat374 = shufflevector <1024 x i32> %.splatinsert373, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1326 = icmp eq <1024 x i32> %1154, %.splat374
+  %.splatinsert377 = insertelement <1024 x i32> undef, i32 %306, i32 0
+  %.splat378 = shufflevector <1024 x i32> %.splatinsert377, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1327 = icmp eq <1024 x i32> %1156, %.splat378
+  %1328 = and <1024 x i1> %1326, %1327
+  %1329 = load <1024 x i32>, <1024 x i32> addrspace(205)* %1313, align 4096
+  %1330 = select <1024 x i1> %1328, <1024 x i32> %1325, <1024 x i32> %1329
+  store <1024 x i32> %1330, <1024 x i32> addrspace(205)* %15, align 4096, !alias.scope !141, !noalias !142
+  store <1024 x i32> %1330, <1024 x i32> addrspace(205)* %56, align 4096, !alias.scope !157, !noalias !158
+  %1331 = shl nuw nsw i32 %970, 3
+  %1332 = and i32 %1331, 134217720
+  %1333 = getelementptr i4096, i4096 addrspace(205)* %8, i32 %1332
+  %1334 = bitcast i4096 addrspace(205)* %1333 to <1024 x i32> addrspace(205)*
+  %1335 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %1334)
+  %1336 = icmp eq i32 %970, 8
+  %1337 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %58)
+  %1338 = select i1 %1336, i32 %1337, i32 %1335
+  %1339 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 %1338)
+  %1340 = bitcast <1024 x i32> addrspace(205)* %1339 to i4096 addrspace(205)*
+  %1341 = getelementptr i4096, i4096 addrspace(205)* %1340, i32 %312
+  %1342 = bitcast i4096 addrspace(205)* %1341 to <1024 x i32> addrspace(205)*
+  %1343 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1342, i32 255, i32 0)
+  %1344 = or i32 %1138, 8
+  %.splatinsert385 = insertelement <1024 x i32> undef, i32 %1344, i32 0
+  %.splat386 = shufflevector <1024 x i32> %.splatinsert385, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1345 = tail call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %.splat386, i32 0)
+  %1346 = tail call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %1143, i32 %1345, i32 0)
+  %1347 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %1346)
+  %1348 = bitcast <1024 x i32> %1343 to <1024 x float>
+  %1349 = bitcast <1024 x i32> %1347 to <1024 x float>
+  %1350 = fadd <1024 x float> %1348, %1349
+  %1351 = bitcast <1024 x float> %1350 to <1024 x i32>
+  %.splatinsert387 = insertelement <1024 x i32> undef, i32 %310, i32 0
+  %.splat388 = shufflevector <1024 x i32> %.splatinsert387, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1352 = icmp eq <1024 x i32> %1154, %.splat388
+  %.splatinsert391 = insertelement <1024 x i32> undef, i32 %312, i32 0
+  %.splat392 = shufflevector <1024 x i32> %.splatinsert391, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1353 = icmp eq <1024 x i32> %1156, %.splat392
+  %1354 = and <1024 x i1> %1352, %1353
+  %1355 = load <1024 x i32>, <1024 x i32> addrspace(205)* %1339, align 4096
+  %1356 = select <1024 x i1> %1354, <1024 x i32> %1351, <1024 x i32> %1355
+  store <1024 x i32> %1356, <1024 x i32> addrspace(205)* %16, align 4096, !alias.scope !141, !noalias !142
+  store <1024 x i32> %1356, <1024 x i32> addrspace(205)* %60, align 4096, !alias.scope !159, !noalias !160
+  %1357 = shl nuw nsw i32 %922, 3
+  %1358 = and i32 %1357, 134217720
+  %1359 = getelementptr i4096, i4096 addrspace(205)* %8, i32 %1358
+  %1360 = bitcast i4096 addrspace(205)* %1359 to <1024 x i32> addrspace(205)*
+  %1361 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %1360)
+  %1362 = icmp eq i32 %922, 9
+  %1363 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %62)
+  %1364 = select i1 %1362, i32 %1363, i32 %1361
+  %1365 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 %1364)
+  %1366 = bitcast <1024 x i32> addrspace(205)* %1365 to i4096 addrspace(205)*
+  %1367 = getelementptr i4096, i4096 addrspace(205)* %1366, i32 %318
+  %1368 = bitcast i4096 addrspace(205)* %1367 to <1024 x i32> addrspace(205)*
+  %1369 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1368, i32 255, i32 0)
+  %1370 = or i32 %1138, 9
+  %.splatinsert399 = insertelement <1024 x i32> undef, i32 %1370, i32 0
+  %.splat400 = shufflevector <1024 x i32> %.splatinsert399, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1371 = tail call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %.splat400, i32 0)
+  %1372 = tail call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %1143, i32 %1371, i32 0)
+  %1373 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %1372)
+  %1374 = bitcast <1024 x i32> %1369 to <1024 x float>
+  %1375 = bitcast <1024 x i32> %1373 to <1024 x float>
+  %1376 = fadd <1024 x float> %1374, %1375
+  %1377 = bitcast <1024 x float> %1376 to <1024 x i32>
+  %.splatinsert401 = insertelement <1024 x i32> undef, i32 %316, i32 0
+  %.splat402 = shufflevector <1024 x i32> %.splatinsert401, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1378 = icmp eq <1024 x i32> %1154, %.splat402
+  %.splatinsert405 = insertelement <1024 x i32> undef, i32 %318, i32 0
+  %.splat406 = shufflevector <1024 x i32> %.splatinsert405, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1379 = icmp eq <1024 x i32> %1156, %.splat406
+  %1380 = and <1024 x i1> %1378, %1379
+  %1381 = load <1024 x i32>, <1024 x i32> addrspace(205)* %1365, align 4096
+  %1382 = select <1024 x i1> %1380, <1024 x i32> %1377, <1024 x i32> %1381
+  store <1024 x i32> %1382, <1024 x i32> addrspace(205)* %17, align 4096, !alias.scope !141, !noalias !142
+  store <1024 x i32> %1382, <1024 x i32> addrspace(205)* %64, align 4096, !alias.scope !161, !noalias !162
+  %1383 = shl nuw nsw i32 %868, 3
+  %1384 = and i32 %1383, 134217720
+  %1385 = getelementptr i4096, i4096 addrspace(205)* %8, i32 %1384
+  %1386 = bitcast i4096 addrspace(205)* %1385 to <1024 x i32> addrspace(205)*
+  %1387 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %1386)
+  %1388 = icmp eq i32 %868, 10
+  %1389 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %66)
+  %1390 = select i1 %1388, i32 %1389, i32 %1387
+  %1391 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 %1390)
+  %1392 = bitcast <1024 x i32> addrspace(205)* %1391 to i4096 addrspace(205)*
+  %1393 = getelementptr i4096, i4096 addrspace(205)* %1392, i32 %324
+  %1394 = bitcast i4096 addrspace(205)* %1393 to <1024 x i32> addrspace(205)*
+  %1395 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1394, i32 255, i32 0)
+  %1396 = or i32 %1138, 10
+  %.splatinsert413 = insertelement <1024 x i32> undef, i32 %1396, i32 0
+  %.splat414 = shufflevector <1024 x i32> %.splatinsert413, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1397 = tail call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %.splat414, i32 0)
+  %1398 = tail call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %1143, i32 %1397, i32 0)
+  %1399 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %1398)
+  %1400 = bitcast <1024 x i32> %1395 to <1024 x float>
+  %1401 = bitcast <1024 x i32> %1399 to <1024 x float>
+  %1402 = fadd <1024 x float> %1400, %1401
+  %1403 = bitcast <1024 x float> %1402 to <1024 x i32>
+  %.splatinsert415 = insertelement <1024 x i32> undef, i32 %322, i32 0
+  %.splat416 = shufflevector <1024 x i32> %.splatinsert415, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1404 = icmp eq <1024 x i32> %1154, %.splat416
+  %.splatinsert419 = insertelement <1024 x i32> undef, i32 %324, i32 0
+  %.splat420 = shufflevector <1024 x i32> %.splatinsert419, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1405 = icmp eq <1024 x i32> %1156, %.splat420
+  %1406 = and <1024 x i1> %1404, %1405
+  %1407 = load <1024 x i32>, <1024 x i32> addrspace(205)* %1391, align 4096
+  %1408 = select <1024 x i1> %1406, <1024 x i32> %1403, <1024 x i32> %1407
+  store <1024 x i32> %1408, <1024 x i32> addrspace(205)* %18, align 4096, !alias.scope !141, !noalias !142
+  store <1024 x i32> %1408, <1024 x i32> addrspace(205)* %68, align 4096, !alias.scope !163, !noalias !164
+  %1409 = shl nuw nsw i32 %808, 3
+  %1410 = and i32 %1409, 134217720
+  %1411 = getelementptr i4096, i4096 addrspace(205)* %8, i32 %1410
+  %1412 = bitcast i4096 addrspace(205)* %1411 to <1024 x i32> addrspace(205)*
+  %1413 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %1412)
+  %1414 = icmp eq i32 %808, 11
+  %1415 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %70)
+  %1416 = select i1 %1414, i32 %1415, i32 %1413
+  %1417 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 %1416)
+  %1418 = bitcast <1024 x i32> addrspace(205)* %1417 to i4096 addrspace(205)*
+  %1419 = getelementptr i4096, i4096 addrspace(205)* %1418, i32 %330
+  %1420 = bitcast i4096 addrspace(205)* %1419 to <1024 x i32> addrspace(205)*
+  %1421 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1420, i32 255, i32 0)
+  %1422 = or i32 %1138, 11
+  %.splatinsert427 = insertelement <1024 x i32> undef, i32 %1422, i32 0
+  %.splat428 = shufflevector <1024 x i32> %.splatinsert427, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1423 = tail call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %.splat428, i32 0)
+  %1424 = tail call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %1143, i32 %1423, i32 0)
+  %1425 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %1424)
+  %1426 = bitcast <1024 x i32> %1421 to <1024 x float>
+  %1427 = bitcast <1024 x i32> %1425 to <1024 x float>
+  %1428 = fadd <1024 x float> %1426, %1427
+  %1429 = bitcast <1024 x float> %1428 to <1024 x i32>
+  %.splatinsert429 = insertelement <1024 x i32> undef, i32 %328, i32 0
+  %.splat430 = shufflevector <1024 x i32> %.splatinsert429, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1430 = icmp eq <1024 x i32> %1154, %.splat430
+  %.splatinsert433 = insertelement <1024 x i32> undef, i32 %330, i32 0
+  %.splat434 = shufflevector <1024 x i32> %.splatinsert433, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1431 = icmp eq <1024 x i32> %1156, %.splat434
+  %1432 = and <1024 x i1> %1430, %1431
+  %1433 = load <1024 x i32>, <1024 x i32> addrspace(205)* %1417, align 4096
+  %1434 = select <1024 x i1> %1432, <1024 x i32> %1429, <1024 x i32> %1433
+  store <1024 x i32> %1434, <1024 x i32> addrspace(205)* %19, align 4096, !alias.scope !141, !noalias !142
+  store <1024 x i32> %1434, <1024 x i32> addrspace(205)* %72, align 4096, !alias.scope !165, !noalias !166
+  %1435 = shl nuw nsw i32 %742, 3
+  %1436 = and i32 %1435, 134217720
+  %1437 = getelementptr i4096, i4096 addrspace(205)* %8, i32 %1436
+  %1438 = bitcast i4096 addrspace(205)* %1437 to <1024 x i32> addrspace(205)*
+  %1439 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %1438)
+  %1440 = icmp eq i32 %742, 12
+  %1441 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %74)
+  %1442 = select i1 %1440, i32 %1441, i32 %1439
+  %1443 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 %1442)
+  %1444 = bitcast <1024 x i32> addrspace(205)* %1443 to i4096 addrspace(205)*
+  %1445 = getelementptr i4096, i4096 addrspace(205)* %1444, i32 %336
+  %1446 = bitcast i4096 addrspace(205)* %1445 to <1024 x i32> addrspace(205)*
+  %1447 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1446, i32 255, i32 0)
+  %1448 = or i32 %1138, 12
+  %.splatinsert441 = insertelement <1024 x i32> undef, i32 %1448, i32 0
+  %.splat442 = shufflevector <1024 x i32> %.splatinsert441, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1449 = tail call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %.splat442, i32 0)
+  %1450 = tail call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %1143, i32 %1449, i32 0)
+  %1451 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %1450)
+  %1452 = bitcast <1024 x i32> %1447 to <1024 x float>
+  %1453 = bitcast <1024 x i32> %1451 to <1024 x float>
+  %1454 = fadd <1024 x float> %1452, %1453
+  %1455 = bitcast <1024 x float> %1454 to <1024 x i32>
+  %.splatinsert443 = insertelement <1024 x i32> undef, i32 %334, i32 0
+  %.splat444 = shufflevector <1024 x i32> %.splatinsert443, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1456 = icmp eq <1024 x i32> %1154, %.splat444
+  %.splatinsert447 = insertelement <1024 x i32> undef, i32 %336, i32 0
+  %.splat448 = shufflevector <1024 x i32> %.splatinsert447, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1457 = icmp eq <1024 x i32> %1156, %.splat448
+  %1458 = and <1024 x i1> %1456, %1457
+  %1459 = load <1024 x i32>, <1024 x i32> addrspace(205)* %1443, align 4096
+  %1460 = select <1024 x i1> %1458, <1024 x i32> %1455, <1024 x i32> %1459
+  store <1024 x i32> %1460, <1024 x i32> addrspace(205)* %20, align 4096, !alias.scope !141, !noalias !142
+  store <1024 x i32> %1460, <1024 x i32> addrspace(205)* %76, align 4096, !alias.scope !167, !noalias !168
+  %1461 = shl nuw nsw i32 %670, 3
+  %1462 = and i32 %1461, 134217720
+  %1463 = getelementptr i4096, i4096 addrspace(205)* %8, i32 %1462
+  %1464 = bitcast i4096 addrspace(205)* %1463 to <1024 x i32> addrspace(205)*
+  %1465 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %1464)
+  %1466 = icmp eq i32 %670, 13
+  %1467 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %78)
+  %1468 = select i1 %1466, i32 %1467, i32 %1465
+  %1469 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 %1468)
+  %1470 = bitcast <1024 x i32> addrspace(205)* %1469 to i4096 addrspace(205)*
+  %1471 = getelementptr i4096, i4096 addrspace(205)* %1470, i32 %342
+  %1472 = bitcast i4096 addrspace(205)* %1471 to <1024 x i32> addrspace(205)*
+  %1473 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1472, i32 255, i32 0)
+  %1474 = or i32 %1138, 13
+  %.splatinsert455 = insertelement <1024 x i32> undef, i32 %1474, i32 0
+  %.splat456 = shufflevector <1024 x i32> %.splatinsert455, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1475 = tail call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %.splat456, i32 0)
+  %1476 = tail call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %1143, i32 %1475, i32 0)
+  %1477 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %1476)
+  %1478 = bitcast <1024 x i32> %1473 to <1024 x float>
+  %1479 = bitcast <1024 x i32> %1477 to <1024 x float>
+  %1480 = fadd <1024 x float> %1478, %1479
+  %1481 = bitcast <1024 x float> %1480 to <1024 x i32>
+  %.splatinsert457 = insertelement <1024 x i32> undef, i32 %340, i32 0
+  %.splat458 = shufflevector <1024 x i32> %.splatinsert457, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1482 = icmp eq <1024 x i32> %1154, %.splat458
+  %.splatinsert461 = insertelement <1024 x i32> undef, i32 %342, i32 0
+  %.splat462 = shufflevector <1024 x i32> %.splatinsert461, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1483 = icmp eq <1024 x i32> %1156, %.splat462
+  %1484 = and <1024 x i1> %1482, %1483
+  %1485 = load <1024 x i32>, <1024 x i32> addrspace(205)* %1469, align 4096
+  %1486 = select <1024 x i1> %1484, <1024 x i32> %1481, <1024 x i32> %1485
+  store <1024 x i32> %1486, <1024 x i32> addrspace(205)* %21, align 4096, !alias.scope !141, !noalias !142
+  store <1024 x i32> %1486, <1024 x i32> addrspace(205)* %80, align 4096, !alias.scope !169, !noalias !170
+  %1487 = shl nuw nsw i32 %592, 3
+  %1488 = and i32 %1487, 134217720
+  %1489 = getelementptr i4096, i4096 addrspace(205)* %8, i32 %1488
+  %1490 = bitcast i4096 addrspace(205)* %1489 to <1024 x i32> addrspace(205)*
+  %1491 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %1490)
+  %1492 = icmp eq i32 %592, 14
+  %1493 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %82)
+  %1494 = select i1 %1492, i32 %1493, i32 %1491
+  %1495 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 %1494)
+  %1496 = bitcast <1024 x i32> addrspace(205)* %1495 to i4096 addrspace(205)*
+  %1497 = getelementptr i4096, i4096 addrspace(205)* %1496, i32 %348
+  %1498 = bitcast i4096 addrspace(205)* %1497 to <1024 x i32> addrspace(205)*
+  %1499 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1498, i32 255, i32 0)
+  %1500 = or i32 %1138, 14
+  %.splatinsert469 = insertelement <1024 x i32> undef, i32 %1500, i32 0
+  %.splat470 = shufflevector <1024 x i32> %.splatinsert469, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1501 = tail call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %.splat470, i32 0)
+  %1502 = tail call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %1143, i32 %1501, i32 0)
+  %1503 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %1502)
+  %1504 = bitcast <1024 x i32> %1499 to <1024 x float>
+  %1505 = bitcast <1024 x i32> %1503 to <1024 x float>
+  %1506 = fadd <1024 x float> %1504, %1505
+  %1507 = bitcast <1024 x float> %1506 to <1024 x i32>
+  %.splatinsert471 = insertelement <1024 x i32> undef, i32 %346, i32 0
+  %.splat472 = shufflevector <1024 x i32> %.splatinsert471, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1508 = icmp eq <1024 x i32> %1154, %.splat472
+  %.splatinsert475 = insertelement <1024 x i32> undef, i32 %348, i32 0
+  %.splat476 = shufflevector <1024 x i32> %.splatinsert475, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1509 = icmp eq <1024 x i32> %1156, %.splat476
+  %1510 = and <1024 x i1> %1508, %1509
+  %1511 = load <1024 x i32>, <1024 x i32> addrspace(205)* %1495, align 4096
+  %1512 = select <1024 x i1> %1510, <1024 x i32> %1507, <1024 x i32> %1511
+  store <1024 x i32> %1512, <1024 x i32> addrspace(205)* %22, align 4096, !alias.scope !141, !noalias !142
+  store <1024 x i32> %1512, <1024 x i32> addrspace(205)* %84, align 4096, !alias.scope !171, !noalias !172
+  %1513 = shl nuw nsw i32 %508, 3
+  %1514 = and i32 %1513, 134217720
+  %1515 = getelementptr i4096, i4096 addrspace(205)* %8, i32 %1514
+  %1516 = bitcast i4096 addrspace(205)* %1515 to <1024 x i32> addrspace(205)*
+  %1517 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %1516)
+  %1518 = icmp eq i32 %508, 15
+  %1519 = tail call i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)* %86)
+  %1520 = select i1 %1518, i32 %1519, i32 %1517
+  %1521 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 %1520)
+  %1522 = bitcast <1024 x i32> addrspace(205)* %1521 to i4096 addrspace(205)*
+  %1523 = getelementptr i4096, i4096 addrspace(205)* %1522, i32 %354
+  %1524 = bitcast i4096 addrspace(205)* %1523 to <1024 x i32> addrspace(205)*
+  %1525 = tail call <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)* %1524, i32 255, i32 0)
+  %1526 = or i32 %1138, 15
+  %.splatinsert483 = insertelement <1024 x i32> undef, i32 %1526, i32 0
+  %.splat484 = shufflevector <1024 x i32> %.splatinsert483, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1527 = tail call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %.splat484, i32 0)
+  %1528 = tail call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %1143, i32 %1527, i32 0)
+  %1529 = tail call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %1528)
+  %1530 = bitcast <1024 x i32> %1525 to <1024 x float>
+  %1531 = bitcast <1024 x i32> %1529 to <1024 x float>
+  %1532 = fadd <1024 x float> %1530, %1531
+  %1533 = bitcast <1024 x float> %1532 to <1024 x i32>
+  %.splatinsert485 = insertelement <1024 x i32> undef, i32 %352, i32 0
+  %.splat486 = shufflevector <1024 x i32> %.splatinsert485, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1534 = icmp eq <1024 x i32> %1154, %.splat486
+  %.splatinsert489 = insertelement <1024 x i32> undef, i32 %354, i32 0
+  %.splat490 = shufflevector <1024 x i32> %.splatinsert489, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %1535 = icmp eq <1024 x i32> %1156, %.splat490
+  %1536 = and <1024 x i1> %1534, %1535
+  %1537 = load <1024 x i32>, <1024 x i32> addrspace(205)* %1521, align 4096
+  %1538 = select <1024 x i1> %1536, <1024 x i32> %1533, <1024 x i32> %1537
+  store <1024 x i32> %1538, <1024 x i32> addrspace(205)* %23, align 4096, !alias.scope !141, !noalias !142
+  store <1024 x i32> %1538, <1024 x i32> addrspace(205)* %88, align 4096, !alias.scope !173, !noalias !174
+  %1539 = tail call <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32> %1159)
+  %1540 = bitcast <1024 x i32> %1539 to <1024 x float>
+  %1541 = bitcast <1024 x i32> %1159 to <1024 x float>
+  %1542 = tail call <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float> %1540, <1024 x float> %1541)
+  %1543 = and <1024 x i32> %206, <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>
+  %1544 = icmp eq <1024 x i32> %1543, zeroinitializer
+  %1545 = bitcast <1024 x float> %1542 to <1024 x i32>
+  tail call void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> %1545, <1024 x i32> addrspace(205)* %29, i32 255, i32 1, <1024 x i1> %1544, i32 undef), !alias.scope !175, !noalias !176
+  %1546 = tail call <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32> %1183)
+  %1547 = bitcast <1024 x i32> %1546 to <1024 x float>
+  %1548 = bitcast <1024 x i32> %1183 to <1024 x float>
+  %1549 = tail call <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float> %1547, <1024 x float> %1548)
+  %1550 = bitcast <1024 x float> %1549 to <1024 x i32>
+  tail call void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> %1550, <1024 x i32> addrspace(205)* %33, i32 255, i32 1, <1024 x i1> %1544, i32 undef), !alias.scope !177, !noalias !178
+  %1551 = tail call <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32> %1206)
+  %1552 = bitcast <1024 x i32> %1551 to <1024 x float>
+  %1553 = bitcast <1024 x i32> %1206 to <1024 x float>
+  %1554 = tail call <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float> %1552, <1024 x float> %1553)
+  %1555 = bitcast <1024 x float> %1554 to <1024 x i32>
+  tail call void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> %1555, <1024 x i32> addrspace(205)* %37, i32 255, i32 1, <1024 x i1> %1544, i32 undef), !alias.scope !179, !noalias !180
+  %1556 = tail call <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32> %1229)
+  %1557 = bitcast <1024 x i32> %1556 to <1024 x float>
+  %1558 = bitcast <1024 x i32> %1229 to <1024 x float>
+  %1559 = tail call <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float> %1557, <1024 x float> %1558)
+  %1560 = bitcast <1024 x float> %1559 to <1024 x i32>
+  tail call void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> %1560, <1024 x i32> addrspace(205)* %41, i32 255, i32 1, <1024 x i1> %1544, i32 undef), !alias.scope !181, !noalias !182
+  %1561 = tail call <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32> %1252)
+  %1562 = bitcast <1024 x i32> %1561 to <1024 x float>
+  %1563 = bitcast <1024 x i32> %1252 to <1024 x float>
+  %1564 = tail call <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float> %1562, <1024 x float> %1563)
+  %1565 = bitcast <1024 x float> %1564 to <1024 x i32>
+  tail call void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> %1565, <1024 x i32> addrspace(205)* %45, i32 255, i32 1, <1024 x i1> %1544, i32 undef), !alias.scope !183, !noalias !184
+  %1566 = tail call <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32> %1278)
+  %1567 = bitcast <1024 x i32> %1566 to <1024 x float>
+  %1568 = bitcast <1024 x i32> %1278 to <1024 x float>
+  %1569 = tail call <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float> %1567, <1024 x float> %1568)
+  %1570 = bitcast <1024 x float> %1569 to <1024 x i32>
+  tail call void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> %1570, <1024 x i32> addrspace(205)* %49, i32 255, i32 1, <1024 x i1> %1544, i32 undef), !alias.scope !185, !noalias !186
+  %1571 = tail call <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32> %1304)
+  %1572 = bitcast <1024 x i32> %1571 to <1024 x float>
+  %1573 = bitcast <1024 x i32> %1304 to <1024 x float>
+  %1574 = tail call <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float> %1572, <1024 x float> %1573)
+  %1575 = bitcast <1024 x float> %1574 to <1024 x i32>
+  tail call void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> %1575, <1024 x i32> addrspace(205)* %53, i32 255, i32 1, <1024 x i1> %1544, i32 undef), !alias.scope !187, !noalias !188
+  %1576 = tail call <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32> %1330)
+  %1577 = bitcast <1024 x i32> %1576 to <1024 x float>
+  %1578 = bitcast <1024 x i32> %1330 to <1024 x float>
+  %1579 = tail call <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float> %1577, <1024 x float> %1578)
+  %1580 = bitcast <1024 x float> %1579 to <1024 x i32>
+  tail call void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> %1580, <1024 x i32> addrspace(205)* %57, i32 255, i32 1, <1024 x i1> %1544, i32 undef), !alias.scope !189, !noalias !190
+  %1581 = tail call <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32> %1356)
+  %1582 = bitcast <1024 x i32> %1581 to <1024 x float>
+  %1583 = bitcast <1024 x i32> %1356 to <1024 x float>
+  %1584 = tail call <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float> %1582, <1024 x float> %1583)
+  %1585 = bitcast <1024 x float> %1584 to <1024 x i32>
+  tail call void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> %1585, <1024 x i32> addrspace(205)* %61, i32 255, i32 1, <1024 x i1> %1544, i32 undef), !alias.scope !191, !noalias !192
+  %1586 = tail call <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32> %1382)
+  %1587 = bitcast <1024 x i32> %1586 to <1024 x float>
+  %1588 = bitcast <1024 x i32> %1382 to <1024 x float>
+  %1589 = tail call <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float> %1587, <1024 x float> %1588)
+  %1590 = bitcast <1024 x float> %1589 to <1024 x i32>
+  tail call void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> %1590, <1024 x i32> addrspace(205)* %65, i32 255, i32 1, <1024 x i1> %1544, i32 undef), !alias.scope !193, !noalias !194
+  %1591 = tail call <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32> %1408)
+  %1592 = bitcast <1024 x i32> %1591 to <1024 x float>
+  %1593 = bitcast <1024 x i32> %1408 to <1024 x float>
+  %1594 = tail call <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float> %1592, <1024 x float> %1593)
+  %1595 = bitcast <1024 x float> %1594 to <1024 x i32>
+  tail call void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> %1595, <1024 x i32> addrspace(205)* %69, i32 255, i32 1, <1024 x i1> %1544, i32 undef), !alias.scope !195, !noalias !196
+  %1596 = tail call <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32> %1434)
+  %1597 = bitcast <1024 x i32> %1596 to <1024 x float>
+  %1598 = bitcast <1024 x i32> %1434 to <1024 x float>
+  %1599 = tail call <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float> %1597, <1024 x float> %1598)
+  %1600 = bitcast <1024 x float> %1599 to <1024 x i32>
+  tail call void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> %1600, <1024 x i32> addrspace(205)* %73, i32 255, i32 1, <1024 x i1> %1544, i32 undef), !alias.scope !197, !noalias !198
+  %1601 = tail call <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32> %1460)
+  %1602 = bitcast <1024 x i32> %1601 to <1024 x float>
+  %1603 = bitcast <1024 x i32> %1460 to <1024 x float>
+  %1604 = tail call <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float> %1602, <1024 x float> %1603)
+  %1605 = bitcast <1024 x float> %1604 to <1024 x i32>
+  tail call void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> %1605, <1024 x i32> addrspace(205)* %77, i32 255, i32 1, <1024 x i1> %1544, i32 undef), !alias.scope !199, !noalias !200
+  %1606 = tail call <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32> %1486)
+  %1607 = bitcast <1024 x i32> %1606 to <1024 x float>
+  %1608 = bitcast <1024 x i32> %1486 to <1024 x float>
+  %1609 = tail call <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float> %1607, <1024 x float> %1608)
+  %1610 = bitcast <1024 x float> %1609 to <1024 x i32>
+  tail call void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> %1610, <1024 x i32> addrspace(205)* %81, i32 255, i32 1, <1024 x i1> %1544, i32 undef), !alias.scope !201, !noalias !202
+  %1611 = tail call <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32> %1512)
+  %1612 = bitcast <1024 x i32> %1611 to <1024 x float>
+  %1613 = bitcast <1024 x i32> %1512 to <1024 x float>
+  %1614 = tail call <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float> %1612, <1024 x float> %1613)
+  %1615 = bitcast <1024 x float> %1614 to <1024 x i32>
+  tail call void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> %1615, <1024 x i32> addrspace(205)* %85, i32 255, i32 1, <1024 x i1> %1544, i32 undef), !alias.scope !203, !noalias !204
+  %1616 = tail call <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32> %1538)
+  %1617 = bitcast <1024 x i32> %1616 to <1024 x float>
+  %1618 = bitcast <1024 x i32> %1538 to <1024 x float>
+  %1619 = tail call <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float> %1617, <1024 x float> %1618)
+  %1620 = bitcast <1024 x float> %1619 to <1024 x i32>
+  tail call void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> %1620, <1024 x i32> addrspace(205)* %89, i32 255, i32 1, <1024 x i1> %1544, i32 undef), !alias.scope !205, !noalias !206
+  %1621 = or i32 %99, 1
+  %1622 = getelementptr i32, i32* %92, i32 %1621
+  %1623 = load i32, i32* %1622, align 4
+  %1624 = icmp ugt i32 %261, 31
+  %1625 = icmp ne i32 %1623, 0
+  %1626 = icmp ugt i32 %263, 15
+  %1627 = or i1 %1624, %1625
+  %1628 = or i1 %1626, %1627
+  %1629 = and i32 %365, 1
+  %1630 = select i1 %1628, i32 2, i32 0
+  %1631 = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 17)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 %1630)
+  %1632 = or i32 %1630, %1629
+  %1633 = shl nuw nsw i32 %1632, 1
+  %1634 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %1633
+  %1635 = getelementptr i8192, i8192 addrspace(203)* %1634, i32 2
+  %1636 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %1635)
+  %1637 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %24)
+  %1638 = icmp sge i32 %1637, %1636
+  %1639 = or i1 %1628, %1638
+  %1640 = xor i1 %1639, true
+  tail call void @llvm.tpu.halt.trap(i1 %1640)
+  br i1 %1628, label %llo-region-14.join, label %llo-region-14.pred
+
+llo-region-14.join:                               ; preds = %llo-region-14, %llo-region-14.pred
+  %1641 = load i32, i32* %266, align 4
+  %1642 = or i32 %99, 5
+  %1643 = getelementptr i32, i32* %92, i32 %1642
+  %1644 = load i32, i32* %1643, align 4
+  %1645 = load i32, i32* %107, align 4
+  %1646 = icmp ugt i32 %1641, 31
+  %1647 = icmp ne i32 %1644, 0
+  %1648 = icmp ugt i32 %1645, 15
+  %1649 = or i1 %1646, %1647
+  %1650 = or i1 %1649, %1648
+  %1651 = lshr i32 %1645, 3
+  %1652 = and i32 %1651, 1
+  %1653 = select i1 %1650, i32 2, i32 0
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 %1653)
+  %1654 = or i32 %1653, %1652
+  %1655 = shl nuw nsw i32 %1654, 1
+  %1656 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %1655
+  %1657 = getelementptr i8192, i8192 addrspace(203)* %1656, i32 2
+  %1658 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %1657)
+  %1659 = icmp sge i32 %1637, %1658
+  %1660 = or i1 %1659, %1650
+  %1661 = xor i1 %1660, true
+  tail call void @llvm.tpu.halt.trap(i1 %1661)
+  br i1 %1650, label %llo-region-14.join562, label %llo-region-14.pred561
+
+llo-region-14.join562:                            ; preds = %llo-region-14.join, %llo-region-14.pred561
+  %1662 = load i32, i32* %272, align 4
+  %1663 = or i32 %99, 9
+  %1664 = getelementptr i32, i32* %92, i32 %1663
+  %1665 = load i32, i32* %1664, align 4
+  %1666 = load i32, i32* %113, align 4
+  %1667 = icmp ugt i32 %1662, 31
+  %1668 = icmp ne i32 %1665, 0
+  %1669 = icmp ugt i32 %1666, 15
+  %1670 = or i1 %1667, %1668
+  %1671 = or i1 %1670, %1669
+  %1672 = lshr i32 %1666, 3
+  %1673 = and i32 %1672, 1
+  %1674 = select i1 %1671, i32 2, i32 0
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 %1674)
+  %1675 = or i32 %1674, %1673
+  %1676 = shl nuw nsw i32 %1675, 1
+  %1677 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %1676
+  %1678 = getelementptr i8192, i8192 addrspace(203)* %1677, i32 2
+  %1679 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %1678)
+  %1680 = icmp sge i32 %1637, %1679
+  %1681 = or i1 %1680, %1671
+  %1682 = xor i1 %1681, true
+  tail call void @llvm.tpu.halt.trap(i1 %1682)
+  br i1 %1671, label %llo-region-14.join564, label %llo-region-14.pred563
+
+llo-region-14.join564:                            ; preds = %llo-region-14.join562, %llo-region-14.pred563
+  %1683 = load i32, i32* %278, align 4
+  %1684 = or i32 %99, 13
+  %1685 = getelementptr i32, i32* %92, i32 %1684
+  %1686 = load i32, i32* %1685, align 4
+  %1687 = load i32, i32* %119, align 4
+  %1688 = icmp ugt i32 %1683, 31
+  %1689 = icmp ne i32 %1686, 0
+  %1690 = icmp ugt i32 %1687, 15
+  %1691 = or i1 %1688, %1689
+  %1692 = or i1 %1691, %1690
+  %1693 = lshr i32 %1687, 3
+  %1694 = and i32 %1693, 1
+  %1695 = select i1 %1692, i32 2, i32 0
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 %1695)
+  %1696 = or i32 %1695, %1694
+  %1697 = shl nuw nsw i32 %1696, 1
+  %1698 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %1697
+  %1699 = getelementptr i8192, i8192 addrspace(203)* %1698, i32 2
+  %1700 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %1699)
+  %1701 = icmp sge i32 %1637, %1700
+  %1702 = or i1 %1701, %1692
+  %1703 = xor i1 %1702, true
+  tail call void @llvm.tpu.halt.trap(i1 %1703)
+  br i1 %1692, label %llo-region-14.join566, label %llo-region-14.pred565
+
+llo-region-14.join566:                            ; preds = %llo-region-14.join564, %llo-region-14.pred565
+  %1704 = load i32, i32* %284, align 4
+  %1705 = or i32 %99, 17
+  %1706 = getelementptr i32, i32* %92, i32 %1705
+  %1707 = load i32, i32* %1706, align 4
+  %1708 = load i32, i32* %125, align 4
+  %1709 = icmp ugt i32 %1704, 31
+  %1710 = icmp ne i32 %1707, 0
+  %1711 = icmp ugt i32 %1708, 15
+  %1712 = or i1 %1709, %1710
+  %1713 = or i1 %1712, %1711
+  %1714 = lshr i32 %1708, 3
+  %1715 = and i32 %1714, 1
+  %1716 = select i1 %1713, i32 2, i32 0
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 %1716)
+  %1717 = or i32 %1716, %1715
+  %1718 = shl nuw nsw i32 %1717, 1
+  %1719 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %1718
+  %1720 = getelementptr i8192, i8192 addrspace(203)* %1719, i32 2
+  %1721 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %1720)
+  %1722 = icmp sge i32 %1637, %1721
+  %1723 = or i1 %1722, %1713
+  %1724 = xor i1 %1723, true
+  tail call void @llvm.tpu.halt.trap(i1 %1724)
+  br i1 %1713, label %llo-region-14.join568, label %llo-region-14.pred567
+
+llo-region-14.join568:                            ; preds = %llo-region-14.join566, %llo-region-14.pred567
+  %1725 = load i32, i32* %290, align 4
+  %1726 = or i32 %99, 21
+  %1727 = getelementptr i32, i32* %92, i32 %1726
+  %1728 = load i32, i32* %1727, align 4
+  %1729 = load i32, i32* %131, align 4
+  %1730 = icmp ugt i32 %1725, 31
+  %1731 = icmp ne i32 %1728, 0
+  %1732 = icmp ugt i32 %1729, 15
+  %1733 = or i1 %1730, %1731
+  %1734 = or i1 %1733, %1732
+  %1735 = lshr i32 %1729, 3
+  %1736 = and i32 %1735, 1
+  %1737 = select i1 %1734, i32 2, i32 0
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 %1737)
+  %1738 = or i32 %1737, %1736
+  %1739 = shl nuw nsw i32 %1738, 1
+  %1740 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %1739
+  %1741 = getelementptr i8192, i8192 addrspace(203)* %1740, i32 2
+  %1742 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %1741)
+  %1743 = icmp sge i32 %1637, %1742
+  %1744 = or i1 %1743, %1734
+  %1745 = xor i1 %1744, true
+  tail call void @llvm.tpu.halt.trap(i1 %1745)
+  br i1 %1734, label %llo-region-14.join570, label %llo-region-14.pred569
+
+llo-region-14.join570:                            ; preds = %llo-region-14.join568, %llo-region-14.pred569
+  %1746 = load i32, i32* %296, align 4
+  %1747 = or i32 %99, 25
+  %1748 = getelementptr i32, i32* %92, i32 %1747
+  %1749 = load i32, i32* %1748, align 4
+  %1750 = load i32, i32* %137, align 4
+  %1751 = icmp ugt i32 %1746, 31
+  %1752 = icmp ne i32 %1749, 0
+  %1753 = icmp ugt i32 %1750, 15
+  %1754 = or i1 %1751, %1752
+  %1755 = or i1 %1754, %1753
+  %1756 = lshr i32 %1750, 3
+  %1757 = and i32 %1756, 1
+  %1758 = select i1 %1755, i32 2, i32 0
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 %1758)
+  %1759 = or i32 %1758, %1757
+  %1760 = shl nuw nsw i32 %1759, 1
+  %1761 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %1760
+  %1762 = getelementptr i8192, i8192 addrspace(203)* %1761, i32 2
+  %1763 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %1762)
+  %1764 = icmp sge i32 %1637, %1763
+  %1765 = or i1 %1764, %1755
+  %1766 = xor i1 %1765, true
+  tail call void @llvm.tpu.halt.trap(i1 %1766)
+  br i1 %1755, label %llo-region-14.join572, label %llo-region-14.pred571
+
+llo-region-14.join572:                            ; preds = %llo-region-14.join570, %llo-region-14.pred571
+  %1767 = load i32, i32* %302, align 4
+  %1768 = or i32 %99, 29
+  %1769 = getelementptr i32, i32* %92, i32 %1768
+  %1770 = load i32, i32* %1769, align 4
+  %1771 = load i32, i32* %143, align 4
+  %1772 = icmp ugt i32 %1767, 31
+  %1773 = icmp ne i32 %1770, 0
+  %1774 = icmp ugt i32 %1771, 15
+  %1775 = or i1 %1772, %1773
+  %1776 = or i1 %1775, %1774
+  %1777 = lshr i32 %1771, 3
+  %1778 = and i32 %1777, 1
+  %1779 = select i1 %1776, i32 2, i32 0
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 %1779)
+  %1780 = or i32 %1779, %1778
+  %1781 = shl nuw nsw i32 %1780, 1
+  %1782 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %1781
+  %1783 = getelementptr i8192, i8192 addrspace(203)* %1782, i32 2
+  %1784 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %1783)
+  %1785 = icmp sge i32 %1637, %1784
+  %1786 = or i1 %1785, %1776
+  %1787 = xor i1 %1786, true
+  tail call void @llvm.tpu.halt.trap(i1 %1787)
+  br i1 %1776, label %llo-region-14.join574, label %llo-region-14.pred573
+
+llo-region-14.join574:                            ; preds = %llo-region-14.join572, %llo-region-14.pred573
+  %1788 = load i32, i32* %308, align 4
+  %1789 = or i32 %99, 33
+  %1790 = getelementptr i32, i32* %92, i32 %1789
+  %1791 = load i32, i32* %1790, align 4
+  %1792 = load i32, i32* %149, align 4
+  %1793 = icmp ugt i32 %1788, 31
+  %1794 = icmp ne i32 %1791, 0
+  %1795 = icmp ugt i32 %1792, 15
+  %1796 = or i1 %1793, %1794
+  %1797 = or i1 %1796, %1795
+  %1798 = lshr i32 %1792, 3
+  %1799 = and i32 %1798, 1
+  %1800 = select i1 %1797, i32 2, i32 0
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 %1800)
+  %1801 = or i32 %1800, %1799
+  %1802 = shl nuw nsw i32 %1801, 1
+  %1803 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %1802
+  %1804 = getelementptr i8192, i8192 addrspace(203)* %1803, i32 2
+  %1805 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %1804)
+  %1806 = icmp sge i32 %1637, %1805
+  %1807 = or i1 %1806, %1797
+  %1808 = xor i1 %1807, true
+  tail call void @llvm.tpu.halt.trap(i1 %1808)
+  br i1 %1797, label %llo-region-14.join576, label %llo-region-14.pred575
+
+llo-region-14.join576:                            ; preds = %llo-region-14.join574, %llo-region-14.pred575
+  %1809 = load i32, i32* %314, align 4
+  %1810 = or i32 %99, 37
+  %1811 = getelementptr i32, i32* %92, i32 %1810
+  %1812 = load i32, i32* %1811, align 4
+  %1813 = load i32, i32* %155, align 4
+  %1814 = icmp ugt i32 %1809, 31
+  %1815 = icmp ne i32 %1812, 0
+  %1816 = icmp ugt i32 %1813, 15
+  %1817 = or i1 %1814, %1815
+  %1818 = or i1 %1817, %1816
+  %1819 = lshr i32 %1813, 3
+  %1820 = and i32 %1819, 1
+  %1821 = select i1 %1818, i32 2, i32 0
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 %1821)
+  %1822 = or i32 %1821, %1820
+  %1823 = shl nuw nsw i32 %1822, 1
+  %1824 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %1823
+  %1825 = getelementptr i8192, i8192 addrspace(203)* %1824, i32 2
+  %1826 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %1825)
+  %1827 = icmp sge i32 %1637, %1826
+  %1828 = or i1 %1827, %1818
+  %1829 = xor i1 %1828, true
+  tail call void @llvm.tpu.halt.trap(i1 %1829)
+  br i1 %1818, label %llo-region-14.join578, label %llo-region-14.pred577
+
+llo-region-14.join578:                            ; preds = %llo-region-14.join576, %llo-region-14.pred577
+  %1830 = load i32, i32* %320, align 4
+  %1831 = or i32 %99, 41
+  %1832 = getelementptr i32, i32* %92, i32 %1831
+  %1833 = load i32, i32* %1832, align 4
+  %1834 = load i32, i32* %161, align 4
+  %1835 = icmp ugt i32 %1830, 31
+  %1836 = icmp ne i32 %1833, 0
+  %1837 = icmp ugt i32 %1834, 15
+  %1838 = or i1 %1835, %1836
+  %1839 = or i1 %1838, %1837
+  %1840 = lshr i32 %1834, 3
+  %1841 = and i32 %1840, 1
+  %1842 = select i1 %1839, i32 2, i32 0
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 %1842)
+  %1843 = or i32 %1842, %1841
+  %1844 = shl nuw nsw i32 %1843, 1
+  %1845 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %1844
+  %1846 = getelementptr i8192, i8192 addrspace(203)* %1845, i32 2
+  %1847 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %1846)
+  %1848 = icmp sge i32 %1637, %1847
+  %1849 = or i1 %1848, %1839
+  %1850 = xor i1 %1849, true
+  tail call void @llvm.tpu.halt.trap(i1 %1850)
+  br i1 %1839, label %llo-region-14.join580, label %llo-region-14.pred579
+
+llo-region-14.join580:                            ; preds = %llo-region-14.join578, %llo-region-14.pred579
+  %1851 = load i32, i32* %326, align 4
+  %1852 = or i32 %99, 45
+  %1853 = getelementptr i32, i32* %92, i32 %1852
+  %1854 = load i32, i32* %1853, align 4
+  %1855 = load i32, i32* %167, align 4
+  %1856 = icmp ugt i32 %1851, 31
+  %1857 = icmp ne i32 %1854, 0
+  %1858 = icmp ugt i32 %1855, 15
+  %1859 = or i1 %1856, %1857
+  %1860 = or i1 %1859, %1858
+  %1861 = lshr i32 %1855, 3
+  %1862 = and i32 %1861, 1
+  %1863 = select i1 %1860, i32 2, i32 0
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 %1863)
+  %1864 = or i32 %1863, %1862
+  %1865 = shl nuw nsw i32 %1864, 1
+  %1866 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %1865
+  %1867 = getelementptr i8192, i8192 addrspace(203)* %1866, i32 2
+  %1868 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %1867)
+  %1869 = icmp sge i32 %1637, %1868
+  %1870 = or i1 %1869, %1860
+  %1871 = xor i1 %1870, true
+  tail call void @llvm.tpu.halt.trap(i1 %1871)
+  br i1 %1860, label %llo-region-14.join582, label %llo-region-14.pred581
+
+llo-region-14.join582:                            ; preds = %llo-region-14.join580, %llo-region-14.pred581
+  %1872 = load i32, i32* %332, align 4
+  %1873 = or i32 %99, 49
+  %1874 = getelementptr i32, i32* %92, i32 %1873
+  %1875 = load i32, i32* %1874, align 4
+  %1876 = load i32, i32* %173, align 4
+  %1877 = icmp ugt i32 %1872, 31
+  %1878 = icmp ne i32 %1875, 0
+  %1879 = icmp ugt i32 %1876, 15
+  %1880 = or i1 %1877, %1878
+  %1881 = or i1 %1880, %1879
+  %1882 = lshr i32 %1876, 3
+  %1883 = and i32 %1882, 1
+  %1884 = select i1 %1881, i32 2, i32 0
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 %1884)
+  %1885 = or i32 %1884, %1883
+  %1886 = shl nuw nsw i32 %1885, 1
+  %1887 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %1886
+  %1888 = getelementptr i8192, i8192 addrspace(203)* %1887, i32 2
+  %1889 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %1888)
+  %1890 = icmp sge i32 %1637, %1889
+  %1891 = or i1 %1890, %1881
+  %1892 = xor i1 %1891, true
+  tail call void @llvm.tpu.halt.trap(i1 %1892)
+  br i1 %1881, label %llo-region-14.join584, label %llo-region-14.pred583
+
+llo-region-14.join584:                            ; preds = %llo-region-14.join582, %llo-region-14.pred583
+  %1893 = load i32, i32* %338, align 4
+  %1894 = or i32 %99, 53
+  %1895 = getelementptr i32, i32* %92, i32 %1894
+  %1896 = load i32, i32* %1895, align 4
+  %1897 = load i32, i32* %179, align 4
+  %1898 = icmp ugt i32 %1893, 31
+  %1899 = icmp ne i32 %1896, 0
+  %1900 = icmp ugt i32 %1897, 15
+  %1901 = or i1 %1898, %1899
+  %1902 = or i1 %1901, %1900
+  %1903 = lshr i32 %1897, 3
+  %1904 = and i32 %1903, 1
+  %1905 = select i1 %1902, i32 2, i32 0
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 %1905)
+  %1906 = or i32 %1905, %1904
+  %1907 = shl nuw nsw i32 %1906, 1
+  %1908 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %1907
+  %1909 = getelementptr i8192, i8192 addrspace(203)* %1908, i32 2
+  %1910 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %1909)
+  %1911 = icmp sge i32 %1637, %1910
+  %1912 = or i1 %1911, %1902
+  %1913 = xor i1 %1912, true
+  tail call void @llvm.tpu.halt.trap(i1 %1913)
+  br i1 %1902, label %llo-region-14.join586, label %llo-region-14.pred585
+
+llo-region-14.join586:                            ; preds = %llo-region-14.join584, %llo-region-14.pred585
+  %1914 = load i32, i32* %344, align 4
+  %1915 = or i32 %99, 57
+  %1916 = getelementptr i32, i32* %92, i32 %1915
+  %1917 = load i32, i32* %1916, align 4
+  %1918 = load i32, i32* %185, align 4
+  %1919 = icmp ugt i32 %1914, 31
+  %1920 = icmp ne i32 %1917, 0
+  %1921 = icmp ugt i32 %1918, 15
+  %1922 = or i1 %1919, %1920
+  %1923 = or i1 %1922, %1921
+  %1924 = lshr i32 %1918, 3
+  %1925 = and i32 %1924, 1
+  %1926 = select i1 %1923, i32 2, i32 0
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 %1926)
+  %1927 = or i32 %1926, %1925
+  %1928 = shl nuw nsw i32 %1927, 1
+  %1929 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %1928
+  %1930 = getelementptr i8192, i8192 addrspace(203)* %1929, i32 2
+  %1931 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %1930)
+  %1932 = icmp sge i32 %1637, %1931
+  %1933 = or i1 %1932, %1923
+  %1934 = xor i1 %1933, true
+  tail call void @llvm.tpu.halt.trap(i1 %1934)
+  br i1 %1923, label %llo-region-14.join588, label %llo-region-14.pred587
+
+llo-region-14.join588:                            ; preds = %llo-region-14.join586, %llo-region-14.pred587
+  %1935 = load i32, i32* %350, align 4
+  %1936 = or i32 %99, 61
+  %1937 = getelementptr i32, i32* %92, i32 %1936
+  %1938 = load i32, i32* %1937, align 4
+  %1939 = load i32, i32* %191, align 4
+  %1940 = icmp ugt i32 %1935, 31
+  %1941 = icmp ne i32 %1938, 0
+  %1942 = icmp ugt i32 %1939, 15
+  %1943 = or i1 %1940, %1941
+  %1944 = or i1 %1943, %1942
+  %1945 = lshr i32 %1939, 3
+  %1946 = and i32 %1945, 1
+  %1947 = select i1 %1944, i32 2, i32 0
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 %1947)
+  %1948 = or i32 %1947, %1946
+  %1949 = shl nuw nsw i32 %1948, 1
+  %1950 = getelementptr i8192, i8192 addrspace(203)* %3, i32 %1949
+  %1951 = getelementptr i8192, i8192 addrspace(203)* %1950, i32 2
+  %1952 = tail call i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)* %1951)
+  %1953 = icmp sge i32 %1637, %1952
+  %1954 = or i1 %1953, %1944
+  %1955 = xor i1 %1954, true
+  tail call void @llvm.tpu.halt.trap(i1 %1955)
+  br i1 %1944, label %llo-region-14.join590, label %llo-region-14.pred589
+
+llo-region-14.join590:                            ; preds = %llo-region-14.join588, %llo-region-14.pred589
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %1631, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %1631, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %1631, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %1631, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %1631, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %1631, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %1631, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %1631, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %1631, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %1631, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %1631, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %1631, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %1631, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %1631, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %1631, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 -2)
+  tail call void @llvm.tpu.waitge(i32 addrspace(204)* %1631, i32 2)
+  tail call void @llvm.tpu.syncadd(i32 addrspace(204)* %1631, i32 -2)
+  %1956 = add nuw nsw i32 %98, 1
+  %exitcond = icmp eq i32 %1956, 2
+  br i1 %exitcond, label %llo-region-7, label %llo-region-14
+
+llo-region-7:                                     ; preds = %llo-region-14.join590
+  ret void
+
+llo-region-14.pred:                               ; preds = %llo-region-14
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %1631, <1024 x i32> addrspace(205)* %29, i8192 addrspace(203)* %1634, i32 2)
+  br label %llo-region-14.join
+
+llo-region-14.pred561:                            ; preds = %llo-region-14.join
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %1631, <1024 x i32> addrspace(205)* %33, i8192 addrspace(203)* %1656, i32 2)
+  br label %llo-region-14.join562
+
+llo-region-14.pred563:                            ; preds = %llo-region-14.join562
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %1631, <1024 x i32> addrspace(205)* %37, i8192 addrspace(203)* %1677, i32 2)
+  br label %llo-region-14.join564
+
+llo-region-14.pred565:                            ; preds = %llo-region-14.join564
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %1631, <1024 x i32> addrspace(205)* %41, i8192 addrspace(203)* %1698, i32 2)
+  br label %llo-region-14.join566
+
+llo-region-14.pred567:                            ; preds = %llo-region-14.join566
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %1631, <1024 x i32> addrspace(205)* %45, i8192 addrspace(203)* %1719, i32 2)
+  br label %llo-region-14.join568
+
+llo-region-14.pred569:                            ; preds = %llo-region-14.join568
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %1631, <1024 x i32> addrspace(205)* %49, i8192 addrspace(203)* %1740, i32 2)
+  br label %llo-region-14.join570
+
+llo-region-14.pred571:                            ; preds = %llo-region-14.join570
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %1631, <1024 x i32> addrspace(205)* %53, i8192 addrspace(203)* %1761, i32 2)
+  br label %llo-region-14.join572
+
+llo-region-14.pred573:                            ; preds = %llo-region-14.join572
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %1631, <1024 x i32> addrspace(205)* %57, i8192 addrspace(203)* %1782, i32 2)
+  br label %llo-region-14.join574
+
+llo-region-14.pred575:                            ; preds = %llo-region-14.join574
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %1631, <1024 x i32> addrspace(205)* %61, i8192 addrspace(203)* %1803, i32 2)
+  br label %llo-region-14.join576
+
+llo-region-14.pred577:                            ; preds = %llo-region-14.join576
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %1631, <1024 x i32> addrspace(205)* %65, i8192 addrspace(203)* %1824, i32 2)
+  br label %llo-region-14.join578
+
+llo-region-14.pred579:                            ; preds = %llo-region-14.join578
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %1631, <1024 x i32> addrspace(205)* %69, i8192 addrspace(203)* %1845, i32 2)
+  br label %llo-region-14.join580
+
+llo-region-14.pred581:                            ; preds = %llo-region-14.join580
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %1631, <1024 x i32> addrspace(205)* %73, i8192 addrspace(203)* %1866, i32 2)
+  br label %llo-region-14.join582
+
+llo-region-14.pred583:                            ; preds = %llo-region-14.join582
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %1631, <1024 x i32> addrspace(205)* %77, i8192 addrspace(203)* %1887, i32 2)
+  br label %llo-region-14.join584
+
+llo-region-14.pred585:                            ; preds = %llo-region-14.join584
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %1631, <1024 x i32> addrspace(205)* %81, i8192 addrspace(203)* %1908, i32 2)
+  br label %llo-region-14.join586
+
+llo-region-14.pred587:                            ; preds = %llo-region-14.join586
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %1631, <1024 x i32> addrspace(205)* %85, i8192 addrspace(203)* %1929, i32 2)
+  br label %llo-region-14.join588
+
+llo-region-14.pred589:                            ; preds = %llo-region-14.join588
+  tail call void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)* %1631, <1024 x i32> addrspace(205)* %89, i8192 addrspace(203)* %1950, i32 2)
+  br label %llo-region-14.join590
+}
+; Function Attrs: nounwind readnone
+declare i32 @llvm.tpu.ptrtoint.p0i32(i32*) #1
+; Function Attrs: nounwind readnone
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) #1
+; Function Attrs: argmemonly nounwind
+declare void @llvm.tpu.dma.hbm.to.smem.p203i8192(i32 addrspace(204)*, i8192 addrspace(203)*, i32*, i32) #2
+; Function Attrs: nounwind readnone
+declare i32 @llvm.tpu.ptrtoint.p203i8192(i8192 addrspace(203)*) #1
+; Function Attrs: nounwind
+declare void @llvm.tpu.halt.trap(i1) #0
+; Function Attrs: nounwind readnone
+declare i32* @llvm.tpu.inttoptr.p0i32(i32) #1
+; Function Attrs: nounwind
+declare void @llvm.tpu.waitge(i32 addrspace(204)*, i32) #0
+; Function Attrs: argmemonly nounwind
+declare void @llvm.tpu.syncadd(i32 addrspace(204)*, i32) #2
+; Function Attrs: argmemonly nounwind
+declare void @llvm.tpu.dma.hbm.to.vmem.p203i8192(i32 addrspace(204)*, i8192 addrspace(203)*, <1024 x i32> addrspace(205)*, i32) #2
+; Function Attrs: argmemonly nounwind readonly
+declare <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)*, i32, i32, i32) #3
+; Function Attrs: nounwind readnone
+declare <1024 x i32> @llvm.tpu.vlaneseq.v1024i32() #1
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.tpu.vst.strided.v1024i32.p205v1024i32(<1024 x i32>, <1024 x i32> addrspace(205)*, i32, i32, <1024 x i1>) #4
+; Function Attrs: nounwind readnone
+declare i32 @llvm.tpu.ptrtoint.p205v1024i32(<1024 x i32> addrspace(205)*) #1
+; Function Attrs: nounwind readnone
+declare <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32) #1
+; Function Attrs: argmemonly nounwind readonly
+declare <1024 x i32> @llvm.tpu.vld.strided.v1024i32.p205v1024i32(<1024 x i32> addrspace(205)*, i32, i32) #3
+; Function Attrs: inaccessiblememonly nounwind
+declare i32 @llvm.tpu.set.permute.sublane(<1024 x i32>, i32) #5
+; Function Attrs: inaccessiblememonly nounwind
+declare i32 @llvm.tpu.permute.v1024i32(<1024 x i32>, i32, i32) #5
+; Function Attrs: inaccessiblememonly nounwind
+declare <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32, i32) #5
+; Function Attrs: nounwind readnone
+declare <1024 x i32> @llvm.tpu.vrot.sublane.down.v1024i32(<1024 x i32>) #1
+; Function Attrs: nounwind readnone
+declare <1024 x float> @llvm.tpu.pack.v1024f32(<1024 x float>, <1024 x float>) #1
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.tpu.vst.evenodd.sublanes.v1024i32.p205v1024i32(<1024 x i32>, <1024 x i32> addrspace(205)*, i32, i32, <1024 x i1>, i32) #4
+; Function Attrs: argmemonly nounwind
+declare void @llvm.tpu.dma.vmem.to.hbm.p203i8192(i32 addrspace(204)*, <1024 x i32> addrspace(205)*, i8192 addrspace(203)*, i32) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { argmemonly nounwind }
+attributes #3 = { argmemonly nounwind readonly }
+attributes #4 = { argmemonly nounwind willreturn }
+attributes #5 = { inaccessiblememonly nounwind }
+
+!smem.funcs.spill = !{!207}
+!vmem.funcs.spill = !{!207}
+!smem.ranges.spill.start = !{!0}
+!smem.ranges.spill.limit = !{!1}
+!vmem.ranges.spill.start = !{!2}
+!vmem.ranges.spill.limit = !{!3}
+
+!207 = !{void (i8192 addrspace(203)*, i8192 addrspace(203)*, i8192 addrspace(203)*, i8192 addrspace(203)*)* @fusion.17}
+!0 = !{i32 849}
+!1 = !{i32 4078}
+!2 = !{i32 21472}
+!3 = !{i32 32768}
+!4 = !{!5}
+!5 = distinct !{!5, !6, !"alloc"}
+!6 = distinct !{!6, !"fusion.17"}
+!7 = !{!8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!8 = distinct !{!8, !6, !"alloc"}
+!9 = distinct !{!9, !6, !"alloc"}
+!10 = distinct !{!10, !6, !"alloc"}
+!11 = distinct !{!11, !6, !"alloc"}
+!12 = distinct !{!12, !6, !"alloc"}
+!13 = distinct !{!13, !6, !"alloc"}
+!14 = distinct !{!14, !6, !"alloc"}
+!15 = distinct !{!15, !6, !"alloc"}
+!16 = distinct !{!16, !6, !"alloc"}
+!17 = distinct !{!17, !6, !"alloc"}
+!18 = distinct !{!18, !6, !"alloc"}
+!19 = distinct !{!19, !6, !"alloc"}
+!20 = distinct !{!20, !6, !"alloc"}
+!21 = distinct !{!21, !6, !"alloc"}
+!22 = distinct !{!22, !6, !"alloc"}
+!23 = distinct !{!23, !6, !"alloc"}
+!24 = distinct !{!24, !6, !"alloc"}
+!25 = distinct !{!25, !6, !"alloc"}
+!26 = distinct !{!26, !6, !"alloc"}
+!27 = distinct !{!27, !6, !"alloc"}
+!28 = distinct !{!28, !6, !"alloc"}
+!29 = distinct !{!29, !6, !"alloc"}
+!30 = distinct !{!30, !6, !"alloc"}
+!31 = distinct !{!31, !6, !"alloc"}
+!32 = distinct !{!32, !6, !"alloc"}
+!33 = distinct !{!33, !6, !"alloc"}
+!34 = distinct !{!34, !6, !"alloc"}
+!35 = distinct !{!35, !6, !"alloc"}
+!36 = distinct !{!36, !6, !"alloc"}
+!37 = distinct !{!37, !6, !"alloc"}
+!38 = distinct !{!38, !6, !"alloc"}
+!39 = distinct !{!39, !6, !"alloc"}
+!40 = distinct !{!40, !6, !"alloc"}
+!41 = distinct !{!41, !6, !"alloc"}
+!42 = distinct !{!42, !6, !"alloc"}
+!43 = distinct !{!43, !6, !"alloc"}
+!44 = distinct !{!44, !6, !"alloc"}
+!45 = distinct !{!45, !6, !"alloc"}
+!46 = distinct !{!46, !6, !"alloc"}
+!47 = distinct !{!47, !6, !"alloc"}
+!48 = distinct !{!48, !6, !"alloc"}
+!49 = distinct !{!49, !6, !"alloc"}
+!50 = distinct !{!50, !6, !"alloc"}
+!51 = distinct !{!51, !6, !"alloc"}
+!52 = distinct !{!52, !6, !"alloc"}
+!53 = distinct !{!53, !6, !"alloc"}
+!54 = distinct !{!54, !6, !"alloc"}
+!55 = distinct !{!55, !6, !"alloc"}
+!56 = distinct !{!56, !6, !"alloc"}
+!57 = distinct !{!57, !6, !"alloc"}
+!58 = distinct !{!58, !6, !"alloc"}
+!59 = distinct !{!59, !6, !"alloc"}
+!60 = distinct !{!60, !6, !"alloc"}
+!61 = distinct !{!61, !6, !"alloc"}
+!62 = distinct !{!62, !6, !"alloc"}
+!63 = distinct !{!63, !6, !"alloc"}
+!64 = distinct !{!64, !6, !"alloc"}
+!65 = distinct !{!65, !6, !"alloc"}
+!66 = distinct !{!66, !6, !"alloc"}
+!67 = distinct !{!67, !6, !"alloc"}
+!68 = distinct !{!68, !6, !"alloc"}
+!69 = distinct !{!69, !6, !"alloc"}
+!70 = distinct !{!70, !6, !"alloc"}
+!71 = distinct !{!71, !6, !"alloc"}
+!72 = distinct !{!72, !6, !"alloc"}
+!73 = distinct !{!73, !6, !"alloc"}
+!74 = distinct !{!74, !6, !"alloc"}
+!75 = !{!10}
+!76 = !{!8, !9, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!77 = !{!14}
+!78 = !{!8, !9, !10, !5, !11, !12, !13, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!79 = !{!13}
+!80 = !{!8, !9, !10, !5, !11, !12, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!81 = !{!18}
+!82 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!83 = !{!17}
+!84 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!85 = !{!22}
+!86 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!87 = !{!21}
+!88 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!89 = !{!26}
+!90 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!91 = !{!25}
+!92 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!93 = !{!30}
+!94 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!95 = !{!29}
+!96 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!97 = !{!34}
+!98 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!99 = !{!33}
+!100 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!101 = !{!38}
+!102 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!103 = !{!37}
+!104 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!105 = !{!42}
+!106 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!107 = !{!41}
+!108 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!109 = !{!46}
+!110 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!111 = !{!45}
+!112 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!113 = !{!50}
+!114 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!115 = !{!49}
+!116 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!117 = !{!54}
+!118 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!119 = !{!53}
+!120 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!121 = !{!58}
+!122 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!123 = !{!57}
+!124 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!125 = !{!62}
+!126 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!127 = !{!61}
+!128 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!129 = !{!66}
+!130 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !67, !68, !69, !70, !71, !72, !73, !74}
+!131 = !{!65}
+!132 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!133 = !{!70}
+!134 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !71, !72, !73, !74}
+!135 = !{!69}
+!136 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !70, !71, !72, !73, !74}
+!137 = !{!74}
+!138 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73}
+!139 = !{!73}
+!140 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !74}
+!141 = !{!9}
+!142 = !{!8, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!143 = !{!11}
+!144 = !{!8, !9, !10, !5, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!145 = !{!15}
+!146 = !{!8, !9, !10, !5, !11, !12, !13, !14, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!147 = !{!19}
+!148 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!149 = !{!23}
+!150 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!151 = !{!27}
+!152 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!153 = !{!31}
+!154 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!155 = !{!35}
+!156 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!157 = !{!39}
+!158 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!159 = !{!43}
+!160 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!161 = !{!47}
+!162 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!163 = !{!51}
+!164 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!165 = !{!55}
+!166 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!167 = !{!59}
+!168 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!169 = !{!63}
+!170 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!171 = !{!67}
+!172 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !68, !69, !70, !71, !72, !73, !74}
+!173 = !{!71}
+!174 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !72, !73, !74}
+!175 = !{!12}
+!176 = !{!8, !9, !10, !5, !11, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!177 = !{!16}
+!178 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!179 = !{!20}
+!180 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!181 = !{!24}
+!182 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!183 = !{!28}
+!184 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!185 = !{!32}
+!186 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!187 = !{!36}
+!188 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!189 = !{!40}
+!190 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!191 = !{!44}
+!192 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!193 = !{!48}
+!194 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!195 = !{!52}
+!196 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!197 = !{!56}
+!198 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!199 = !{!60}
+!200 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!201 = !{!64}
+!202 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !65, !66, !67, !68, !69, !70, !71, !72, !73, !74}
+!203 = !{!68}
+!204 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !69, !70, !71, !72, !73, !74}
+!205 = !{!72}
+!206 = !{!8, !9, !10, !5, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63, !64, !65, !66, !67, !68, !69, !70, !71, !73, !74}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/spill_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/spill_tc.ll
new file mode 100644
index 0000000..16da126
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/spill_tc.ll

@@ -0,0 +1,130 @@
+; RUN: llc -O0 < %s -mcpu=tensorcore-jf -asm-verbose=false -disable-cgp -tpu-skip-fast-opt | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+!smem.funcs.spill = !{!0}
+!smem.ranges.spill.start = !{!1}
+!smem.ranges.spill.limit = !{!2}
+
+!0 = !{void (i32)* @smem_spill}
+!1 = !{i32 100}
+!2 = !{i32 200}
+
+declare void @llvm.tpu.vtrace(i32) nounwind
+
+; CHECK-LABEL: smem_spill:
+; CHECK-DAG: [smem:$0xc7] =	sst s{{[0-9]+}}
+; CHECK-DAG: [smem:$0xc6] =	sst s{{[0-9]+}}
+; CHECK-DAG: [smem:$0xc5] =	sst s{{[0-9]+}}
+; CHECK-DAG: s{{[0-9]+}} =	sld [smem:$0xc7]
+; CHECK-DAG: s{{[0-9]+}} =	sld [smem:$0xc6]
+; CHECK-DAG: s{{[0-9]+}} =	sld [smem:$0xc5]
+; CHECK: shalt
+
+define void @smem_spill(i32 %arg) {
+entry:
+  br label %region-0
+
+region-0:
+  %x = phi i32 [ %arg, %entry ], [ %79, %region-2 ]
+
+  %0 = add i32 %x, 0
+  %1 = add i32 %x, 1
+  %2 = add i32 %x, 2
+  %3 = add i32 %x, 3
+  %4 = add i32 %x, 4
+  %5 = add i32 %x, 5
+  %6 = add i32 %x, 6
+  %7 = add i32 %x, 7
+  %8 = add i32 %x, 8
+  %9 = add i32 %x, 9
+
+  %10 = add i32 %x, 10
+  %11 = add i32 %x, 11
+  %12 = add i32 %x, 12
+  %13 = add i32 %x, 13
+  %14 = add i32 %x, 14
+  %15 = add i32 %x, 15
+  %16 = add i32 %x, 16
+  %17 = add i32 %x, 17
+  %18 = add i32 %x, 18
+  %19 = add i32 %x, 19
+
+  %20 = add i32 %x, 20
+  %21 = add i32 %x, 21
+  %22 = add i32 %x, 22
+  %23 = add i32 %x, 23
+  %24 = add i32 %x, 24
+  %25 = add i32 %x, 25
+  %26 = add i32 %x, 26
+  %27 = add i32 %x, 27
+  %28 = add i32 %x, 28
+  %29 = add i32 %x, 29
+
+  %30 = add i32 %x, 30
+  %31 = add i32 %x, 31
+  %32 = add i32 %x, 32
+  %33 = add i32 %x, 33
+  %34 = add i32 %x, 34
+  %35 = add i32 %x, 35
+  %36 = add i32 %x, 36
+  %37 = add i32 %x, 37
+  %38 = add i32 %x, 38
+  %39 = add i32 %x, 39
+
+  br label %region-2
+
+region-2:
+  %40 = add i32 %arg, %0
+  %41 = add i32 %40, %1
+  %42 = add i32 %41, %2
+  %43 = add i32 %42, %3
+  %44 = add i32 %43, %4
+  %45 = add i32 %44, %5
+  %46 = add i32 %45, %6
+  %47 = add i32 %46, %7
+  %48 = add i32 %47, %8
+  %49 = add i32 %48, %9
+
+  %50 = add i32 %49, %10
+  %51 = add i32 %50, %11
+  %52 = add i32 %51, %12
+  %53 = add i32 %52, %13
+  %54 = add i32 %53, %14
+  %55 = add i32 %54, %15
+  %56 = add i32 %55, %16
+  %57 = add i32 %56, %17
+  %58 = add i32 %57, %18
+  %59 = add i32 %58, %19
+
+  %60 = add i32 %59, %20
+  %61 = add i32 %60, %21
+  %62 = add i32 %61, %22
+  %63 = add i32 %62, %23
+  %64 = add i32 %63, %24
+  %65 = add i32 %64, %25
+  %66 = add i32 %65, %26
+  %67 = add i32 %66, %27
+  %68 = add i32 %67, %28
+  %69 = add i32 %68, %29
+
+  %70 = add i32 %69, %30
+  %71 = add i32 %70, %31
+  %72 = add i32 %71, %32
+  %73 = add i32 %72, %33
+  %74 = add i32 %73, %34
+  %75 = add i32 %74, %35
+  %76 = add i32 %75, %36
+  %77 = add i32 %76, %37
+  %78 = add i32 %77, %38
+  %79 = add i32 %78, %39
+
+  %80 = icmp sge i32 %79, 1000
+  br i1 %80, label %region-0, label %region-3
+
+region-3:
+  call void @llvm.tpu.vtrace(i32 %79)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_add_f32_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_add_f32_sc.ll
new file mode 100644
index 0000000..7f4c873
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_add_f32_sc.ll

@@ -0,0 +1,795 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.add.f32.spmem.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.add.f32.tilespmem.tileN.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_add_f32_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [hbm:s0], s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [hbm:s0], s3, $0x5
+define void @stream_linear_gather_add_f32_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_add_f32_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [hbm:s0], s2, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [hbm:s0], s2, $0x5
+define void @stream_linear_gather_add_f32_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.f32 [hbm:s0], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.f32 [hbm:s0], $0x4, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.f32 [hbm:s0], s3, s4, s5, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.f32 [hbm:s0], s3, s4, s5, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.f32 [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.f32 [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.f32 [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.f32 [hbm:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.f32 [hbm:s0], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.f32 [hbm:s0], $0x4, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.f32 [hbm:s0], s3, s4, s5, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.f32 [hbm:s0], s3, s4, s5, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.f32 [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.f32 [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.f32 [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.f32 [hbm:s0], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [hbm4b:s0+s3], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [hbm4b:s0+s3], $0x4, $0x5
+define void @stream_linear_gather_add_f32_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [hbm4b:s0+s4], s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [hbm4b:s0+s4], s3, $0x5
+define void @stream_linear_gather_add_f32_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [hbm4b:s0+s2], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [hbm4b:s0+s2], $0x4, $0x5
+define void @stream_linear_gather_add_f32_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [hbm4b:s0+s3], s2, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [hbm4b:s0+s3], s2, $0x5
+define void @stream_linear_gather_add_f32_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.f32 [hbm4b:s0+s4], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.f32 [hbm4b:s0+s4], $0x4, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.f32 [hbm4b:s0+s5], s3, s4, s5, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.f32 [hbm4b:s0+s5], s3, s4, s5, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.f32 [hbm4b:s0+s4], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.f32 [hbm4b:s0+s4], $0x4, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.f32 [hbm4b:s0+s5], s3, s4, s5, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.f32 [hbm4b:s0+s5], s3, s4, s5, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tac_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_add_f32_tac_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tac_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [spmem:s0], s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [spmem:s0], s3, $0x5
+define void @stream_linear_gather_add_f32_tac_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tac_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_add_f32_tac_spmem_to_tilespmem_ii(i32 addrspace(202)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tac_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [spmem:s0], s2, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [spmem:s0], s2, $0x5
+define void @stream_linear_gather_add_f32_tac_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.add.f32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.add.f32.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tac_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_add_f32_tac_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tac_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [tilespmem.tileN:s0], s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [tilespmem.tileN:s0], s3, $0x5
+define void @stream_linear_gather_add_f32_tac_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tac_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_add_f32_tac_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tac_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [tilespmem.tileN:s0], s2, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [tilespmem.tileN:s0], s2, $0x5
+define void @stream_linear_gather_add_f32_tac_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.add.f32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.add.f32.tilespmem.tileN.to.tilespmem
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_add_f32_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [hbm:s0], s3, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [hbm:s0], s3, $0x4
+define void @stream_linear_gather_add_f32_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_add_f32_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [hbm:s0], s2, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [hbm:s0], s2, $0x4
+define void @stream_linear_gather_add_f32_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.f32 [hbm:s0], $0x4, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.f32 [hbm:s0], $0x4, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.f32 [hbm:s0], s3, s4, s5, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.f32 [hbm:s0], s3, s4, s5, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.f32 [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.f32 [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.f32 [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.f32 [hbm:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.f32 [hbm:s0], $0x4, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.f32 [hbm:s0], $0x4, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.f32 [hbm:s0], s3, s4, s5, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.f32 [hbm:s0], s3, s4, s5, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.f32 [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.f32 [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.f32 [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.f32 [hbm:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_add_f32_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather.add.f32 [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather.add.f32 [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_add_f32_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_add_f32_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather.add.f32 [hbm:s0], s3, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather.add.f32 [hbm:s0], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_add_f32_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_add_f32_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather.add.f32 [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather.add.f32 [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_add_f32_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_add_f32_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather.add.f32 [hbm:s0], s2, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather.add.f32 [hbm:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_add_f32_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [hbm4b:s0+s3], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [hbm4b:s0+s3], $0x4, $0x4
+define void @stream_linear_gather_add_f32_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [hbm4b:s0+s4], s3, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [hbm4b:s0+s4], s3, $0x4
+define void @stream_linear_gather_add_f32_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [hbm4b:s0+s2], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [hbm4b:s0+s2], $0x4, $0x4
+define void @stream_linear_gather_add_f32_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [hbm4b:s0+s3], s2, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [hbm4b:s0+s3], s2, $0x4
+define void @stream_linear_gather_add_f32_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.f32 [hbm4b:s0+s4], $0x4, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.f32 [hbm4b:s0+s4], $0x4, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.f32 [hbm4b:s0+s5], s3, s4, s5, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.f32 [hbm4b:s0+s5], s3, s4, s5, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.f32 [hbm4b:s0+s4], $0x4, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.f32 [hbm4b:s0+s4], $0x4, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.f32 [hbm4b:s0+s5], s3, s4, s5, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.f32 [hbm4b:s0+s5], s3, s4, s5, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_add_f32_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather.add.f32 [hbm4b:s0+s3], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather.add.f32 [hbm4b:s0+s3], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_add_f32_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_add_f32_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather.add.f32 [hbm4b:s0+s4], s3, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather.add.f32 [hbm4b:s0+s4], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_add_f32_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_add_f32_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather.add.f32 [hbm4b:s0+s2], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather.add.f32 [hbm4b:s0+s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_add_f32_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_add_f32_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather.add.f32 [hbm4b:s0+s3], s2, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather.add.f32 [hbm4b:s0+s3], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_add_f32_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.add.f32.vreg.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tec_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_add_f32_tec_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tec_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [spmem:s0], s3, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [spmem:s0], s3, $0x4
+define void @stream_linear_gather_add_f32_tec_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tec_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_add_f32_tec_spmem_to_tilespmem_ii(i32 addrspace(202)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tec_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [spmem:s0], s2, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [spmem:s0], s2, $0x4
+define void @stream_linear_gather_add_f32_tec_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.add.f32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.add.f32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.add.f32.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tec_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_add_f32_tec_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tec_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [tilespmem.tileN:s0], s3, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.f32 [tilespmem.tileN:s0], s3, $0x4
+define void @stream_linear_gather_add_f32_tec_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tec_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_add_f32_tec_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_f32_tec_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [tilespmem.tileN:s0], s2, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.f32 [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_add_f32_tec_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.add.f32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.add.f32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.add.f32.tilespmem.tileN.to.tilespmem

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_add_s32_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_add_s32_sc.ll
new file mode 100644
index 0000000..108219a
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_add_s32_sc.ll

@@ -0,0 +1,795 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.add.s32.spmem.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.add.s32.tilespmem.tileN.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_add_s32_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [hbm:s0], s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [hbm:s0], s3, $0x5
+define void @stream_linear_gather_add_s32_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_add_s32_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [hbm:s0], s2, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [hbm:s0], s2, $0x5
+define void @stream_linear_gather_add_s32_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.s32 [hbm:s0], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.s32 [hbm:s0], $0x4, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.s32 [hbm:s0], s3, s4, s5, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.s32 [hbm:s0], s3, s4, s5, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.s32 [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.s32 [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.s32 [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.s32 [hbm:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.s32 [hbm:s0], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.s32 [hbm:s0], $0x4, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.s32 [hbm:s0], s3, s4, s5, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.s32 [hbm:s0], s3, s4, s5, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.s32 [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.s32 [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.s32 [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.s32 [hbm:s0], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [hbm4b:s0+s3], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [hbm4b:s0+s3], $0x4, $0x5
+define void @stream_linear_gather_add_s32_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [hbm4b:s0+s4], s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [hbm4b:s0+s4], s3, $0x5
+define void @stream_linear_gather_add_s32_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [hbm4b:s0+s2], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [hbm4b:s0+s2], $0x4, $0x5
+define void @stream_linear_gather_add_s32_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [hbm4b:s0+s3], s2, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [hbm4b:s0+s3], s2, $0x5
+define void @stream_linear_gather_add_s32_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.s32 [hbm4b:s0+s4], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.s32 [hbm4b:s0+s4], $0x4, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.s32 [hbm4b:s0+s5], s3, s4, s5, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.s32 [hbm4b:s0+s5], s3, s4, s5, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.s32 [hbm4b:s0+s4], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.s32 [hbm4b:s0+s4], $0x4, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.s32 [hbm4b:s0+s5], s3, s4, s5, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.s32 [hbm4b:s0+s5], s3, s4, s5, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tac_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_add_s32_tac_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tac_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [spmem:s0], s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [spmem:s0], s3, $0x5
+define void @stream_linear_gather_add_s32_tac_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tac_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_add_s32_tac_spmem_to_tilespmem_ii(i32 addrspace(202)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tac_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [spmem:s0], s2, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [spmem:s0], s2, $0x5
+define void @stream_linear_gather_add_s32_tac_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.add.s32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.add.s32.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tac_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_add_s32_tac_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tac_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [tilespmem.tileN:s0], s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [tilespmem.tileN:s0], s3, $0x5
+define void @stream_linear_gather_add_s32_tac_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tac_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_add_s32_tac_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tac_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [tilespmem.tileN:s0], s2, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [tilespmem.tileN:s0], s2, $0x5
+define void @stream_linear_gather_add_s32_tac_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.add.s32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.add.s32.tilespmem.tileN.to.tilespmem
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_add_s32_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [hbm:s0], s3, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [hbm:s0], s3, $0x4
+define void @stream_linear_gather_add_s32_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_add_s32_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [hbm:s0], s2, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [hbm:s0], s2, $0x4
+define void @stream_linear_gather_add_s32_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.s32 [hbm:s0], $0x4, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.s32 [hbm:s0], $0x4, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.s32 [hbm:s0], s3, s4, s5, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.s32 [hbm:s0], s3, s4, s5, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.s32 [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.s32 [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.s32 [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.s32 [hbm:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.s32 [hbm:s0], $0x4, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.s32 [hbm:s0], $0x4, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.s32 [hbm:s0], s3, s4, s5, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.s32 [hbm:s0], s3, s4, s5, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.s32 [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.s32 [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.s32 [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.s32 [hbm:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_add_s32_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather.add.s32 [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather.add.s32 [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_add_s32_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_add_s32_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather.add.s32 [hbm:s0], s3, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather.add.s32 [hbm:s0], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_add_s32_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_add_s32_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather.add.s32 [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather.add.s32 [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_add_s32_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_add_s32_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather.add.s32 [hbm:s0], s2, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather.add.s32 [hbm:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_add_s32_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [hbm4b:s0+s3], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [hbm4b:s0+s3], $0x4, $0x4
+define void @stream_linear_gather_add_s32_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [hbm4b:s0+s4], s3, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [hbm4b:s0+s4], s3, $0x4
+define void @stream_linear_gather_add_s32_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [hbm4b:s0+s2], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [hbm4b:s0+s2], $0x4, $0x4
+define void @stream_linear_gather_add_s32_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [hbm4b:s0+s3], s2, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [hbm4b:s0+s3], s2, $0x4
+define void @stream_linear_gather_add_s32_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.s32 [hbm4b:s0+s4], $0x4, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.s32 [hbm4b:s0+s4], $0x4, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.s32 [hbm4b:s0+s5], s3, s4, s5, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather.add.s32 [hbm4b:s0+s5], s3, s4, s5, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.s32 [hbm4b:s0+s4], $0x4, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.s32 [hbm4b:s0+s4], $0x4, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.s32 [hbm4b:s0+s5], s3, s4, s5, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather.add.s32 [hbm4b:s0+s5], s3, s4, s5, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_add_s32_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather.add.s32 [hbm4b:s0+s3], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather.add.s32 [hbm4b:s0+s3], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_add_s32_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_add_s32_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather.add.s32 [hbm4b:s0+s4], s3, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather.add.s32 [hbm4b:s0+s4], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_add_s32_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_add_s32_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather.add.s32 [hbm4b:s0+s2], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather.add.s32 [hbm4b:s0+s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_add_s32_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_add_s32_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather.add.s32 [hbm4b:s0+s3], s2, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather.add.s32 [hbm4b:s0+s3], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_add_s32_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.add.s32.vreg.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tec_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_add_s32_tec_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tec_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [spmem:s0], s3, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [spmem:s0], s3, $0x4
+define void @stream_linear_gather_add_s32_tec_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tec_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_add_s32_tec_spmem_to_tilespmem_ii(i32 addrspace(202)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tec_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [spmem:s0], s2, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [spmem:s0], s2, $0x4
+define void @stream_linear_gather_add_s32_tec_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.add.s32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.add.s32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.add.s32.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tec_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_add_s32_tec_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tec_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [tilespmem.tileN:s0], s3, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather.add.s32 [tilespmem.tileN:s0], s3, $0x4
+define void @stream_linear_gather_add_s32_tec_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tec_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_add_s32_tec_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_add_s32_tec_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [tilespmem.tileN:s0], s2, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather.add.s32 [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_add_s32_tec_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.add.s32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.add.s32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.add.s32.tilespmem.tileN.to.tilespmem

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_add_f32_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_add_f32_sc.ll
new file mode 100644
index 0000000..b3ef9bd
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_add_f32_sc.ll

@@ -0,0 +1,795 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.spmem.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.tilespmem.tileN.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [hbm:s0], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [hbm:s0], s2, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [hbm:s0], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [hbm:s0], s1, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.f32 [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.f32 [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.f32 [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.f32 [hbm:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.f32 [hbm:s0], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.f32 [hbm:s0], $0x4, s1, s2, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.f32 [hbm:s0], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.f32 [hbm:s0], s1, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.f32 [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.f32 [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.f32 [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.f32 [hbm:s0], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.f32 [hbm:s0], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.f32 [hbm:s0], $0x4, s1, s2, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.f32 [hbm:s0], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.f32 [hbm:s0], s1, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [hbm4b:s0+s2], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [hbm4b:s0+s2], $0x4, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [hbm4b:s0+s3], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [hbm4b:s0+s3], s2, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [hbm4b:s0+s1], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [hbm4b:s0+s1], $0x4, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [hbm4b:s0+s2], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [hbm4b:s0+s2], s1, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.f32 [hbm4b:s0+s2], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.f32 [hbm4b:s0+s2], $0x4, s1, s2, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.f32 [hbm4b:s0+s3], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.f32 [hbm4b:s0+s3], s1, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.f32 [hbm4b:s0+s2], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.f32 [hbm4b:s0+s2], $0x4, s1, s2, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.f32 [hbm4b:s0+s3], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.f32 [hbm4b:s0+s3], s1, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [spmem:s0], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [spmem:s0], s2, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_spmem_to_tilespmem_ii(i32 addrspace(202)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [spmem:s0], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [spmem:s0], s1, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.add.f32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.add.f32.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [tilespmem.tileN:s0], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [tilespmem.tileN:s0], s2, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [tilespmem.tileN:s0], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [tilespmem.tileN:s0], s1, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.add.f32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.add.f32.tilespmem.tileN.to.tilespmem
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [hbm:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [hbm:s0], s2, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [hbm:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [hbm:s0], s1, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.f32 [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.f32 [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.f32 [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.f32 [hbm:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.f32 [hbm:s0], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.f32 [hbm:s0], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.f32 [hbm:s0], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.f32 [hbm:s0], s1, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.f32 [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.f32 [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.f32 [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.f32 [hbm:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.f32 [hbm:s0], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.f32 [hbm:s0], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.f32 [hbm:s0], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.f32 [hbm:s0], s1, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.add.f32 [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.add.f32 [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_f32_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.add.f32 [hbm:s0], s2, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.add.f32 [hbm:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_f32_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.add.f32 [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.add.f32 [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_f32_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.add.f32 [hbm:s0], s1, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.add.f32 [hbm:s0], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_f32_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [hbm4b:s0+s2], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [hbm4b:s0+s2], $0x4, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [hbm4b:s0+s3], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [hbm4b:s0+s3], s2, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [hbm4b:s0+s1], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [hbm4b:s0+s1], $0x4, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [hbm4b:s0+s2], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [hbm4b:s0+s2], s1, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.f32 [hbm4b:s0+s2], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.f32 [hbm4b:s0+s2], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.f32 [hbm4b:s0+s3], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.f32 [hbm4b:s0+s3], s1, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.f32 [hbm4b:s0+s2], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.f32 [hbm4b:s0+s2], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.f32 [hbm4b:s0+s3], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.f32 [hbm4b:s0+s3], s1, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.add.f32 [hbm4b:s0+s2], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.add.f32 [hbm4b:s0+s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.add.f32 [hbm4b:s0+s3], s2, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.add.f32 [hbm4b:s0+s3], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_f32_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.add.f32 [hbm4b:s0+s1], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.add.f32 [hbm4b:s0+s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.add.f32 [hbm4b:s0+s2], s1, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.add.f32 [hbm4b:s0+s2], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.add.f32.vreg.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [spmem:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [spmem:s0], s2, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_spmem_to_tilespmem_ii(i32 addrspace(202)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [spmem:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [spmem:s0], s1, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.add.f32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.add.f32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [tilespmem.tileN:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.f32 [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [tilespmem.tileN:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.f32 [tilespmem.tileN:s0], s1, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.add.f32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.add.f32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.tilespmem.tileN.to.tilespmem

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_add_s32_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_add_s32_sc.ll
new file mode 100644
index 0000000..9904818
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_add_s32_sc.ll

@@ -0,0 +1,795 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.spmem.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.tilespmem.tileN.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [hbm:s0], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [hbm:s0], s2, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [hbm:s0], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [hbm:s0], s1, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.s32 [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.s32 [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.s32 [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.s32 [hbm:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.s32 [hbm:s0], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.s32 [hbm:s0], $0x4, s1, s2, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.s32 [hbm:s0], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.s32 [hbm:s0], s1, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.s32 [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.s32 [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.s32 [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.s32 [hbm:s0], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.s32 [hbm:s0], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.s32 [hbm:s0], $0x4, s1, s2, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.s32 [hbm:s0], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.s32 [hbm:s0], s1, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [hbm4b:s0+s2], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [hbm4b:s0+s2], $0x4, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [hbm4b:s0+s3], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [hbm4b:s0+s3], s2, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [hbm4b:s0+s1], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [hbm4b:s0+s1], $0x4, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [hbm4b:s0+s2], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [hbm4b:s0+s2], s1, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.s32 [hbm4b:s0+s2], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.s32 [hbm4b:s0+s2], $0x4, s1, s2, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.s32 [hbm4b:s0+s3], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.s32 [hbm4b:s0+s3], s1, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.s32 [hbm4b:s0+s2], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.s32 [hbm4b:s0+s2], $0x4, s1, s2, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.s32 [hbm4b:s0+s3], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.s32 [hbm4b:s0+s3], s1, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [spmem:s0], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [spmem:s0], s2, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_spmem_to_tilespmem_ii(i32 addrspace(202)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [spmem:s0], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [spmem:s0], s1, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.add.s32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.add.s32.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [tilespmem.tileN:s0], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [tilespmem.tileN:s0], s2, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [tilespmem.tileN:s0], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [tilespmem.tileN:s0], s1, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.add.s32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.add.s32.tilespmem.tileN.to.tilespmem
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [hbm:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [hbm:s0], s2, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [hbm:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [hbm:s0], s1, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.s32 [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.s32 [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.s32 [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.s32 [hbm:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.s32 [hbm:s0], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.s32 [hbm:s0], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.s32 [hbm:s0], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.s32 [hbm:s0], s1, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.s32 [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.s32 [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.s32 [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.s32 [hbm:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.s32 [hbm:s0], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.s32 [hbm:s0], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.s32 [hbm:s0], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.s32 [hbm:s0], s1, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_s32_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.add.s32 [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.add.s32 [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_s32_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_s32_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.add.s32 [hbm:s0], s2, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.add.s32 [hbm:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_s32_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_s32_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.add.s32 [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.add.s32 [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_s32_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_s32_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.add.s32 [hbm:s0], s1, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.add.s32 [hbm:s0], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_s32_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [hbm4b:s0+s2], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [hbm4b:s0+s2], $0x4, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [hbm4b:s0+s3], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [hbm4b:s0+s3], s2, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [hbm4b:s0+s1], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [hbm4b:s0+s1], $0x4, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [hbm4b:s0+s2], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [hbm4b:s0+s2], s1, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.s32 [hbm4b:s0+s2], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.s32 [hbm4b:s0+s2], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.s32 [hbm4b:s0+s3], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.add.s32 [hbm4b:s0+s3], s1, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.s32 [hbm4b:s0+s2], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.s32 [hbm4b:s0+s2], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.s32 [hbm4b:s0+s3], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.add.s32 [hbm4b:s0+s3], s1, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_s32_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.add.s32 [hbm4b:s0+s2], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.add.s32 [hbm4b:s0+s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_s32_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_s32_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.add.s32 [hbm4b:s0+s3], s2, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.add.s32 [hbm4b:s0+s3], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_s32_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_s32_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.add.s32 [hbm4b:s0+s1], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.add.s32 [hbm4b:s0+s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_s32_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_s32_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.add.s32 [hbm4b:s0+s2], s1, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.add.s32 [hbm4b:s0+s2], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_s32_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.add.s32.vreg.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [spmem:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [spmem:s0], s2, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_spmem_to_tilespmem_ii(i32 addrspace(202)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [spmem:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [spmem:s0], s1, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.add.s32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.add.s32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [tilespmem.tileN:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.add.s32 [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [tilespmem.tileN:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.add.s32 [tilespmem.tileN:s0], s1, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.add.s32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.add.s32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.add.s32.tilespmem.tileN.to.tilespmem

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_sc.ll
new file mode 100644
index 0000000..8cace55
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_sc.ll

@@ -0,0 +1,1569 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.spmem.to.smem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.spmem.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.tilespmem.tileN.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-scs-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.spmem.to.smem
+
+; CHECK-LABEL: stream_linear_gather_cb_scs_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], $0x4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_scs_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_scs_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], s2, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], s2, $0x4
+define void @stream_linear_gather_cb_scs_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #0 {
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_scs_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], $0x4, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_scs_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 addrspace(209)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_scs_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], s1, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], s1, $0x4
+define void @stream_linear_gather_cb_scs_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_cb_scs_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [spmem:s0], $0x4, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [spmem:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_cb_scs_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_scs_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [spmem:s0], s2, s3, s4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [spmem:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_cb_scs_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_scs_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [spmem:s0], $0x4, s1, s2, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [spmem:s0], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_cb_scs_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_scs_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [spmem:s0], s1, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [spmem:s0], s1, s2, s3, $0x4
+define void @stream_indirect_gather_cb_scs_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.spmem.to.smem
+
+; CHECK-LABEL: stream_strided_gather_cb_scs_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [spmem:s0], $0x4, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [spmem:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_cb_scs_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_scs_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [spmem:s0], s2, s3, s4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [spmem:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_cb_scs_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_scs_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [spmem:s0], $0x4, s1, s2, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [spmem:s0], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_cb_scs_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_scs_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [spmem:s0], s1, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [spmem:s0], s1, s2, s3, $0x4
+define void @stream_strided_gather_cb_scs_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_linear_gather_cb_scs_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_scs_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #0 {
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_scs_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], s2, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_cb_scs_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #0 {
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_scs_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_scs_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32 addrspace(209)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_scs_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], s1, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], s1, $0x4
+define void @stream_linear_gather_cb_scs_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_cb_scs_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [tilespmem.tileN:s0], $0x4, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [tilespmem.tileN:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_cb_scs_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_scs_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [tilespmem.tileN:s0], s2, s3, s4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [tilespmem.tileN:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_cb_scs_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_scs_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [tilespmem.tileN:s0], $0x4, s1, s2, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [tilespmem.tileN:s0], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_cb_scs_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_scs_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [tilespmem.tileN:s0], s1, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [tilespmem.tileN:s0], s1, s2, s3, $0x4
+define void @stream_indirect_gather_cb_scs_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_strided_gather_cb_scs_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [tilespmem.tileN:s0], $0x4, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [tilespmem.tileN:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_cb_scs_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_scs_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [tilespmem.tileN:s0], s2, s3, s4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [tilespmem.tileN:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_cb_scs_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_scs_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [tilespmem.tileN:s0], $0x4, s1, s2, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [tilespmem.tileN:s0], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_cb_scs_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_scs_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [tilespmem.tileN:s0], s1, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [tilespmem.tileN:s0], s1, s2, s3, $0x4
+define void @stream_strided_gather_cb_scs_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.spmem.to.smem
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], $0x4, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_tac_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], s2, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], s2, $0x5
+define void @stream_linear_gather_cb_tac_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], $0x4, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_tac_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 addrspace(209)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], s1, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], s1, $0x5
+define void @stream_linear_gather_cb_tac_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_cb_tac_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [spmem:s0], $0x4, s2, s3, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [spmem:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_cb_tac_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tac_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [spmem:s0], s2, s3, s4, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [spmem:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_cb_tac_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tac_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [spmem:s0], $0x4, s1, s2, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [spmem:s0], $0x4, s1, s2, $0x5
+define void @stream_indirect_gather_cb_tac_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tac_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [spmem:s0], s1, s2, s3, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [spmem:s0], s1, s2, s3, $0x5
+define void @stream_indirect_gather_cb_tac_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.spmem.to.smem
+
+; CHECK-LABEL: stream_strided_gather_cb_tac_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [spmem:s0], $0x4, s2, s3, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [spmem:s0], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_cb_tac_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tac_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [spmem:s0], s2, s3, s4, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [spmem:s0], s2, s3, s4, $0x5
+define void @stream_strided_gather_cb_tac_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tac_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [spmem:s0], $0x4, s1, s2, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [spmem:s0], $0x4, s1, s2, $0x5
+define void @stream_strided_gather_cb_tac_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tac_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [spmem:s0], s1, s2, s3, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [spmem:s0], s1, s2, s3, $0x5
+define void @stream_strided_gather_cb_tac_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_tac_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], s2, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], s2, $0x5
+define void @stream_linear_gather_cb_tac_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_tac_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32 addrspace(209)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], s1, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], s1, $0x5
+define void @stream_linear_gather_cb_tac_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_cb_tac_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [tilespmem.tileN:s0], $0x4, s2, s3, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [tilespmem.tileN:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_cb_tac_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tac_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [tilespmem.tileN:s0], s2, s3, s4, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [tilespmem.tileN:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_cb_tac_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tac_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [tilespmem.tileN:s0], $0x4, s1, s2, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [tilespmem.tileN:s0], $0x4, s1, s2, $0x5
+define void @stream_indirect_gather_cb_tac_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tac_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [tilespmem.tileN:s0], s1, s2, s3, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [tilespmem.tileN:s0], s1, s2, s3, $0x5
+define void @stream_indirect_gather_cb_tac_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [hbm:s0], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [hbm:s0], s2, $0x5
+define void @stream_linear_gather_cb_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(209)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [hbm:s0], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [hbm:s0], s1, $0x5
+define void @stream_linear_gather_cb_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_cb_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_cb_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb [hbm:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_cb_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [hbm:s0], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [hbm:s0], $0x4, s1, s2, $0x5
+define void @stream_indirect_gather_cb_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [hbm:s0], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [hbm:s0], s1, s2, s3, $0x5
+define void @stream_indirect_gather_cb_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_cb_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_cb_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb [hbm:s0], s2, s3, s4, $0x5
+define void @stream_strided_gather_cb_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [hbm:s0], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [hbm:s0], $0x4, s1, s2, $0x5
+define void @stream_strided_gather_cb_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [hbm:s0], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [hbm:s0], s1, s2, s3, $0x5
+define void @stream_strided_gather_cb_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [hbm4b:s0+s2], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [hbm4b:s0+s2], $0x4, $0x5
+define void @stream_linear_gather_cb_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [hbm4b:s0+s3], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [hbm4b:s0+s3], s2, $0x5
+define void @stream_linear_gather_cb_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [hbm4b:s0+s1], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [hbm4b:s0+s1], $0x4, $0x5
+define void @stream_linear_gather_cb_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(209)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [hbm4b:s0+s2], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [hbm4b:s0+s2], s1, $0x5
+define void @stream_linear_gather_cb_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_cb_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_cb_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_indirect_gather_cb_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [hbm4b:s0+s2], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [hbm4b:s0+s2], $0x4, s1, s2, $0x5
+define void @stream_indirect_gather_cb_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [hbm4b:s0+s3], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [hbm4b:s0+s3], s1, s2, s3, $0x5
+define void @stream_indirect_gather_cb_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_cb_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_cb_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_strided_gather_cb_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [hbm4b:s0+s2], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [hbm4b:s0+s2], $0x4, s1, s2, $0x5
+define void @stream_strided_gather_cb_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [hbm4b:s0+s3], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [hbm4b:s0+s3], s1, s2, s3, $0x5
+define void @stream_strided_gather_cb_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_tac_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], s2, $0x5
+define void @stream_linear_gather_cb_tac_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_tac_spmem_to_tilespmem_ii(i32 addrspace(202)* %src, i32 addrspace(209)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], s1, $0x5
+define void @stream_linear_gather_cb_tac_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_tac_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], s2, $0x5
+define void @stream_linear_gather_cb_tac_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_tac_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src, i32 addrspace(209)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tac_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], s1, $0x5
+define void @stream_linear_gather_cb_tac_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.tilespmem
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.spmem.to.smem
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], $0x4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], s2, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], s2, $0x4
+define void @stream_linear_gather_cb_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], $0x4, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 addrspace(209)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], s1, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], s1, $0x4
+define void @stream_linear_gather_cb_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_cb_tec_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [spmem:s0], $0x4, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [spmem:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_cb_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tec_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [spmem:s0], s2, s3, s4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [spmem:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_cb_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tec_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [spmem:s0], $0x4, s1, s2, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [spmem:s0], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_cb_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tec_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [spmem:s0], s1, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [spmem:s0], s1, s2, s3, $0x4
+define void @stream_indirect_gather_cb_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.spmem.to.smem
+
+; CHECK-LABEL: stream_strided_gather_cb_tec_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [spmem:s0], $0x4, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [spmem:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_cb_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tec_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [spmem:s0], s2, s3, s4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [spmem:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_cb_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tec_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [spmem:s0], $0x4, s1, s2, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [spmem:s0], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_cb_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tec_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [spmem:s0], s1, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [spmem:s0], s1, s2, s3, $0x4
+define void @stream_strided_gather_cb_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.spmem.to.smem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_tec_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb [spmem:s0], $0x4, v0, vm0, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb [spmem:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.spmem.to.smem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.spmem.to.smem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_tec_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb [spmem:s0], s2, v0, vm0, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb [spmem:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.spmem.to.smem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.spmem.to.smem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_tec_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb [spmem:s0], $0x4, v0, vm0, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb [spmem:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.spmem.to.smem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.spmem.to.smem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_tec_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb [spmem:s0], s1, v0, vm0, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb [spmem:s0], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.spmem.to.smem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.spmem.to.smem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_tec_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], s2, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_cb_tec_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_tec_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32 addrspace(209)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], s1, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], s1, $0x4
+define void @stream_linear_gather_cb_tec_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_cb_tec_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [tilespmem.tileN:s0], $0x4, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [tilespmem.tileN:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_cb_tec_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tec_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [tilespmem.tileN:s0], s2, s3, s4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb [tilespmem.tileN:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_cb_tec_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tec_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [tilespmem.tileN:s0], $0x4, s1, s2, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [tilespmem.tileN:s0], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_cb_tec_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tec_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [tilespmem.tileN:s0], s1, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [tilespmem.tileN:s0], s1, s2, s3, $0x4
+define void @stream_indirect_gather_cb_tec_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_strided_gather_cb_tec_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [tilespmem.tileN:s0], $0x4, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [tilespmem.tileN:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_cb_tec_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tec_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [tilespmem.tileN:s0], s2, s3, s4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb [tilespmem.tileN:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_cb_tec_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tec_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [tilespmem.tileN:s0], $0x4, s1, s2, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [tilespmem.tileN:s0], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_cb_tec_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tec_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [tilespmem.tileN:s0], s1, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [tilespmem.tileN:s0], s1, s2, s3, $0x4
+define void @stream_strided_gather_cb_tec_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_tec_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb [tilespmem.tileN:s0], $0x4, v0, vm0, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb [tilespmem.tileN:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_tec_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_tec_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb [tilespmem.tileN:s0], s2, v0, vm0, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb [tilespmem.tileN:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_tec_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_tec_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb [tilespmem.tileN:s0], $0x4, v0, vm0, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb [tilespmem.tileN:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_tec_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_tec_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb [tilespmem.tileN:s0], s1, v0, vm0, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb [tilespmem.tileN:s0], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_tec_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [hbm:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [hbm:s0], s2, $0x4
+define void @stream_linear_gather_cb_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(209)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [hbm:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [hbm:s0], s1, $0x4
+define void @stream_linear_gather_cb_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_cb_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_cb_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb [hbm:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_cb_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [hbm:s0], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [hbm:s0], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_cb_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [hbm:s0], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [hbm:s0], s1, s2, s3, $0x4
+define void @stream_indirect_gather_cb_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_cb_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_cb_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb [hbm:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_cb_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [hbm:s0], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [hbm:s0], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_cb_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [hbm:s0], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [hbm:s0], s1, s2, s3, $0x4
+define void @stream_strided_gather_cb_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb [hbm:s0], s2, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb [hbm:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb [hbm:s0], s1, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb [hbm:s0], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [hbm4b:s0+s2], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [hbm4b:s0+s2], $0x4, $0x4
+define void @stream_linear_gather_cb_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [hbm4b:s0+s3], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [hbm4b:s0+s3], s2, $0x4
+define void @stream_linear_gather_cb_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [hbm4b:s0+s1], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [hbm4b:s0+s1], $0x4, $0x4
+define void @stream_linear_gather_cb_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(209)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [hbm4b:s0+s2], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [hbm4b:s0+s2], s1, $0x4
+define void @stream_linear_gather_cb_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_cb_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_cb_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_indirect_gather_cb_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [hbm4b:s0+s2], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [hbm4b:s0+s2], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_cb_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [hbm4b:s0+s3], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb [hbm4b:s0+s3], s1, s2, s3, $0x4
+define void @stream_indirect_gather_cb_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_cb_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_cb_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_strided_gather_cb_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [hbm4b:s0+s2], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [hbm4b:s0+s2], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_cb_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [hbm4b:s0+s3], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb [hbm4b:s0+s3], s1, s2, s3, $0x4
+define void @stream_strided_gather_cb_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb [hbm4b:s0+s2], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb [hbm4b:s0+s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb [hbm4b:s0+s3], s2, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb [hbm4b:s0+s3], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb [hbm4b:s0+s1], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb [hbm4b:s0+s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb [hbm4b:s0+s2], s1, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb [hbm4b:s0+s2], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.vreg.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_tec_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [spmem:s0], s2, $0x4
+define void @stream_linear_gather_cb_tec_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_tec_spmem_to_tilespmem_ii(i32 addrspace(202)* %src, i32 addrspace(209)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [spmem:s0], s1, $0x4
+define void @stream_linear_gather_cb_tec_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_tec_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_cb_tec_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_tec_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src, i32 addrspace(209)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_tec_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb [tilespmem.tileN:s0], s1, $0x4
+define void @stream_linear_gather_cb_tec_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.tilespmem.tileN.to.tilespmem

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_upd_add_f32_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_upd_add_f32_sc.ll
new file mode 100644
index 0000000..8418667
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_upd_add_f32_sc.ll

@@ -0,0 +1,795 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32, i32, i32, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.spmem.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32, i32, i32, i32, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [hbm:s0], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [hbm:s0], s2, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [hbm:s0], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [hbm:s0], s1, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.f32 [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.f32 [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.f32 [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.f32 [hbm:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.f32 [hbm:s0], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.f32 [hbm:s0], $0x4, s1, s2, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.f32 [hbm:s0], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.f32 [hbm:s0], s1, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.f32 [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.f32 [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.f32 [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.f32 [hbm:s0], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.f32 [hbm:s0], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.f32 [hbm:s0], $0x4, s1, s2, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.f32 [hbm:s0], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.f32 [hbm:s0], s1, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [hbm4b:s0+s2], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [hbm4b:s0+s2], $0x4, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [hbm4b:s0+s3], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [hbm4b:s0+s3], s2, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [hbm4b:s0+s1], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [hbm4b:s0+s1], $0x4, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [hbm4b:s0+s2], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [hbm4b:s0+s2], s1, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.f32 [hbm4b:s0+s2], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.f32 [hbm4b:s0+s2], $0x4, s1, s2, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.f32 [hbm4b:s0+s3], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.f32 [hbm4b:s0+s3], s1, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.f32 [hbm4b:s0+s2], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.f32 [hbm4b:s0+s2], $0x4, s1, s2, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.f32 [hbm4b:s0+s3], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.f32 [hbm4b:s0+s3], s1, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [spmem:s0], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [spmem:s0], s2, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_spmem_to_tilespmem_ii(i32 addrspace(202)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [spmem:s0], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [spmem:s0], s1, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.upd.add.f32.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [tilespmem.tileN:s0], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [tilespmem.tileN:s0], s2, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tac_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [tilespmem.tileN:s0], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [tilespmem.tileN:s0], s1, $0x5
+define void @stream_linear_gather_cb_add_f32_tac_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [hbm:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [hbm:s0], s2, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [hbm:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [hbm:s0], s1, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.f32 [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.f32 [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.f32 [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.f32 [hbm:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.f32 [hbm:s0], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.f32 [hbm:s0], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.f32 [hbm:s0], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.f32 [hbm:s0], s1, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.f32 [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.f32 [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.f32 [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.f32 [hbm:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.f32 [hbm:s0], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.f32 [hbm:s0], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.f32 [hbm:s0], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.f32 [hbm:s0], s1, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_f32_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm:s0], s2, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_f32_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_f32_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm:s0], s1, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm:s0], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_f32_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [hbm4b:s0+s2], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [hbm4b:s0+s2], $0x4, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [hbm4b:s0+s3], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [hbm4b:s0+s3], s2, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [hbm4b:s0+s1], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [hbm4b:s0+s1], $0x4, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [hbm4b:s0+s2], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [hbm4b:s0+s2], s1, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.f32 [hbm4b:s0+s2], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.f32 [hbm4b:s0+s2], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.f32 [hbm4b:s0+s3], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.f32 [hbm4b:s0+s3], s1, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.f32 [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.f32 [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.f32 [hbm4b:s0+s2], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.f32 [hbm4b:s0+s2], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.f32 [hbm4b:s0+s3], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.f32 [hbm4b:s0+s3], s1, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.f32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm4b:s0+s2], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm4b:s0+s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm4b:s0+s3], s2, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm4b:s0+s3], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_f32_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm4b:s0+s1], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm4b:s0+s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm4b:s0+s2], s1, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm4b:s0+s2], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb_add_f32_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.add.f32.vreg.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [spmem:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [spmem:s0], s2, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_spmem_to_tilespmem_ii(i32 addrspace(202)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [spmem:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [spmem:s0], s1, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.upd.add.f32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [tilespmem.tileN:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.f32 [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_f32_tec_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [tilespmem.tileN:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.f32 [tilespmem.tileN:s0], s1, $0x4
+define void @stream_linear_gather_cb_add_f32_tec_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.tilespmem.tileN.to.tilespmem

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_upd_add_s32_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_upd_add_s32_sc.ll
new file mode 100644
index 0000000..a1a3df4
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_upd_add_s32_sc.ll

@@ -0,0 +1,383 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32, i32, i32, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.s32.hbm.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.s32.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.s32.spmem.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.s32.tilespmem.tileN.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32, i32, i32, i32, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.s32 [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.s32 [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.s32 [hbm:s0], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.s32 [hbm:s0], s2, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.s32 [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.s32 [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.s32 [hbm:s0], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.s32 [hbm:s0], s1, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.s32 [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.s32 [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.s32 [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.s32 [hbm:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.s32 [hbm:s0], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.s32 [hbm:s0], $0x4, s1, s2, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.s32 [hbm:s0], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.s32 [hbm:s0], s1, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.s32 [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.s32 [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.s32 [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.s32 [hbm:s0], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.s32 [hbm:s0], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.s32 [hbm:s0], $0x4, s1, s2, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.s32 [hbm:s0], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.s32 [hbm:s0], s1, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.s32 [hbm4b:s0+s2], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.s32 [hbm4b:s0+s2], $0x4, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.s32 [hbm4b:s0+s3], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.s32 [hbm4b:s0+s3], s2, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.s32 [hbm4b:s0+s1], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.s32 [hbm4b:s0+s1], $0x4, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.s32 [hbm4b:s0+s2], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.s32 [hbm4b:s0+s2], s1, $0x5
+define void @stream_linear_gather_cb_add_s32_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.s32 [hbm4b:s0+s2], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.s32 [hbm4b:s0+s2], $0x4, s1, s2, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.s32 [hbm4b:s0+s3], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd.add.s32 [hbm4b:s0+s3], s1, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.s32 [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd.add.s32 [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.s32 [hbm4b:s0+s2], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.s32 [hbm4b:s0+s2], $0x4, s1, s2, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.s32 [hbm4b:s0+s3], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd.add.s32 [hbm4b:s0+s3], s1, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.add.s32.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.add.s32.vreg.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.s32 [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.s32 [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.s32 [spmem:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.s32 [spmem:s0], s2, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.s32 [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.s32 [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_spmem_to_tilespmem_ii(i32 addrspace(202)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.s32 [spmem:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.s32 [spmem:s0], s1, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.upd.add.s32.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.s32.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.add.s32.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.s32 [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.s32 [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.s32 [tilespmem.tileN:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd.add.s32 [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.s32 [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.s32 [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb_add_s32_tec_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.s32 [tilespmem.tileN:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd.add.s32 [tilespmem.tileN:s0], s1, $0x4
+define void @stream_linear_gather_cb_add_s32_tec_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.add.s32.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.add.s32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.upd.add.s32.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.s32.tilespmem.tileN.to.tilespmem

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_upd_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_upd_sc.ll
new file mode 100644
index 0000000..da5036b
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_cb_upd_sc.ll

@@ -0,0 +1,1569 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.spmem.to.smem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.spmem.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.tilespmem.tileN.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, x86_mmx, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-scs-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_scs_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb.upd_scs_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_scs_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], s2, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], s2, $0x4
+define void @stream_linear_gather_cb.upd_scs_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #0 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_scs_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb.upd_scs_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 addrspace(209)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_scs_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], s1, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], s1, $0x4
+define void @stream_linear_gather_cb.upd_scs_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_scs_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [spmem:s0], $0x4, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [spmem:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_cb.upd_scs_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_scs_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [spmem:s0], s2, s3, s4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [spmem:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_cb.upd_scs_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_scs_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [spmem:s0], $0x4, s1, s2, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [spmem:s0], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_cb.upd_scs_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_scs_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [spmem:s0], s1, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [spmem:s0], s1, s2, s3, $0x4
+define void @stream_indirect_gather_cb.upd_scs_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_scs_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [spmem:s0], $0x4, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [spmem:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_cb.upd_scs_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_scs_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [spmem:s0], s2, s3, s4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [spmem:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_cb.upd_scs_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_scs_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [spmem:s0], $0x4, s1, s2, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [spmem:s0], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_cb.upd_scs_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_scs_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [spmem:s0], s1, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [spmem:s0], s1, s2, s3, $0x4
+define void @stream_strided_gather_cb.upd_scs_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_scs_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb.upd_scs_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #0 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_scs_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s2, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_cb.upd_scs_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #0 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_scs_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb.upd_scs_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32 addrspace(209)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_scs_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s1, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s1, $0x4
+define void @stream_linear_gather_cb.upd_scs_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_scs_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], $0x4, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_cb.upd_scs_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_scs_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], s2, s3, s4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_cb.upd_scs_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_scs_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], $0x4, s1, s2, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_cb.upd_scs_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_scs_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], s1, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], s1, s2, s3, $0x4
+define void @stream_indirect_gather_cb.upd_scs_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_scs_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [tilespmem.tileN:s0], $0x4, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [tilespmem.tileN:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_cb.upd_scs_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_scs_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [tilespmem.tileN:s0], s2, s3, s4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [tilespmem.tileN:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_cb.upd_scs_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_scs_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [tilespmem.tileN:s0], $0x4, s1, s2, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [tilespmem.tileN:s0], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_cb.upd_scs_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_scs_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [tilespmem.tileN:s0], s1, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [tilespmem.tileN:s0], s1, s2, s3, $0x4
+define void @stream_strided_gather_cb.upd_scs_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_cb.upd_tac_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], s2, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], s2, $0x5
+define void @stream_linear_gather_cb.upd_tac_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_cb.upd_tac_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 addrspace(209)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], s1, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], s1, $0x5
+define void @stream_linear_gather_cb.upd_tac_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tac_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [spmem:s0], $0x4, s2, s3, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [spmem:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_cb.upd_tac_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tac_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [spmem:s0], s2, s3, s4, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [spmem:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_cb.upd_tac_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tac_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [spmem:s0], $0x4, s1, s2, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [spmem:s0], $0x4, s1, s2, $0x5
+define void @stream_indirect_gather_cb.upd_tac_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tac_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [spmem:s0], s1, s2, s3, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [spmem:s0], s1, s2, s3, $0x5
+define void @stream_indirect_gather_cb.upd_tac_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tac_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [spmem:s0], $0x4, s2, s3, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [spmem:s0], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_cb.upd_tac_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tac_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [spmem:s0], s2, s3, s4, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [spmem:s0], s2, s3, s4, $0x5
+define void @stream_strided_gather_cb.upd_tac_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tac_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [spmem:s0], $0x4, s1, s2, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [spmem:s0], $0x4, s1, s2, $0x5
+define void @stream_strided_gather_cb.upd_tac_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tac_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [spmem:s0], s1, s2, s3, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [spmem:s0], s1, s2, s3, $0x5
+define void @stream_strided_gather_cb.upd_tac_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_cb.upd_tac_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s2, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s2, $0x5
+define void @stream_linear_gather_cb.upd_tac_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_cb.upd_tac_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32 addrspace(209)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s1, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s1, $0x5
+define void @stream_linear_gather_cb.upd_tac_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tac_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], $0x4, s2, s3, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_cb.upd_tac_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tac_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], s2, s3, s4, $0x1
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_cb.upd_tac_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tac_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], $0x4, s1, s2, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], $0x4, s1, s2, $0x5
+define void @stream_indirect_gather_cb.upd_tac_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tac_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], s1, s2, s3, $0x1
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], s1, s2, s3, $0x5
+define void @stream_indirect_gather_cb.upd_tac_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_cb.upd_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [hbm:s0], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [hbm:s0], s2, $0x5
+define void @stream_linear_gather_cb.upd_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_cb.upd_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(209)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [hbm:s0], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [hbm:s0], s1, $0x5
+define void @stream_linear_gather_cb.upd_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_cb.upd_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [hbm:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_cb.upd_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [hbm:s0], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [hbm:s0], $0x4, s1, s2, $0x5
+define void @stream_indirect_gather_cb.upd_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [hbm:s0], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [hbm:s0], s1, s2, s3, $0x5
+define void @stream_indirect_gather_cb.upd_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_cb.upd_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [hbm:s0], s2, s3, s4, $0x5
+define void @stream_strided_gather_cb.upd_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [hbm:s0], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [hbm:s0], $0x4, s1, s2, $0x5
+define void @stream_strided_gather_cb.upd_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [hbm:s0], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [hbm:s0], s1, s2, s3, $0x5
+define void @stream_strided_gather_cb.upd_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [hbm4b:s0+s2], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [hbm4b:s0+s2], $0x4, $0x5
+define void @stream_linear_gather_cb.upd_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [hbm4b:s0+s3], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [hbm4b:s0+s3], s2, $0x5
+define void @stream_linear_gather_cb.upd_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [hbm4b:s0+s1], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [hbm4b:s0+s1], $0x4, $0x5
+define void @stream_linear_gather_cb.upd_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(209)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [hbm4b:s0+s2], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [hbm4b:s0+s2], s1, $0x5
+define void @stream_linear_gather_cb.upd_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_cb.upd_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_indirect_gather_cb.upd_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [hbm4b:s0+s2], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [hbm4b:s0+s2], $0x4, s1, s2, $0x5
+define void @stream_indirect_gather_cb.upd_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [hbm4b:s0+s3], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [hbm4b:s0+s3], s1, s2, s3, $0x5
+define void @stream_indirect_gather_cb.upd_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_cb.upd_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_strided_gather_cb.upd_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [hbm4b:s0+s2], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [hbm4b:s0+s2], $0x4, s1, s2, $0x5
+define void @stream_strided_gather_cb.upd_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [hbm4b:s0+s3], s1, s2, s3, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [hbm4b:s0+s3], s1, s2, s3, $0x5
+define void @stream_strided_gather_cb.upd_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_cb.upd_tac_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], s2, $0x5
+define void @stream_linear_gather_cb.upd_tac_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_cb.upd_tac_spmem_to_tilespmem_ii(i32 addrspace(202)* %src, i32 addrspace(209)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], s1, $0x5
+define void @stream_linear_gather_cb.upd_tac_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_cb.upd_tac_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s2, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s2, $0x5
+define void @stream_linear_gather_cb.upd_tac_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_cb.upd_tac_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src, i32 addrspace(209)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tac_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s1, $0x1
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s1, $0x5
+define void @stream_linear_gather_cb.upd_tac_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.tilespmem
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb.upd_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], s2, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], s2, $0x4
+define void @stream_linear_gather_cb.upd_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb.upd_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 addrspace(209)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], s1, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], s1, $0x4
+define void @stream_linear_gather_cb.upd_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tec_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [spmem:s0], $0x4, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [spmem:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_cb.upd_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tec_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [spmem:s0], s2, s3, s4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [spmem:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_cb.upd_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tec_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [spmem:s0], $0x4, s1, s2, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [spmem:s0], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_cb.upd_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tec_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [spmem:s0], s1, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [spmem:s0], s1, s2, s3, $0x4
+define void @stream_indirect_gather_cb.upd_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tec_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [spmem:s0], $0x4, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [spmem:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_cb.upd_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tec_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [spmem:s0], s2, s3, s4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [spmem:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_cb.upd_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tec_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [spmem:s0], $0x4, s1, s2, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [spmem:s0], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_cb.upd_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tec_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [spmem:s0], s1, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [spmem:s0], s1, s2, s3, $0x4
+define void @stream_strided_gather_cb.upd_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.upd.spmem.to.smem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb.upd_tec_spmem_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd [spmem:s0], $0x4, v0, vm0, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd [spmem:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb.upd_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.spmem.to.smem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.spmem.to.smem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb.upd_tec_spmem_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd [spmem:s0], s2, v0, vm0, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd [spmem:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb.upd_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.spmem.to.smem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.spmem.to.smem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb.upd_tec_spmem_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd [spmem:s0], $0x4, v0, vm0, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd [spmem:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb.upd_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.spmem.to.smem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.spmem.to.smem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb.upd_tec_spmem_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd [spmem:s0], s1, v0, vm0, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd [spmem:s0], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb.upd_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.spmem.to.smem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.spmem.to.smem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb.upd_tec_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s2, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_cb.upd_tec_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb.upd_tec_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32 addrspace(209)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s1, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s1, $0x4
+define void @stream_linear_gather_cb.upd_tec_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tec_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], $0x4, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_cb.upd_tec_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tec_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], s2, s3, s4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_cb.upd_tec_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tec_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], $0x4, s1, s2, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_cb.upd_tec_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tec_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], s1, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [tilespmem.tileN:s0], s1, s2, s3, $0x4
+define void @stream_indirect_gather_cb.upd_tec_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tec_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [tilespmem.tileN:s0], $0x4, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [tilespmem.tileN:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_cb.upd_tec_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tec_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [tilespmem.tileN:s0], s2, s3, s4, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [tilespmem.tileN:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_cb.upd_tec_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tec_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [tilespmem.tileN:s0], $0x4, s1, s2, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [tilespmem.tileN:s0], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_cb.upd_tec_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tec_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [tilespmem.tileN:s0], s1, s2, s3, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [tilespmem.tileN:s0], s1, s2, s3, $0x4
+define void @stream_strided_gather_cb.upd_tec_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.upd.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb.upd_tec_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd [tilespmem.tileN:s0], $0x4, v0, vm0, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd [tilespmem.tileN:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb.upd_tec_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb.upd_tec_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd [tilespmem.tileN:s0], s2, v0, vm0, $0
+; CHECK: [smem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd [tilespmem.tileN:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb.upd_tec_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb.upd_tec_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd [tilespmem.tileN:s0], $0x4, v0, vm0, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd [tilespmem.tileN:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb.upd_tec_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb.upd_tec_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd [tilespmem.tileN:s0], s1, v0, vm0, $0
+; CHECK: [smem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd [tilespmem.tileN:s0], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb.upd_tec_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_cb.upd_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [hbm:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [hbm:s0], s2, $0x4
+define void @stream_linear_gather_cb.upd_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_cb.upd_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(209)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [hbm:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [hbm:s0], s1, $0x4
+define void @stream_linear_gather_cb.upd_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_cb.upd_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [hbm:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_cb.upd_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [hbm:s0], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [hbm:s0], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_cb.upd_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [hbm:s0], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [hbm:s0], s1, s2, s3, $0x4
+define void @stream_indirect_gather_cb.upd_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_cb.upd_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [hbm:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_cb.upd_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [hbm:s0], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [hbm:s0], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_cb.upd_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [hbm:s0], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [hbm:s0], s1, s2, s3, $0x4
+define void @stream_strided_gather_cb.upd_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb.upd_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb.upd_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb.upd_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd [hbm:s0], s2, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd [hbm:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb.upd_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb.upd_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb.upd_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb.upd_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd [hbm:s0], s1, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd [hbm:s0], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb.upd_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [hbm4b:s0+s2], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [hbm4b:s0+s2], $0x4, $0x4
+define void @stream_linear_gather_cb.upd_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [hbm4b:s0+s3], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [hbm4b:s0+s3], s2, $0x4
+define void @stream_linear_gather_cb.upd_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [hbm4b:s0+s1], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [hbm4b:s0+s1], $0x4, $0x4
+define void @stream_linear_gather_cb.upd_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(209)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [hbm4b:s0+s2], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [hbm4b:s0+s2], s1, $0x4
+define void @stream_linear_gather_cb.upd_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_cb.upd_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect.gather.cb.upd [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_indirect_gather_cb.upd_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [hbm4b:s0+s2], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [hbm4b:s0+s2], $0x4, s1, s2, $0x4
+define void @stream_indirect_gather_cb.upd_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_cb.upd_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [hbm4b:s0+s3], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect.gather.cb.upd [hbm4b:s0+s3], s1, s2, s3, $0x4
+define void @stream_indirect_gather_cb.upd_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_cb.upd_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.strided.gather.cb.upd [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_strided_gather_cb.upd_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [hbm4b:s0+s2], $0x4, s1, s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [hbm4b:s0+s2], $0x4, s1, s2, $0x4
+define void @stream_strided_gather_cb.upd_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_cb.upd_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [hbm4b:s0+s3], s1, s2, s3, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.strided.gather.cb.upd [hbm4b:s0+s3], s1, s2, s3, $0x4
+define void @stream_strided_gather_cb.upd_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.cb.upd.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb.upd_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd [hbm4b:s0+s2], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd [hbm4b:s0+s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb.upd_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb.upd_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd [hbm4b:s0+s3], s2, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.indirect_vreg.gather.cb.upd [hbm4b:s0+s3], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb.upd_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb.upd_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd [hbm4b:s0+s1], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd [hbm4b:s0+s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb.upd_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb.upd_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd [hbm4b:s0+s2], s1, v0, vm0, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.indirect_vreg.gather.cb.upd [hbm4b:s0+s2], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_cb.upd_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.vreg.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb.upd_tec_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [spmem:s0], s2, $0x4
+define void @stream_linear_gather_cb.upd_tec_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_cb.upd_tec_spmem_to_tilespmem_ii(i32 addrspace(202)* %src, i32 addrspace(209)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [spmem:s0], s1, $0x4
+define void @stream_linear_gather_cb.upd_tec_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, x86_mmx undef, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.upd.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.upd.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb.upd_tec_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s2, $0
+; CHECK: [tilespmem:$0x0], [sflag:s1] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_cb.upd_tec_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_cb.upd_tec_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src, i32 addrspace(209)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_cb.upd_tec_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s1, $0
+; CHECK: [tilespmem:$0x0], [sflag:$0x1f] = stream.linear.gather.cb.upd [tilespmem.tileN:s0], s1, $0x4
+define void @stream_linear_gather_cb.upd_tec_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.cb.upd.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, x86_mmx undef, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.cb.upd.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.cb.upd.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.cb.upd.tilespmem.tileN.to.tilespmem

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_dynovrd_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_dynovrd_sc.ll
new file mode 100644
index 0000000..1b50e74
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_dynovrd_sc.ll

@@ -0,0 +1,99 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_smem_ri:
+; CHECK: { s[[s0:[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0x5 }
+; CHECK: { s[[s1:[0-9]+]] = sor.u32 s[[s0]], s{{[0-9]+}} }
+; CHECK: [smem:s{{[0-9]+}}], [sflag:s[[s1]]] = stream.linear.gather [spmem:s{{[0-9]+}}], $0x4, $0x0
+define void @stream_linear_gather_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %ctrl, i32 %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 %ctrl, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_smem_rr:
+; CHECK: { s[[s0:[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0x5 }
+; CHECK: { s[[s1:[0-9]+]] = sor.u32 s[[s0]], s{{[0-9]+}} }
+; CHECK: [smem:s{{[0-9]+}}], [sflag:s[[s1]]] = stream.linear.gather [spmem:s{{[0-9]+}}], s{{[0-9]+}}, $0x0
+define void @stream_linear_gather_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %ctrl, i32 %len) #0 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 %ctrl, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_smem_ii:
+; CHECK: { s[[s0:[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0x5 }
+; CHECK: { s[[s1:[0-9]+]] = sor.u32 $0x1f, s{{[0-9]+}} }
+; CHECK: [smem:s{{[0-9]+}}], [sflag:s[[s1]]] = stream.linear.gather [spmem:s{{[0-9]+}}], $0x4, $0x0
+define void @stream_linear_gather_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst, i32 %ctrl) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 %ctrl, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_smem_ir:
+; CHECK: { s[[s0:[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0x5 }
+; CHECK: { s[[s1:[0-9]+]] = sor.u32 $0x1f, s{{[0-9]+}} }
+; CHECK: [smem:s{{[0-9]+}}], [sflag:s[[s1]]] = stream.linear.gather [spmem:s{{[0-9]+}}], s{{[0-9]+}}, $0x0
+define void @stream_linear_gather_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %ctrl) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 %ctrl, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_gf_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_gf_sc.ll
new file mode 100644
index 0000000..c16ad6a
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_gf_sc.ll

@@ -0,0 +1,1569 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v16i32(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32*, i32, <16 x i32>, <16 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v16i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32*, i32, <16 x i32>, <16 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v16i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, <16 x i32>, <16 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v16i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, <16 x i32>, <16 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.tilespmem.v16i32(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, <16 x i32>, <16 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.tilespmem.v16i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, <16 x i32>, <16 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-gf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-gf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-gf" }
+
+;  ------------------------
+; | Test sparsecore-scs-gf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_linear_gather_scs_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_scs_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_scs_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0x4
+define void @stream_linear_gather_scs_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len) #0 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_scs_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_scs_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_scs_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0x4
+define void @stream_linear_gather_scs_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_scs_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], $0x4, s3, s4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], $0x4, s3, s4, $0x4
+define void @stream_indirect_gather_scs_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_scs_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], s3, s4, s5, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], s3, s4, s5, $0x4
+define void @stream_indirect_gather_scs_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_scs_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], $0x4, s2, s3, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_scs_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_scs_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], s2, s3, s4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_scs_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_strided_gather_scs_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], $0x4, s3, s4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], $0x4, s3, s4, $0x4
+define void @stream_strided_gather_scs_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_scs_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], s3, s4, s5, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], s3, s4, s5, $0x4
+define void @stream_strided_gather_scs_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_scs_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], $0x4, s2, s3, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_scs_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_scs_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], s2, s3, s4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_scs_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_linear_gather_scs_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_scs_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag) #0 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_scs_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0x4
+define void @stream_linear_gather_scs_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #0 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_scs_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_scs_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_scs_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_scs_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_scs_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0x4
+define void @stream_indirect_gather_scs_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_scs_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], s3, s4, s5, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], s3, s4, s5, $0x4
+define void @stream_indirect_gather_scs_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_scs_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_scs_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_scs_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], s2, s3, s4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_scs_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_strided_gather_scs_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0x4
+define void @stream_strided_gather_scs_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_scs_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [tilespmem.tileN:s0], s3, s4, s5, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [tilespmem.tileN:s0], s3, s4, s5, $0x4
+define void @stream_strided_gather_scs_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_scs_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_scs_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_scs_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [tilespmem.tileN:s0], s2, s3, s4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [tilespmem.tileN:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_scs_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+;  ------------------------
+; | Test sparsecore-tac-gf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_linear_gather_tac_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0x5
+define void @stream_linear_gather_tac_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0x5
+define void @stream_linear_gather_tac_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_tac_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], $0x4, s3, s4, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], $0x4, s3, s4, $0x5
+define void @stream_indirect_gather_tac_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], s3, s4, s5, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], s3, s4, s5, $0x5
+define void @stream_indirect_gather_tac_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], $0x4, s2, s3, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], s2, s3, s4, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_strided_gather_tac_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], $0x4, s3, s4, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], $0x4, s3, s4, $0x5
+define void @stream_strided_gather_tac_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], s3, s4, s5, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], s3, s4, s5, $0x5
+define void @stream_strided_gather_tac_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], $0x4, s2, s3, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], s2, s3, s4, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_linear_gather_tac_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0x5
+define void @stream_linear_gather_tac_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0x5
+define void @stream_linear_gather_tac_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_tac_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0x5
+define void @stream_indirect_gather_tac_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], s3, s4, s5, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], s3, s4, s5, $0x5
+define void @stream_indirect_gather_tac_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], s2, s3, s4, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm:s0], s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm:s0], s3, $0x5
+define void @stream_linear_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm:s0], s2, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm:s0], s2, $0x5
+define void @stream_linear_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm:s0], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm:s0], $0x4, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm:s0], s3, s4, s5, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm:s0], s3, s4, s5, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm:s0], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm:s0], $0x4, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm:s0], s3, s4, s5, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm:s0], s3, s4, s5, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm:s0], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s3], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s3], $0x4, $0x5
+define void @stream_linear_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s4], s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s4], s3, $0x5
+define void @stream_linear_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm4b:s0+s2], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm4b:s0+s2], $0x4, $0x5
+define void @stream_linear_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm4b:s0+s3], s2, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm4b:s0+s3], s2, $0x5
+define void @stream_linear_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s4], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s4], $0x4, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s5], s3, s4, s5, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s5], s3, s4, s5, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s4], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s4], $0x4, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s5], s3, s4, s5, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s5], s3, s4, s5, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_tac_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0x5
+define void @stream_linear_gather_tac_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_spmem_to_tilespmem_ii(i32 addrspace(202)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0x5
+define void @stream_linear_gather_tac_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_tac_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0x5
+define void @stream_linear_gather_tac_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0x5
+define void @stream_linear_gather_tac_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.tilespmem
+
+;  ------------------------
+; | Test sparsecore-tec-gf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0x4
+define void @stream_linear_gather_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0x4
+define void @stream_linear_gather_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_tec_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], $0x4, s3, s4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], $0x4, s3, s4, $0x4
+define void @stream_indirect_gather_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], s3, s4, s5, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], s3, s4, s5, $0x4
+define void @stream_indirect_gather_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], $0x4, s2, s3, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], s2, s3, s4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_strided_gather_tec_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], $0x4, s3, s4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], $0x4, s3, s4, $0x4
+define void @stream_strided_gather_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], s3, s4, s5, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], s3, s4, s5, $0x4
+define void @stream_strided_gather_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], $0x4, s2, s3, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], s2, s3, s4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect_vreg.gather [spmem:s0], $0x4, v0, vm0, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect_vreg.gather [spmem:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v16i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v16i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect_vreg.gather [spmem:s0], s3, v0, vm0, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect_vreg.gather [spmem:s0], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v16i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v16i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [spmem:s0], $0x4, v0, vm0, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [spmem:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v16i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v16i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [spmem:s0], s2, v0, vm0, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [spmem:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v16i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v16i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_linear_gather_tec_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0x4
+define void @stream_linear_gather_tec_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_tec_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_tec_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0x4
+define void @stream_indirect_gather_tec_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], s3, s4, s5, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], s3, s4, s5, $0x4
+define void @stream_indirect_gather_tec_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], s2, s3, s4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_strided_gather_tec_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0x4
+define void @stream_strided_gather_tec_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [tilespmem.tileN:s0], s3, s4, s5, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [tilespmem.tileN:s0], s3, s4, s5, $0x4
+define void @stream_strided_gather_tec_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [tilespmem.tileN:s0], s2, s3, s4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [tilespmem.tileN:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect_vreg.gather [tilespmem.tileN:s0], $0x4, v0, vm0, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect_vreg.gather [tilespmem.tileN:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v16i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v16i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect_vreg.gather [tilespmem.tileN:s0], s3, v0, vm0, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect_vreg.gather [tilespmem.tileN:s0], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v16i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v16i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [tilespmem.tileN:s0], $0x4, v0, vm0, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [tilespmem.tileN:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v16i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v16i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [tilespmem.tileN:s0], s2, v0, vm0, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [tilespmem.tileN:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v16i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v16i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm:s0], s3, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm:s0], s3, $0x4
+define void @stream_linear_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm:s0], s2, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm:s0], s2, $0x4
+define void @stream_linear_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm:s0], $0x4, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm:s0], $0x4, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm:s0], s3, s4, s5, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm:s0], s3, s4, s5, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm:s0], $0x4, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm:s0], $0x4, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm:s0], s3, s4, s5, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm:s0], s3, s4, s5, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v16i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v16i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm:s0], s3, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm:s0], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v16i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v16i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v16i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v16i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [hbm:s0], s2, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [hbm:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v16i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v16i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s3], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s3], $0x4, $0x4
+define void @stream_linear_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s4], s3, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s4], s3, $0x4
+define void @stream_linear_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm4b:s0+s2], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm4b:s0+s2], $0x4, $0x4
+define void @stream_linear_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm4b:s0+s3], s2, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm4b:s0+s3], s2, $0x4
+define void @stream_linear_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s4], $0x4, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s4], $0x4, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s5], s3, s4, s5, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s5], s3, s4, s5, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s4], $0x4, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s4], $0x4, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s5], s3, s4, s5, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s5], s3, s4, s5, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm4b:s0+s3], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm4b:s0+s3], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v16i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v16i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm4b:s0+s4], s3, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm4b:s0+s4], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v16i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v16i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [hbm4b:s0+s2], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [hbm4b:s0+s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v16i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v16i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [hbm4b:s0+s3], s2, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [hbm4b:s0+s3], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v16i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v16i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.vreg.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0x4
+define void @stream_linear_gather_tec_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_spmem_to_tilespmem_ii(i32 addrspace(202)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0x4
+define void @stream_linear_gather_tec_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_tec_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0x4
+define void @stream_linear_gather_tec_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_tec_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.tilespmem

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_sc.ll
new file mode 100644
index 0000000..90b95b8
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_gather_sc.ll

@@ -0,0 +1,1569 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-scs-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_linear_gather_scs_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_scs_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_scs_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0x4
+define void @stream_linear_gather_scs_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len) #0 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_scs_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_scs_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_scs_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0x4
+define void @stream_linear_gather_scs_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_scs_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], $0x4, s3, s4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], $0x4, s3, s4, $0x4
+define void @stream_indirect_gather_scs_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_scs_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], s3, s4, s5, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], s3, s4, s5, $0x4
+define void @stream_indirect_gather_scs_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_scs_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], $0x4, s2, s3, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_scs_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_scs_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], s2, s3, s4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_scs_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_strided_gather_scs_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], $0x4, s3, s4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], $0x4, s3, s4, $0x4
+define void @stream_strided_gather_scs_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_scs_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], s3, s4, s5, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], s3, s4, s5, $0x4
+define void @stream_strided_gather_scs_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_scs_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], $0x4, s2, s3, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_scs_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_scs_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], s2, s3, s4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_scs_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_linear_gather_scs_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_scs_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag) #0 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_scs_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0x4
+define void @stream_linear_gather_scs_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #0 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_scs_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_scs_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_scs_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_scs_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_scs_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0x4
+define void @stream_indirect_gather_scs_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_scs_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], s3, s4, s5, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], s3, s4, s5, $0x4
+define void @stream_indirect_gather_scs_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_scs_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_scs_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_scs_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], s2, s3, s4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_scs_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_strided_gather_scs_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0x4
+define void @stream_strided_gather_scs_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_scs_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [tilespmem.tileN:s0], s3, s4, s5, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [tilespmem.tileN:s0], s3, s4, s5, $0x4
+define void @stream_strided_gather_scs_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_scs_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_scs_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_scs_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [tilespmem.tileN:s0], s2, s3, s4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [tilespmem.tileN:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_scs_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_linear_gather_tac_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0x5
+define void @stream_linear_gather_tac_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0x5
+define void @stream_linear_gather_tac_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_tac_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], $0x4, s3, s4, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], $0x4, s3, s4, $0x5
+define void @stream_indirect_gather_tac_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], s3, s4, s5, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], s3, s4, s5, $0x5
+define void @stream_indirect_gather_tac_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], $0x4, s2, s3, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], s2, s3, s4, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_strided_gather_tac_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], $0x4, s3, s4, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], $0x4, s3, s4, $0x5
+define void @stream_strided_gather_tac_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], s3, s4, s5, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], s3, s4, s5, $0x5
+define void @stream_strided_gather_tac_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], $0x4, s2, s3, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], s2, s3, s4, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_linear_gather_tac_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0x5
+define void @stream_linear_gather_tac_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0x5
+define void @stream_linear_gather_tac_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_tac_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0x5
+define void @stream_indirect_gather_tac_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], s3, s4, s5, $0x1
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], s3, s4, s5, $0x5
+define void @stream_indirect_gather_tac_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], s2, s3, s4, $0x1
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm:s0], s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm:s0], s3, $0x5
+define void @stream_linear_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm:s0], s2, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm:s0], s2, $0x5
+define void @stream_linear_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm:s0], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm:s0], $0x4, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm:s0], s3, s4, s5, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm:s0], s3, s4, s5, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm:s0], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm:s0], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm:s0], $0x4, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm:s0], s3, s4, s5, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm:s0], s3, s4, s5, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm:s0], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm:s0], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm:s0], s2, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm:s0], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s3], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s3], $0x4, $0x5
+define void @stream_linear_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s4], s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s4], s3, $0x5
+define void @stream_linear_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm4b:s0+s2], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm4b:s0+s2], $0x4, $0x5
+define void @stream_linear_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm4b:s0+s3], s2, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm4b:s0+s3], s2, $0x5
+define void @stream_linear_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s4], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s4], $0x4, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s5], s3, s4, s5, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s5], s3, s4, s5, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_indirect_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s4], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s4], $0x4, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s5], s3, s4, s5, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s5], s3, s4, s5, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm4b:s0+s3], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm4b:s0+s3], $0x4, s2, s3, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tac_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm4b:s0+s4], s2, s3, s4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm4b:s0+s4], s2, s3, s4, $0x5
+define void @stream_strided_gather_tac_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_tac_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0x5
+define void @stream_linear_gather_tac_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_spmem_to_tilespmem_ii(i32 addrspace(202)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0x5
+define void @stream_linear_gather_tac_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_tac_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0x1
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0x5
+define void @stream_linear_gather_tac_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x5
+define void @stream_linear_gather_tac_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tac_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0x1
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0x5
+define void @stream_linear_gather_tac_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.tilespmem
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0x4
+define void @stream_linear_gather_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0x4
+define void @stream_linear_gather_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_tec_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], $0x4, s3, s4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], $0x4, s3, s4, $0x4
+define void @stream_indirect_gather_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], s3, s4, s5, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [spmem:s0], s3, s4, s5, $0x4
+define void @stream_indirect_gather_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], $0x4, s2, s3, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], s2, s3, s4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [spmem:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_strided_gather_tec_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], $0x4, s3, s4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], $0x4, s3, s4, $0x4
+define void @stream_strided_gather_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], s3, s4, s5, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [spmem:s0], s3, s4, s5, $0x4
+define void @stream_strided_gather_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], $0x4, s2, s3, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], s2, s3, s4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [spmem:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.spmem.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_spmem_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect_vreg.gather [spmem:s0], $0x4, v0, vm0, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect_vreg.gather [spmem:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_spmem_to_smem_ri(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_spmem_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect_vreg.gather [spmem:s0], s3, v0, vm0, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect_vreg.gather [spmem:s0], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_spmem_to_smem_rr(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_spmem_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [spmem:s0], $0x4, v0, vm0, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [spmem:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_spmem_to_smem_ii(i32 addrspace(202)* %src, i32* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_spmem_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [spmem:s0], s2, v0, vm0, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [spmem:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_spmem_to_smem_ir(i32 addrspace(202)* %src, i32* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.spmem.to.smem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_linear_gather_tec_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0x4
+define void @stream_linear_gather_tec_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_tec_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_indirect_gather_tec_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0x4
+define void @stream_indirect_gather_tec_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], s3, s4, s5, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect.gather [tilespmem.tileN:s0], s3, s4, s5, $0x4
+define void @stream_indirect_gather_tec_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], s2, s3, s4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect.gather [tilespmem.tileN:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_strided_gather_tec_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [tilespmem.tileN:s0], $0x4, s3, s4, $0x4
+define void @stream_strided_gather_tec_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [tilespmem.tileN:s0], s3, s4, s5, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.strided.gather [tilespmem.tileN:s0], s3, s4, s5, $0x4
+define void @stream_strided_gather_tec_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [tilespmem.tileN:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [tilespmem.tileN:s0], s2, s3, s4, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.strided.gather [tilespmem.tileN:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.smem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_tilespmem_tileN_to_smem_ri:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect_vreg.gather [tilespmem.tileN:s0], $0x4, v0, vm0, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect_vreg.gather [tilespmem.tileN:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_tilespmem_tileN_to_smem_ri(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_tilespmem_tileN_to_smem_rr:
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect_vreg.gather [tilespmem.tileN:s0], s3, v0, vm0, $0
+; CHECK: [smem:s1], [sflag:s2] = stream.indirect_vreg.gather [tilespmem.tileN:s0], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_tilespmem_tileN_to_smem_rr(i32 addrspace(201)* %src, i32* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_tilespmem_tileN_to_smem_ii:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [tilespmem.tileN:s0], $0x4, v0, vm0, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [tilespmem.tileN:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_tilespmem_tileN_to_smem_ii(i32 addrspace(201)* %src, i32* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_tilespmem_tileN_to_smem_ir:
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [tilespmem.tileN:s0], s2, v0, vm0, $0
+; CHECK: [smem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [tilespmem.tileN:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_tilespmem_tileN_to_smem_ir(i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.smem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm:s0], s3, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm:s0], s3, $0x4
+define void @stream_linear_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm:s0], s2, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm:s0], s2, $0x4
+define void @stream_linear_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm:s0], $0x4, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm:s0], $0x4, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm:s0], s3, s4, s5, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm:s0], s3, s4, s5, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm:s0], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm:s0], $0x4, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm:s0], $0x4, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm:s0], s3, s4, s5, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm:s0], s3, s4, s5, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm:s0], $0x4, s2, s3, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm:s0], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm:s0], s2, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm:s0], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_hbm_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_hbm_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_hbm_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm:s0], s3, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm:s0], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_hbm_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_hbm_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [hbm:s0], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [hbm:s0], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_hbm_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_hbm_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [hbm:s0], s2, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [hbm:s0], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_hbm_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s3], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s3], $0x4, $0x4
+define void @stream_linear_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s4], s3, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s4], s3, $0x4
+define void @stream_linear_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm4b:s0+s2], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm4b:s0+s2], $0x4, $0x4
+define void @stream_linear_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm4b:s0+s3], s2, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [hbm4b:s0+s3], s2, $0x4
+define void @stream_linear_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s4], $0x4, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s4], $0x4, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s5], s3, s4, s5, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s5], s3, s4, s5, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect.gather [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_indirect_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s4], $0x4, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s4], $0x4, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s5], s3, s4, s5, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s5], s3, s4, s5, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm4b:s0+s3], $0x4, s2, s3, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm4b:s0+s3], $0x4, s2, s3, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm4b:s0+s4], s2, s3, s4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.strided.gather [hbm4b:s0+s4], s2, s3, s4, $0x4
+define void @stream_strided_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_hbm4b_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm4b:s0+s3], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm4b:s0+s3], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_hbm4b_to_tilespmem_ri(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_hbm4b_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm4b:s0+s4], s3, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm4b:s0+s4], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_hbm4b_to_tilespmem_rr(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_hbm4b_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [hbm4b:s0+s2], $0x4, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [hbm4b:s0+s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_hbm4b_to_tilespmem_ii(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_tec_hbm4b_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [hbm4b:s0+s3], s2, v0, vm0, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.indirect_vreg.gather [hbm4b:s0+s3], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_gather_tec_hbm4b_to_tilespmem_ir(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.vreg.spmem.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_spmem_to_tilespmem_ri(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [spmem:s0], s3, $0x4
+define void @stream_linear_gather_tec_spmem_to_tilespmem_rr(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_spmem_to_tilespmem_ii(i32 addrspace(202)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_spmem_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [spmem:s0], s2, $0x4
+define void @stream_linear_gather_tec_spmem_to_tilespmem_ir(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.spmem.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.spmem.to.tilespmem
+
+; Test intrinsic @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem
+
+; CHECK-LABEL: stream_linear_gather_tec_tilespmem_tileN_to_tilespmem_ri:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_tilespmem_tileN_to_tilespmem_ri(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_tilespmem_tileN_to_tilespmem_rr:
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0
+; CHECK: [tilespmem:s1], [sflag:s2] = stream.linear.gather [tilespmem.tileN:s0], s3, $0x4
+define void @stream_linear_gather_tec_tilespmem_tileN_to_tilespmem_rr(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_tilespmem_tileN_to_tilespmem_ii:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], $0x4, $0x4
+define void @stream_linear_gather_tec_tilespmem_tileN_to_tilespmem_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_tec_tilespmem_tileN_to_tilespmem_ir:
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0
+; CHECK: [tilespmem:s1], [sflag:$0x1f] = stream.linear.gather [tilespmem.tileN:s0], s2, $0x4
+define void @stream_linear_gather_tec_tilespmem_tileN_to_tilespmem_ir(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.gather.tilespmem.tileN.to.tilespmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.gather.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.gather.tilespmem.tileN.to.tilespmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.gather.tilespmem.tileN.to.tilespmem

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_indirect_vreg_cb_errata_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_indirect_vreg_cb_errata_sc.ll
new file mode 100644
index 0000000..52bd99f
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_indirect_vreg_cb_errata_sc.ll

@@ -0,0 +1,178 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck --check-prefixes=CHECK,CHECK-VFC %s
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck --check-prefixes=CHECK %s
+; REQUIRES: tpu
+
+; Tests the VFC errata b/210042404. The test is partially useless because
+; we also model address calculation delay for stream.indirect_vreg.
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, x86_mmx, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.spmem.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.smem.to.spmem.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_hbm_to_tilespmem:
+; CHECK-VFC: { _ = sdelay $0x2 }
+; CHECK: { [tilespmem:$0x0], [sflag:s{{[0-9]+}}] = stream.indirect_vreg.gather.cb [hbm:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x0 }
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: { [tilespmem:$0x0], [sflag:s{{[0-9]+}}] = stream.indirect_vreg.gather.cb [hbm:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x4 }
+; CHECK-NEXT: { _ = shalt }
+define void @stream_indirect_vreg_gather_cb_hbm_to_tilespmem(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_upd_hbm_to_tilespmem:
+; CHECK-VFC: { _ = sdelay $0x2 }
+; CHECK: { [tilespmem:$0x0], [sflag:s{{[0-9]+}}] = stream.indirect_vreg.gather.cb.upd [hbm:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x0 }
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: { [tilespmem:$0x0], [sflag:s{{[0-9]+}}] = stream.indirect_vreg.gather.cb.upd [hbm:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x4 }
+; CHECK-NEXT: { _ = shalt }
+define void @stream_indirect_vreg_gather_cb_upd_hbm_to_tilespmem(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_add_f32_hbm_to_tilespmem:
+; CHECK-VFC: { _ = sdelay $0x2 }
+; CHECK: { [tilespmem:$0x0], [sflag:s{{[0-9]+}}] = stream.indirect_vreg.gather.cb.add.f32 [hbm:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x0 }
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: { [tilespmem:$0x0], [sflag:s{{[0-9]+}}] = stream.indirect_vreg.gather.cb.add.f32 [hbm:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x4 }
+; CHECK-NEXT: { _ = shalt }
+define void @stream_indirect_vreg_gather_cb_add_f32_hbm_to_tilespmem(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_upd_add_f32_hbm_to_tilespmem:
+; CHECK-VFC: { _ = sdelay $0x2 }
+; CHECK: { [tilespmem:$0x0], [sflag:s{{[0-9]+}}] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x0 }
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: { [tilespmem:$0x0], [sflag:s{{[0-9]+}}] = stream.indirect_vreg.gather.cb.upd.add.f32 [hbm:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x4 }
+; CHECK-NEXT: { _ = shalt }
+define void @stream_indirect_vreg_gather_cb_upd_add_f32_hbm_to_tilespmem(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) {
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.add.f32.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_smem_to_spmem:
+; CHECK-VFC: { _ = sdelay $0x2 }
+; CHECK: { [spmem:s{{[0-9]+}}] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x0 }
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: { [spmem:s{{[0-9]+}}] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x4 }
+; CHECK-NEXT: { _ = shalt }
+define void @stream_indirect_vreg_scatter_cb_smem_to_spmem(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_upd_smem_to_spmem:
+; CHECK-VFC: { _ = sdelay $0x2 }
+; CHECK: { [spmem:s{{[0-9]+}}] = stream.indirect_vreg.scatter.cb.upd [smem:$0x0], [sflag:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x0 }
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: { [spmem:s{{[0-9]+}}] = stream.indirect_vreg.scatter.cb.upd [smem:$0x0], [sflag:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x4 }
+; CHECK-NEXT: { _ = shalt }
+define void @stream_indirect_vreg_scatter_cb_upd_smem_to_spmem(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_smem_to_spmem:
+; CHECK-VFC: { _ = sdelay $0x2 }
+; CHECK: { [spmem:s{{[0-9]+}}] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x0 }
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: { [spmem:s{{[0-9]+}}] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x4 }
+; CHECK-NEXT: { _ = shalt }
+define void @stream_indirect_vreg_scatter_cb_add_f32_smem_to_spmem(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_upd_add_f32_smem_to_spmem:
+; CHECK-VFC: { _ = sdelay $0x2 }
+; CHECK: { [spmem:s{{[0-9]+}}] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x0 }
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: { [spmem:s{{[0-9]+}}] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x4 }
+; CHECK-NEXT: { _ = shalt }
+define void @stream_indirect_vreg_scatter_cb_upd_add_f32_smem_to_spmem(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_upd_hbm_to_tilespmem_pipelined_loop
+; CHECK-VFC: .LBB8_1
+; CHECK-VFC: {
+; CHECK-VFC stream.indirect_vreg.gather.cb.upd
+; CHECK-VFC: {
+; CHECK-VFC stream.indirect_vreg.gather.cb.upd
+; CHECK-VFC: {
+
+define void @stream_indirect_vreg_gather_cb_upd_hbm_to_tilespmem_pipelined_loop(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm, i32 %a, i32 %b) {
+Entry:
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  br label %Loop
+Loop:
+  %iv = phi i32 [ %a, %Entry ], [ %next_iv, %Loop ]
+  %v0 = phi i32 [ %b, %Entry ], [ %v5, %Loop ]
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 %a, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  %v1 = mul i32 %v0, 3
+  %v2 = xor i32 %v1, %b
+  %v3 = mul i32 %v2, %a
+  %v4 = add i32 %v3, 1
+  %v5 = xor i32 %v4, %a
+  %ptr = inttoptr i32 %v5 to i32*
+  store i32 %a, i32* %ptr
+  %next_iv = add i32 %iv, 1
+  %cond = icmp slt i32 %iv, %b
+  br i1 %cond, label %Loop, label %Exit, !llvm.loop !2
+Exit:
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_cb_upd_hbm_to_tilespmem_cross_block
+; CHECK-VFC: { _ = sdelay $0x2 }
+; CHECK: stream.indirect_vreg.gather.cb.upd
+; CHECK-NEXT: { _ = sdelay $0x1 }
+; CHECK-NEXT: { p{{[0-9]+}} = sne.s32 s{{[0-9]+}}, s{{[0-9]+}} }
+; CHECK-NEXT: { [tilespmem:$0x0], [sflag:s{{[0-9]+}}] = stream.indirect_vreg.gather.cb.upd @p0 [hbm:s{{[0-9]+}}], $0x4, v{{[0-9]+}}, vm{{[0-9]+}}, $0x1 }
+; CHECK-NEXT: { _ = shalt @p0 }
+
+; CHECK-NEXT: .LBB9_1:
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: stream.indirect_vreg.gather.cb.upd
+; CHECK-NEXT: { _ = shalt }
+define void @stream_indirect_vreg_gather_cb_upd_hbm_to_tilespmem_cross_block(i32 addrspace(203)* %src, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm, i32 %a, i32 %b) {
+Entry:
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  %cond = icmp eq i32 %a, %b
+  br i1 %cond, label %Eq, label %Neq
+Eq:
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+Neq:
+  call void @llvm.tpu.stream.indirect.vreg.gather.cb.upd.hbm.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, x86_mmx undef, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+!1 = distinct !{}
+!2 = distinct !{!2, !3}
+!3 = !{!"llvm.loop.parallel_accesses", !1}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_sc.ll
new file mode 100644
index 0000000..74b00cd
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_sc.ll

@@ -0,0 +1,137 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -opaque-pointers | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)*, i32, i32 addrspace(202)*, i32*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+; Tests that stream gather with tilespmem is ordered with the vector store,
+; but will get bundled with it because zero latency.
+
+; CHECK-LABEL: vst_to_stream_linear_gather_tilespmem_tec
+; CHECK: { [tilespmem:s{{[0-9]+}}+$0x0] =   vst v0;
+; CHECK-NEXT: {{.*}} stream.linear.gather {{.*}} }
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: { {{.*}} stream.linear.gather {{.*}}
+define void @vst_to_stream_linear_gather_tilespmem_tec(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x float> addrspace(201)* %a, <8 x float> %b) #2 {
+  store <8 x float> %b, <8 x float> addrspace(201)* %a
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; Tests that stream gather without tilespmem is not ordered with the vector store.
+
+; CHECK-LABEL: vst_to_stream_linear_gather_smem_tec
+; CHECK: { {{.*}} stream.linear.gather {{.*}} }
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: { {{.*}} stream.linear.gather {{.*}}
+; CHECK-NEXT: { [tilespmem:s{{[0-9]+}}+$0x0] =   vst v0;
+define void @vst_to_stream_linear_gather_smem_tec(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, <8 x float> addrspace(201)* %a, <8 x float> %b) #2 {
+  store <8 x float> %b, <8 x float> addrspace(201)* %a
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; Tests that stream scatter with tilespmem is ordered with the vector store,
+; but will get bundled with it because zero latency.
+
+; CHECK-LABEL: vst_to_stream_linear_scatter_tilespmem_tec
+; CHECK: { [tilespmem:s{{[0-9]+}}+$0x0] =   vst v0;
+; CHECK-NEXT: {{.*}} stream.linear.scatter {{.*}} }
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: { {{.*}} stream.linear.scatter {{.*}}
+define void @vst_to_stream_linear_scatter_tilespmem_tec(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, <8 x float> addrspace(201)* %a, <8 x float> %b) #2 {
+  store <8 x float> %b, <8 x float> addrspace(201)* %a
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; Tests that stream scatter without tilespmem is not ordered with the vector store.
+
+; CHECK-LABEL: vst_to_stream_linear_scatter_smem_tec
+; CHECK: { {{.*}} stream.linear.scatter {{.*}} }
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: { {{.*}} stream.linear.scatter {{.*}}
+; CHECK-NEXT: { [tilespmem:s{{[0-9]+}}+$0x0] =   vst v0;
+define void @vst_to_stream_linear_scatter_smem_tec(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, <8 x float> addrspace(201)* %a, <8 x float> %b) #2 {
+  store <8 x float> %b, <8 x float> addrspace(201)* %a
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; Tests the stream instruction cadence on sparsecore-tac-vf.
+
+; CHECK-LABEL: stream_linear_gather_tilespmem_tac
+; CHECK: {{.*}} stream.linear.gather {{.*}} }
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: { {{.*}} stream.linear.gather {{.*}}
+define void @stream_linear_gather_tilespmem_tac(i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x float> %b) #1 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(202)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_gather_smem_tac
+; CHECK: { {{.*}} stream.linear.gather {{.*}} }
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: { {{.*}} stream.linear.gather {{.*}}
+define void @stream_linear_gather_smem_tac(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, <8 x float> %b) #1 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tilespmem_tac
+; CHECK: {{.*}} stream.linear.scatter {{.*}} }
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: { {{.*}} stream.linear.scatter {{.*}}
+define void @stream_linear_scatter_tilespmem_tac(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, <8 x float> %b) #1 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_smem_tac
+; CHECK: { {{.*}} stream.linear.scatter {{.*}} }
+; CHECK-NEXT: { _ = sdelay $0x2 }
+; CHECK-NEXT: { {{.*}} stream.linear.scatter {{.*}}
+define void @stream_linear_scatter_smem_tac(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, <8 x float> %b) #1 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; Tests the stream instruction cadence on sparsecore-scs-vf.
+
+; CHECK-LABEL: stream_linear_gather_smem_scs
+; CHECK: { {{.*}} stream.linear.gather {{.*}} }
+; CHECK-NEXT: { {{.*}} stream.linear.gather {{.*}}
+define void @stream_linear_gather_smem_scs(i32 addrspace(202)* %src, i32* %dst, i32 addrspace(204)* %flag, <8 x float> %b) #0 {
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.gather.spmem.to.smem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(202)* %src, i32* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_smem_scs
+; CHECK: { {{.*}} stream.linear.scatter {{.*}} }
+; CHECK-NEXT: { {{.*}} stream.linear.scatter {{.*}}
+define void @stream_linear_scatter_smem_scs(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, <8 x float> %b) #0 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_add_f32_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_add_f32_sc.ll
new file mode 100644
index 0000000..dcf2661
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_add_f32_sc.ll

@@ -0,0 +1,1001 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.tilespmem.to.spmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.tilespmem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-scs-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_add_f32_scs_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], s3, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_add_f32_scs_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_add_f32_scs_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_add_f32_scs_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_add_f32_scs_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_add_f32_scs_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_add_f32_scs_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_add_f32_scs_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_add_f32_scs_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_add_f32_scs_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_add_f32_scs_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_add_f32_scs_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_add_f32_scs_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #0 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_add_f32_scs_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_add_f32_scs_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_add_f32_scs_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_add_f32_scs_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_add_f32_scs_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_add_f32_scs_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_add_f32_scs_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_add_f32_scs_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_add_f32_scs_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_add_f32_scs_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_add_f32_scs_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_add_f32_tac_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], s3, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_add_f32_tac_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_add_f32_tac_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_add_f32_tac_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_indirect_scatter_add_f32_tac_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_indirect_scatter_add_f32_tac_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_add_f32_tac_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_add_f32_tac_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_strided_scatter_add_f32_tac_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_strided_scatter_add_f32_tac_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_add_f32_tac_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_strided_scatter_add_f32_tac_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_add_f32_tac_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], s3, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_add_f32_tac_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_add_f32_tac_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_add_f32_tac_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_indirect_scatter_add_f32_tac_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_indirect_scatter_add_f32_tac_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_add_f32_tac_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_add_f32_tac_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tac_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_add_f32_tac_tilespmem_to_spmem_ri(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tac_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:s2], s3, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_add_f32_tac_tilespmem_to_spmem_rr(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tac_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_add_f32_tac_tilespmem_to_spmem_ii(i32 addrspace(201)* %src, i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tac_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_add_f32_tac_tilespmem_to_spmem_ir(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.add.f32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.add.f32.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tac_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_add_f32_tac_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tac_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:s2], s3, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_add_f32_tac_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tac_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_add_f32_tac_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tac_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_add_f32_tac_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.add.f32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.add.f32.tilespmem.to.tilespmem.tileN
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_add_f32_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], s3, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_add_f32_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_add_f32_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_add_f32_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_add_f32_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_add_f32_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_add_f32_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_add_f32_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_add_f32_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_add_f32_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_add_f32_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [spmem:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_add_f32_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_add_f32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, v0, vm0, $0
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_add_f32_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_add_f32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter.add.f32 [smem:s0], [sflag:s2], s3, v0, vm0, $0
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter.add.f32 [smem:s0], [sflag:s2], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_add_f32_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_add_f32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_add_f32_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_add_f32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, v0, vm0, $0
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_add_f32_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_add_f32_tec_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_add_f32_tec_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_add_f32_tec_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_add_f32_tec_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_add_f32_tec_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_add_f32_tec_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_add_f32_tec_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_f32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_add_f32_tec_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_add_f32_tec_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_add_f32_tec_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_add_f32_tec_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_f32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_add_f32_tec_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_add_f32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter.add.f32 [smem:s0], [sflag:s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_add_f32_tec_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_add_f32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter.add.f32 [smem:s0], [sflag:s2], s3, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter.add.f32 [smem:s0], [sflag:s2], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_add_f32_tec_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_add_f32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter.add.f32 [smem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_add_f32_tec_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_add_f32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter.add.f32 [smem:s0], [sflag:$0x1f], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_add_f32_tec_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.f32.vreg.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tec_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:s2], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_add_f32_tec_tilespmem_to_spmem_ri(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tec_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:s2], s3, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_add_f32_tec_tilespmem_to_spmem_rr(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tec_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_add_f32_tec_tilespmem_to_spmem_ii(i32 addrspace(201)* %src, i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tec_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_add_f32_tec_tilespmem_to_spmem_ir(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.add.f32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.add.f32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.add.f32.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tec_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:s2], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_add_f32_tec_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tec_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:s2], s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_add_f32_tec_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tec_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_add_f32_tec_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_f32_tec_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.f32 [tilespmem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_add_f32_tec_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.add.f32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.add.f32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.add.f32.tilespmem.to.tilespmem.tileN

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_add_s32_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_add_s32_sc.ll
new file mode 100644
index 0000000..8b39d63
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_add_s32_sc.ll

@@ -0,0 +1,1001 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.tilespmem.to.spmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.tilespmem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-scs-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_add_s32_scs_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], s3, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_add_s32_scs_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_add_s32_scs_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_add_s32_scs_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_add_s32_scs_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_add_s32_scs_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_add_s32_scs_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_add_s32_scs_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_add_s32_scs_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_add_s32_scs_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_add_s32_scs_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_add_s32_scs_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_add_s32_scs_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #0 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_add_s32_scs_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_add_s32_scs_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_add_s32_scs_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_add_s32_scs_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_add_s32_scs_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_add_s32_scs_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_add_s32_scs_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_add_s32_scs_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_add_s32_scs_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_add_s32_scs_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_add_s32_scs_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_add_s32_tac_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], s3, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_add_s32_tac_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_add_s32_tac_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_add_s32_tac_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_indirect_scatter_add_s32_tac_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_indirect_scatter_add_s32_tac_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_add_s32_tac_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_add_s32_tac_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_strided_scatter_add_s32_tac_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_strided_scatter_add_s32_tac_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_add_s32_tac_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_strided_scatter_add_s32_tac_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_add_s32_tac_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], s3, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_add_s32_tac_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_add_s32_tac_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_add_s32_tac_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_indirect_scatter_add_s32_tac_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_indirect_scatter_add_s32_tac_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_add_s32_tac_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_add_s32_tac_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tac_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_add_s32_tac_tilespmem_to_spmem_ri(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tac_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:s2], s3, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_add_s32_tac_tilespmem_to_spmem_rr(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tac_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_add_s32_tac_tilespmem_to_spmem_ii(i32 addrspace(201)* %src, i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tac_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_add_s32_tac_tilespmem_to_spmem_ir(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.add.s32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.add.s32.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tac_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_add_s32_tac_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tac_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:s2], s3, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_add_s32_tac_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tac_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_add_s32_tac_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tac_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_add_s32_tac_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.add.s32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.add.s32.tilespmem.to.tilespmem.tileN
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_add_s32_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], s3, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_add_s32_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_add_s32_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_add_s32_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_add_s32_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_add_s32_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_add_s32_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_add_s32_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_add_s32_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_add_s32_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_add_s32_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [spmem:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_add_s32_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_add_s32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, v0, vm0, $0
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_add_s32_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_add_s32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter.add.s32 [smem:s0], [sflag:s2], s3, v0, vm0, $0
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter.add.s32 [smem:s0], [sflag:s2], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_add_s32_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_add_s32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_add_s32_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_add_s32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, v0, vm0, $0
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_add_s32_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_add_s32_tec_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_add_s32_tec_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_add_s32_tec_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_add_s32_tec_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_add_s32_tec_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_add_s32_tec_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_add_s32_tec_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_add_s32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_add_s32_tec_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_add_s32_tec_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_add_s32_tec_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_add_s32_tec_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_add_s32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_add_s32_tec_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_add_s32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter.add.s32 [smem:s0], [sflag:s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_add_s32_tec_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_add_s32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter.add.s32 [smem:s0], [sflag:s2], s3, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter.add.s32 [smem:s0], [sflag:s2], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_add_s32_tec_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_add_s32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter.add.s32 [smem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_add_s32_tec_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_add_s32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter.add.s32 [smem:s0], [sflag:$0x1f], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_add_s32_tec_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.s32.vreg.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tec_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:s2], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_add_s32_tec_tilespmem_to_spmem_ri(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tec_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:s2], s3, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_add_s32_tec_tilespmem_to_spmem_rr(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tec_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_add_s32_tec_tilespmem_to_spmem_ii(i32 addrspace(201)* %src, i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tec_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [spmem:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_add_s32_tec_tilespmem_to_spmem_ir(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.add.s32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.add.s32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.add.s32.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tec_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:s2], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_add_s32_tec_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tec_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:s2], s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_add_s32_tec_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tec_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_add_s32_tec_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_add_s32_tec_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter.add.s32 [tilespmem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_add_s32_tec_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.add.s32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.add.s32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.add.s32.tilespmem.to.tilespmem.tileN

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_add_f32_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_add_f32_sc.ll
new file mode 100644
index 0000000..487c2ef
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_add_f32_sc.ll

@@ -0,0 +1,1001 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.tilespmem.to.spmem.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-scs-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_f32_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_f32_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_f32_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_add_f32_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_add_f32_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_f32_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_f32_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_add_f32_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_add_f32_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_f32_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_add_f32_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_cb_add_f32_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_indirect_scatter_cb_add_f32_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_add_f32_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_cb_add_f32_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_strided_scatter_cb_add_f32_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_strided_scatter_cb_add_f32_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_strided_scatter_cb_add_f32_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_indirect_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_tilespmem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_tilespmem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_tilespmem_to_spmem_ii(i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_tilespmem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.f32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.add.f32.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_f32_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_add_f32_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_add_f32_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_f32_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_f32_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_add_f32_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_add_f32_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_f32_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:s1], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.add.f32 [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.f32.vreg.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_tilespmem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_tilespmem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_tilespmem_to_spmem_ii(i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_tilespmem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.f32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.add.f32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.f32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.add.f32.tilespmem.to.tilespmem.tileN

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_add_s32_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_add_s32_sc.ll
new file mode 100644
index 0000000..6ccecab
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_add_s32_sc.ll

@@ -0,0 +1,1001 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.tilespmem.to.spmem.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-scs-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_s32_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_s32_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_s32_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_add_s32_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_add_s32_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_s32_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_s32_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_add_s32_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_add_s32_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_s32_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_add_s32_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_cb_add_s32_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_indirect_scatter_cb_add_s32_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_add_s32_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_cb_add_s32_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_strided_scatter_cb_add_s32_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_strided_scatter_cb_add_s32_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_strided_scatter_cb_add_s32_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_indirect_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_tilespmem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_tilespmem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_tilespmem_to_spmem_ii(i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_tilespmem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.s32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.add.s32.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_s32_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_add_s32_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_add_s32_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_s32_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_s32_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_add_s32_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_add_s32_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_s32_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.add.s32 [smem:$0x0], [sflag:s1], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.add.s32 [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.s32.vreg.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_tilespmem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_tilespmem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_tilespmem_to_spmem_ii(i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_tilespmem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.s32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.add.s32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.add.s32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.add.s32.tilespmem.to.tilespmem.tileN

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_sc.ll
new file mode 100644
index 0000000..287200f
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_sc.ll

@@ -0,0 +1,1569 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(203)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(203)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.spmem.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(203)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(203)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.spmem.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(203)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(203)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-scs-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_cb_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_cb_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_indirect_scatter_cb_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_cb_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_strided_scatter_cb_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_strided_scatter_cb_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_strided_scatter_cb_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_tac_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_tac_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_tac_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_tac_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_tac_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_cb_tac_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_indirect_scatter_cb_tac_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_tac_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [hbm:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_tac_tilespmem_to_hbm_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [hbm:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_tac_tilespmem_to_hbm_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [hbm:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_tac_tilespmem_to_hbm_ii(i32 addrspace(203)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [hbm:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_tac_tilespmem_to_hbm_ir(i32 addrspace(203)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tac_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_tac_tilespmem_to_hbm_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tac_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_cb_tac_tilespmem_to_hbm_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tac_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_indirect_scatter_cb_tac_tilespmem_to_hbm_ii(i32 addrspace(203)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tac_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_tac_tilespmem_to_hbm_ir(i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_strided_scatter_cb_tac_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s0] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [hbm:s0] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_cb_tac_tilespmem_to_hbm_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tac_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s0] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [hbm:s0] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_strided_scatter_cb_tac_tilespmem_to_hbm_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tac_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s0] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [hbm:s0] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_strided_scatter_cb_tac_tilespmem_to_hbm_ii(i32 addrspace(203)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tac_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s0] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [hbm:s0] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_strided_scatter_cb_tac_tilespmem_to_hbm_ir(i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s0+s2] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [hbm4b:s0+s2] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_tac_tilespmem_to_hbm4b_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s0+s3] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [hbm4b:s0+s3] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_tac_tilespmem_to_hbm4b_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s0+s1] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [hbm4b:s0+s1] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_tac_tilespmem_to_hbm4b_ii(i32 addrspace(203)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s0+s2] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [hbm4b:s0+s2] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_tac_tilespmem_to_hbm4b_ir(i32 addrspace(203)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tac_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s0+s3] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [hbm4b:s0+s3] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_tac_tilespmem_to_hbm4b_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tac_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s0+s4] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [hbm4b:s0+s4] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_cb_tac_tilespmem_to_hbm4b_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tac_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s0+s2] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [hbm4b:s0+s2] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_indirect_scatter_cb_tac_tilespmem_to_hbm4b_ii(i32 addrspace(203)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tac_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s0+s3] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [hbm4b:s0+s3] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_tac_tilespmem_to_hbm4b_ir(i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_strided_scatter_cb_tac_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s0+s3] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [hbm4b:s0+s3] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_cb_tac_tilespmem_to_hbm4b_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tac_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s0+s4] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [hbm4b:s0+s4] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_strided_scatter_cb_tac_tilespmem_to_hbm4b_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tac_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s0+s2] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [hbm4b:s0+s2] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_strided_scatter_cb_tac_tilespmem_to_hbm4b_ii(i32 addrspace(203)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tac_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s0+s3] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [hbm4b:s0+s3] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_strided_scatter_cb_tac_tilespmem_to_hbm4b_ir(i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_tac_tilespmem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_tac_tilespmem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_tac_tilespmem_to_spmem_ii(i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_tac_tilespmem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_tac_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_tac_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_tac_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tac_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_tac_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.tilespmem.tileN
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:s1], s2, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:s1], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_cb_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:s1], s2, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:s1], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [hbm:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_tec_tilespmem_to_hbm_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, $0
+; CHECK: [hbm:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_tec_tilespmem_to_hbm_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [hbm:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_tec_tilespmem_to_hbm_ii(i32 addrspace(203)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [hbm:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_tec_tilespmem_to_hbm_ir(i32 addrspace(203)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tec_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_tec_tilespmem_to_hbm_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tec_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_tec_tilespmem_to_hbm_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tec_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_tec_tilespmem_to_hbm_ii(i32 addrspace(203)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tec_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_tec_tilespmem_to_hbm_ir(i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_strided_scatter_cb_tec_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s0] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [hbm:s0] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_tec_tilespmem_to_hbm_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tec_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s0] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [hbm:s0] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_tec_tilespmem_to_hbm_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tec_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s0] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [hbm:s0] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_tec_tilespmem_to_hbm_ii(i32 addrspace(203)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tec_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s0] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [hbm:s0] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_tec_tilespmem_to_hbm_ir(i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_tec_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s0] = stream.indirect_vreg.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, v0, vm0, $0
+; CHECK: [hbm:s0] = stream.indirect_vreg.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_tec_tilespmem_to_hbm_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_tec_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s0] = stream.indirect_vreg.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, v0, vm0, $0
+; CHECK: [hbm:s0] = stream.indirect_vreg.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_tec_tilespmem_to_hbm_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_tec_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s0] = stream.indirect_vreg.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [hbm:s0] = stream.indirect_vreg.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_tec_tilespmem_to_hbm_ii(i32 addrspace(203)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_tec_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s0] = stream.indirect_vreg.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0
+; CHECK: [hbm:s0] = stream.indirect_vreg.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_tec_tilespmem_to_hbm_ir(i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s0+s2] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [hbm4b:s0+s2] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_tec_tilespmem_to_hbm4b_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s0+s3] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, $0
+; CHECK: [hbm4b:s0+s3] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_tec_tilespmem_to_hbm4b_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s0+s1] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [hbm4b:s0+s1] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_tec_tilespmem_to_hbm4b_ii(i32 addrspace(203)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s0+s2] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [hbm4b:s0+s2] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_tec_tilespmem_to_hbm4b_ir(i32 addrspace(203)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tec_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s0+s3] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [hbm4b:s0+s3] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_tec_tilespmem_to_hbm4b_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tec_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s0+s4] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [hbm4b:s0+s4] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_tec_tilespmem_to_hbm4b_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tec_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s0+s2] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [hbm4b:s0+s2] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_tec_tilespmem_to_hbm4b_ii(i32 addrspace(203)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_tec_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s0+s3] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [hbm4b:s0+s3] = stream.indirect.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_tec_tilespmem_to_hbm4b_ir(i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_strided_scatter_cb_tec_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s0+s3] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [hbm4b:s0+s3] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_tec_tilespmem_to_hbm4b_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tec_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s0+s4] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [hbm4b:s0+s4] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_tec_tilespmem_to_hbm4b_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tec_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s0+s2] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [hbm4b:s0+s2] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_tec_tilespmem_to_hbm4b_ii(i32 addrspace(203)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_tec_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s0+s3] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [hbm4b:s0+s3] = stream.strided.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_tec_tilespmem_to_hbm4b_ir(i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_tec_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s0+s2] = stream.indirect_vreg.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, v0, vm0, $0
+; CHECK: [hbm4b:s0+s2] = stream.indirect_vreg.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_tec_tilespmem_to_hbm4b_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_tec_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s0+s3] = stream.indirect_vreg.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, v0, vm0, $0
+; CHECK: [hbm4b:s0+s3] = stream.indirect_vreg.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_tec_tilespmem_to_hbm4b_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_tec_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s0+s1] = stream.indirect_vreg.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [hbm4b:s0+s1] = stream.indirect_vreg.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_tec_tilespmem_to_hbm4b_ii(i32 addrspace(203)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_tec_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s0+s2] = stream.indirect_vreg.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0
+; CHECK: [hbm4b:s0+s2] = stream.indirect_vreg.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_tec_tilespmem_to_hbm4b_ir(i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.vreg.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_tec_tilespmem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_tec_tilespmem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_tec_tilespmem_to_spmem_ii(i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_tec_tilespmem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_tec_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_tec_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_tec_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_tec_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb [tilespmem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_tec_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.tilespmem.to.tilespmem.tileN

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_upd_add_f32_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_upd_add_f32_sc.ll
new file mode 100644
index 0000000..a5a3610
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_upd_add_f32_sc.ll

@@ -0,0 +1,1001 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32, i32, i32, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.tilespmem.to.spmem.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32, i32, i32, i32, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-scs-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_f32_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_f32_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_f32_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_add_f32_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_add_f32_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_f32_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_f32_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_add_f32_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_add_f32_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_f32_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_f32_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_add_f32_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_cb_add_f32_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_indirect_scatter_cb_add_f32_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_add_f32_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_cb_add_f32_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_strided_scatter_cb_add_f32_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_strided_scatter_cb_add_f32_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_strided_scatter_cb_add_f32_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_indirect_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_add_f32_tac_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_tilespmem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_tilespmem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_tilespmem_to_spmem_ii(i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_tilespmem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tac_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_add_f32_tac_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_f32_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_add_f32_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_add_f32_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_f32_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_f32_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_add_f32_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_add_f32_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_f32_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:s1], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.upd.add.f32 [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_f32_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.vreg.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_tilespmem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_tilespmem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_tilespmem_to_spmem_ii(i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_tilespmem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_f32_tec_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.f32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_f32_tec_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.f32.tilespmem.to.tilespmem.tileN

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_upd_add_s32_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_upd_add_s32_sc.ll
new file mode 100644
index 0000000..f8b567e
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_upd_add_s32_sc.ll

@@ -0,0 +1,1001 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32, i32, i32, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.tilespmem.to.spmem.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32, i32, i32, i32, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-scs-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_s32_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_s32_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_s32_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_add_s32_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_add_s32_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_s32_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_s32_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_add_s32_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_add_s32_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_s32_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_s32_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_add_s32_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_cb_add_s32_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_indirect_scatter_cb_add_s32_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_add_s32_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_cb_add_s32_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_strided_scatter_cb_add_s32_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_strided_scatter_cb_add_s32_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_strided_scatter_cb_add_s32_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_indirect_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_indirect_scatter_cb_add_s32_tac_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_tilespmem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_tilespmem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_tilespmem_to_spmem_ii(i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_tilespmem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tac_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb_add_s32_tac_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_s32_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_add_s32_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_add_s32_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_s32_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_s32_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_add_s32_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_add_s32_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_s32_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_spmem_ii(i32 addrspace(202)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0
+; CHECK: [spmem:s0] = stream.indirect_vreg.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:s1], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect_vreg.scatter.cb.upd.add.s32 [smem:$0x0], [sflag:$0x1f], s1, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_cb_add_s32_tec_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.vreg.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_tilespmem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_tilespmem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_tilespmem_to_spmem_ii(i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_tilespmem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb_add_s32_tec_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd.add.s32 [tilespmem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb_add_s32_tec_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.add.s32.tilespmem.to.tilespmem.tileN

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_upd_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_upd_sc.ll
new file mode 100644
index 0000000..df1eb43
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_cb_upd_sc.ll

@@ -0,0 +1,837 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(203)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(203)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.smem.to.spmem.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.tilespmem.to.hbm.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(203)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(203)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.tilespmem.to.spmem.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.tilespmem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(203)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(203)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.spmem(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, x86_mmx, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-scs-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb.upd_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb.upd_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb.upd_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb.upd_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb.upd_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb.upd_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb.upd_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb.upd_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb.upd_scs_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb.upd_scs_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb.upd_scs_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb.upd_scs_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb.upd_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_cb.upd_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_indirect_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_indirect_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, s3, s4, $0x4
+define void @stream_strided_scatter_cb.upd_scs_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x4
+define void @stream_strided_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0
+; CHECK: [tilespmem.tileN:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x4
+define void @stream_strided_scatter_cb.upd_scs_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb.upd_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb.upd_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb.upd_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb.upd_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_cb.upd_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_cb.upd_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_indirect_scatter_cb.upd_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_indirect_scatter_cb.upd_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_cb.upd_tac_smem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_strided_scatter_cb.upd_tac_smem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_strided_scatter_cb.upd_tac_smem_to_spmem_ii(i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [spmem:s0] = stream.strided.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_strided_scatter_cb.upd_tac_smem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.spmem(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb.upd_tac_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb.upd_tac_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb.upd_tac_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb.upd_tac_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_cb.upd_tac_smem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_cb.upd_tac_smem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_indirect_scatter_cb.upd_tac_smem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [tilespmem.tileN:s0] = stream.indirect.scatter.cb.upd [smem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_indirect_scatter_cb.upd_tac_smem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.smem.to.tilespmem.tileN
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tac_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [hbm:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb.upd_tac_tilespmem_to_hbm_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tac_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [hbm:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb.upd_tac_tilespmem_to_hbm_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tac_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [hbm:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb.upd_tac_tilespmem_to_hbm_ii(i32 addrspace(203)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tac_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [hbm:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb.upd_tac_tilespmem_to_hbm_ir(i32 addrspace(203)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_tac_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_cb.upd_tac_tilespmem_to_hbm_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_tac_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_cb.upd_tac_tilespmem_to_hbm_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_tac_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_indirect_scatter_cb.upd_tac_tilespmem_to_hbm_ii(i32 addrspace(203)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_tac_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [hbm:s0] = stream.indirect.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_indirect_scatter_cb.upd_tac_tilespmem_to_hbm_ir(i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_tac_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s0] = stream.strided.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [hbm:s0] = stream.strided.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_cb.upd_tac_tilespmem_to_hbm_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_tac_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s0] = stream.strided.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [hbm:s0] = stream.strided.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_strided_scatter_cb.upd_tac_tilespmem_to_hbm_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_tac_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s0] = stream.strided.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [hbm:s0] = stream.strided.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_strided_scatter_cb.upd_tac_tilespmem_to_hbm_ii(i32 addrspace(203)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_tac_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s0] = stream.strided.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [hbm:s0] = stream.strided.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_strided_scatter_cb.upd_tac_tilespmem_to_hbm_ir(i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tac_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s0+s2] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], $0x4, $0x1
+; CHECK: [hbm4b:s0+s2] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], $0x4, $0x5
+define void @stream_linear_scatter_cb.upd_tac_tilespmem_to_hbm4b_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tac_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s0+s3] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], s2, $0x1
+; CHECK: [hbm4b:s0+s3] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], s2, $0x5
+define void @stream_linear_scatter_cb.upd_tac_tilespmem_to_hbm4b_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tac_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s0+s1] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [hbm4b:s0+s1] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_cb.upd_tac_tilespmem_to_hbm4b_ii(i32 addrspace(203)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tac_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s0+s2] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], s1, $0x1
+; CHECK: [hbm4b:s0+s2] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], s1, $0x5
+define void @stream_linear_scatter_cb.upd_tac_tilespmem_to_hbm4b_ir(i32 addrspace(203)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_tac_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s0+s3] = stream.indirect.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [hbm4b:s0+s3] = stream.indirect.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_cb.upd_tac_tilespmem_to_hbm4b_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_tac_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s0+s4] = stream.indirect.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [hbm4b:s0+s4] = stream.indirect.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_cb.upd_tac_tilespmem_to_hbm4b_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_tac_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s0+s2] = stream.indirect.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [hbm4b:s0+s2] = stream.indirect.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_indirect_scatter_cb.upd_tac_tilespmem_to_hbm4b_ii(i32 addrspace(203)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_cb.upd_tac_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s0+s3] = stream.indirect.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [hbm4b:s0+s3] = stream.indirect.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_indirect_scatter_cb.upd_tac_tilespmem_to_hbm4b_ir(i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_tac_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s0+s3] = stream.strided.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x1
+; CHECK: [hbm4b:s0+s3] = stream.strided.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_cb.upd_tac_tilespmem_to_hbm4b_ri(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_tac_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s0+s4] = stream.strided.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x1
+; CHECK: [hbm4b:s0+s4] = stream.strided.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], s2, s3, s4, $0x5
+define void @stream_strided_scatter_cb.upd_tac_tilespmem_to_hbm4b_rr(i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_tac_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s0+s2] = stream.strided.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x1
+; CHECK: [hbm4b:s0+s2] = stream.strided.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], $0x4, s1, s2, $0x5
+define void @stream_strided_scatter_cb.upd_tac_tilespmem_to_hbm4b_ii(i32 addrspace(203)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_cb.upd_tac_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s0+s3] = stream.strided.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x1
+; CHECK: [hbm4b:s0+s3] = stream.strided.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], s1, s2, s3, $0x5
+define void @stream_strided_scatter_cb.upd_tac_tilespmem_to_hbm4b_ir(i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, x86_mmx undef, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.vreg.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tec_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb.upd_tec_tilespmem_to_spmem_ri(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tec_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], s2, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb.upd_tec_tilespmem_to_spmem_rr(i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tec_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb.upd_tec_tilespmem_to_spmem_ii(i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tec_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [spmem:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb.upd_tec_tilespmem_to_spmem_ir(i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tec_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], $0x4, $0x4
+define void @stream_linear_scatter_cb.upd_tec_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tec_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], s2, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:s1], s2, $0x4
+define void @stream_linear_scatter_cb.upd_tec_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tec_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_cb.upd_tec_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_cb.upd_tec_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], s1, $0
+; CHECK: [tilespmem.tileN:s0] = stream.linear.scatter.cb.upd [tilespmem:$0x0], [sflag:$0x1f], s1, $0x4
+define void @stream_linear_scatter_cb.upd_tec_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.cb.upd.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, x86_mmx undef, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.cb.upd.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.cb.upd.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.cb.upd.tilespmem.to.tilespmem.tileN

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_dynovrd_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_dynovrd_sc.ll
new file mode 100644
index 0000000..68f13c7
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_dynovrd_sc.ll

@@ -0,0 +1,99 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.tilespmem.to.spmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v8i32(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.spmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.tilespmem.to.spmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_spmem_ri:
+; CHECK: { s[[s0:[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0x5 }
+; CHECK: { s[[s1:[0-9]+]] = sor.u32 s[[s0]], s{{[0-9]+}} }
+; CHECK: [spmem:s{{[0-9]+}}] = stream.linear.scatter [smem:s{{[0-9]+}}], [sflag:s[[s1]]], $0x4, $0x0
+define void @stream_linear_scatter_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %ctrl, i32 %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 %ctrl, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_spmem_rr:
+; CHECK: { s[[s0:[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0x5 }
+; CHECK: { s[[s1:[0-9]+]] = sor.u32 s[[s0]], s{{[0-9]+}} }
+; CHECK: [spmem:s{{[0-9]+}}] = stream.linear.scatter [smem:s{{[0-9]+}}], [sflag:s[[s1]]], s{{[0-9]+}}, $0x0
+define void @stream_linear_scatter_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %ctrl, i32 %len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 %ctrl, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_spmem_ii:
+; CHECK: { s[[s0:[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0x5 }
+; CHECK: { s[[s1:[0-9]+]] = sor.u32 $0x1f, s{{[0-9]+}} }
+; CHECK: [spmem:s{{[0-9]+}}] = stream.linear.scatter [smem:s{{[0-9]+}}], [sflag:s[[s1]]], $0x4, $0x0
+define void @stream_linear_scatter_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 %ctrl) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 %ctrl, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_spmem_ir:
+; CHECK: { s[[s0:[0-9]+]] = sshll.u32 s{{[0-9]+}}, $0x5 }
+; CHECK: { s[[s1:[0-9]+]] = sor.u32 $0x1f, s{{[0-9]+}} }
+; CHECK: [spmem:s{{[0-9]+}}] = stream.linear.scatter [smem:s{{[0-9]+}}], [sflag:s[[s1]]], s{{[0-9]+}}, $0x0
+define void @stream_linear_scatter_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %ctrl) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 %ctrl, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_gather_hbm4b_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_gather_hbm4b_sc.ll
new file mode 100644
index 0000000..f894baf
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_gather_hbm4b_sc.ll

@@ -0,0 +1,116 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests that codegen prepare generates shift/or instructions for hbm4b pointer when appropriate.
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+declare void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)*, i32, i32 addrspace(203)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-tec-vf" }
+
+; CHECK-LABEL: stream_linear_gather_hbm4b:
+; CHECK: { s3 = simm.s32 $0x0 }
+; CHECK-NEXT: { [tilespmem:s1], [sflag:s2] = stream.linear.gather [hbm4b:s0+s3], $0x4, $0x1 }
+define void @stream_linear_gather_hbm4b(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #0 {
+  call void @llvm.tpu.stream.linear.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 0);
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_hbm4b:
+; CHECK: { s4 = sor.u32 $0x1c0000, s4 }
+; CHECK-NEXT: { [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s4], $0x4, s3, s4, $0x1 }
+define void @stream_indirect_gather_hbm4b(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 7);
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_gather_hbm4b_zero:
+; CHECK-NOT: { s4 = sor.u32
+; CHECK: { [tilespmem:s1], [sflag:s2] = stream.indirect.gather [hbm4b:s0+s4], $0x4, s3, s4, $0x1 }
+define void @stream_indirect_gather_hbm4b_zero(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0);
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_gather_hbm4b:
+; CHECK: { s3 = simm.s32 $0x7 }
+; CHECK-NEXT: { [tilespmem:s1], [sflag:s2] = stream.indirect_vreg.gather [hbm4b:s0+s3], $0x4, v0, vm0, $0x1 }
+define void @stream_indirect_vreg_gather_hbm4b(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #0 {
+  call void @llvm.tpu.stream.indirect.vreg.gather.hbm4b.to.tilespmem.v8i32(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 7);
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_hbm4b:
+; CHECK: { s4 = sor.u32 $0x1c0000, s4 }
+; CHECK-NEXT: { [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s4], $0x4, s3, s4, $0x1 }
+define void @stream_strided_gather_hbm4b(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 7)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_gather_hbm4b_zero:
+; CHECK-NOT: { s4 = sor.u32
+; CHECK: { [tilespmem:s1], [sflag:s2] = stream.strided.gather [hbm4b:s0+s4], $0x4, s3, s4, $0x1 }
+define void @stream_strided_gather_hbm4b_zero(i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.gather.hbm4b.to.tilespmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(203)* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_hbm4b
+; CHECK: { s3 = simm.s32 $0x5 }
+; CHECK-NEXT: { [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x0 }
+define void @stream_linear_scatter_hbm4b(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag) #0 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 5)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_hbm4b
+; CHECK: { s4 = sor.u32 $0x140000, s4 }
+; CHECK-NEXT: { [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x1 }
+define void @stream_indirect_scatter_hbm4b(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 5)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_hbm4b_zero
+; CHECK-NOT: { s4 = sor.u32
+; CHECK: { [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x1 }
+define void @stream_indirect_scatter_hbm4b_zero(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_hbm4b
+; CHECK: { s3 = simm.s32 $0x5 }
+; CHECK-NEXT: { [hbm4b:s1+s3] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], $0x4, v0, vm0, $0x0 }
+define void @stream_indirect_vreg_scatter_hbm4b(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #0 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 5)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_hbm4b:
+; CHECK: { s4 = sor.u32 $0x140000, s4 }
+; CHECK-NEXT: { [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x1 }
+define void @stream_strided_scatter_hbm4b(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 5)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_hbm4b_zero:
+; CHECK-NOT: { s4 = sor.u32
+; CHECK: { [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x1 }
+define void @stream_strided_scatter_hbm4b_zero(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_gf_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_gf_sc.ll
new file mode 100644
index 0000000..7840e6a
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_gf_sc.ll

@@ -0,0 +1,1569 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.tilespmem.to.spmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v16i32(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, <16 x i32>, <16 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v16i32(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, <16 x i32>, <16 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v16i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, <16 x i32>, <16 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v16i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, <16 x i32>, <16 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.spmem.v16i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, <16 x i32>, <16 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.tilespmem.tileN.v16i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, <16 x i32>, <16 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.tilespmem.to.spmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-gf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-gf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-gf" }
+
+;  ------------------------
+; | Test sparsecore-scs-gf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_scs_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_scs_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_scs_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_scs_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_scs_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_scs_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_scs_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_scs_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_scs_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_scs_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_scs_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_scs_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_scs_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #0 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_scs_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_scs_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_scs_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_scs_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_scs_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_scs_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_scs_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_scs_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_scs_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_scs_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_scs_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+;  ------------------------
+; | Test sparsecore-tac-gf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_tac_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_tac_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_tac_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_tac_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_indirect_scatter_tac_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_indirect_scatter_tac_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_tac_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_tac_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_strided_scatter_tac_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_strided_scatter_tac_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_tac_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_strided_scatter_tac_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_tac_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_tac_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_tac_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_tac_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_indirect_scatter_tac_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_indirect_scatter_tac_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_tac_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_tac_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_hbm_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x1
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_hbm_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_hbm_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_hbm_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_indirect_scatter_tac_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_indirect_scatter_tac_tilespmem_to_hbm_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_indirect_scatter_tac_tilespmem_to_hbm_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_tac_tilespmem_to_hbm_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_tac_tilespmem_to_hbm_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_strided_scatter_tac_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_strided_scatter_tac_tilespmem_to_hbm_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_strided_scatter_tac_tilespmem_to_hbm_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_tac_tilespmem_to_hbm_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_strided_scatter_tac_tilespmem_to_hbm_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_hbm4b_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s1+s4] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x1
+; CHECK: [hbm4b:s1+s4] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_hbm4b_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s1+s2] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [hbm4b:s1+s2] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_hbm4b_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_hbm4b_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_indirect_scatter_tac_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_indirect_scatter_tac_tilespmem_to_hbm4b_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s1+s5] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [hbm4b:s1+s5] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_indirect_scatter_tac_tilespmem_to_hbm4b_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s1+s3] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [hbm4b:s1+s3] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_tac_tilespmem_to_hbm4b_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_tac_tilespmem_to_hbm4b_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_strided_scatter_tac_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_strided_scatter_tac_tilespmem_to_hbm4b_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s1+s5] = stream.strided.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [hbm4b:s1+s5] = stream.strided.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_strided_scatter_tac_tilespmem_to_hbm4b_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s1+s3] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [hbm4b:s1+s3] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_tac_tilespmem_to_hbm4b_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_strided_scatter_tac_tilespmem_to_hbm4b_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_spmem_ri(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_spmem_rr(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_spmem_ii(i32 addrspace(201)* %src, i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_spmem_ir(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.tilespmem.to.tilespmem.tileN
+
+;  ------------------------
+; | Test sparsecore-tec-gf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:s2], $0x4, v0, vm0, $0
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v16i32(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v16i32(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:s2], s3, v0, vm0, $0
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:s2], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v16i32(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v16i32(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v16i32(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v16i32(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:$0x1f], s2, v0, vm0, $0
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:$0x1f], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v16i32(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v16i32(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_tec_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_tec_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_tec_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_tec_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_tec_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_tec_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_tec_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_tec_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_tec_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_tec_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_tec_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_tec_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:s2], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v16i32(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v16i32(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:s2], s3, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:s2], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v16i32(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v16i32(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v16i32(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v16i32(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:$0x1f], s2, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:$0x1f], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v16i32(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v16i32(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_hbm_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_hbm_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_hbm_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_hbm_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_indirect_scatter_tec_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_tec_tilespmem_to_hbm_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_tec_tilespmem_to_hbm_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_tec_tilespmem_to_hbm_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_tec_tilespmem_to_hbm_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_strided_scatter_tec_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_tec_tilespmem_to_hbm_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_tec_tilespmem_to_hbm_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_tec_tilespmem_to_hbm_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_tec_tilespmem_to_hbm_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s1] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], $0x4, v0, vm0, $0
+; CHECK: [hbm:s1] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_tilespmem_to_hbm_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v16i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v16i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s1] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], s3, v0, vm0, $0
+; CHECK: [hbm:s1] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_tilespmem_to_hbm_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v16i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v16i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s1] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [hbm:s1] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_tilespmem_to_hbm_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v16i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v16i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s1] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:$0x1f], s2, v0, vm0, $0
+; CHECK: [hbm:s1] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:$0x1f], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_tilespmem_to_hbm_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v16i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v16i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0
+; CHECK: [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_hbm4b_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s1+s4] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0
+; CHECK: [hbm4b:s1+s4] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_hbm4b_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s1+s2] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [hbm4b:s1+s2] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_hbm4b_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_hbm4b_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_indirect_scatter_tec_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_tec_tilespmem_to_hbm4b_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s1+s5] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [hbm4b:s1+s5] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_tec_tilespmem_to_hbm4b_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s1+s3] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [hbm4b:s1+s3] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_tec_tilespmem_to_hbm4b_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_tec_tilespmem_to_hbm4b_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_strided_scatter_tec_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_tec_tilespmem_to_hbm4b_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s1+s5] = stream.strided.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [hbm4b:s1+s5] = stream.strided.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_tec_tilespmem_to_hbm4b_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s1+s3] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [hbm4b:s1+s3] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_tec_tilespmem_to_hbm4b_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_tec_tilespmem_to_hbm4b_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s1+s3] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], $0x4, v0, vm0, $0
+; CHECK: [hbm4b:s1+s3] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_tilespmem_to_hbm4b_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v16i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v16i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s1+s4] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], s3, v0, vm0, $0
+; CHECK: [hbm4b:s1+s4] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_tilespmem_to_hbm4b_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v16i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v16i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s1+s2] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [hbm4b:s1+s2] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_tilespmem_to_hbm4b_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v16i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v16i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s1+s3] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:$0x1f], s2, v0, vm0, $0
+; CHECK: [hbm4b:s1+s3] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:$0x1f], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_tilespmem_to_hbm4b_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v16i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v16i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <16 x i32> %offs, <16 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.vreg.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_spmem_ri(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_spmem_rr(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_spmem_ii(i32 addrspace(201)* %src, i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_spmem_ir(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.tilespmem.tileN

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_sc.ll
new file mode 100644
index 0000000..4643754
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/stream_scatter_sc.ll

@@ -0,0 +1,1569 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+
+;  -------------------------
+; | stream.linear intrinsic |
+;  -------------------------
+
+declare void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  ---------------------------
+; | stream.indirect intrinsic |
+;  ---------------------------
+
+declare void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.tilespmem.to.spmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32 addrspace(201)*, i32, i32) argmemonly nounwind
+
+;  --------------------------------
+; | stream.indirect.vreg intrinsic |
+;  --------------------------------
+
+declare void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v8i32(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.spmem.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.tilespmem.tileN.v8i32(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, <8 x i32>, <8 x i1>, i32) argmemonly nounwind
+
+;  --------------------------
+; | stream.strided intrinsic |
+;  --------------------------
+
+declare void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)*, i32, i32*, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(203)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.tilespmem.to.spmem(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(202)*, i32, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.stream.strided.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)*, i32, i32 addrspace(201)*, i32 addrspace(201)*, i32, i32, i32, i32) argmemonly nounwind
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }
+
+;  ------------------------
+; | Test sparsecore-scs-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_scs_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_scs_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_scs_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_scs_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_scs_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_scs_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_scs_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_scs_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_scs_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_scs_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_scs_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_scs_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_scs_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_scs_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_scs_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_scs_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_scs_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #0 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_scs_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #0 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_scs_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_scs_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_scs_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_scs_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_scs_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_scs_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_scs_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_scs_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_scs_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_scs_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_scs_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_scs_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_scs_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_scs_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #0 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+;  ------------------------
+; | Test sparsecore-tac-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_tac_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_tac_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_tac_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_tac_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_indirect_scatter_tac_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_indirect_scatter_tac_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_tac_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_tac_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_tac_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_strided_scatter_tac_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_strided_scatter_tac_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_tac_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_strided_scatter_tac_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_tac_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_tac_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_tac_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_tac_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_tac_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_indirect_scatter_tac_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_indirect_scatter_tac_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_tac_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_tac_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_hbm_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x1
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_hbm_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_hbm_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_hbm_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_indirect_scatter_tac_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_indirect_scatter_tac_tilespmem_to_hbm_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_indirect_scatter_tac_tilespmem_to_hbm_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_tac_tilespmem_to_hbm_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_tac_tilespmem_to_hbm_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_strided_scatter_tac_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_strided_scatter_tac_tilespmem_to_hbm_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_strided_scatter_tac_tilespmem_to_hbm_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_tac_tilespmem_to_hbm_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_strided_scatter_tac_tilespmem_to_hbm_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_hbm4b_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s1+s4] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x1
+; CHECK: [hbm4b:s1+s4] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_hbm4b_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s1+s2] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [hbm4b:s1+s2] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_hbm4b_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_hbm4b_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_indirect_scatter_tac_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_indirect_scatter_tac_tilespmem_to_hbm4b_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s1+s5] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [hbm4b:s1+s5] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_indirect_scatter_tac_tilespmem_to_hbm4b_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s1+s3] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [hbm4b:s1+s3] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_indirect_scatter_tac_tilespmem_to_hbm4b_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tac_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_indirect_scatter_tac_tilespmem_to_hbm4b_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_strided_scatter_tac_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x1
+; CHECK: [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x5
+define void @stream_strided_scatter_tac_tilespmem_to_hbm4b_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s1+s5] = stream.strided.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x1
+; CHECK: [hbm4b:s1+s5] = stream.strided.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x5
+define void @stream_strided_scatter_tac_tilespmem_to_hbm4b_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s1+s3] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x1
+; CHECK: [hbm4b:s1+s3] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x5
+define void @stream_strided_scatter_tac_tilespmem_to_hbm4b_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tac_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x1
+; CHECK: [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x5
+define void @stream_strided_scatter_tac_tilespmem_to_hbm4b_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_spmem_ri(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_spmem_rr(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_spmem_ii(i32 addrspace(201)* %src, i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_spmem_ir(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #1 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #1 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %dst) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tac_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x1
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x5
+define void @stream_linear_scatter_tac_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #1 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 1, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 5, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.tilespmem.to.tilespmem.tileN
+
+;  ------------------------
+; | Test sparsecore-tec-vf |
+;  ------------------------
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_scatter_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [spmem:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_strided_scatter_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [spmem:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.spmem(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_smem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:s2], $0x4, v0, vm0, $0
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_smem_to_spmem_ri(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_smem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:s2], s3, v0, vm0, $0
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:s2], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_smem_to_spmem_rr(i32* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v8i32(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_smem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_smem_to_spmem_ii(i32* %src, i32 addrspace(202)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_smem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:$0x1f], s2, v0, vm0, $0
+; CHECK: [spmem:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:$0x1f], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_smem_to_spmem_ir(i32* %src, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.spmem.v8i32(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(202)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_tec_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_tec_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_tec_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [smem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_tec_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_scatter_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_tec_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_tec_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_tec_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_tec_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_strided_scatter_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_tec_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_tec_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_tec_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.strided.scatter [smem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_tec_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.smem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_smem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:s2], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_smem_to_tilespmem_tileN_ri(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_smem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:s2], s3, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:s2], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_smem_to_tilespmem_tileN_rr(i32* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %flag, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_smem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_smem_to_tilespmem_tileN_ii(i32* %src, i32 addrspace(201)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_smem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:$0x1f], s2, v0, vm0, $0
+; CHECK: [tilespmem.tileN:s1] = stream.indirect_vreg.scatter [smem:s0], [sflag:$0x1f], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_smem_to_tilespmem_tileN_ir(i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 0, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.smem.to.tilespmem.tileN.v8i32(i32 addrspace(204)* %s, i32 4, i32* %src, i32 addrspace(201)* %dst, i32 %tileN_len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_hbm_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_hbm_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_hbm_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [hbm:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_hbm_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_indirect_scatter_tec_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_tec_tilespmem_to_hbm_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_tec_tilespmem_to_hbm_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_tec_tilespmem_to_hbm_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [hbm:s1] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_tec_tilespmem_to_hbm_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_strided_scatter_tec_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_tec_tilespmem_to_hbm_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_tec_tilespmem_to_hbm_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_tec_tilespmem_to_hbm_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [hbm:s1] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_tec_tilespmem_to_hbm_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_tilespmem_to_hbm_ri:
+; CHECK: [hbm:s1] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], $0x4, v0, vm0, $0
+; CHECK: [hbm:s1] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_tilespmem_to_hbm_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_tilespmem_to_hbm_rr:
+; CHECK: [hbm:s1] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], s3, v0, vm0, $0
+; CHECK: [hbm:s1] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_tilespmem_to_hbm_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_tilespmem_to_hbm_ii:
+; CHECK: [hbm:s1] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [hbm:s1] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_tilespmem_to_hbm_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_tilespmem_to_hbm_ir:
+; CHECK: [hbm:s1] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:$0x1f], s2, v0, vm0, $0
+; CHECK: [hbm:s1] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:$0x1f], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_tilespmem_to_hbm_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0
+; CHECK: [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_hbm4b_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s1+s4] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0
+; CHECK: [hbm4b:s1+s4] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_hbm4b_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s1+s2] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [hbm4b:s1+s2] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_hbm4b_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [hbm4b:s1+s3] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_hbm4b_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_indirect_scatter_tec_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_indirect_scatter_tec_tilespmem_to_hbm4b_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s1+s5] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [hbm4b:s1+s5] = stream.indirect.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_indirect_scatter_tec_tilespmem_to_hbm4b_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s1+s3] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [hbm4b:s1+s3] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_indirect_scatter_tec_tilespmem_to_hbm4b_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_scatter_tec_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [hbm4b:s1+s4] = stream.indirect.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_indirect_scatter_tec_tilespmem_to_hbm4b_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  call void @llvm.tpu.stream.indirect.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 addrspace(201)* %off, i32 %size, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_strided_scatter_tec_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0
+; CHECK: [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:s2], $0x4, s3, s4, $0x4
+define void @stream_strided_scatter_tec_tilespmem_to_hbm4b_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s1+s5] = stream.strided.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0
+; CHECK: [hbm4b:s1+s5] = stream.strided.scatter [tilespmem:s0], [sflag:s2], s3, s4, s5, $0x4
+define void @stream_strided_scatter_tec_tilespmem_to_hbm4b_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s1+s3] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0
+; CHECK: [hbm4b:s1+s3] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, s2, s3, $0x4
+define void @stream_strided_scatter_tec_tilespmem_to_hbm4b_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_strided_scatter_tec_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0
+; CHECK: [hbm4b:s1+s4] = stream.strided.scatter [tilespmem:s0], [sflag:$0x1f], s2, s3, s4, $0x4
+define void @stream_strided_scatter_tec_tilespmem_to_hbm4b_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  call void @llvm.tpu.stream.strided.scatter.tilespmem.to.hbm4b(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, i32 %stride_size, i32 %stride_length, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_tilespmem_to_hbm4b_ri:
+; CHECK: [hbm4b:s1+s3] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], $0x4, v0, vm0, $0
+; CHECK: [hbm4b:s1+s3] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_tilespmem_to_hbm4b_ri(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_tilespmem_to_hbm4b_rr:
+; CHECK: [hbm4b:s1+s4] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], s3, v0, vm0, $0
+; CHECK: [hbm4b:s1+s4] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:s2], s3, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_tilespmem_to_hbm4b_rr(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 addrspace(204)* %flag, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_tilespmem_to_hbm4b_ii:
+; CHECK: [hbm4b:s1+s2] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0
+; CHECK: [hbm4b:s1+s2] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_tilespmem_to_hbm4b_ii(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 4, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_indirect_vreg_scatter_tec_tilespmem_to_hbm4b_ir:
+; CHECK: [hbm4b:s1+s3] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:$0x1f], s2, v0, vm0, $0
+; CHECK: [hbm4b:s1+s3] = stream.indirect_vreg.scatter [tilespmem:s0], [sflag:$0x1f], s2, v0, vm0, $0x4
+define void @stream_indirect_vreg_scatter_tec_tilespmem_to_hbm4b_ir(i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  call void @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.hbm4b.v8i32(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(203)* %dst, i32 %len, <8 x i32> %offs, <8 x i1> %offsm, i32 0)
+  ret void
+}
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.vreg.tilespmem.to.spmem
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_spmem_ri:
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_spmem_ri(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_spmem_rr:
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_spmem_rr(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 addrspace(204)* %flag, i32 %len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_spmem_ii:
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_spmem_ii(i32 addrspace(201)* %src, i32 addrspace(202)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_spmem_ir:
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [spmem:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_spmem_ir(i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.spmem(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(202)* %dst, i32 %len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.tilespmem.to.spmem
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.spmem
+
+; Test intrinsic @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_tilespmem_tileN_ri:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], $0x4, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_tilespmem_tileN_ri(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_tilespmem_tileN_rr:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:s2], s3, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_tilespmem_tileN_rr(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 addrspace(204)* %flag, i32 %tileN_len) #2 {
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %flag, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_tilespmem_tileN_ii:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], $0x4, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_tilespmem_tileN_ii(i32 addrspace(201)* %src, i32 addrspace(201)* %dst) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 4, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: stream_linear_scatter_tec_tilespmem_to_tilespmem_tileN_ir:
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0
+; CHECK: [tilespmem.tileN:s1] = stream.linear.scatter [tilespmem:s0], [sflag:$0x1f], s2, $0x4
+define void @stream_linear_scatter_tec_tilespmem_to_tilespmem_tileN_ir(i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len) #2 {
+  %s = tail call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 31)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 0, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  call void @llvm.tpu.stream.linear.scatter.tilespmem.to.tilespmem.tileN(i32 addrspace(204)* %s, i32 4, i32 addrspace(201)* %src, i32 addrspace(201)* %dst, i32 %tileN_len, i32 0)
+  ret void
+}
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.scatter.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.strided.scatter.tilespmem.to.tilespmem.tileN
+
+; Not tested intrinsic @llvm.tpu.stream.indirect.vreg.scatter.tilespmem.to.tilespmem.tileN

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_1instr_fifo_rec_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_1instr_fifo_rec_sc.ll
new file mode 100644
index 0000000..3ee2dc1
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_1instr_fifo_rec_sc.ll

@@ -0,0 +1,84 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -stop-after=tpu-pipeliner -tpu-pipeliner-annotate-for-testing \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests that below loop pipelines and does not crash, which would happen
+; if the recurrence consisting of one composed fifo instruction would
+; not be considered, including its backedge latency.
+
+@a = external local_unnamed_addr global i32, align 4
+
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquei(<8 x i1>, <8 x i32>) #1
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32) #2
+
+; CHECK-LABEL: bb.4.
+; CHECK: scVPOP3_XRF1 {{.*}} Stage-1
+; CHECK: scVUNIQUE {{.*}} Stage-0
+
+define i32 @main() {
+entry:
+  %0 = tail call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 0)
+  %1 = load i32, i32* @a, align 4, !tbaa !7, !alias.scope !26, !noalias !13, !llvm.access.group !17
+  %cmp5 = icmp sgt i32 %1, 0
+  br i1 %cmp5, label %for.body, label %for.cond.cleanup, !llvm.loop !18
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret i32 0
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %mask.06 = phi <8 x i1> [ %4, %for.body ], [ <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, %entry ]
+  %2 = load <8 x i32>, <8 x i32> addrspace(201)* %0, align 32, !alias.scope !27, !noalias !25, !llvm.access.group !17
+  %3 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquei(<8 x i1> %mask.06, <8 x i32> %2), !alias.scope !26, !noalias !25, !llvm.access.group !17
+  %4 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %3, 2
+  %5 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %3, 1
+  store <8 x i32> %5, <8 x i32> addrspace(201)* %0, align 32, !alias.scope !27, !noalias !25, !llvm.access.group !17
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond.not = icmp eq i32 %inc, %1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !18
+}
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!smem.funcs.spill = !{!2}
+!tilespmem.funcs.spill = !{!2}
+!vmem.funcs.spill = !{!2}
+!smem.ranges.spill.start = !{!3}
+!smem.ranges.spill.limit = !{!4}
+!tilespmem.ranges.spill.start = !{!5}
+!tilespmem.ranges.spill.limit = !{!6}
+!vmem.ranges.spill.start = !{!3}
+!vmem.ranges.spill.limit = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version google3-trunk (93fd30bac3345fea4f5beba3241f1ef4f2f5f419)"}
+!2 = !{i32 ()* @main}
+!3 = !{i32 0}
+!4 = !{i32 1024}
+!5 = !{i32 128}
+!6 = !{i32 131072}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C++ TBAA"}
+!11 = distinct !{!11, !12, !"loop.parallel"}
+!12 = distinct !{!12, !"for.cond"}
+!13 = !{!14, !16}
+!14 = distinct !{!14, !15, !"alloc"}
+!15 = distinct !{!15, !"main"}
+!16 = distinct !{!16, !12, !"loop.parallel"}
+!17 = distinct !{}
+!18 = distinct !{!18, !19, !20, !21, !22, !23}
+!19 = !{!"llvm.loop.parallel_accesses", !17}
+!20 = !{!"llvm.loop.unroll.disable"}
+!21 = !{!"llvm.loop.vectorize.width", i32 1}
+!22 = !{!"llvm.loop.interleave.count", i32 1}
+!23 = !{!"llvm.loop.vectorize.enable", i1 true}
+!24 = !{!24, !12, !"loop.parallel"}
+!25 = !{!16}
+!26 = !{!11}
+!27 = !{!24, !14, !11}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_5cycle_lr_split_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_5cycle_lr_split_sc.ll
new file mode 100644
index 0000000..c7a0a18
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_5cycle_lr_split_sc.ll

@@ -0,0 +1,99 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp \
+; RUN: -stop-after=tpu-pipeliner -tpu-pipeliner-annotate-for-testing \
+; RUN: -enable-ordering-twist=false -tpu-latencies %S/Inputs/vld_4_cycle.yml \
+; RUN: -calculate-branch-delay-ii=false | FileCheck %s
+; REQUIRES: tpu
+
+; Tests whether we can pipeline below loop in 5 cycles when enabling manual live-range
+; splitting as well as bundle limiters.
+
+; The test is derived from sorted_deduplicate baseline from the embeddings kernels.
+
+; The test was tailored assuming a vld latency of 4.
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <8 x i32> @llvm.tpu.vshift.insert(<8 x i32>, <8 x i32>, i32)
+declare <8 x i32> @llvm.tpu.vmpcnt.ones(<8 x i1>)
+declare <8 x i32> @llvm.tpu.mprefix.v8i32(<8 x i1>)
+declare void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+
+; CHECK-LABEL: bb.2.
+; CHECK: scBUNDLE
+; CHECK: early-clobber {{.*}} VMOV_SEr {{.*}}Cycle-0
+; CHECK: early-clobber {{.*}} VMOV_SEr {{.*}}Cycle-0
+; CHECK: {{.*}}Cycle-0
+; CHECK: scBUNDLE
+; CHECK: {{.*}}Cycle-1
+; CHECK: scBUNDLE
+; CHECK: {{.*}}Cycle-2
+; CHECK: scBUNDLE
+; CHECK: {{.*}}Cycle-3
+; CHECK: BRcond
+; CHECK: BR
+
+define void @tile_execute() {
+entry:
+  %0 = load i32, i32* inttoptr (i32 256 to i32*), align 256, !tbaa !38
+  %1 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 257 to <8 x i32> addrspace(201)**), align 4, !tbaa !38
+  %div = sdiv i32 %0, 8
+  %cmp35.i = icmp sgt i32 %0, 7
+  br i1 %cmp35.i, label %for.body.lr.ph.i, label %_ZN10embeddings17SortedDeduplicate7ComputeENS_20TileSpmemVectorArrayIiEEPS2_PiS3_.exit, !llvm.loop !42
+
+for.body.lr.ph.i:                                 ; preds = %entry
+  %2 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 259 to <8 x i32> addrspace(201)**), align 4, !tbaa !38
+  %3 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 258 to <8 x i32> addrspace(201)**), align 4, !tbaa !38
+  %4 = load <8 x i32>, <8 x i32> addrspace(201)* %1, align 32, !tbaa !49
+  %vecext.i = extractelement <8 x i32> %4, i32 0
+  %sub.i = add nsw i32 %vecext.i, -1
+  %splat.splatinsert.i = insertelement <8 x i32> undef, i32 %sub.i, i32 0
+  %splat.splat.i = shufflevector <8 x i32> %splat.splatinsert.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.body.i, %for.body.lr.ph.i
+  %last_offset.038.i = phi <8 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %for.body.lr.ph.i ], [ %add8.i, %for.body.i ]
+  %last_input_vec.037.i = phi <8 x i32> [ %splat.splat.i, %for.body.lr.ph.i ], [ %5, %for.body.i ]
+  %i.036.i = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc.i, %for.body.i ]
+  %add.ptr.i34.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %1, i32 %i.036.i
+  %5 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i34.i, align 32, !tbaa !49, !llvm.access.group !44
+  %6 = tail call <8 x i32> @llvm.tpu.vshift.insert(<8 x i32> %last_input_vec.037.i, <8 x i32> %5, i32 7)
+  %cmp4.i = icmp ne <8 x i32> %6, %5
+  %7 = tail call <8 x i32> @llvm.tpu.vmpcnt.ones(<8 x i1> %cmp4.i)
+  %8 = tail call <8 x i32> @llvm.tpu.mprefix.v8i32(<8 x i1> %cmp4.i)
+  %add.i = add <8 x i32> %8, %last_offset.038.i
+  tail call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> %cmp4.i, <8 x i32> addrspace(201)* %3, <8 x i32> %add.i, <8 x i32> %5), !llvm.access.group !44
+  %add.ptr.i.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %2, i32 %i.036.i
+  store <8 x i32> %add.i, <8 x i32> addrspace(201)* %add.ptr.i.i, align 32, !tbaa !49, !llvm.access.group !44
+  %add8.i = add <8 x i32> %7, %last_offset.038.i
+  %inc.i = add nuw nsw i32 %i.036.i, 1
+  %exitcond.not.i = icmp eq i32 %inc.i, %div
+  br i1 %exitcond.not.i, label %_ZN10embeddings17SortedDeduplicate7ComputeENS_20TileSpmemVectorArrayIiEEPS2_PiS3_.exit, label %for.body.i, !llvm.loop !42
+
+_ZN10embeddings17SortedDeduplicate7ComputeENS_20TileSpmemVectorArrayIiEEPS2_PiS3_.exit: ; preds = %for.body.i, %entry
+  %last_offset.0.lcssa.i = phi <8 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %add8.i, %for.body.i ]
+  %vecext9.i = extractelement <8 x i32> %last_offset.0.lcssa.i, i32 0
+  %add10.i = add nsw i32 %vecext9.i, 1
+  store i32 %add10.i, i32* inttoptr (i32 256 to i32*), align 256, !tbaa !38
+  ret void
+}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version google3-trunk (ada1e2ffa1172ede1790b4b42ef8ab01508d3a47)"}
+!3 = !{void ()* @tile_execute}
+!34 = !{i32 0}
+!35 = !{i32 1024}
+!36 = !{i32 8192}
+!37 = !{i32 131072}
+!38 = !{!39, !39, i64 0}
+!39 = !{!"int", !40, i64 0}
+!40 = !{!"omnipotent char", !41, i64 0}
+!41 = !{!"Simple C++ TBAA"}
+!42 = distinct !{!42, !43, !45, !46, !47, !48}
+!43 = !{!"llvm.loop.parallel_accesses", !44}
+!44 = distinct !{}
+!45 = !{!"llvm.loop.unroll.disable"}
+!46 = !{!"llvm.loop.vectorize.width", i32 1}
+!47 = !{!"llvm.loop.interleave.count", i32 1}
+!48 = !{!"llvm.loop.vectorize.enable", i1 true}
+!49 = !{!40, !40, i64 0}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_analyze_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_analyze_sc.ll
new file mode 100644
index 0000000..a7da59b
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_analyze_sc.ll

@@ -0,0 +1,122 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -stop-after=tpu-pipeliner \
+; RUN: -tpu-pipeliner-annotate-for-testing -calculate-latency-ii=false -max-ii=3 \
+; RUN: -enable-split-live-ranges=false -pre-bundle-limiters=false \
+; RUN: -enable-ordering-twist=false -calculate-branch-delay-ii=false \
+; RUN: -instcombine-max-iterations=0 | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.loop.parallel()
+declare i32* @llvm.tpu.inttoptr.pi32(i32)
+
+; Tests if the SW pipeliner's loop analyzer can understand a post-increment
+; in the loop.
+
+; CHECK-LABEL: {{.*}} pre_iv_update_loop
+; CHECK: bb.1.for.body.i
+; CHECK: {{.*}} Stage-0_Cycle-1
+; CHECK: {{.*}} Stage-1_Cycle-1
+
+define void @pre_iv_update_loop() {
+entry:
+  %0 = call i32* @llvm.tpu.inttoptr.pi32(i32 0)
+  %1 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  %2 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.i ]
+  %idx0 = getelementptr inbounds i32, i32* %0, i32 %i
+  %3 = load i32, i32* %idx0, align 4
+  store i32 %3, i32* %1, align 4
+  %idx1 = getelementptr inbounds i32, i32* %0, i32 32
+  %4 = load i32, i32* %idx1, align 4
+  store i32 %4, i32* %2, align 4
+  %idx2 = getelementptr inbounds i32, i32* %0, i32 16
+  %5 = load i32, i32* %idx2, align 4
+  store i32 %5, i32* %2, align 4
+  %cmp.i = icmp slt i32 %i, 31
+  %ic = add nuw nsw i32 %i, 1
+  br i1 %cmp.i, label %for.body.i, label %exit
+
+exit:
+  ret void
+}
+
+; Tests if the SW pipeliner's loop analyzer can understand a SRLri
+; before the loop condition's test.
+
+; CHECK-LABEL: {{.*}} srl_iv_update_loop
+; CHECK: bb.1.for.body.i
+; CHECK: {{.*}} Stage-0_Cycle-1
+; CHECK: {{.*}} Stage-1_Cycle-1
+
+define void @srl_iv_update_loop() {
+entry:
+  %0 = call i32* @llvm.tpu.inttoptr.pi32(i32 0)
+  %1 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  %2 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.i ]
+  %idx0 = getelementptr inbounds i32, i32* %0, i32 %i
+  %3 = load i32, i32* %idx0, align 4
+  store i32 %3, i32* %1, align 4
+  %idx1 = getelementptr inbounds i32, i32* %0, i32 32
+  %4 = load i32, i32* %idx1, align 4
+  store i32 %4, i32* %2, align 4
+  %idx2 = getelementptr inbounds i32, i32* %0, i32 16
+  %5 = load i32, i32* %idx2, align 4
+  store i32 %5, i32* %2, align 4
+  %6 = lshr i32 %i, 1
+  %cmp.i = icmp slt i32 %6, 15
+  %ic = add i32 %i, 1
+  br i1 %cmp.i, label %for.body.i, label %exit
+
+exit:
+  ret void
+}
+
+; Tests if the SW pipeliner's loop analyzer can understand
+; llvm.loop.pipeline.disable.
+
+; CHECK-LABEL: {{.*}} loop_pipeliner_disabled
+; CHECK-NOT: {{.*}} Stage-
+
+define void @loop_pipeliner_disabled() {
+entry:
+  %0 = call i32* @llvm.tpu.inttoptr.pi32(i32 0)
+  %1 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  %2 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.i ]
+  %idx0 = getelementptr inbounds i32, i32* %0, i32 %i
+  %3 = load i32, i32* %idx0, align 4
+  store i32 %3, i32* %1, align 4
+  %idx1 = getelementptr inbounds i32, i32* %0, i32 32
+  %4 = load i32, i32* %idx1, align 4
+  store i32 %4, i32* %2, align 4
+  %idx2 = getelementptr inbounds i32, i32* %0, i32 16
+  %5 = load i32, i32* %idx2, align 4
+  store i32 %5, i32* %2, align 4
+  %cmp.i = icmp slt i32 %i, 31
+  %ic = add nuw nsw i32 %i, 1
+  br i1 %cmp.i, label %for.body.i, label %exit, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !3, !4, !5, !6, !7}
+!1 = !{!"llvm.loop.parallel_accesses", !2}
+!2 = distinct !{}
+!3 = !{!"llvm.loop.unroll.disable"}
+!4 = !{!"llvm.loop.vectorize.width", i32 1}
+!5 = !{!"llvm.loop.interleave.count", i32 1}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}
+!7 = !{!"llvm.loop.pipeline.disable", i32 1}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_bail_input_distance_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_bail_input_distance_sc.ll
new file mode 100644
index 0000000..9d43dea
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_bail_input_distance_sc.ll

@@ -0,0 +1,39 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -stop-after=tpu-pipeliner \
+; RUN: -tpu-pipeliner-annotate-for-testing -instcombine-max-iterations=0 \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32)
+
+; CHECK-LABEL: {{.*}} input_distance_two
+; CHECK: bb.1.for.body.i
+; CHECK-NOT: {{.*}} Stage{{.*}}Cycle
+
+define void @input_distance_two() {
+entry:
+  %0 = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  %1 = load i32, i32 addrspace(208)* %0, align 4
+  %2 = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 16)
+  %3 = load i32, i32 addrspace(208)* %2, align 4
+  %4 = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 32)
+  %5 = load i32, i32 addrspace(208)* %4, align 4
+  %l = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 48)
+  %s = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 52)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %iv, %for.body.i ]
+  %6 = phi i32 [ %1, %entry ], [ %7, %for.body.i ]
+  %7 = phi i32 [ %3, %entry ], [ %8, %for.body.i ]
+  %.not = icmp slt i32 %7, %i
+  %iv = add nuw nsw i32 %i, 1
+  store i32 %6, i32 addrspace(208)* %s, align 4
+  %8 = load i32, i32 addrspace(208)* %l, align 4
+  br i1 %.not, label %exit, label %for.body.i
+
+exit:
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_event_debug.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_event_debug.ll
new file mode 100644
index 0000000..2ef072a
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_event_debug.ll

@@ -0,0 +1,59 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp -tpu-pipeliner-annotate-for-testing \
+; RUN: -tpu-latencies=%S/Inputs/long_load3.yml -enable-ordering-twist=false \
+; RUN: -stop-after=tpu-event-debug -tpu-enable-event-debug -tpu-event-debug-stores \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32) nounwind
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32) nounwind
+declare <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float>)
+
+; Checks that the events debug pass populates event instructions as expected.
+
+; CHECK-LABEL: loop_func
+; CHECK: scVSTi killed %[[v0:[0-9]+]], {{[0-9]+}}, $palways, [[pi0:[0-9]+]]
+; CHECK-NEXT: EVENT @.str_loop_func_{{[0-9]+}}, %[[v0]], $palways, [[pi0]]
+; CHECK: scVSTi killed %[[v1:[0-9]+]], {{[0-9]+}}, $palways, [[pi1:[0-9]+]]
+; CHECK-NEXT: EVENT @.str_loop_func_{{[0-9]+}}, %[[v1]], $palways, [[pi1]]
+; CHECK: scVSTi killed %[[v2:[0-9]+]], {{[0-9]+}}, %[[p2:[0-9]+]], [[pi2:[0-9]+]]
+; CHECK-NEXT: EVENT @.str_loop_func_{{[0-9]+}}, %[[v2]], %[[p2]], [[pi2]]
+; CHECK: scVSTi killed %[[v3:[0-9]+]], {{[0-9]+}}, %[[p3:[0-9]+]], [[pi3:[0-9]+]]
+; CHECK-NEXT: EVENT @.str_loop_func_{{[0-9]+}}, %[[v3]], %[[p3]], [[pi3]]
+; CHECK: scVSTi killed %[[v4:[0-9]+]], {{[0-9]+}}, %[[p4:[0-9]+]], [[pi4:[0-9]+]]
+; CHECK-NEXT: EVENT @.str_loop_func_{{[0-9]+}}, %[[v4]], %[[p4]], [[pi4]]
+; CHECK: scVSTi killed %[[v5:[0-9]+]], {{[0-9]+}}, %[[p5:[0-9]+]], [[pi5:[0-9]+]]
+; CHECK-NEXT: EVENT @.str_loop_func_{{[0-9]+}}, %[[v5]], %[[p5]], [[pi5]]
+
+define void @loop_func(<8 x float> %in1, i32 %u) {
+entry:
+  %a1 = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 0)
+  %a2 = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 32)
+  %a3 = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 64)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body.i ]
+  %v = load <8 x float>, <8 x float> addrspace(201)* %a1, align 32
+  %sqrt = tail call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %v)
+  %rcp = tail call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %in1)
+  store <8 x float> %sqrt, <8 x float> addrspace(201)* %a2
+  store <8 x float> %rcp, <8 x float> addrspace(201)* %a3
+  %inc.i = add nuw nsw i32 %i, 1
+  %exitcond.i = icmp eq i32 %inc.i, %u
+  br i1 %exitcond.i, label %for.cond.loopexit.i, label %for.body.i, !llvm.loop !0
+
+for.cond.loopexit.i:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !3, !4, !5, !6}
+!1 = !{!"llvm.loop.parallel_accesses", !2}
+!2 = distinct !{}
+!3 = !{!"llvm.loop.unroll.disable"}
+!4 = !{!"llvm.loop.vectorize.width", i32 1}
+!5 = !{!"llvm.loop.interleave.count", i32 1}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_analysis_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_analysis_sc.ll
new file mode 100644
index 0000000..1781973
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_analysis_sc.ll

@@ -0,0 +1,170 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -stop-after=tpu-pipeliner -tpu-pipeliner-annotate-for-testing \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Tests whether below loops both pipeline. The test is not handwritten and derived by clang
+; for the bug b/169101823. Checks that the fifo overflow check does not crash during fifo
+; analysis, which would happen if we attempted to analyze previously pipelined predicated
+; code.
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+@__sc_tile_execute_entry = alias i32, bitcast (void ()* @tile_execute to i32*)
+
+; CHECK-LABEL: bb.3.
+; CHECK: scVUNIQUE {{.*}} Stage-{{[0-9]+}}
+; CHECK-LABEL: bb.7.
+; CHECK: scVDUPCNT {{.*}} Stage-{{[0-9]+}}
+
+define void @tile_execute() #1 section ".text.tile_execute" {
+_ZN10embeddings29RadixSortTileSpmemToTileSpmemILi4EEC2EPNS_25SparsecoreMemoryAllocatorENS_20TileSpmemVectorArrayIiEE.exit:
+  %0 = tail call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 0)
+  %1 = load i32, i32* inttoptr (i32 256 to i32*), align 256, !tbaa !88
+  %2 = load i32, i32* inttoptr (i32 257 to i32*), align 4, !tbaa !88
+  %div = sdiv i32 %2, 8
+  %3 = ptrtoint <8 x i32> addrspace(201)* %0 to i32
+  %add.ptr.i157.i.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %0, i32 %div
+  %cmp20.i.i = icmp sgt i32 %2, 7
+  %add.ptr.i.i33.i.1 = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i157.i.i, i32 1
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %_ZN10embeddings18RadixSortIterationIiLi4EE14RankAndPermuteEibNS_20TileSpmemVectorArrayIiEEPS3_.exit.i, %_ZN10embeddings29RadixSortTileSpmemToTileSpmemILi4EEC2EPNS_25SparsecoreMemoryAllocatorENS_20TileSpmemVectorArrayIiEE.exit
+  %sorter.sroa.13.sroa.19.1 = phi i32 [ %3, %_ZN10embeddings29RadixSortTileSpmemToTileSpmemILi4EEC2EPNS_25SparsecoreMemoryAllocatorENS_20TileSpmemVectorArrayIiEE.exit ], [ %sorter.sroa.4.sroa.8.0, %_ZN10embeddings18RadixSortIterationIiLi4EE14RankAndPermuteEibNS_20TileSpmemVectorArrayIiEEPS3_.exit.i ]
+  %sorter.sroa.4.sroa.8.0 = phi i32 [ %1, %_ZN10embeddings29RadixSortTileSpmemToTileSpmemILi4EEC2EPNS_25SparsecoreMemoryAllocatorENS_20TileSpmemVectorArrayIiEE.exit ], [ %sorter.sroa.13.sroa.19.1, %_ZN10embeddings18RadixSortIterationIiLi4EE14RankAndPermuteEibNS_20TileSpmemVectorArrayIiEEPS3_.exit.i ]
+  %digit.053.i = phi i32 [ 0, %_ZN10embeddings29RadixSortTileSpmemToTileSpmemILi4EEC2EPNS_25SparsecoreMemoryAllocatorENS_20TileSpmemVectorArrayIiEE.exit ], [ %inc.i, %_ZN10embeddings18RadixSortIterationIiLi4EE14RankAndPermuteEibNS_20TileSpmemVectorArrayIiEEPS3_.exit.i ]
+  store <8 x i32> zeroinitializer, <8 x i32> addrspace(201)* %add.ptr.i157.i.i, align 32, !tbaa !92
+  store <8 x i32> zeroinitializer, <8 x i32> addrspace(201)* %add.ptr.i.i33.i.1, align 32, !tbaa !92
+  %4 = inttoptr i32 %sorter.sroa.4.sroa.8.0 to <8 x i32> addrspace(201)*
+  br i1 %cmp20.i.i, label %for.body.lr.ph.i.i, label %_ZN10embeddings18RadixSortIterationIiLi4EE13HistogramKeysEiNS_20TileSpmemVectorArrayIiEE.exit.i, !llvm.loop !93
+
+for.body.lr.ph.i.i:                               ; preds = %for.body.i
+  %mul.i.i.i = shl nsw i32 %digit.053.i, 2
+  %splat.splatinsert.i.i.i = insertelement <8 x i32> undef, i32 %mul.i.i.i, i32 0
+  %splat.splat.i.i.i = shufflevector <8 x i32> %splat.splatinsert.i.i.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  br label %for.body.i22.i
+
+for.body.i22.i:                                   ; preds = %for.body.i22.i, %for.body.lr.ph.i.i
+  %i.021.i.i = phi i32 [ 0, %for.body.lr.ph.i.i ], [ %add6.i.i, %for.body.i22.i ]
+  %add.ptr.i.i21.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %4, i32 %i.021.i.i
+  %5 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i.i21.i, align 32, !tbaa !92, !alias.scope !123, !noalias !102, !llvm.access.group !95
+  %shr.i.i.i = ashr <8 x i32> %5, %splat.splat.i.i.i
+  %and.i.i.i = and <8 x i32> %shr.i.i.i, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %6 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquei(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %and.i.i.i) #6, !noalias !104, !llvm.access.group !95
+  %n = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %6, 1
+  %nn = add <8 x i32> %and.i.i.i, %n
+  tail call void @llvm.tpu.vst.msk.idx.add.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %add.ptr.i157.i.i, <8 x i32> %nn, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>), !alias.scope !123, !noalias !102, !llvm.access.group !95
+  %add6.i.i = add nuw nsw i32 %i.021.i.i, 1
+  %exitcond.not.i = icmp eq i32 %add6.i.i, %div
+  br i1 %exitcond.not.i, label %_ZN10embeddings18RadixSortIterationIiLi4EE13HistogramKeysEiNS_20TileSpmemVectorArrayIiEE.exit.i, label %for.body.i22.i, !llvm.loop !93
+
+_ZN10embeddings18RadixSortIterationIiLi4EE13HistogramKeysEiNS_20TileSpmemVectorArrayIiEE.exit.i: ; preds = %for.body.i22.i, %for.body.i
+  %7 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i157.i.i, align 32, !tbaa !92
+  %8 = tail call { <8 x i32>, <8 x i1> } @llvm.tpu.add.scan1xNi(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %7) #6, !noalias !107
+  %9 = extractvalue { <8 x i32>, <8 x i1> } %8, 0
+  %10 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i.i33.i.1, align 32, !tbaa !92
+  %11 = tail call { <8 x i32>, <8 x i1> } @llvm.tpu.add.scan1xNi(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %10) #6, !noalias !107
+  %12 = extractvalue { <8 x i32>, <8 x i1> } %11, 0
+  %splat.splat.i.i.1 = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %add.i.i.1 = add <8 x i32> %12, %splat.splat.i.i.1
+  %13 = xor <8 x i32> %7, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %add.i.i.i.i.i = add <8 x i32> %9, %13
+  store <8 x i32> %add.i.i.i.i.i, <8 x i32> addrspace(201)* %add.ptr.i157.i.i, align 32, !tbaa !92
+  %14 = xor <8 x i32> %10, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %add.i.i.i.i.i.1 = add <8 x i32> %add.i.i.1, %14
+  store <8 x i32> %add.i.i.i.i.i.1, <8 x i32> addrspace(201)* %add.ptr.i.i33.i.1, align 32, !tbaa !92
+  br i1 %cmp20.i.i, label %for.body.lr.ph.i.i.i, label %_ZN10embeddings18RadixSortIterationIiLi4EE14RankAndPermuteEibNS_20TileSpmemVectorArrayIiEEPS3_.exit.i, !llvm.loop !110
+
+for.body.lr.ph.i.i.i:                             ; preds = %_ZN10embeddings18RadixSortIterationIiLi4EE13HistogramKeysEiNS_20TileSpmemVectorArrayIiEE.exit.i
+  %mul.i.i.i.i = shl nsw i32 %digit.053.i, 2
+  %splat.splatinsert.i.i.i.i = insertelement <8 x i32> undef, i32 %mul.i.i.i.i, i32 0
+  %splat.splat.i.i.i.i = shufflevector <8 x i32> %splat.splatinsert.i.i.i.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %15 = inttoptr i32 %sorter.sroa.13.sroa.19.1 to <8 x i32> addrspace(201)*
+  br label %for.body.i.i.i
+
+for.body.i.i.i:                                   ; preds = %for.body.i.i.i, %for.body.lr.ph.i.i.i
+  %i.037.i.i.i = phi i32 [ 0, %for.body.lr.ph.i.i.i ], [ %add7.i.i.i, %for.body.i.i.i ]
+  %16 = tail call <8 x i32> addrspace(201)* @llvm.tpu.make.restrict.ptr(<8 x i32> addrspace(201)* %4), !alias.scope !124, !noalias !115, !llvm.access.group !112
+  %add.ptr.i.i.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %16, i32 %i.037.i.i.i
+  %17 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i.i.i, align 32, !tbaa !92, !alias.scope !124, !noalias !115, !llvm.access.group !112
+  %shr.i.i.i.i = ashr <8 x i32> %17, %splat.splat.i.i.i.i
+  %and.i.i.i.i = and <8 x i32> %shr.i.i.i.i, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %18 = tail call <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.p201v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %add.ptr.i157.i.i, <8 x i32> %and.i.i.i.i, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>), !alias.scope !125, !noalias !119, !llvm.access.group !112
+  %19 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.dupcnti(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %and.i.i.i.i) #6, !noalias !120, !llvm.access.group !112
+  %20 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %19, 1
+  %add6.i.i.i = add <8 x i32> %20, %18
+  tail call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %15, <8 x i32> %add6.i.i.i, <8 x i32> %17), !alias.scope !124, !noalias !115, !llvm.access.group !112
+  %add7.i.i.i = add nuw nsw i32 %i.037.i.i.i, 1
+  %exitcond.not.i.i = icmp eq i32 %add7.i.i.i, %div
+  br i1 %exitcond.not.i.i, label %_ZN10embeddings18RadixSortIterationIiLi4EE14RankAndPermuteEibNS_20TileSpmemVectorArrayIiEEPS3_.exit.i, label %for.body.i.i.i, !llvm.loop !110
+
+_ZN10embeddings18RadixSortIterationIiLi4EE14RankAndPermuteEibNS_20TileSpmemVectorArrayIiEEPS3_.exit.i: ; preds = %for.body.i.i.i, %_ZN10embeddings18RadixSortIterationIiLi4EE13HistogramKeysEiNS_20TileSpmemVectorArrayIiEE.exit.i
+  %inc.i = add nuw nsw i32 %digit.053.i, 1
+  %exitcond56.not.i = icmp eq i32 %inc.i, 8
+  br i1 %exitcond56.not.i, label %_ZN10embeddings29RadixSortTileSpmemToTileSpmemILi4EE4SortEv.exit, label %for.body.i
+
+_ZN10embeddings29RadixSortTileSpmemToTileSpmemILi4EE4SortEv.exit: ; preds = %_ZN10embeddings18RadixSortIterationIiLi4EE14RankAndPermuteEibNS_20TileSpmemVectorArrayIiEEPS3_.exit.i
+  store i32 1, i32* inttoptr (i32 256 to i32*), align 256, !tbaa !88
+  ret void
+}
+
+declare void @llvm.tpu.vst.msk.idx.add.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.uniquei(<8 x i1>, <8 x i32>)
+declare { <8 x i32>, <8 x i1> } @llvm.tpu.add.scan1xNi(<8 x i1>, <8 x i32>)
+declare noalias <8 x i32> addrspace(201)* @llvm.tpu.make.restrict.ptr(<8 x i32> addrspace(201)*)
+declare <8 x i32> @llvm.tpu.vst.msk.idx.ret.add.np.v8i32.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.dupcnti(<8 x i1>, <8 x i32>)
+declare void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32)
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version google3-trunk (93fd30bac3345fea4f5beba3241f1ef4f2f5f419)"}
+!3 = !{void ()* @tile_execute}
+!82 = !{i32 0}
+!83 = !{i32 512}
+!84 = !{i32 1024}
+!85 = !{i32 8192}
+!86 = !{i32 65535}
+!87 = !{i32 131072}
+!88 = !{!89, !89, i64 0}
+!89 = !{!"int", !90, i64 0}
+!90 = !{!"omnipotent char", !91, i64 0}
+!91 = !{!"Simple C++ TBAA"}
+!92 = !{!90, !90, i64 0}
+!93 = distinct !{!93, !94, !96, !97, !98, !99}
+!94 = !{!"llvm.loop.parallel_accesses", !95}
+!95 = distinct !{}
+!96 = !{!"llvm.loop.unroll.disable"}
+!97 = !{!"llvm.loop.vectorize.width", i32 1}
+!98 = !{!"llvm.loop.interleave.count", i32 1}
+!99 = !{!"llvm.loop.vectorize.enable", i1 true}
+!100 = distinct !{!100, !101, !"loop.parallel"}
+!101 = distinct !{!101, !"for.cond"}
+!102 = !{!103}
+!103 = distinct !{!103, !101, !"loop.parallel"}
+!104 = !{!105}
+!105 = distinct !{!105, !106, !"_ZN10embeddings13VectorVuniqueIDv8_iEENS_13VuniqueResultIT_EES1_S3_: %agg.result"}
+!106 = distinct !{!106, !"_ZN10embeddings13VectorVuniqueIDv8_iEENS_13VuniqueResultIT_EES1_S3_"}
+!107 = !{!108}
+!108 = distinct !{!108, !109, !"_ZN10embeddings13VectorAddScanIDv8_iEENS_13AddScanResultIT_EES1_S3_: %agg.result"}
+!109 = distinct !{!109, !"_ZN10embeddings13VectorAddScanIDv8_iEENS_13AddScanResultIT_EES1_S3_"}
+!110 = distinct !{!110, !111, !96, !97, !98, !99}
+!111 = !{!"llvm.loop.parallel_accesses", !112}
+!112 = distinct !{}
+!113 = distinct !{!113, !114, !"loop.parallel"}
+!114 = distinct !{!114, !"for.cond"}
+!115 = !{!116, !117, !118}
+!116 = distinct !{!116, !114, !"loop.parallel"}
+!117 = distinct !{!117, !114, !"loop.parallel"}
+!118 = distinct !{!118, !114, !"loop.parallel"}
+!119 = !{!113, !116}
+!120 = !{!121}
+!121 = distinct !{!121, !122, !"_ZN10embeddings12VectorDupcntIDv8_iEENS_12DupcntResultIT_EES1_S3_: %agg.result"}
+!122 = distinct !{!122, !"_ZN10embeddings12VectorDupcntIDv8_iEENS_12DupcntResultIT_EES1_S3_"}
+!123 = !{!100}
+!124 = !{!113}
+!125 = !{!117}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_overflow_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_overflow_sc.ll
new file mode 100644
index 0000000..62e854b
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_overflow_sc.ll

@@ -0,0 +1,102 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp -tpu-latencies=%S/Inputs/long_push4.yml \
+; RUN: -tpu-pipeliner-annotate-for-testing -stop-after=tpu-pipeliner \
+; RUN: -enable-ordering-twist=false -calculate-branch-delay-ii=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32) nounwind
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32) nounwind
+declare { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32>, <8 x float>) nounwind
+declare {<8 x i32>, <8 x i32>, <8 x i1>} @llvm.tpu.sort.ascdi.v8i32(<8 x i1>, <8 x i32>, <8 x i32>) nounwind
+
+; Checks that the fifo overflow check in the software pipeliner leads to a schedule
+; that does not overflow the fifo.
+
+; CHECK-LABEL: tight_loop_func_xrf0
+; CHECK-LABEL: bb.3.for.body.i:
+; CHECK: scVSEGREDUCEADDF
+; CHECK-LABEL: bb.4.for.body.i:
+; CHECK: scVSEGREDUCEADDF
+; CHECK-LABEL: bb.5.for.body.i:
+; CHECK: scVSEGREDUCEADDF
+; CHECK-LABEL: bb.6.for.body.i:
+; CHECK: scVSEGREDUCEADDF
+; CHECK-LABEL: bb.7.for.body.i:
+; CHECK: scVSEGREDUCEADDF
+; CHECK-LABEL: bb.1.for.body.i:
+; CHECK-NEXT: successors: {{.*}} %bb.1
+; CHECK: PHI
+; CHECK: scVSEGREDUCEADDF
+; CHECK: scVPOP3_XRF0 {{.*}} Stage-5
+
+define void @tight_loop_func_xrf0(<8 x i32> %0, <8 x float> %1) {
+entry:
+  %2 = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 0)
+  br label %for.body.i
+
+for.body.i:
+  %j.0104.i = phi i32 [ 0, %entry ], [ %inc.i, %for.body.i ]
+  %3 = tail call { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %0, <8 x float> %1)
+  %4 = extractvalue { <8 x float>, <8 x i1> } %3, 0
+  store <8 x float> %4, <8 x float> addrspace(201)* %2
+  %inc.i = add nuw nsw i32 %j.0104.i, 1
+  %exitcond.i = icmp eq i32 %inc.i, 32
+  br i1 %exitcond.i, label %for.cond.loopexit.i, label %for.body.i, !llvm.loop !0
+
+for.cond.loopexit.i:
+  ret void
+}
+
+; CHECK-LABEL: tight_loop_func_xrf1
+; CHECK-LABEL: bb.3.for.body.i:
+; CHECK: scVSORT
+; CHECK-LABEL: bb.4.for.body.i:
+; CHECK: scVSORT
+; CHECK-LABEL: bb.5.for.body.i:
+; CHECK: scVSORT
+; CHECK-LABEL: bb.6.for.body.i:
+; CHECK: scVSORT
+; CHECK-LABEL: bb.7.for.body.i:
+; CHECK: scVSORT
+; CHECK-LABEL: bb.8.for.body.i:
+; CHECK: scVSORT
+; CHECK-LABEL: bb.9.for.body.i:
+; CHECK: scVSORT
+; CHECK-LABEL: bb.10.for.body.i:
+; CHECK: scVSORT
+; CHECK-LABEL: bb.11.for.body.i:
+; CHECK: scVSORT
+; CHECK-LABEL: bb.1.for.body.i:
+; CHECK-NEXT: successors: {{.*}} %bb.1
+; CHECK: PHI
+; CHECK: scVSORT
+; CHECK: scVPOP3_XRF1 {{.*}} Stage-9
+
+define void @tight_loop_func_xrf1(<8 x i1> %0, <8 x i32> %1, <8 x i32> %2) {
+entry:
+  %3 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 0)
+  br label %for.body.i
+
+for.body.i:
+  %j.0104.i = phi i32 [ 0, %entry ], [ %inc.i, %for.body.i ]
+  %4 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> %0, <8 x i32> %1, <8 x i32> %2)
+  %5 = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %4, 0
+  store <8 x i32> %5, <8 x i32> addrspace(201)* %3
+  %inc.i = add nuw nsw i32 %j.0104.i, 1
+  %exitcond.i = icmp eq i32 %inc.i, 32
+  br i1 %exitcond.i, label %for.cond.loopexit.i, label %for.body.i, !llvm.loop !0
+
+for.cond.loopexit.i:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !3, !4, !5, !6}
+!1 = !{!"llvm.loop.parallel_accesses", !2}
+!2 = distinct !{}
+!3 = !{!"llvm.loop.unroll.disable"}
+!4 = !{!"llvm.loop.vectorize.width", i32 1}
+!5 = !{!"llvm.loop.interleave.count", i32 1}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_reorder_bf16_gl_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_reorder_bf16_gl_sc.ll
new file mode 100644
index 0000000..662e69e
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_reorder_bf16_gl_sc.ll

@@ -0,0 +1,60 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -disable-cgp -tpu-latencies=%S/Inputs/long_load3.yml \
+; RUN: -tpu-pipeliner-annotate-for-testing -stop-after=tpu-pipeliner \
+; RUN: -tpu-enable-vliw-prep-postiv=false -tpu-enable-vliw-prep-post-addrinc=false \
+; RUN: -calculate-branch-delay-ii=false | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32) nounwind
+declare <32 x i8> addrspace(201)* @llvm.tpu.inttoptr.p201v32i8(i32) nounwind
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32) nounwind
+declare { <16 x bfloat>, <8 x i1> } @llvm.tpu.add.half.seg.scan2xNbf16(<8 x i1>, <16 x bfloat>)
+declare <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float>)
+declare <16 x bfloat> @llvm.tpu.unpack.c.l.u8.bf16(<32 x i8>)
+declare <32 x i8> @llvm.tpu.pack.c.bf16.u8(<16 x bfloat>, <16 x bfloat>)
+
+; Checks that the software pipeliner is able to reorder bf16 fifo instructions. Same
+; test as sw_pipeliner_fifo_reorder_sc.ll, just with bf16.
+
+; CHECK-LABEL: loop_func
+; CHECK-LABEL: bb.1.for.body.i
+; CHECK: VRCP {{.*}} Stage-0_Cycle-0
+; CHECK: scVADDSEGSCAN2XNHALFBF16 {{.*}} Stage-0_Cycle-4
+; CHECK: BRcond %bb.1
+
+define void @loop_func(<8 x float> %in1, <8 x i1> %m) #0 {
+entry:
+  %a1 = call <32 x i8> addrspace(201)* @llvm.tpu.inttoptr.p201v32i8(i32 0)
+  %a2 = call <32 x i8> addrspace(201)* @llvm.tpu.inttoptr.p201v32i8(i32 32)
+  %a3 = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 64)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body.i ]
+  %v = load <32 x i8>, <32 x i8> addrspace(201)* %a1, align 32
+  %vs = tail call <16 x bfloat> @llvm.tpu.unpack.c.l.u8.bf16(<32 x i8> %v)
+  %res = call { <16 x bfloat>, <8 x i1> } @llvm.tpu.add.half.seg.scan2xNbf16(<8 x i1> %m, <16 x bfloat> %vs)
+  %rcp = tail call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %in1)
+  %p = extractvalue { <16 x bfloat>, <8 x i1> } %res, 0
+  %pr = tail call <32 x i8> @llvm.tpu.pack.c.bf16.u8(<16 x bfloat> %p, <16 x bfloat> %p)
+  store <32 x i8> %pr, <32 x i8> addrspace(201)* %a2
+  store <8 x float> %rcp, <8 x float> addrspace(201)* %a3
+  %inc.i = add nuw nsw i32 %i, 1
+  %exitcond.i = icmp eq i32 %inc.i, 32
+  br i1 %exitcond.i, label %for.cond.loopexit.i, label %for.body.i, !llvm.loop !0
+
+for.cond.loopexit.i:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !3, !4, !5, !6}
+!1 = !{!"llvm.loop.parallel_accesses", !2}
+!2 = distinct !{}
+!3 = !{!"llvm.loop.unroll.disable"}
+!4 = !{!"llvm.loop.vectorize.width", i32 1}
+!5 = !{!"llvm.loop.interleave.count", i32 1}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+attributes #0 = { "implicit-section-name"=".text.tile_execute" "target-cpu"="sparsecore-tec-gl" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_reorder_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_reorder_sc.ll
new file mode 100644
index 0000000..93614c4
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_reorder_sc.ll

@@ -0,0 +1,53 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp -tpu-latencies=%S/Inputs/long_load3.yml \
+; RUN: -tpu-pipeliner-annotate-for-testing -stop-after=tpu-pipeliner \
+; RUN: -tpu-enable-vliw-prep-postiv=false -tpu-enable-vliw-prep-post-addrinc=false \
+; RUN: -calculate-branch-delay-ii=false | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32) nounwind
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32) nounwind
+declare <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float>)
+
+; Checks that the software pipeliner is able to reorder fifo instructions. This is a
+; preliminary test that produces invalid code. It simply checks that 2 fifo instructions
+; are being reordered as expected.
+
+; CHECK-LABEL: loop_func
+; CHECK-LABEL: bb.1.for.body.i
+; CHECK: VRCP {{.*}} Stage-0
+; CHECK: VRSQRT {{.*}} Stage-3
+; CHECK: BRcond %bb.1
+
+define void @loop_func(<8 x float> %in1) {
+entry:
+  %a1 = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 0)
+  %a2 = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 32)
+  %a3 = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 64)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body.i ]
+  %v = load <8 x float>, <8 x float> addrspace(201)* %a1, align 32
+  %sqrt = tail call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %v)
+  %rcp = tail call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %in1)
+  store <8 x float> %sqrt, <8 x float> addrspace(201)* %a2
+  store <8 x float> %rcp, <8 x float> addrspace(201)* %a3
+  %inc.i = add nuw nsw i32 %i, 1
+  %exitcond.i = icmp eq i32 %inc.i, 32
+  br i1 %exitcond.i, label %for.cond.loopexit.i, label %for.body.i, !llvm.loop !0
+
+for.cond.loopexit.i:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !3, !4, !5, !6}
+!1 = !{!"llvm.loop.parallel_accesses", !2}
+!2 = distinct !{}
+!3 = !{!"llvm.loop.unroll.disable"}
+!4 = !{!"llvm.loop.vectorize.width", i32 1}
+!5 = !{!"llvm.loop.interleave.count", i32 1}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_same_cycle_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_same_cycle_sc.ll
new file mode 100644
index 0000000..9d6a3ec
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_same_cycle_sc.ll

@@ -0,0 +1,55 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp -tpu-latencies=%S/Inputs/long_load2.yml \
+; RUN: -tpu-pipeliner-annotate-for-testing -stop-after=tpu-pipeliner \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32) nounwind
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32) nounwind
+declare <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float>)
+
+; Checks that the software pipeliner will not schedule two erf fifo instructions in the same
+; cycle. This can happen unless the swing scheduler tracks the resources.
+
+; CHECK-LABEL: loop_func
+; CHECK-LABEL: bb.1.for.body.i
+; CHECK-DAG: VRCP {{.*}} Stage-{{[0-9]+}}_Cycle-[[c:[0-9]+]]
+; CHECK-NOT: VRSQRT {{.*}} Stage-{{[0-9]+}}_Cycle-[[c]]
+; CHECK-NOT: VRSQRT {{.*}} Stage-{{[0-9]+}}_Cycle-[[c]]
+; CHECK: BRcond %bb.1
+
+define void @loop_func(<8 x float> %in1) {
+entry:
+  %a1 = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 0)
+  %a2 = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 32)
+  %a3 = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 64)
+  %a4 = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 96)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body.i ]
+  %v = load <8 x float>, <8 x float> addrspace(201)* %a1, align 32
+  %sqrt0 = tail call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %v)
+  %sqrt1 = tail call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %in1)
+  %rcp = tail call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %in1)
+  store <8 x float> %sqrt0, <8 x float> addrspace(201)* %a2
+  store <8 x float> %rcp, <8 x float> addrspace(201)* %a3
+  store <8 x float> %sqrt1, <8 x float> addrspace(201)* %a4
+  %inc.i = add nuw nsw i32 %i, 1
+  %exitcond.i = icmp eq i32 %inc.i, 32
+  br i1 %exitcond.i, label %for.cond.loopexit.i, label %for.body.i, !llvm.loop !0
+
+for.cond.loopexit.i:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !3, !4, !5, !6}
+!1 = !{!"llvm.loop.parallel_accesses", !2}
+!2 = distinct !{}
+!3 = !{!"llvm.loop.unroll.disable"}
+!4 = !{!"llvm.loop.vectorize.width", i32 1}
+!5 = !{!"llvm.loop.interleave.count", i32 1}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_two_pops_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_two_pops_sc.ll
new file mode 100644
index 0000000..8040e38
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_fifo_two_pops_sc.ll

@@ -0,0 +1,165 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp \
+; RUN: -tpu-pipeliner-annotate-for-testing -stop-after=tpu-pipeliner \
+; RUN: -add-backedges-to-dag=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests that the pipeliner does not decompose scVPOP3_XRF0 and scVPOP3_XRF1 into the same bundle.
+; The test is derived and cut down from segmented_reduce_test.cc -DFEATURE_SIZE=8 -DPIPELINE_OUTER.
+
+; The switch -ignore-fifo-backedge=false was added to keep the original layout of the test.
+; The test is unrelated to DAG backedges.
+
+declare <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>)
+declare { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32>, <8 x float>)
+declare void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, <8 x float>)
+declare { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1>, <8 x i32>, <8 x i32>)
+
+; CHECK-LABEL: tile_execute
+; CHECK-LABEL: bb.2.
+; CHECK: scVPOP3_XRF1 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c0:[0-9]+]]>
+; CHECK-NOT: scVPOP3_XRF0 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c0]]>
+; CHECK: scVPOP3_XRF0 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c1:[0-9]+]]>
+; CHECK-NOT: scVPOP3_XRF1 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c1]]>
+; CHECK: scVPOP3_XRF0 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c2:[0-9]+]]>
+; CHECK-NOT: scVPOP3_XRF1 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c2]]>
+; CHECK: scVPOP3_XRF0 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c3:[0-9]+]]>
+; CHECK-NOT: scVPOP3_XRF1 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c3]]>
+; CHECK: scVPOP3_XRF0 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c4:[0-9]+]]>
+; CHECK-NOT: scVPOP3_XRF1 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c4]]>
+; CHECK: scVPOP3_XRF0 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c5:[0-9]+]]>
+; CHECK-NOT: scVPOP3_XRF1 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c5]]>
+; CHECK: scVPOP3_XRF0 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c6:[0-9]+]]>
+; CHECK-NOT: scVPOP3_XRF1 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c6]]>
+; CHECK: scVPOP3_XRF0 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c7:[0-9]+]]>
+; CHECK-NOT: scVPOP3_XRF1 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c7]]>
+; CHECK: scVPOP3_XRF0 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c8:[0-9]+]]>
+; CHECK-NOT: scVPOP3_XRF1 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c8]]>
+; CHECK: scVPOP3_XRF1 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c9:[0-9]+]]>
+; CHECK-NOT: scVPOP3_XRF0 killed %{{[0-9]+}}, $palways, 0, post-instr-symbol <mcsymbol Stage-{{[0-9]+}}_Cycle-[[c9]]>
+; CHECK: BRcond %bb.2
+
+define void @tile_execute() {
+entry:
+  %0 = load i32, i32* inttoptr (i32 258 to i32*), align 4
+  %1 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 260 to <8 x i32> addrspace(201)**), align 4
+  %2 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 261 to <8 x i32> addrspace(201)**), align 4
+  %3 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 262 to <8 x float> addrspace(201)**), align 4
+  %4 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 263 to <8 x float> addrspace(201)**), align 4
+  %5 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 264 to <8 x float> addrspace(201)**), align 8
+  %div = sdiv i32 %0, 8
+  %6 = load <8 x i32>, <8 x i32> addrspace(201)* %1, align 32
+  %7 = load <8 x i32>, <8 x i32> addrspace(201)* %2, align 32
+  %8 = bitcast <8 x float> addrspace(201)* %3 to <8 x i32> addrspace(201)*
+  %9 = load <8 x i32>, <8 x i32> addrspace(201)* %8, align 32
+  %10 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %6, <8 x i32> %7)
+  %11 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %6, <8 x i32> %9)
+  %cmp112.i = icmp sgt i32 %0, 7
+  br i1 %cmp112.i, label %for.body.i.preheader, label %_ZN10embeddings15SegmentedReduce7ComputeEiNS_20TileSpmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit, !llvm.loop !60
+
+for.body.i.preheader:                             ; preds = %entry
+  %sub.i = add nsw i32 %div, -1
+  br label %for.cond.loopexit.loopexit.unr-lcssa.i
+
+for.cond.loopexit.loopexit.unr-lcssa.i:           ; preds = %for.cond.loopexit.loopexit.unr-lcssa.i, %for.body.i.preheader
+  %.pn.i = phi { <8 x i32>, <8 x i32>, <8 x i1> } [ %17, %for.cond.loopexit.loopexit.unr-lcssa.i ], [ %11, %for.body.i.preheader ]
+  %.pn117.i = phi { <8 x i32>, <8 x i32>, <8 x i1> } [ %16, %for.cond.loopexit.loopexit.unr-lcssa.i ], [ %10, %for.body.i.preheader ]
+  %i.0113.i = phi i32 [ %add.i, %for.cond.loopexit.loopexit.unr-lcssa.i ], [ 0, %for.body.i.preheader ]
+  %sorted_segments.0114.i = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %.pn117.i, 0
+  %sorted_gains.0116.in.i = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %.pn.i, 1
+  %sorted_gains.0116.i = bitcast <8 x i32> %sorted_gains.0116.in.i to <8 x float>
+  %cmp7.i = icmp eq i32 %i.0113.i, %sub.i
+  %add.i = add nuw nsw i32 %i.0113.i, 1
+  %cond.i = select i1 %cmp7.i, i32 %i.0113.i, i32 %add.i
+  %add.ptr.i81.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %1, i32 %cond.i
+  %12 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i81.i, align 32
+  %add.ptr.i79.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %2, i32 %cond.i
+  %13 = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i79.i, align 32
+  %add.ptr.i.i = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %3, i32 %cond.i
+  %14 = bitcast <8 x float> addrspace(201)* %add.ptr.i.i to <8 x i32> addrspace(201)*
+  %15 = load <8 x i32>, <8 x i32> addrspace(201)* %14, align 32
+  %16 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %12, <8 x i32> %13)
+  %17 = tail call { <8 x i32>, <8 x i32>, <8 x i1> } @llvm.tpu.sort.ascdi.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %12, <8 x i32> %15)
+  %sorted_indices.0115.i = extractvalue { <8 x i32>, <8 x i32>, <8 x i1> } %.pn117.i, 1
+  %mul.i = shl <8 x i32> %sorted_indices.0115.i, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul30.i = shl <8 x i32> %sorted_segments.0114.i, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %18 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %4, <8 x i32> %mul.i), !llvm.access.group !62
+  %mul27.i = fmul <8 x float> %18, %sorted_gains.0116.i
+  %19 = tail call { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %sorted_segments.0114.i, <8 x float> %mul27.i), !llvm.access.group !62
+  %20 = extractvalue { <8 x float>, <8 x i1> } %19, 0
+  %21 = extractvalue { <8 x float>, <8 x i1> } %19, 1
+  tail call void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1> %21, <8 x float> addrspace(201)* %5, <8 x i32> %mul30.i, <8 x float> %20), !llvm.access.group !62
+  %add25.1.i = or <8 x i32> %mul.i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %22 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %4, <8 x i32> %add25.1.i), !llvm.access.group !62
+  %mul27.1.i = fmul <8 x float> %22, %sorted_gains.0116.i
+  %23 = tail call { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %sorted_segments.0114.i, <8 x float> %mul27.1.i), !llvm.access.group !62
+  %24 = extractvalue { <8 x float>, <8 x i1> } %23, 0
+  %25 = extractvalue { <8 x float>, <8 x i1> } %23, 1
+  %add33.1.i = or <8 x i32> %mul30.i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  tail call void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1> %25, <8 x float> addrspace(201)* %5, <8 x i32> %add33.1.i, <8 x float> %24), !llvm.access.group !62
+  %add25.2.i = or <8 x i32> %mul.i, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %26 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %4, <8 x i32> %add25.2.i), !llvm.access.group !62
+  %mul27.2.i = fmul <8 x float> %26, %sorted_gains.0116.i
+  %27 = tail call { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %sorted_segments.0114.i, <8 x float> %mul27.2.i), !llvm.access.group !62
+  %28 = extractvalue { <8 x float>, <8 x i1> } %27, 0
+  %29 = extractvalue { <8 x float>, <8 x i1> } %27, 1
+  %add33.2.i = or <8 x i32> %mul30.i, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  tail call void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1> %29, <8 x float> addrspace(201)* %5, <8 x i32> %add33.2.i, <8 x float> %28), !llvm.access.group !62
+  %add25.3.i = or <8 x i32> %mul.i, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %30 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %4, <8 x i32> %add25.3.i), !llvm.access.group !62
+  %mul27.3.i = fmul <8 x float> %30, %sorted_gains.0116.i
+  %31 = tail call { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %sorted_segments.0114.i, <8 x float> %mul27.3.i), !llvm.access.group !62
+  %32 = extractvalue { <8 x float>, <8 x i1> } %31, 0
+  %33 = extractvalue { <8 x float>, <8 x i1> } %31, 1
+  %add33.3.i = or <8 x i32> %mul30.i, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  tail call void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1> %33, <8 x float> addrspace(201)* %5, <8 x i32> %add33.3.i, <8 x float> %32), !llvm.access.group !62
+  %add25.4.i = or <8 x i32> %mul.i, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %34 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %4, <8 x i32> %add25.4.i), !llvm.access.group !62
+  %mul27.4.i = fmul <8 x float> %34, %sorted_gains.0116.i
+  %35 = tail call { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %sorted_segments.0114.i, <8 x float> %mul27.4.i), !llvm.access.group !62
+  %36 = extractvalue { <8 x float>, <8 x i1> } %35, 0
+  %37 = extractvalue { <8 x float>, <8 x i1> } %35, 1
+  %add33.4.i = or <8 x i32> %mul30.i, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  tail call void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1> %37, <8 x float> addrspace(201)* %5, <8 x i32> %add33.4.i, <8 x float> %36), !llvm.access.group !62
+  %add25.5.i = or <8 x i32> %mul.i, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  %38 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %4, <8 x i32> %add25.5.i), !llvm.access.group !62
+  %mul27.5.i = fmul <8 x float> %38, %sorted_gains.0116.i
+  %39 = tail call { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %sorted_segments.0114.i, <8 x float> %mul27.5.i), !llvm.access.group !62
+  %40 = extractvalue { <8 x float>, <8 x i1> } %39, 0
+  %41 = extractvalue { <8 x float>, <8 x i1> } %39, 1
+  %add33.5.i = or <8 x i32> %mul30.i, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  tail call void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1> %41, <8 x float> addrspace(201)* %5, <8 x i32> %add33.5.i, <8 x float> %40), !llvm.access.group !62
+  %add25.6.i = or <8 x i32> %mul.i, <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
+  %42 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %4, <8 x i32> %add25.6.i), !llvm.access.group !62
+  %mul27.6.i = fmul <8 x float> %42, %sorted_gains.0116.i
+  %43 = tail call { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %sorted_segments.0114.i, <8 x float> %mul27.6.i), !llvm.access.group !62
+  %44 = extractvalue { <8 x float>, <8 x i1> } %43, 0
+  %45 = extractvalue { <8 x float>, <8 x i1> } %43, 1
+  %add33.6.i = or <8 x i32> %mul30.i, <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
+  tail call void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1> %45, <8 x float> addrspace(201)* %5, <8 x i32> %add33.6.i, <8 x float> %44), !llvm.access.group !62
+  %add25.7.i = or <8 x i32> %mul.i, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %46 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %4, <8 x i32> %add25.7.i), !llvm.access.group !62
+  %mul27.7.i = fmul <8 x float> %46, %sorted_gains.0116.i
+  %47 = tail call { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %sorted_segments.0114.i, <8 x float> %mul27.7.i), !llvm.access.group !62
+  %48 = extractvalue { <8 x float>, <8 x i1> } %47, 0
+  %49 = extractvalue { <8 x float>, <8 x i1> } %47, 1
+  %add33.7.i = or <8 x i32> %mul30.i, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  tail call void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1> %49, <8 x float> addrspace(201)* %5, <8 x i32> %add33.7.i, <8 x float> %48), !llvm.access.group !62
+  %exitcond.not = icmp eq i32 %add.i, %div
+  br i1 %exitcond.not, label %_ZN10embeddings15SegmentedReduce7ComputeEiNS_20TileSpmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit, label %for.cond.loopexit.loopexit.unr-lcssa.i, !llvm.loop !60
+
+_ZN10embeddings15SegmentedReduce7ComputeEiNS_20TileSpmemVectorArrayIiEES2_NS1_IfEES3_PS3_.exit: ; preds = %for.cond.loopexit.loopexit.unr-lcssa.i, %entry
+  store i32 1, i32* inttoptr (i32 256 to i32*), align 256
+  ret void
+}
+
+!60 = distinct !{!60, !61, !63, !64, !65, !66}
+!61 = !{!"llvm.loop.parallel_accesses", !62}
+!62 = distinct !{}
+!63 = !{!"llvm.loop.unroll.disable"}
+!64 = !{!"llvm.loop.vectorize.width", i32 1}
+!65 = !{!"llvm.loop.interleave.count", i32 1}
+!66 = !{!"llvm.loop.vectorize.enable", i1 true}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_ftrl_reorder_pred_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_ftrl_reorder_pred_sc.ll
new file mode 100644
index 0000000..9869485
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_ftrl_reorder_pred_sc.ll

@@ -0,0 +1,180 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-pipeliner-annotate-for-testing -stop-after=tpu-pipeliner \
+; RUN: -min-ii=8 -enable-split-live-ranges=false \
+; RUN: -tpu-latencies=third_party/llvm/llvm/test/CodeGen/GoogleTPU/Inputs/eup7.yml \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests the proper generation of predicates in the prolog and uses in the epilog.
+
+; CHECK-LABEL: tile_execute
+
+; CHECK-LABEL: bb.5.
+; CHECK-DAG: {{.*}} Stage-0
+; CHECK-NOT: {{.*}} Stage-1
+; CHECK-NOT: {{.*}} Stage-2
+; CHECK-NOT: {{.*}} Stage-3
+; CHECK-NOT: {{.*}} Stage-4
+; CHECK: %[[p0:[0-9]+]]:ppr = PORii 0, 0, $palways, 0
+; CHECK: %[[p1:[0-9]+]]:ppr = PORii 0, 0, $palways, 0
+; CHECK: %[[p2:[0-9]+]]:ppr = PORii 0, 0, $palways, 0
+; CHECK: BRcond %bb.13
+; CHECK-LABEL: bb.6.
+; CHECK-DAG: {{.*}} Stage-0
+; CHECK-DAG: {{.*}} Stage-1
+; CHECK-NOT: {{.*}} Stage-2
+; CHECK-NOT: {{.*}} Stage-3
+; CHECK-NOT: {{.*}} Stage-4
+; CHECK: %[[p3:[0-9]+]]:ppr = PORii 1, 1, $palways, 0
+; CHECK: BRcond %bb.13
+; CHECK-LABEL: bb.7.
+; CHECK-DAG: {{.*}} Stage-0
+; CHECK-DAG: {{.*}} Stage-1
+; CHECK-DAG: {{.*}} Stage-2
+; CHECK-NOT: {{.*}} Stage-3
+; CHECK-NOT: {{.*}} Stage-4
+; CHECK: %[[p4:[0-9]+]]:ppr = PORii 1, 1, $palways, 0
+; CHECK: BRcond %bb.13
+; CHECK-LABEL: bb.8.
+; CHECK-DAG: {{.*}} Stage-0
+; CHECK-DAG: {{.*}} Stage-1
+; CHECK-DAG: {{.*}} Stage-2
+; CHECK-DAG: {{.*}} Stage-3
+; CHECK-NOT: {{.*}} Stage-4
+; CHECK: %[[p5:[0-9]+]]:ppr = PORii 1, 1, $palways, 0
+; CHECK: BRcond %bb.13
+; CHECK-LABEL: bb.2.
+; CHECK: BR %bb.13
+; CHECK-LABEL: bb.13.
+; CHECK: %[[p6:[0-9]+]]:ppr = PHI %[[p3]], %bb.2, %[[p0]], %bb.5, %[[p3]], %bb.6, %[[p3]], %bb.7, %[[p3]], %bb.8
+; CHECK: %[[p7:[0-9]+]]:ppr = PHI %[[p4]], %bb.2, %[[p1]], %bb.5, %[[p1]], %bb.6, %[[p4]], %bb.7, %[[p4]], %bb.8
+; CHECK: %[[p8:[0-9]+]]:ppr = PHI %[[p5]], %bb.2, %[[p2]], %bb.5, %[[p2]], %bb.6, %[[p2]], %bb.7, %[[p5]], %bb.8
+; CHECK-NOT: {{.*}} Stage-0
+; CHECK-NOT: {{.*}} %[[p6]], {{.*}} Stage-1
+; CHECK-DAG: {{.*}} %[[p6]], {{.*}} Stage-2
+; CHECK-NOT: {{.*}} %[[p6]], {{.*}} Stage-3
+; CHECK-NOT: {{.*}} %[[p6]], {{.*}} Stage-4
+; CHECK-NOT: {{.*}} %[[p7]], {{.*}} Stage-1
+; CHECK-NOT: {{.*}} %[[p7]], {{.*}} Stage-2
+; CHECK-DAG: {{.*}} %[[p7]], {{.*}} Stage-3
+; CHECK-NOT: {{.*}} %[[p7]], {{.*}} Stage-4
+; CHECK-NOT: {{.*}} %[[p8]], {{.*}} Stage-1
+; CHECK-NOT: {{.*}} %[[p8]], {{.*}} Stage-2
+; CHECK-NOT: {{.*}} %[[p8]], {{.*}} Stage-3
+; CHECK-DAG: {{.*}} %[[p8]], {{.*}} Stage-4
+; CHECK-LABEL: bb.12.
+; CHECK-NOT: {{.*}} Stage-0
+; CHECK-NOT: {{.*}} Stage-1
+; CHECK-NOT: {{.*}} %[[p6]], {{.*}} Stage-2
+; CHECK-DAG: {{.*}} %[[p6]], {{.*}} Stage-3
+; CHECK-NOT: {{.*}} %[[p6]], {{.*}} Stage-4
+; CHECK-NOT: {{.*}} %[[p7]], {{.*}} Stage-2
+; CHECK-NOT: {{.*}} %[[p7]], {{.*}} Stage-3
+; CHECK-DAG: {{.*}} %[[p7]], {{.*}} Stage-4
+; CHECK-NOT: {{.*}} %[[p8]], {{.*}} Stage-2
+; CHECK-NOT: {{.*}} %[[p8]], {{.*}} Stage-3
+; CHECK-NOT: {{.*}} %[[p8]], {{.*}} Stage-4
+; CHECK-LABEL: bb.11.
+; CHECK-NOT: {{.*}} Stage-0
+; CHECK-NOT: {{.*}} Stage-1
+; CHECK-NOT: {{.*}} Stage-2
+; CHECK-NOT: {{.*}} %[[p6]], {{.*}} Stage-3
+; CHECK-DAG: {{.*}} %[[p6]], {{.*}} Stage-4
+; CHECK-NOT: {{.*}} %[[p7]], {{.*}} Stage-3
+; CHECK-NOT: {{.*}} %[[p7]], {{.*}} Stage-4
+; CHECK-NOT: {{.*}} %[[p8]], {{.*}} Stage-3
+; CHECK-NOT: {{.*}} %[[p8]], {{.*}} Stage-4
+; CHECK-LABEL: bb.10.
+; CHECK-NOT: {{.*}} Stage-0
+; CHECK-NOT: {{.*}} Stage-1
+; CHECK-NOT: {{.*}} Stage-2
+; CHECK-NOT: {{.*}} Stage-3
+; CHECK-NOT: {{.*}} %[[p6]], {{.*}} Stage-4
+; CHECK-NOT: {{.*}} %[[p7]], {{.*}} Stage-4
+; CHECK-NOT: {{.*}} %[[p8]], {{.*}} Stage-4
+
+define void @tile_execute() {
+entry:
+  %0 = load i32, i32* inttoptr (i32 256 to i32*), align 256
+  %div = sdiv i32 %0, 8
+  %cmp110.i = icmp sgt i32 %0, 7
+  br i1 %cmp110.i, label %for.body.lr.ph.i, label %exit, !llvm.loop !43
+
+for.body.lr.ph.i:
+  %1 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 260 to <8 x float> addrspace(201)**), align 4
+  %2 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 259 to <8 x float> addrspace(201)**), align 4
+  %3 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 258 to <8 x float> addrspace(201)**), align 4
+  %4 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 257 to <8 x float> addrspace(201)**), align 4
+  %5 = tail call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>)
+  br label %exit.i
+
+exit.i:
+  %j.0111.i = phi i32 [ 0, %for.body.lr.ph.i ], [ %add27.i, %exit.i ]
+  %add.ptr.i98.i = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %4, i32 %j.0111.i
+  %6 = load <8 x float>, <8 x float> addrspace(201)* %add.ptr.i98.i, align 32, !alias.scope !55, !noalias !53, !llvm.access.group !45
+  %add.ptr.i96.i = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %1, i32 %j.0111.i
+  %7 = load <8 x float>, <8 x float> addrspace(201)* %add.ptr.i96.i, align 32, !alias.scope !55, !noalias !53, !llvm.access.group !45
+  %add.ptr.i94.i = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %3, i32 %j.0111.i
+  %8 = load <8 x float>, <8 x float> addrspace(201)* %add.ptr.i94.i, align 32, !alias.scope !55, !noalias !53, !llvm.access.group !45
+  %add.ptr.i92.i = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %2, i32 %j.0111.i
+  %9 = load <8 x float>, <8 x float> addrspace(201)* %add.ptr.i92.i, align 32, !alias.scope !55, !noalias !53, !llvm.access.group !45
+  %mul.i = fmul <8 x float> %7, %7
+  %add6.i = fadd <8 x float> %mul.i, %8
+  %add7.i = fadd <8 x float> %7, %9
+  %10 = tail call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %add6.i)
+  %11 = tail call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %8)
+  %sub.i = fsub <8 x float> %10, %11
+  %mul11.i = fmul <8 x float> %5, %6
+  %mul12.i = fmul <8 x float> %mul11.i, %sub.i
+  %sub13108.i = fsub <8 x float> %add7.i, %mul12.i
+  %mul16.i = fmul <8 x float> %5, %10
+  %add21.i = fadd <8 x float> %mul16.i, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+  %12 = tail call <8 x float> @llvm.minimum.v8f32(<8 x float> %sub13108.i, <8 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>)
+  %13 = tail call <8 x float> @llvm.maximum.v8f32(<8 x float> %12, <8 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>)
+  %sub.i.i = fsub <8 x float> %13, %sub13108.i
+  %14 = tail call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %add21.i)
+  %mul23.i = fmul <8 x float> %14, %sub.i.i
+  store <8 x float> %mul23.i, <8 x float> addrspace(201)* %add.ptr.i98.i, align 32, !alias.scope !55, !noalias !53, !llvm.access.group !45
+  store <8 x float> %add6.i, <8 x float> addrspace(201)* %add.ptr.i94.i, align 32, !alias.scope !55, !noalias !53, !llvm.access.group !45
+  store <8 x float> %sub13108.i, <8 x float> addrspace(201)* %add.ptr.i92.i, align 32, !alias.scope !55, !noalias !53, !llvm.access.group !45
+  %add27.i = add nuw nsw i32 %j.0111.i, 1
+  %exitcond.not.i = icmp eq i32 %add27.i, %div
+  br i1 %exitcond.not.i, label %exit, label %exit.i, !llvm.loop !43
+
+exit:
+  store i32 1, i32* inttoptr (i32 256 to i32*), align 256
+  ret void
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float>) #3
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float>) #3
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.minimum.v8f32(<8 x float>, <8 x float>) #3
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.maximum.v8f32(<8 x float>, <8 x float>) #3
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version google3-trunk (24b62f28c5daa293a2602712e1eba82cb59f3a6f)"}
+!43 = distinct !{!43, !44, !46, !47, !48, !49}
+!44 = !{!"llvm.loop.parallel_accesses", !45}
+!45 = distinct !{}
+!46 = !{!"llvm.loop.unroll.disable"}
+!47 = !{!"llvm.loop.vectorize.width", i32 1}
+!48 = !{!"llvm.loop.interleave.count", i32 1}
+!49 = !{!"llvm.loop.vectorize.enable", i1 true}
+!51 = distinct !{!51, !52, !"loop.parallel"}
+!52 = distinct !{!52, !"for.cond"}
+!53 = !{!54}
+!54 = distinct !{!54, !52, !"loop.parallel"}
+!55 = !{!51}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_ftrl_reorder_pred_static_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_ftrl_reorder_pred_static_sc.ll
new file mode 100644
index 0000000..95b318a
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_ftrl_reorder_pred_static_sc.ll

@@ -0,0 +1,155 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-pipeliner-annotate-for-testing -stop-after=tpu-pipeliner \
+; RUN: -min-ii=8 | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests the proper omission of predicates in the prolog and uses in the epilog,
+; if the trip count is statically known. This is the same loop and test as
+; sw_pipeliner_ftrl_reorder_pred_sc.ll, but with statically known trip count.
+
+; CHECK-LABEL: tile_execute
+
+; CHECK-LABEL: bb.5.
+; CHECK-DAG: {{.*}} Stage-0
+; CHECK-NOT: {{.*}} Stage-1
+; CHECK-NOT: {{.*}} Stage-2
+; CHECK-NOT: {{.*}} Stage-3
+; CHECK-NOT: {{.*}} Stage-4
+; CHECK-NOT: %{{p[0-9]+}}:ppr = PORii
+; CHECK-NOT: BRcond
+; CHECK-LABEL: bb.6.
+; CHECK-DAG: {{.*}} Stage-0
+; CHECK-DAG: {{.*}} Stage-1
+; CHECK-NOT: {{.*}} Stage-2
+; CHECK-NOT: {{.*}} Stage-3
+; CHECK-NOT: {{.*}} Stage-4
+; CHECK-NOT: %{{p[0-9]+}}:ppr = PORii
+; CHECK-NOT: BRcond
+; CHECK-LABEL: bb.7.
+; CHECK-DAG: {{.*}} Stage-0
+; CHECK-DAG: {{.*}} Stage-1
+; CHECK-DAG: {{.*}} Stage-2
+; CHECK-NOT: {{.*}} Stage-3
+; CHECK-NOT: {{.*}} Stage-4
+; CHECK-NOT: %{{p[0-9]+}}:ppr = PORii
+; CHECK-NOT: BRcond
+; CHECK-LABEL: bb.8.
+; CHECK-DAG: {{.*}} Stage-0
+; CHECK-DAG: {{.*}} Stage-1
+; CHECK-DAG: {{.*}} Stage-2
+; CHECK-DAG: {{.*}} Stage-3
+; CHECK-NOT: {{.*}} Stage-4
+; CHECK-NOT: %{{p[0-9]+}}:ppr = PORii
+; CHECK-NOT: BRcond
+; CHECK-LABEL: bb.2.
+; CHECK: BR %bb.13
+; CHECK-LABEL: bb.13.
+; CHECK-NOT: %{{p[0-9]+}}:ppr = PHI
+; CHECK-NOT: {{.*}} Stage-0
+; CHECK-DAG: {{.*}} Stage-1
+; CHECK-DAG: {{.*}} Stage-2
+; CHECK-DAG: {{.*}} Stage-3
+; CHECK-DAG: {{.*}} Stage-4
+; CHECK-LABEL: bb.12.
+; CHECK-NOT: {{.*}} Stage-0
+; CHECK-NOT: {{.*}} Stage-1
+; CHECK-DAG: {{.*}} Stage-2
+; CHECK-DAG: {{.*}} Stage-3
+; CHECK-DAG: {{.*}} Stage-4
+; CHECK-LABEL: bb.11.
+; CHECK-NOT: {{.*}} Stage-0
+; CHECK-NOT: {{.*}} Stage-1
+; CHECK-NOT: {{.*}} Stage-2
+; CHECK-DAG: {{.*}} Stage-3
+; CHECK-DAG: {{.*}} Stage-4
+; CHECK-LABEL: bb.10.
+; CHECK-NOT: {{.*}} Stage-0
+; CHECK-NOT: {{.*}} Stage-1
+; CHECK-NOT: {{.*}} Stage-2
+; CHECK-NOT: {{.*}} Stage-3
+; CHECK-DAG: {{.*}} Stage-4
+
+define void @tile_execute() {
+entry:
+  %0 = load i32, i32* inttoptr (i32 256 to i32*), align 256
+  %cmp110.i = icmp sgt i32 %0, 7
+  br i1 %cmp110.i, label %for.body.lr.ph.i, label %exit, !llvm.loop !43
+
+for.body.lr.ph.i:
+  %1 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 260 to <8 x float> addrspace(201)**), align 4
+  %2 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 259 to <8 x float> addrspace(201)**), align 4
+  %3 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 258 to <8 x float> addrspace(201)**), align 4
+  %4 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 257 to <8 x float> addrspace(201)**), align 4
+  %5 = tail call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>)
+  br label %exit.i
+
+exit.i:
+  %j.0111.i = phi i32 [ 0, %for.body.lr.ph.i ], [ %add27.i, %exit.i ]
+  %add.ptr.i98.i = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %4, i32 %j.0111.i
+  %6 = load <8 x float>, <8 x float> addrspace(201)* %add.ptr.i98.i, align 32, !alias.scope !55, !noalias !53, !llvm.access.group !45
+  %add.ptr.i96.i = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %1, i32 %j.0111.i
+  %7 = load <8 x float>, <8 x float> addrspace(201)* %add.ptr.i96.i, align 32, !alias.scope !55, !noalias !53, !llvm.access.group !45
+  %add.ptr.i94.i = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %3, i32 %j.0111.i
+  %8 = load <8 x float>, <8 x float> addrspace(201)* %add.ptr.i94.i, align 32, !alias.scope !55, !noalias !53, !llvm.access.group !45
+  %add.ptr.i92.i = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %2, i32 %j.0111.i
+  %9 = load <8 x float>, <8 x float> addrspace(201)* %add.ptr.i92.i, align 32, !alias.scope !55, !noalias !53, !llvm.access.group !45
+  %mul.i = fmul <8 x float> %7, %7
+  %add6.i = fadd <8 x float> %mul.i, %8
+  %add7.i = fadd <8 x float> %7, %9
+  %10 = tail call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %add6.i)
+  %11 = tail call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %8)
+  %sub.i = fsub <8 x float> %10, %11
+  %mul11.i = fmul <8 x float> %5, %6
+  %mul12.i = fmul <8 x float> %mul11.i, %sub.i
+  %sub13108.i = fsub <8 x float> %add7.i, %mul12.i
+  %mul16.i = fmul <8 x float> %5, %10
+  %add21.i = fadd <8 x float> %mul16.i, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+  %12 = tail call <8 x float> @llvm.minimum.v8f32(<8 x float> %sub13108.i, <8 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>)
+  %13 = tail call <8 x float> @llvm.maximum.v8f32(<8 x float> %12, <8 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>)
+  %sub.i.i = fsub <8 x float> %13, %sub13108.i
+  %14 = tail call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %add21.i)
+  %mul23.i = fmul <8 x float> %14, %sub.i.i
+  store <8 x float> %mul23.i, <8 x float> addrspace(201)* %add.ptr.i98.i, align 32, !alias.scope !55, !noalias !53, !llvm.access.group !45
+  store <8 x float> %add6.i, <8 x float> addrspace(201)* %add.ptr.i94.i, align 32, !alias.scope !55, !noalias !53, !llvm.access.group !45
+  store <8 x float> %sub13108.i, <8 x float> addrspace(201)* %add.ptr.i92.i, align 32, !alias.scope !55, !noalias !53, !llvm.access.group !45
+  %add27.i = add nuw nsw i32 %j.0111.i, 1
+  %exitcond.not.i = icmp eq i32 %add27.i, 32
+  br i1 %exitcond.not.i, label %exit, label %exit.i, !llvm.loop !43
+
+exit:
+  store i32 1, i32* inttoptr (i32 256 to i32*), align 256
+  ret void
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float>) #3
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float>) #3
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.minimum.v8f32(<8 x float>, <8 x float>) #3
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare <8 x float> @llvm.maximum.v8f32(<8 x float>, <8 x float>) #3
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version google3-trunk (24b62f28c5daa293a2602712e1eba82cb59f3a6f)"}
+!43 = distinct !{!43, !44, !46, !47, !48, !49}
+!44 = !{!"llvm.loop.parallel_accesses", !45}
+!45 = distinct !{}
+!46 = !{!"llvm.loop.unroll.disable"}
+!47 = !{!"llvm.loop.vectorize.width", i32 1}
+!48 = !{!"llvm.loop.interleave.count", i32 1}
+!49 = !{!"llvm.loop.vectorize.enable", i1 true}
+!51 = distinct !{!51, !52, !"loop.parallel"}
+!52 = distinct !{!52, !"for.cond"}
+!53 = !{!54}
+!54 = distinct !{!54, !52, !"loop.parallel"}
+!55 = !{!51}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_noparallel_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_noparallel_sc.ll
new file mode 100644
index 0000000..84dab3c
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_noparallel_sc.ll

@@ -0,0 +1,59 @@
+; RUN: llc -O2 < %s -mcpu=sparsecore-tec-vf -asm-verbose=false \
+; RUN: -tpu-latencies=%S/Inputs/long_load.yml -stop-after=tpu-pipeliner \
+; RUN: -tpu-pipeliner-annotate-for-testing -late-start-swing=false \
+; RUN: -instcombine-max-iterations=0 | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.loop.parallel()
+declare i32* @llvm.tpu.inttoptr.pi32(i32)
+declare void @llvm.tpu.vst.msk.idx.add.np.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+declare void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.tpu.vld.msk.idx.np.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>)
+
+; Tests that the not-parallel instructions are not getting reordered.
+; This is a check for correctness in the Sparsecore radix sort kernel.
+
+; CHECK-LABEL: bb.1.for.body.i:
+; CHECK-DAG: scVST_IDX_MSK_ADD_NP {{.*}} Stage-1
+; CHECK-DAG: scVLD_IDX_MSK_NP {{.*}} Stage-0
+
+define void @func_vst_idx_ret_add_noparallel_loop(<8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  %0 = call i32* @llvm.tpu.inttoptr.pi32(i32 0)
+  %1 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  %2 = call i32* @llvm.tpu.inttoptr.pi32(i32 32)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %ic, %for.body.i ]
+  %idx0 = getelementptr inbounds i32, i32* %0, i32 %i
+  %3 = load i32, i32* %idx0, align 4
+  store i32 %3, i32* %1, align 4
+  %idx1 = getelementptr inbounds i32, i32* %0, i32 32
+  %4 = load i32, i32* %idx1, align 4
+  store i32 %4, i32* %2, align 4
+  %5 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.np.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %base, <8 x i32> %off)
+  %idx2 = getelementptr inbounds i32, i32* %0, i32 16
+  %6 = load i32, i32* %idx2, align 4
+  store i32 %6, i32* %2, align 4
+  tail call void @llvm.tpu.vst.msk.idx.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i32> %5)
+  %cmp.i = icmp slt i32 %i, 15
+  %ic = add i32 %i, 1
+  tail call void @llvm.tpu.vst.msk.idx.add.np.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i32> %val)
+  br i1 %cmp.i, label %for.body.i, label %exit, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+!1 = distinct !{!1, !2, !4, !5, !6, !7}
+!2 = !{!"llvm.loop.parallel_accesses", !3}
+!3 = distinct !{}
+!4 = !{!"llvm.loop.unroll.disable"}
+!5 = !{!"llvm.loop.vectorize.width", i32 1}
+!6 = !{!"llvm.loop.interleave.count", i32 1}
+!7 = !{!"llvm.loop.vectorize.enable", i1 true}
+!8 = distinct !{!8, !4}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_postaddr_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_postaddr_sc.ll
new file mode 100644
index 0000000..a2be561
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_postaddr_sc.ll

@@ -0,0 +1,70 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp \
+; RUN: -tpu-latencies %S/Inputs/vld_4_cycle.yml -enable-ordering-twist=false \
+; RUN: -tpu-enable-vliw-prep-postiv=false -tpu-enable-vliw-prep-post-addrinc=false \
+; RUN: -calculate-branch-delay-ii=false | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>)
+declare { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32>, <8 x float>)
+declare void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, <8 x float>)
+
+; Verify that the postaddr optimization leads to an optimized schedule of 5
+; bundles and the result of increment is used in the next iteration. The
+; function is derived from the embedding kernels.
+
+; The test was tailored assuming a vld latency of 4.
+
+; CHECK-LABEL: tight_loop_func:
+; CHECK-LABEL: .LBB0_1
+; CHECK: {
+; CHECK: vld.idx.msk [tilespmem:v[[REG2:[0-9]+]]
+; CHECK: v[[REG2]] = vadd
+; CHECK: {
+; CHECK: {
+; CHECK: {
+; CHECK: {
+; CHECK:  // %bb.2:
+
+define void @tight_loop_func() {
+entry:
+  %a0 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 260 to <8 x i32> addrspace(201)**), align 4
+  %a1 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 261 to <8 x float> addrspace(201)**), align 4
+  %a2 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** inttoptr (i32 262 to <8 x i32> addrspace(201)**), align 4
+  %ss0 = load <8 x i32>, <8 x i32> addrspace(201)* %a0, align 32
+  %ss1 = load <8 x float>, <8 x float> addrspace(201)* %a1, align 32
+  %ss2 = load <8 x i32>, <8 x i32> addrspace(201)* %a2, align 32
+  %0 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 263 to <8 x float> addrspace(201)**), align 4
+  %1 = load <8 x float> addrspace(201)*, <8 x float> addrspace(201)** inttoptr (i32 264 to <8 x float> addrspace(201)**), align 4
+  %m0 = shl <8 x i32> %ss2, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  %m1 = shl <8 x i32> %ss0, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  br label %for.body.i
+
+for.body.i:
+  %j.0104.i = phi i32 [ 0, %entry ], [ %inc.i, %for.body.i ]
+  %splat.splatinsert23.i = insertelement <8 x i32> undef, i32 %j.0104.i, i32 0
+  %splat.splat24.i = shufflevector <8 x i32> %splat.splatinsert23.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %add25.i = add <8 x i32> %splat.splat24.i, %m0
+  %2 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %0, <8 x i32> %add25.i)
+  %mul27.i = fmul <8 x float> %2, %ss1
+  %3 = tail call { <8 x float>, <8 x i1> } @llvm.tpu.deprecated.segreduce.addf(<8 x i32> %ss0, <8 x float> %mul27.i)
+  %4 = extractvalue { <8 x float>, <8 x i1> } %3, 0
+  %add33.i = add <8 x i32> %splat.splat24.i, %m1
+  tail call void @llvm.tpu.vst.msk.idx.add.p201v8f32.v8f32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> addrspace(201)* %1, <8 x i32> %add33.i, <8 x float> %4)
+  %inc.i = add nuw nsw i32 %j.0104.i, 1
+  %exitcond.i = icmp eq i32 %inc.i, 32
+  br i1 %exitcond.i, label %for.cond.loopexit.i, label %for.body.i, !llvm.loop !0
+
+for.cond.loopexit.i:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !3, !4, !5, !6}
+!1 = !{!"llvm.loop.parallel_accesses", !2}
+!2 = distinct !{}
+!3 = !{!"llvm.loop.unroll.disable"}
+!4 = !{!"llvm.loop.vectorize.width", i32 1}
+!5 = !{!"llvm.loop.interleave.count", i32 1}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_top_copies_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_top_copies_sc.ll
new file mode 100644
index 0000000..2a586b7
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_top_copies_sc.ll

@@ -0,0 +1,58 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp \
+; RUN: -enable-split-live-ranges=true \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.vst.msk.idx.add.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>) #6
+
+; Tests that we are able to schedule the loop in only 4 bundles. This only happens
+; if both postiv successfully moved the iv update past the loop condition, as
+; well as the software pipeliner inserting movs at the top of the loop manually.
+; If we continue with the current II instead of letting it fail, we are getting 2 vmovs.
+
+; CHECK-LABEL: loop_4cycles:
+; CHECK-LABEL: .LBB0_1
+; CHECK: {
+; CHECK: .Ltmp0
+; CHECK: {
+; CHECK: sbr.rel {{.*}} .LBB0_1-.Ltmp0 }
+; CHECK-NOT: }
+; CHECK-NOT: v{{[0-9]+}} = vmov v{{[0-9]+}}
+; CHECK: {
+; CHECK: {
+
+define void @loop_4cycles(i32 %d.i, i32 %d2.i, [512 x <8 x i32>] addrspace(201)* %addr) {
+entry:
+  %mul.i.i.i = shl nsw i32 %d.i, 2
+  %splat.splatinsert.i.i.i = insertelement <8 x i32> undef, i32 %mul.i.i.i, i32 0
+  %splat.splat.i.i.i = shufflevector <8 x i32> %splat.splatinsert.i.i.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  br label %for.body.i130.i
+
+for.body.i130.i:                                  ; preds = %for.body.i130.i, %for.body.lr.ph.i126.i
+  %i.016.i.i = phi i32 [ 0, %entry ], [ %inc.i128.i, %for.body.i130.i ]
+  %addr2 = getelementptr inbounds [512 x <8 x i32>], [512 x <8 x i32>] addrspace(201)* %addr, i32 0, i32 280
+  %add.ptr.i.i127.i = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %addr2, i32 %i.016.i.i
+  %l = load <8 x i32>, <8 x i32> addrspace(201)* %add.ptr.i.i127.i, align 32
+  %shr.i.i.i = ashr <8 x i32> %l, %splat.splat.i.i.i
+  %and.i.i.i = and <8 x i32> %shr.i.i.i, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %addr3 = getelementptr inbounds [512 x <8 x i32>], [512 x <8 x i32>] addrspace(201)* %addr, i32 0, i32 0
+  tail call void @llvm.tpu.vst.msk.idx.add.p201v8i32.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %addr3, <8 x i32> %and.i.i.i, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
+  %inc.i128.i = add nuw nsw i32 %i.016.i.i, 1
+  %exitcond.i = icmp eq i32 %inc.i128.i, 32
+  br i1 %exitcond.i, label %exit, label %for.body.i130.i, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+!1 = distinct !{!1, !2, !4, !5, !6, !7}
+!2 = !{!"llvm.loop.parallel_accesses", !3}
+!3 = distinct !{}
+!4 = !{!"llvm.loop.unroll.disable"}
+!5 = !{!"llvm.loop.vectorize.width", i32 1}
+!6 = !{!"llvm.loop.interleave.count", i32 1}
+!7 = !{!"llvm.loop.vectorize.enable", i1 true}
+!8 = distinct !{!8, !4}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_trace_debug.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_trace_debug.ll
new file mode 100644
index 0000000..d4e43e9
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/sw_pipeliner_trace_debug.ll

@@ -0,0 +1,55 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -disable-cgp -tpu-pipeliner-annotate-for-testing \
+; RUN: -tpu-latencies=%S/Inputs/long_load3.yml -enable-ordering-twist=false \
+; RUN: -tpu-add-loop-hardware-tracing -tpu-hardware-tracing-tile-num=0 | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32) nounwind
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32) nounwind
+declare <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float>)
+
+; Checks that the debug pass injects a strace before after the loop with
+; the IDs [8|9] | 0xffff.
+
+; CHECK-LABEL: loop_func
+; CHECK: s[[s0:[0-9]+]] = stileid.u32
+; CHECK: p[[p0:[0-9]+]] = seq.s32 s[[s0]], $0x0
+; CHECK: strace @p[[p0]] $0x8000ffff
+; CHECK-LABEL: .LBB0_5
+; CHECK: (pc) = sbr.rel @p2 .LBB0_5
+; CHECK: s[[s1:[0-9]+]] = stileid.u32
+; CHECK: p[[p1:[0-9]+]] = seq.s32 s[[s1]], $0x0
+; CHECK: strace @p[[p1]] $0x9000ffff
+
+define void @loop_func(<8 x float> %in1, i32 %u) {
+entry:
+  %a1 = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 0)
+  %a2 = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 32)
+  %a3 = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 64)
+  br label %for.body.i
+
+for.body.i:
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body.i ]
+  %v = load <8 x float>, <8 x float> addrspace(201)* %a1, align 32
+  %sqrt = tail call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %v)
+  %rcp = tail call <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float> %in1)
+  store <8 x float> %sqrt, <8 x float> addrspace(201)* %a2
+  store <8 x float> %rcp, <8 x float> addrspace(201)* %a3
+  %inc.i = add nuw nsw i32 %i, 1
+  %exitcond.i = icmp eq i32 %inc.i, %u
+  br i1 %exitcond.i, label %for.cond.loopexit.i, label %for.body.i, !llvm.loop !0
+
+for.cond.loopexit.i:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !3, !4, !5, !6}
+!1 = !{!"llvm.loop.parallel_accesses", !2}
+!2 = distinct !{}
+!3 = !{!"llvm.loop.unroll.disable"}
+!4 = !{!"llvm.loop.vectorize.width", i32 1}
+!5 = !{!"llvm.loop.interleave.count", i32 1}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/tac_scoped_alloc_error_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/tac_scoped_alloc_error_sc.ll
new file mode 100644
index 0000000..4b6c6f1
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/tac_scoped_alloc_error_sc.ll

@@ -0,0 +1,19 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tac-vf -tpu-fatal-mem-alloc-error=false < %s 2>&1 \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Test that smem allocation size is checked properly for sparsecore-tac-vf.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32* @llvm.tpu.alloca.smem(i32)
+declare <8 x i32> addrspace(202)* @llvm.tpu.alloca.spmem(i32)
+
+; CHECK: Scoped allocation overflow.
+ define void @scoped_allocation_overflow_sparsecore_tac_smem(i32 %a) {
+  %mem = call i32* @llvm.tpu.alloca.smem(i32 2049)
+  %arrayidx = getelementptr inbounds i32, i32* %mem, i32 %a
+  store i32 0, i32* %arrayidx, align 4
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/task_dispatch.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/task_dispatch.ll
new file mode 100644
index 0000000..a9578ec
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/task_dispatch.ll

@@ -0,0 +1,111 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-scs-vf < %s \
+; RUN: | llc -mcpu=sparsecore-scs-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; This tests the code generation sequence of the SC task dispatch intrinsic.
+
+declare void @llvm.tpu.task.dispatch(i32 addrspace(208)*, i32)
+declare void @llvm.tpu.task.dispatch.clear.ibuf(i32 addrspace(208)*, i32)
+declare i32 addrspace(208)* @llvm.tpu.alloca.dreg(i32)
+
+; CHECK-LABEL: SCS:
+; CHECK-DAG: [[TAC1:s[0-9]+]] = simm.s32 TAC1
+; CHECK-DAG: [dreg:$0x0] = wrdreg [[TAC1]]
+; CHECK-DAG: [[TEC1:s[0-9]+]] = simm.s32 TEC1
+; CHECK-DAG: [dreg:$0x1] = wrdreg [[TEC1]]
+; CHECK-DAG: [dreg:$0x2] = wrdreg $0xf
+; CHECK-DAG: [dreg:$0x3] = wrdreg $0x2
+; CHECK-DAG: [[S0:s[0-9]+]] = simm.s32 $0x0
+; CHECK-DAG: _ = task [dreg:[[S0]]], $0x40001
+
+; CHECK-DAG: [[TAC2:s[0-9]+]] = simm.s32 TAC2
+; CHECK-DAG: [dreg:$0x4] = wrdreg [[TAC2]]
+; CHECK-DAG: [[TEC2:s[0-9]+]] = simm.s32 TEC2
+; CHECK-DAG: [dreg:$0x5] = wrdreg [[TEC2]]
+; CHECK-DAG: [dreg:$0x6] = wrdreg $0x7
+; CHECK-DAG: [[S4:s[0-9]+]] = simm.s32 $0x4
+; CHECK-DAG: _ = task [dreg:[[S4]]], $0x30002
+; CHECK-DAG: _ = task.clear_ibuf [dreg:[[S4]]], $0x30002
+
+define void @SCS() #0 section ".text.scs" {
+entry:
+  %tac1 = ptrtoint void()* @TAC1 to i32
+  %tec1 = ptrtoint void()* @TEC1 to i32
+  %tac2 = ptrtoint void()* @TAC2 to i32
+  %tec2 = ptrtoint void()* @TEC2 to i32
+
+  ; Task Dispatch 1: Allocate descriptor in dreg address space.
+  %dmem0 = tail call i32 addrspace(208)* @llvm.tpu.alloca.dreg(i32 4)
+
+  ; Store '%tac1' and '%tec1' to first two descriptor dregs.
+  %idx0 = getelementptr inbounds i32, i32 addrspace(208)* %dmem0, i32 0
+  store i32 %tac1, i32 addrspace(208)* %idx0, align 4
+  %idx1 = getelementptr inbounds i32, i32 addrspace(208)* %dmem0, i32 1
+  store i32 %tec1, i32 addrspace(208)* %idx1, align 4
+
+  ; Store task arguments.
+  %idx2 = getelementptr inbounds i32, i32 addrspace(208)* %dmem0, i32 2
+  store i32 15, i32 addrspace(208)* %idx2, align 4
+  %idx3 = getelementptr inbounds i32, i32 addrspace(208)* %dmem0, i32 3
+  store i32 2, i32 addrspace(208)* %idx3, align 4
+
+  ; Call task dispatch intrinsic.
+  ; bitmap = 4 << 16 | 1 = 262145
+  call void @llvm.tpu.task.dispatch(i32 addrspace(208)* %dmem0, i32 262145)
+
+  ; Task Dispatch 2: Allocate descriptor in dreg address space.
+  %dmem1 = tail call i32 addrspace(208)* @llvm.tpu.alloca.dreg(i32 3)
+
+  ; Store '%tac2' and '%tec2' to first two descriptor dregs.
+  %idx4 = getelementptr inbounds i32, i32 addrspace(208)* %dmem1, i32 0
+  store i32 %tac2, i32 addrspace(208)* %idx4, align 4
+  %idx5 = getelementptr inbounds i32, i32 addrspace(208)* %dmem1, i32 1
+  store i32 %tec2, i32 addrspace(208)* %idx5, align 4
+
+  ; Store task argument.
+  %idx6 = getelementptr inbounds i32, i32 addrspace(208)* %dmem1, i32 2
+  store i32 7, i32 addrspace(208)* %idx6, align 4
+
+  ; Call task dispatch intrinsic.
+  ; bitmap = 3 << 16 | 2 = 196610
+  call void @llvm.tpu.task.dispatch(i32 addrspace(208)* %dmem1, i32 196610)
+
+  ; Call task dispatch clear_ibuf intrinsic.
+  ; bitmap = 3 << 16 | 2 = 196610
+  call void @llvm.tpu.task.dispatch.clear.ibuf(i32 addrspace(208)* %dmem1, i32 196610)
+
+  ret void
+}
+
+define void @TAC1() #1 section ".text.tile_access" {
+entry:
+  store i32 1, i32* inttoptr (i32 256 to i32*)
+  ret void
+}
+
+define void @TAC2() #1 section ".text.tile_access" {
+entry:
+  store i32 1, i32* inttoptr (i32 256 to i32*)
+  ret void
+}
+
+define void @TEC1() #2 section ".text.tile_execute" {
+entry:
+  store i32 1, i32* inttoptr (i32 256 to i32*)
+  ret void
+}
+
+define void @TEC2() #2 section ".text.tile_execute" {
+entry:
+  store i32 1, i32* inttoptr (i32 256 to i32*)
+  ret void
+}
+
+attributes #0 = { "target-cpu"="sparsecore-scs-vf" }
+attributes #1 = { "target-cpu"="sparsecore-tac-vf" }
+attributes #2 = { "target-cpu"="sparsecore-tec-vf" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/tmem_scoreboard_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/tmem_scoreboard_sc.ll
new file mode 100644
index 0000000..9a83680
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/tmem_scoreboard_sc.ll

@@ -0,0 +1,52 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Test tilespmem hazard conditions
+
+declare <8 x i32> @llvm.tpu.vld.msk.idx.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>)
+
+; CHECK-LABEL: loadafterstore:
+; CHECK: {  [tilespmem:s0+$0x0] =	vst v{{[0-9]+}}  }
+; CHECK-NEXT: {  	v{{[0-9]+}} =	vld [tilespmem:s1+$0x0];
+define <8 x i32> @loadafterstore(<8 x i32> addrspace(201)* %ptr0, <8 x i32> addrspace(201)* %ptr1) {
+  store <8 x i32> zeroinitializer, <8 x i32> addrspace(201)* %ptr0
+  %res = load <8 x i32>, <8 x i32> addrspace(201)* %ptr1
+  ret <8 x i32> %res
+}
+
+; Test a case where the compiler can verify that the load and store don't alias
+; CHECK-LABEL: loadafterstorenoalias:
+; CHECK: v{{[0-9]+}} = vld [tilespmem:s0+$0x6400]
+; CHECK: sdelay
+; CHECK: [tilespmem:s0+$0x0] = vst v{{[0-9]+}}
+; CHECK: vadd
+define <8 x i32> @loadafterstorenoalias(<8 x i32> addrspace(201)* %ptr, <8 x i32> %add2) {
+  store <8 x i32> zeroinitializer, <8 x i32> addrspace(201)* %ptr
+  %addr = getelementptr <8 x i32>, <8 x i32> addrspace(201)* %ptr, i32 3200
+  %add = load <8 x i32>, <8 x i32> addrspace(201)* %addr
+  %res = add <8 x i32> %add, %add2
+  ret <8 x i32> %res
+}
+
+; CHECK-LABEL: load_intr_afterstore:
+; CHECK: {  [tilespmem:s0+$0x0] =	vst v{{[0-9]+}}  }
+; CHECK-NEXT: {  	v{{[0-9]+}} =	vld.idx.msk [tilespmem:v{{[0-9]+}}+s{{[0-9]+}}+$0x0]
+define <8 x i32> @load_intr_afterstore(<8 x i32> addrspace(201)* %ptr0, <8 x i32> addrspace(201)* %ptr1, <8 x i32> %offsets) {
+  store <8 x i32> zeroinitializer, <8 x i32> addrspace(201)* %ptr0
+  %res = call <8 x i32> @llvm.tpu.vld.msk.idx.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* %ptr1, <8 x i32> %offsets)
+  ret <8 x i32> %res
+}
+
+; CHECK-LABEL: back_to_back_load:
+; CHECK: v{{[0-9]+}} = vld.idx.msk [tilespmem:v0+s0+$0x0]
+; CHECK-NEXT: v{{[0-9]+}} = vld.idx.msk [tilespmem:v1+s1+$0x0]
+; CHECK: v{{[0-9]+}} = vadd.s32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i32> @back_to_back_load(<8 x i32> addrspace(201)* %ptr0, <8 x i32> addrspace(201)* %ptr1, <8 x i32> %offsets1, <8 x i32> %offsets2) {
+  %l0 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* nonnull %ptr0, <8 x i32> %offsets1)
+  %l1 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.v8i32(<8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> addrspace(201)* nonnull %ptr1, <8 x i32> %offsets2)
+  %a0 = add <8 x i32> %l0, %l1
+  ret <8 x i32> %a0
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/trace_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/trace_sc.ll
new file mode 100644
index 0000000..8a65dc1
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/trace_sc.ll

@@ -0,0 +1,71 @@
+; RUN: llc < %s -mcpu=sparsecore-scs-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.sc.ssettm(i32) nounwind
+declare void @llvm.tpu.sc.strace(i32) nounwind
+
+; CHECK-LABEL: strace_r:
+; CHECK: _ = strace s{{[0-9]+}}
+define void @strace_r(i32 %op) {
+  call void @llvm.tpu.sc.strace(i32 %op)
+  ret void
+}
+
+; CHECK-LABEL: strace_i:
+; CHECK: _ = strace $0x1
+define void @strace_i() {
+  call void @llvm.tpu.sc.strace(i32 1)
+  ret void
+}
+
+; CHECK-LABEL: ssettm_r:
+; CHECK: (tm) = ssettm s{{[0-9]+}}
+define void @ssettm_r(i32 %op) {
+  call void @llvm.tpu.sc.ssettm(i32 %op)
+  ret void
+}
+
+; CHECK-LABEL: ssettm_i:
+; CHECK: (tm) = ssettm $0x1
+define void @ssettm_i() {
+  call void @llvm.tpu.sc.ssettm(i32 1)
+  ret void
+}
+
+; CHECK-LABEL: ssettm_delay_trace:
+; CHECK:      { (tm) = ssettm s{{[0-9]+}}
+; CHECK-NEXT:   _ = sdelay $0x2 }
+; CHECK-NEXT: { _ = strace $0x1 }
+; CHECK-NEXT: { (tm) = ssettm s{{[0-9]+}}
+; CHECK-NEXT:   _ = shalt }
+define void @ssettm_delay_trace(i32 %a, i32 %b) {
+  call void @llvm.tpu.sc.ssettm(i32 %a)
+  call void @llvm.tpu.sc.strace(i32 1)
+  call void @llvm.tpu.sc.ssettm(i32 %b)
+  ret void
+}
+
+
+
+; CHECK-LABEL: ssettm_delay_trace_three:
+; CHECK:      { _ =     strace $0x0  }
+; CHECK-NEXT: { (tm) =  ssettm s{{[0-9]+}};
+; CHECK-NEXT:   _ =     sdelay $0x2  }
+; CHECK-NEXT: { _ =     strace $0x1  }
+; CHECK-NEXT: { (tm) =  ssettm s{{[0-9]+}};
+; CHECK-NEXT:   _ =     sdelay $0x2  }
+; CHECK-NEXT: { _ =     strace $0x2;
+; CHECK-NEXT:   _ =     shalt  }
+define void @ssettm_delay_trace_three(i32 %a, i32 %b) {
+  call void @llvm.tpu.sc.strace(i32 0)
+  call void @llvm.tpu.sc.ssettm(i32 %a)
+  call void @llvm.tpu.sc.strace(i32 1)
+  call void @llvm.tpu.sc.ssettm(i32 %b)
+  call void @llvm.tpu.sc.strace(i32 2)
+  ret void
+}
+

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/trap_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/trap_sc.ll
new file mode 100644
index 0000000..c032f20
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/trap_sc.ll

@@ -0,0 +1,166 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.halt.trap(i1)
+declare void @llvm.tpu.wait.trap(i8*, i32, i1)
+declare void @llvm.tpu.waiteq(i32 addrspace(204)*, i32)
+
+@.str = private unnamed_addr constant [13 x i8] c"hello, world\00", align 1
+
+; CHECK-LABEL: trap:
+; CHECK: { _ = shalt @p0 }
+; CHECK: { _ = shalt @p1 }
+; CHECK: { _ = shalt }
+define void @trap(i1 %f1, i1 %f2) {
+  call void @llvm.tpu.halt.trap(i1 %f1)
+  call void @llvm.tpu.halt.trap(i1 %f2)
+  ret void
+}
+
+; We're intentionally not printing the string on SparseCore.
+
+; CHECK-LABEL: sc_trap_r:
+; CHECK: { s[[s:[0-9]+]] = simm.s32 @p[[p:[0-9]+]] $0x0 }
+; CHECK-NEXT: { [sflag:s[[s]]] = ssyncset.s32 @p[[p]] $0x0 }
+; CHECK-NEXT: { _ = sint @p[[p]] s0 }
+; CHECK-NEXT: { _ = swait.eq @p[[p]] [sflag:s[[s]]], $0x1
+define void @sc_trap_r(i32 %t, i1 %p) {
+  call void @llvm.tpu.wait.trap(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 %t, i1 %p)
+  ret void
+}
+
+; CHECK-LABEL: sc_trap_i:
+; CHECK: { s[[s:[0-9]+]] = simm.s32 @p[[p:[0-9]+]] $0x0 }
+; CHECK-NEXT: { [sflag:s[[s]]] = ssyncset.s32 @p[[p]] $0x0 }
+; CHECK-NEXT: { _ = sint @p[[p]] $0x1 }
+; CHECK-NEXT: { _ = swait.eq @p[[p]] [sflag:s[[s]]], $0x1
+define void @sc_trap_i(i1 %p) {
+  call void @llvm.tpu.wait.trap(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 1, i1 %p)
+  ret void
+}
+
+; Tests that simm, syncset etc. bundles with other instructions, while
+; the trap's swait.eq is kept separate in a bundle right after the sint.
+
+; CHECK-LABEL: sc_trap_r_bundle:
+; CHECK: { s{{[0-9]+}} = sadd.s32 {{.*}}
+; CHECK-NEXT: s[[s:[0-9]+]] = simm.s32 @p[[p:[0-9]+]] $0x0 }
+; CHECK-NEXT: { s{{[0-9]+}} = sadd.s32 {{.*}}
+; CHECK-NEXT: [sflag:s[[s]]] = ssyncset.s32 @p[[p]] $0x0 }
+; CHECK-NEXT: { _ = sint @p[[p]] s0 }
+; CHECK-NEXT: { s{{[0-9]+}} = sadd.s32 {{.*}};
+; CHECK-NEXT:   _ = swait.eq @p[[p]] [sflag:s[[s]]], $0x1 }
+; CHECK: _ = shalt  }
+define i32 @sc_trap_r_bundle(i1 %p, i32 %t, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g) {
+  %i0 = add i32 %a, %b
+  %i1 = add i32 %i0, %c
+  %i2 = add i32 %i1, %d
+  %i3 = add i32 %i2, %e
+  %i4 = add i32 %i3, %f
+  %i5 = add i32 %i4, %g
+  call void @llvm.tpu.wait.trap(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 %t, i1 %p)
+  ret i32 %i5
+}
+
+; Same as above, but with immediate version.
+
+; CHECK-LABEL: sc_trap_i_bundle:
+; CHECK: { s{{[0-9]+}} = sadd.s32 {{.*}};
+; CHECK-NEXT: s[[s:[0-9]+]] = simm.s32 @p[[p:[0-9]+]] $0x0 }
+; CHECK-NEXT: { s{{[0-9]+}} = sadd.s32 {{.*}};
+; CHECK-NEXT: [sflag:s[[s]]] = ssyncset.s32 @p[[p]] $0x0 }
+; CHECK-NEXT: { _ = sint @p[[p]] $0x3 }
+; CHECK-NEXT: { s{{[0-9]+}} = sadd.s32 {{.*}}
+; CHECK-NEXT:   _ = swait.eq @p[[p]] [sflag:s[[s]]], $0x1 }
+; CHECK: _ = shalt  }
+define i32 @sc_trap_i_bundle(i1 %p, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g) {
+  %i0 = add i32 %a, %b
+  %i1 = add i32 %i0, %c
+  %i2 = add i32 %i1, %d
+  %i3 = add i32 %i2, %e
+  %i4 = add i32 %i3, %f
+  %i5 = add i32 %i4, %g
+  call void @llvm.tpu.wait.trap(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 3, i1 %p)
+  ret i32 %i5
+}
+
+; Tests that statically known predicated traps are optimized away.
+
+; CHECK-LABEL: del_sc_trap:
+; CHECK-NOT: { _ = sint @p{{[0-9]+}}
+define void @del_sc_trap() {
+  call void @llvm.tpu.wait.trap(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 0, i1 0)
+  ret void
+}
+
+; Tests that we insert a separate bundle for the swait if it can't
+; bundle with the successor bundle of the trap.
+
+; CHECK-LABEL: sc_trap_r_alone_bundle:
+; CHECK: { s{{[0-9]+}} = simm.s32 @p0 $0x0 }
+; CHECK-NEXT: { [sflag:s{{[0-9]+}}] = ssyncset.s32 @p0 $0x0 }
+; CHECK-NEXT: { _ = sint @p0 s0 }
+; CHECK-NEXT: { _ = swait.eq @p0 [sflag:s{{[0-9]+}}], $0x1 }
+
+define void @sc_trap_r_alone_bundle(i1 %p, i32 %t, i32 %a, i32 %b, i32 %c, i32 %d, i32 addrspace(204)* %sflag0,
+                              i32 addrspace(204)* %sflag1, i32 addrspace(204)* %sflag2) {
+  call void @llvm.tpu.wait.trap(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 %t, i1 %p)
+
+  call void @llvm.tpu.waiteq(i32 addrspace(204)* %sflag0, i32 0)
+  call void @llvm.tpu.waiteq(i32 addrspace(204)* %sflag1, i32 0)
+  call void @llvm.tpu.waiteq(i32 addrspace(204)* %sflag2, i32 0)
+  ret void
+}
+
+; Same as above but with immediate version.
+
+; CHECK-LABEL: sc_trap_i_alone_bundle:
+; CHECK: { s{{[0-9]+}} = simm.s32 @p0 $0x0 }
+; CHECK-NEXT: { [sflag:s{{[0-9]+}}] = ssyncset.s32 @p0 $0x0 }
+; CHECK-NEXT: { _ = sint @p0 $0x5 }
+; CHECK-NEXT: { _ = swait.eq @p0 [sflag:s{{[0-9]+}}], $0x1 }
+
+define void @sc_trap_i_alone_bundle(i1 %p, i32 %a, i32 %b, i32 %c, i32 %d, i32 addrspace(204)* %sflag0,
+                              i32 addrspace(204)* %sflag1, i32 addrspace(204)* %sflag2) {
+  call void @llvm.tpu.wait.trap(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 5, i1 %p)
+
+  call void @llvm.tpu.waiteq(i32 addrspace(204)* %sflag0, i32 0)
+  call void @llvm.tpu.waiteq(i32 addrspace(204)* %sflag1, i32 0)
+  call void @llvm.tpu.waiteq(i32 addrspace(204)* %sflag2, i32 0)
+  ret void
+}
+
+; Tests that a trap in a block w/o a terminator (.here block)
+; will expand to the additional wait bundle.
+
+; CHECK-LABEL: sc_trap_r_block_end_bundle:
+; CHECK: { _ = sint @!p0 s{{[0-9]+}} }
+; CHECK-NEXT: { _ = swait.eq @!p0 [sflag:s{{[0-9]+}}], $0x1
+
+define void @sc_trap_r_block_end_bundle(i1 %p1, i1 %p2, i32 %t, i32 %a, i32 %b, i32 %c, i32 %d, i32 addrspace(204)* %sflag0,
+                              i32 addrspace(204)* %sflag1, i32 addrspace(204)* %sflag2) {
+  br i1 %p2, label %.here, label %.there
+
+.here:
+  call void @llvm.tpu.wait.trap(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 %t, i1 %p1)
+  br label %.there
+
+.there:
+  call void @llvm.tpu.waiteq(i32 addrspace(204)* %sflag1, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: sc_trap_inv_pred:
+; CHECK: { s[[s:[0-9]+]] = simm.s32 @!p[[p:[0-9]+]] $0x0 }
+; CHECK-NEXT: { [sflag:s[[s]]] = ssyncset.s32 @!p[[p]] $0x0 }
+; CHECK-NEXT: { _ = sint @!p[[p]] s0 }
+; CHECK-NEXT: { _ = swait.eq @!p[[p]] [sflag:s[[s]]], $0x1
+define void @sc_trap_inv_pred(i32 %t, i1 %p) {
+  %pi = xor i1 %p, true
+  call void @llvm.tpu.wait.trap(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 %t, i1 %pi)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/trap_simplify_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/trap_simplify_sc.ll
new file mode 100644
index 0000000..0174819
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/trap_simplify_sc.ll

@@ -0,0 +1,41 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare void @llvm.tpu.halt.trap(i1)
+
+; CHECK: trap_0
+; CHECK: shalt
+; CHECK-NOT: shalt
+define void @trap_0() {
+  call void @llvm.tpu.halt.trap(i1 0)
+  ret void
+}
+
+; CHECK: trap_1a
+; CHECK: shalt
+; CHECK-NOT: shalt
+define void @trap_1a() {
+  call void @llvm.tpu.halt.trap(i1 1)
+  ret void
+}
+
+; CHECK: trap_1b
+; CHECK-NOT simm.s32 $0x1
+; CHECK: s0 = simm.s32 @p0 $0x2
+; CHECK: shalt @p0
+define i32 @trap_1b(i1 %p) {
+entry:
+  br i1 %p, label %bbr, label %bbt
+
+bbt:
+  call void @llvm.tpu.halt.trap(i1 1)
+  ret i32 1
+
+bbr:
+  ret i32 2
+}
+

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v2s_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v2s_sc.ll
new file mode 100644
index 0000000..aea181d
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v2s_sc.ll

@@ -0,0 +1,16 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -stop-after=tpu-isel | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests that we're matching the right vpush opcode on SparseCore.
+; CHECK-LABEL: v2s_test
+; CHECK: scVREADi
+define float @v2s_test(i32 %a) {
+  %1 = load <8 x float>, <8 x float> addrspace(201)* inttoptr (i32 8192 to <8 x float> addrspace(201)*), align 32
+  %r = extractelement <8 x float> %1, i32 0
+  ret float %r
+}
+

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v2sf_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v2sf_sc.ll
new file mode 100644
index 0000000..9bcd20b
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v2sf_sc.ll

@@ -0,0 +1,49 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-no-push-pop-reordering | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false -disable-cgp \
+; RUN: -tpu-no-push-pop-reordering | FileCheck --check-prefixes=CHECK %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; We generate many vpush/spop pairs, and ensure they are properly interleaved.
+; If this test fails it will actually cause a crash (as the regalloc tries to
+; find two registers for (v2sf)).
+
+; CHECK-LABEL: v2sf:
+; CHECK: vpush v0, $0x0
+; CHECK: spop
+; CHECK: vpush v0, $0x1
+; CHECK: spop
+; CHECK: vpush v0, $0x2
+; CHECK: spop
+; CHECK: vpush v0, $0x3
+; CHECK: spop
+; CHECK: vpush v0, $0x4
+; CHECK: spop
+; CHECK: vpush v0, $0x5
+; CHECK: spop
+define i32 @v2sf(<8 x i32> %a) {
+  %b = extractelement <8 x i32> %a, i32 0
+  %c = extractelement <8 x i32> %a, i32 1
+  %d = extractelement <8 x i32> %a, i32 2
+  %e = extractelement <8 x i32> %a, i32 3
+  %f = extractelement <8 x i32> %a, i32 4
+  %g = extractelement <8 x i32> %a, i32 5
+  ; Note, deliberately use items out of order.
+  %z = add i32 %f, %g
+  %y = add i32 %d, %e
+  %x = add i32 %b, %c
+  %w = add i32 %z, %y
+  %v = add i32 %w, %x
+  ret i32 %v
+}
+
+; CHECK-LABEL: v2sf_delay:
+; CHECK: vpush v0, $0x0
+; CHECK: spop
+define i32 @v2sf_delay(<8 x i32> %a) {
+  %v = extractelement <8 x i32> %a, i32 0
+  ret i32 %v
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v2sf_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v2sf_tc.ll
new file mode 100644
index 0000000..b29e14c
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v2sf_tc.ll

@@ -0,0 +1,38 @@
+; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: v2sf:
+; CHECK: (v2sf) = vpush v0
+; CHECK: (v2sf) = vpush v1
+; CHECK: (v2sf) = vpush v2
+; CHECK: (v2sf) = vpush v3
+; CHECK: _ = vdelay $0x1a
+; CHECK: s{{[0-9]+}} = spop (v2sf)
+; CHECK: s{{[0-9]+}} = spop (v2sf)
+; CHECK: s{{[0-9]+}} = spop (v2sf)
+; CHECK: s{{[0-9]+}} = spop (v2sf)
+define i32 @v2sf(<1024 x i32> %a, <1024 x i32> %b, <1024 x i32> %c, <1024 x i32> %d) {
+  %e = extractelement <1024 x i32> %a, i32 0
+  %f = extractelement <1024 x i32> %b, i32 0
+  %g = extractelement <1024 x i32> %c, i32 0
+  %h = extractelement <1024 x i32> %d, i32 0
+  %x = add i32 %g, %h
+  %y = add i32 %e, %f
+  %z = add i32 %x, %y
+  ret i32 %z
+}
+
+; CHECK-LABEL: v2sf_i1:
+; CHECK: v{{[0-9]+}} = vimm.s32 $0x0
+; CHECK: v{{[0-9]+}} = vsel vm0, $0x1, v{{[0-9]+}}
+; CHECK: (v2sf) = vpush v{{[0-9]+}}
+; CHECK: _ = vdelay $0x1d
+; CHECK: s{{[0-9]+}} = spop (v2sf)
+; CHECK: p0 = sne.s32 s{{[0-9]+}}, $0x0;
+define i1 @v2sf_i1(<1024 x i1> %m) {
+  %e = extractelement <1024 x i1> %m, i32 0
+  ret i1 %e
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v_spill_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v_spill_sc.ll
new file mode 100644
index 0000000..915aaf5
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v_spill_sc.ll

@@ -0,0 +1,138 @@
+; RUN: llc -O2 < %s -mcpu=sparsecore-tec-vf -asm-verbose=false \
+; RUN: -tpu-fixed-vregs=32-63 -tpu-fixed-maskregs=8-15 | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+!tilespmem.funcs.spill = !{!0}
+!tilespmem.ranges.spill.start = !{!1}
+!tilespmem.ranges.spill.limit = !{!2}
+
+!0 = !{void (<8 x i32> addrspace(201)*)* @spill_t_to_vreg}
+!1 = !{i32 1000}
+!2 = !{i32 2000}
+
+; CHECK-LABEL: spill_t_to_vreg:
+; CHECK: [tilespmem:$0x7c0] =	vst v{{[0-9]+}}
+; CHECK: [tilespmem:$0x7c8] =	vst v{{[0-9]+}}
+; CHECK: v{{[0-9]+}} =	vld [tilespmem:$0x7c0]
+; CHECK: v{{[0-9]+}} =	vld [tilespmem:$0x7c8]
+
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32) nounwind
+
+define void @spill_t_to_vreg(<8 x i32> addrspace(201)* %unknownptr) {
+llo-region-0:
+  br label %llo-region-1
+
+llo-region-1:                                    ; preds = %llo-region-0
+  %addr00 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 000)
+  %addr01 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 010)
+  %addr02 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 020)
+  %addr03 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 030)
+  %addr04 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 040)
+  %addr05 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 050)
+  %addr06 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 060)
+  %addr07 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 070)
+  %addr08 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 080)
+  %addr09 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 090)
+  %addr10 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 100)
+  %addr11 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 110)
+  %addr12 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 120)
+  %addr13 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 130)
+  %addr14 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 140)
+  %addr15 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 150)
+  %addr16 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 160)
+  %addr17 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 170)
+  %addr18 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 180)
+  %addr19 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 190)
+  %addr20 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 200)
+  %addr21 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 210)
+  %addr22 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 220)
+  %addr23 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 230)
+  %addr24 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 240)
+  %addr25 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 250)
+  %addr26 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 260)
+  %addr27 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 270)
+  %addr28 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 280)
+  %addr29 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 290)
+  %addr30 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 300)
+  %addr31 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 310)
+  %addr32 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 320)
+  %addr33 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 330)
+  %addr34 = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 340)
+
+  %val00 = load <8 x i32>, <8 x i32> addrspace(201)* %addr34
+  %val01 = load <8 x i32>, <8 x i32> addrspace(201)* %addr33
+  %val02 = load <8 x i32>, <8 x i32> addrspace(201)* %addr32
+  %val03 = load <8 x i32>, <8 x i32> addrspace(201)* %addr31
+  %val04 = load <8 x i32>, <8 x i32> addrspace(201)* %addr30
+  %val05 = load <8 x i32>, <8 x i32> addrspace(201)* %addr29
+  %val06 = load <8 x i32>, <8 x i32> addrspace(201)* %addr28
+  %val07 = load <8 x i32>, <8 x i32> addrspace(201)* %addr27
+  %val08 = load <8 x i32>, <8 x i32> addrspace(201)* %addr26
+  %val09 = load <8 x i32>, <8 x i32> addrspace(201)* %addr25
+  %val10 = load <8 x i32>, <8 x i32> addrspace(201)* %addr24
+  %val11 = load <8 x i32>, <8 x i32> addrspace(201)* %addr23
+  %val12 = load <8 x i32>, <8 x i32> addrspace(201)* %addr22
+  %val13 = load <8 x i32>, <8 x i32> addrspace(201)* %addr21
+  %val14 = load <8 x i32>, <8 x i32> addrspace(201)* %addr20
+  %val15 = load <8 x i32>, <8 x i32> addrspace(201)* %addr19
+  %val16 = load <8 x i32>, <8 x i32> addrspace(201)* %addr18
+  %val17 = load <8 x i32>, <8 x i32> addrspace(201)* %addr17
+  %val18 = load <8 x i32>, <8 x i32> addrspace(201)* %addr16
+  %val19 = load <8 x i32>, <8 x i32> addrspace(201)* %addr15
+  %val20 = load <8 x i32>, <8 x i32> addrspace(201)* %addr14
+  %val21 = load <8 x i32>, <8 x i32> addrspace(201)* %addr13
+  %val22 = load <8 x i32>, <8 x i32> addrspace(201)* %addr12
+  %val23 = load <8 x i32>, <8 x i32> addrspace(201)* %addr11
+  %val24 = load <8 x i32>, <8 x i32> addrspace(201)* %addr10
+  %val25 = load <8 x i32>, <8 x i32> addrspace(201)* %addr09
+  %val26 = load <8 x i32>, <8 x i32> addrspace(201)* %addr08
+  %val27 = load <8 x i32>, <8 x i32> addrspace(201)* %addr07
+  %val28 = load <8 x i32>, <8 x i32> addrspace(201)* %addr06
+  %val29 = load <8 x i32>, <8 x i32> addrspace(201)* %addr05
+  %val30 = load <8 x i32>, <8 x i32> addrspace(201)* %addr04
+  %val31 = load <8 x i32>, <8 x i32> addrspace(201)* %addr03
+  %val32 = load <8 x i32>, <8 x i32> addrspace(201)* %addr02
+  %val33 = load <8 x i32>, <8 x i32> addrspace(201)* %addr01
+  %val34 = load <8 x i32>, <8 x i32> addrspace(201)* %addr00
+
+  store <8 x i32> %val34, <8 x i32> addrspace(201)* %unknownptr
+  store <8 x i32> %val33, <8 x i32> addrspace(201)* %addr33
+  store <8 x i32> %val32, <8 x i32> addrspace(201)* %addr32
+  store <8 x i32> %val31, <8 x i32> addrspace(201)* %addr31
+  store <8 x i32> %val30, <8 x i32> addrspace(201)* %addr30
+  store <8 x i32> %val29, <8 x i32> addrspace(201)* %addr29
+  store <8 x i32> %val28, <8 x i32> addrspace(201)* %addr28
+  store <8 x i32> %val27, <8 x i32> addrspace(201)* %addr27
+  store <8 x i32> %val26, <8 x i32> addrspace(201)* %addr26
+  store <8 x i32> %val25, <8 x i32> addrspace(201)* %addr25
+  store <8 x i32> %val24, <8 x i32> addrspace(201)* %addr24
+  store <8 x i32> %val23, <8 x i32> addrspace(201)* %addr23
+  store <8 x i32> %val22, <8 x i32> addrspace(201)* %addr22
+  store <8 x i32> %val21, <8 x i32> addrspace(201)* %addr21
+  store <8 x i32> %val20, <8 x i32> addrspace(201)* %addr20
+  store <8 x i32> %val19, <8 x i32> addrspace(201)* %addr19
+  store <8 x i32> %val18, <8 x i32> addrspace(201)* %addr18
+  store <8 x i32> %val17, <8 x i32> addrspace(201)* %addr17
+  store <8 x i32> %val16, <8 x i32> addrspace(201)* %addr16
+  store <8 x i32> %val15, <8 x i32> addrspace(201)* %addr15
+  store <8 x i32> %val14, <8 x i32> addrspace(201)* %addr14
+  store <8 x i32> %val13, <8 x i32> addrspace(201)* %addr13
+  store <8 x i32> %val12, <8 x i32> addrspace(201)* %addr12
+  store <8 x i32> %val11, <8 x i32> addrspace(201)* %addr11
+  store <8 x i32> %val00, <8 x i32> addrspace(201)* %addr10
+  store <8 x i32> %val09, <8 x i32> addrspace(201)* %addr09
+  store <8 x i32> %val08, <8 x i32> addrspace(201)* %addr08
+  store <8 x i32> %val07, <8 x i32> addrspace(201)* %addr07
+  store <8 x i32> %val06, <8 x i32> addrspace(201)* %addr06
+  store <8 x i32> %val05, <8 x i32> addrspace(201)* %addr05
+  store <8 x i32> %val04, <8 x i32> addrspace(201)* %addr04
+  store <8 x i32> %val03, <8 x i32> addrspace(201)* %addr03
+  store <8 x i32> %val02, <8 x i32> addrspace(201)* %addr02
+  store <8 x i32> %val01, <8 x i32> addrspace(201)* %addr01
+  store <8 x i32> %val00, <8 x i32> addrspace(201)* %addr00
+
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v_spill_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v_spill_tc.ll
new file mode 100644
index 0000000..0de510c
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v_spill_tc.ll

@@ -0,0 +1,137 @@
+; RUN: llc < %s -mcpu=tensorcore-jf -asm-verbose=false -tpu-skip-fast-opt | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+!vmem.funcs.spill = !{!0}
+!vmem.ranges.spill.start = !{!1}
+!vmem.ranges.spill.limit = !{!2}
+
+!0 = !{void (<1024 x i32> addrspace(205)*)* @spill_v_to_vreg}
+!1 = !{i32 1000}
+!2 = !{i32 2000}
+
+; CHECK-LABEL: spill_v_to_vreg:
+; CHECK-DAG: [vmem:$0x7c8] =	vst v{{[0-9]+}}
+; CHECK-DAG: [vmem:$0x7c0] =	vst v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} =	vld [vmem:$0x7c0]
+; CHECK-DAG: v{{[0-9]+}} =	vld [vmem:$0x7c8]
+
+declare <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32) nounwind
+
+define void @spill_v_to_vreg(<1024 x i32> addrspace(205)* %unknowptr) {
+llo-region-0:
+  br label %llo-region-1
+
+llo-region-1:                                    ; preds = %llo-region-0
+  %addr00 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 000)
+  %addr01 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 010)
+  %addr02 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 020)
+  %addr03 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 030)
+  %addr04 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 040)
+  %addr05 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 050)
+  %addr06 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 060)
+  %addr07 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 070)
+  %addr08 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 080)
+  %addr09 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 090)
+  %addr10 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 100)
+  %addr11 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 110)
+  %addr12 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 120)
+  %addr13 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 130)
+  %addr14 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 140)
+  %addr15 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 150)
+  %addr16 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 160)
+  %addr17 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 170)
+  %addr18 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 180)
+  %addr19 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 190)
+  %addr20 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 200)
+  %addr21 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 210)
+  %addr22 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 220)
+  %addr23 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 230)
+  %addr24 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 240)
+  %addr25 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 250)
+  %addr26 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 260)
+  %addr27 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 270)
+  %addr28 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 280)
+  %addr29 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 290)
+  %addr30 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 300)
+  %addr31 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 310)
+  %addr32 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 320)
+  %addr33 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 330)
+  %addr34 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 340)
+
+  %val00 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr34
+  %val01 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr33
+  %val02 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr32
+  %val03 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr31
+  %val04 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr30
+  %val05 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr29
+  %val06 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr28
+  %val07 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr27
+  %val08 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr26
+  %val09 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr25
+  %val10 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr24
+  %val11 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr23
+  %val12 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr22
+  %val13 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr21
+  %val14 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr20
+  %val15 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr19
+  %val16 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr18
+  %val17 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr17
+  %val18 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr16
+  %val19 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr15
+  %val20 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr14
+  %val21 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr13
+  %val22 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr12
+  %val23 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr11
+  %val24 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr10
+  %val25 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr09
+  %val26 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr08
+  %val27 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr07
+  %val28 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr06
+  %val29 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr05
+  %val30 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr04
+  %val31 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr03
+  %val32 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr02
+  %val33 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr01
+  %val34 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr00
+
+  store <1024 x i32> %val34, <1024 x i32> addrspace(205)* %unknowptr
+  store <1024 x i32> %val33, <1024 x i32> addrspace(205)* %addr33
+  store <1024 x i32> %val32, <1024 x i32> addrspace(205)* %addr32
+  store <1024 x i32> %val31, <1024 x i32> addrspace(205)* %addr31
+  store <1024 x i32> %val30, <1024 x i32> addrspace(205)* %addr30
+  store <1024 x i32> %val29, <1024 x i32> addrspace(205)* %addr29
+  store <1024 x i32> %val28, <1024 x i32> addrspace(205)* %addr28
+  store <1024 x i32> %val27, <1024 x i32> addrspace(205)* %addr27
+  store <1024 x i32> %val26, <1024 x i32> addrspace(205)* %addr26
+  store <1024 x i32> %val25, <1024 x i32> addrspace(205)* %addr25
+  store <1024 x i32> %val24, <1024 x i32> addrspace(205)* %addr24
+  store <1024 x i32> %val23, <1024 x i32> addrspace(205)* %addr23
+  store <1024 x i32> %val22, <1024 x i32> addrspace(205)* %addr22
+  store <1024 x i32> %val21, <1024 x i32> addrspace(205)* %addr21
+  store <1024 x i32> %val20, <1024 x i32> addrspace(205)* %addr20
+  store <1024 x i32> %val19, <1024 x i32> addrspace(205)* %addr19
+  store <1024 x i32> %val18, <1024 x i32> addrspace(205)* %addr18
+  store <1024 x i32> %val17, <1024 x i32> addrspace(205)* %addr17
+  store <1024 x i32> %val16, <1024 x i32> addrspace(205)* %addr16
+  store <1024 x i32> %val15, <1024 x i32> addrspace(205)* %addr15
+  store <1024 x i32> %val14, <1024 x i32> addrspace(205)* %addr14
+  store <1024 x i32> %val13, <1024 x i32> addrspace(205)* %addr13
+  store <1024 x i32> %val12, <1024 x i32> addrspace(205)* %addr12
+  store <1024 x i32> %val11, <1024 x i32> addrspace(205)* %addr11
+  store <1024 x i32> %val00, <1024 x i32> addrspace(205)* %addr10
+  store <1024 x i32> %val09, <1024 x i32> addrspace(205)* %addr09
+  store <1024 x i32> %val08, <1024 x i32> addrspace(205)* %addr08
+  store <1024 x i32> %val07, <1024 x i32> addrspace(205)* %addr07
+  store <1024 x i32> %val06, <1024 x i32> addrspace(205)* %addr06
+  store <1024 x i32> %val05, <1024 x i32> addrspace(205)* %addr05
+  store <1024 x i32> %val04, <1024 x i32> addrspace(205)* %addr04
+  store <1024 x i32> %val03, <1024 x i32> addrspace(205)* %addr03
+  store <1024 x i32> %val02, <1024 x i32> addrspace(205)* %addr02
+  store <1024 x i32> %val01, <1024 x i32> addrspace(205)* %addr01
+  store <1024 x i32> %val00, <1024 x i32> addrspace(205)* %addr00
+
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v_vm_spill_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v_vm_spill_tc.ll
new file mode 100644
index 0000000..49d150c
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/v_vm_spill_tc.ll

@@ -0,0 +1,201 @@
+; RUN: llc -O2 < %s -mcpu=tensorcore-jf -asm-verbose=false | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+!vmem.funcs.spill = !{!0}
+!vmem.ranges.spill.start = !{!1}
+!vmem.ranges.spill.limit = !{!2}
+
+!0 = !{void (<1024 x i32> addrspace(205)*)* @spill_v_vm_to_vreg}
+!1 = !{i32 1000}
+!2 = !{i32 2000}
+
+; CHECK-LABEL: spill_v_vm_to_vreg:
+; No further checks performed at present. This mainly tests whether
+; the registe scavenger is able to spill by spilling vms and vprs
+; at the same time.
+; TODO(hgreving): implement more checks once the frame indices have
+; been optimized.
+
+declare <1024 x i32> @llvm.tpu.vlaneseq() #0
+declare <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32) nounwind
+
+define void @spill_v_vm_to_vreg(<1024 x i32> addrspace(205)* %unknownptr) {
+llo-region-0:
+  br label %llo-region-1
+
+llo-region-1:                                    ; preds = %llo-region-0
+  %addr00 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 000)
+  %addr01 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 010)
+  %addr02 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 020)
+  %addr03 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 030)
+  %addr04 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 040)
+  %addr05 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 050)
+  %addr06 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 060)
+  %addr07 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 070)
+  %addr08 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 080)
+  %addr09 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 090)
+  %addr10 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 100)
+  %addr11 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 110)
+  %addr12 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 120)
+  %addr13 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 130)
+  %addr14 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 140)
+  %addr15 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 150)
+  %addr16 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 160)
+  %addr17 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 170)
+  %addr18 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 180)
+  %addr19 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 190)
+  %addr20 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 200)
+  %addr21 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 210)
+  %addr22 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 220)
+  %addr23 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 230)
+  %addr24 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 240)
+  %addr25 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 250)
+  %addr26 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 260)
+  %addr27 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 270)
+  %addr28 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 280)
+  %addr29 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 290)
+  %addr30 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 300)
+  %addr31 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 310)
+  %addr32 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 320)
+  %addr33 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 330)
+  %addr34 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 340)
+
+  %val00 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr34
+  %val01 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr33
+  %val02 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr32
+  %val03 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr31
+  %val04 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr30
+  %val05 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr29
+  %val06 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr28
+  %val07 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr27
+  %val08 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr26
+  %val09 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr25
+  %val10 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr24
+  %val11 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr23
+  %val12 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr22
+  %val13 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr21
+  %val14 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr20
+  %val15 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr19
+  %val16 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr18
+  %val17 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr17
+  %val18 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr16
+  %val19 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr15
+  %val20 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr14
+  %val21 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr13
+  %val22 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr12
+  %val23 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr11
+  %val24 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr10
+  %val25 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr09
+  %val26 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr08
+  %val27 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr07
+  %val28 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr06
+  %val29 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr05
+  %val30 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr04
+  %val31 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr03
+  %val32 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr02
+  %val33 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr01
+  %val34 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr00
+
+  %laneseq = call <1024 x i32> @llvm.tpu.vlaneseq()
+  %.splatinsert = insertelement <1024 x i32> undef, i32 127, i32 0
+  %.splat = shufflevector <1024 x i32> %.splatinsert, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %base_rec = and <1024 x i32> %laneseq, %.splat
+
+  %.splatinsert1 = insertelement <1024 x i32> undef, i32 10, i32 0
+  %.splat2 = shufflevector <1024 x i32> %.splatinsert1, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %.splatinsert3 = insertelement <1024 x i32> undef, i32 11, i32 0
+  %.splat4 = shufflevector <1024 x i32> %.splatinsert3, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp1 = icmp eq <1024 x i32> %base_rec, %.splat4
+  %.splatinsert5 = insertelement <1024 x i32> undef, i32 1, i32 0
+  %.splat6 = shufflevector <1024 x i32> %.splatinsert5, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp2 = icmp eq <1024 x i32> %base_rec, %.splat6
+  %.splatinsert7 = insertelement <1024 x i32> undef, i32 2, i32 0
+  %.splat8 = shufflevector <1024 x i32> %.splatinsert7, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp3 = icmp eq <1024 x i32> %base_rec, %.splat8
+  %.splatinsert9 = insertelement <1024 x i32> undef, i32 3, i32 0
+  %.splat10 = shufflevector <1024 x i32> %.splatinsert9, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp4 = icmp eq <1024 x i32> %base_rec, %.splat10
+  %.splatinsert11 = insertelement <1024 x i32> undef, i32 4, i32 0
+  %.splat12 = shufflevector <1024 x i32> %.splatinsert11, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp5 = icmp eq <1024 x i32> %base_rec, %.splat12
+  %.splatinsert13 = insertelement <1024 x i32> undef, i32 5, i32 0
+  %.splat14 = shufflevector <1024 x i32> %.splatinsert13, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp6 = icmp eq <1024 x i32> %base_rec, %.splat14
+  %.splatinsert15 = insertelement <1024 x i32> undef, i32 6, i32 0
+  %.splat16 = shufflevector <1024 x i32> %.splatinsert15, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp7 = icmp eq <1024 x i32> %base_rec, %.splat16
+  %.splatinsert17 = insertelement <1024 x i32> undef, i32 7, i32 0
+  %.splat18 = shufflevector <1024 x i32> %.splatinsert17, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp8 = icmp eq <1024 x i32> %base_rec, %.splat18
+  %.splatinsert19 = insertelement <1024 x i32> undef, i32 8, i32 0
+  %.splat20 = shufflevector <1024 x i32> %.splatinsert19, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp9 = icmp eq <1024 x i32> %base_rec, %.splat20
+  %.splatinsert23 = insertelement <1024 x i32> undef, i32 9, i32 0
+  %.splat24 = shufflevector <1024 x i32> %.splatinsert23, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp10 = icmp sgt <1024 x i32> %base_rec, %.splat24
+
+  %sel0 = select <1024 x i1> %cmp1, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %sel1 = select <1024 x i1> %cmp10, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %sel2 = select <1024 x i1> %cmp2, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add0 = add <1024 x i32> %sel1, %sel0
+  %sel3 = select <1024 x i1> %cmp3, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add1 = add <1024 x i32> %add0, %sel2
+  %sel4 = select <1024 x i1> %cmp4, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add2 = add <1024 x i32> %add1, %sel3
+  %sel5 = select <1024 x i1> %cmp5, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add3 = add <1024 x i32> %add2, %sel4
+  %sel6 = select <1024 x i1> %cmp6, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add4 = add <1024 x i32> %add3, %sel5
+  %sel7 = select <1024 x i1> %cmp7, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add5 = add <1024 x i32> %add4, %sel6
+  %sel8 = select <1024 x i1> %cmp8, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add6 = add <1024 x i32> %add5, %sel7
+  %sel9 = select <1024 x i1> %cmp9, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add7 = add <1024 x i32> %add6, %sel8
+  %add8 = add <1024 x i32> %add7, %sel9
+  %result = sub <1024 x i32> %add8, %base_rec
+
+  %result_addr = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 0)
+  store <1024 x i32> %result, <1024 x i32> addrspace(205)* %result_addr
+
+  store <1024 x i32> %val34, <1024 x i32> addrspace(205)* %unknownptr
+  store <1024 x i32> %val33, <1024 x i32> addrspace(205)* %addr33
+  store <1024 x i32> %val32, <1024 x i32> addrspace(205)* %addr32
+  store <1024 x i32> %val31, <1024 x i32> addrspace(205)* %addr31
+  store <1024 x i32> %val30, <1024 x i32> addrspace(205)* %addr30
+  store <1024 x i32> %val29, <1024 x i32> addrspace(205)* %addr29
+  store <1024 x i32> %val28, <1024 x i32> addrspace(205)* %addr28
+  store <1024 x i32> %val27, <1024 x i32> addrspace(205)* %addr27
+  store <1024 x i32> %val26, <1024 x i32> addrspace(205)* %addr26
+  store <1024 x i32> %val25, <1024 x i32> addrspace(205)* %addr25
+  store <1024 x i32> %val24, <1024 x i32> addrspace(205)* %addr24
+  store <1024 x i32> %val23, <1024 x i32> addrspace(205)* %addr23
+  store <1024 x i32> %val22, <1024 x i32> addrspace(205)* %addr22
+  store <1024 x i32> %val21, <1024 x i32> addrspace(205)* %addr21
+  store <1024 x i32> %val20, <1024 x i32> addrspace(205)* %addr20
+  store <1024 x i32> %val19, <1024 x i32> addrspace(205)* %addr19
+  store <1024 x i32> %val18, <1024 x i32> addrspace(205)* %addr18
+  store <1024 x i32> %val17, <1024 x i32> addrspace(205)* %addr17
+  store <1024 x i32> %val16, <1024 x i32> addrspace(205)* %addr16
+  store <1024 x i32> %val15, <1024 x i32> addrspace(205)* %addr15
+  store <1024 x i32> %val14, <1024 x i32> addrspace(205)* %addr14
+  store <1024 x i32> %val13, <1024 x i32> addrspace(205)* %addr13
+  store <1024 x i32> %val12, <1024 x i32> addrspace(205)* %addr12
+  store <1024 x i32> %val11, <1024 x i32> addrspace(205)* %addr11
+  store <1024 x i32> %val00, <1024 x i32> addrspace(205)* %addr10
+  store <1024 x i32> %val09, <1024 x i32> addrspace(205)* %addr09
+  store <1024 x i32> %val08, <1024 x i32> addrspace(205)* %addr08
+  store <1024 x i32> %val07, <1024 x i32> addrspace(205)* %addr07
+  store <1024 x i32> %val06, <1024 x i32> addrspace(205)* %addr06
+  store <1024 x i32> %val05, <1024 x i32> addrspace(205)* %addr05
+  store <1024 x i32> %val04, <1024 x i32> addrspace(205)* %addr04
+  store <1024 x i32> %val03, <1024 x i32> addrspace(205)* %addr03
+  store <1024 x i32> %val02, <1024 x i32> addrspace(205)* %addr02
+  store <1024 x i32> %val01, <1024 x i32> addrspace(205)* %addr01
+  store <1024 x i32> %val00, <1024 x i32> addrspace(205)* %addr00
+
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vcmask_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vcmask_sc.ll
new file mode 100644
index 0000000..37e8dae
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vcmask_sc.ll

@@ -0,0 +1,111 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-enable-vcmasks -tpu-enable-embedded-masks=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests that we're generating vcmask instructions for supported mask immediates.
+
+declare <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*)
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32)
+
+; CHECK-LABEL: f_vld_vcmsk_0xf:
+; CHECK: vm{{[0-9]+}} = vcmask $0xf00
+define <8 x i32> @f_vld_vcmsk_0xf() {
+entry:
+  %a = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 1)
+  %b = call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i32> addrspace(201)* %a)
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: f_vld_vcmsk_0xf0:
+; CHECK: vm{{[0-9]+}} = vcmask $0x1f10
+define <8 x i32> @f_vld_vcmsk_0xf0() {
+entry:
+  %a = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 1)
+  %b = call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <8 x i32> addrspace(201)* %a)
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: f_vld_vcmsk_0x3c:
+; CHECK: vm{{[0-9]+}} = vcmask $0x1708
+define <8 x i32> @f_vld_vcmsk_0x3c() {
+entry:
+  %a = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 1)
+  %b = call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x i32> addrspace(201)* %a)
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: f_vld_vcmsk_0xc3:
+; CHECK: vm{{[0-9]+}} = vcmask $0x718
+define <8 x i32> @f_vld_vcmsk_0xc3() {
+entry:
+  %a = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 1)
+  %b = call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x i32> addrspace(201)* %a)
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: f_vld_vcmsk_0xe7:
+; CHECK: vm{{[0-9]+}} = vcmask $0xb14
+define <8 x i32> @f_vld_vcmsk_0xe7() {
+entry:
+  %a = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 1)
+  %b = call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x i32> addrspace(201)* %a)
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: f_vld_vcmsk_0xef:
+; CHECK: vm{{[0-9]+}} = vcmask $0xf14
+define <8 x i32> @f_vld_vcmsk_0xef() {
+entry:
+  %a = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 1)
+  %b = call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i32> addrspace(201)* %a)
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: f_vld_vcmsk_0xf7:
+; CHECK: vm{{[0-9]+}} = vcmask $0xb10
+define <8 x i32> @f_vld_vcmsk_0xf7() {
+entry:
+  %a = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 1)
+  %b = call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1>, <8 x i32> addrspace(201)* %a)
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: f_vld_vcmsk_0x80:
+; CHECK: vm{{[0-9]+}} = vcmask $0x1f1c
+define <8 x i32> @f_vld_vcmsk_0x80() {
+entry:
+  %a = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 1)
+  %b = call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> addrspace(201)* %a)
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: f_vld_vcmsk_0x1:
+; CHECK: vm{{[0-9]+}} = vcmask $0x300
+define <8 x i32> @f_vld_vcmsk_0x1() {
+entry:
+  %a = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 1)
+  %b = call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> addrspace(201)* %a)
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: f_vld_vcmsk_0x18:
+; CHECK: vm{{[0-9]+}} = vcmask $0x130c
+define <8 x i32> @f_vld_vcmsk_0x18() {
+entry:
+  %a = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 1)
+  %b = call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x i32> addrspace(201)* %a)
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: f_vld_vcmsk_0xff:
+; CHECK: vm{{[0-9]+}} = vcmask $0x1f00
+define <8 x i32> @f_vld_vcmsk_0xff() {
+entry:
+  %a = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 1)
+  %b = call <8 x i32> @llvm.tpu.vld.msk.v8i32.p201v8i32(<8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i32> addrspace(201)* %a)
+  ret <8 x i32> %b
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_bf16_gl_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_bf16_gl_sc.ll
new file mode 100644
index 0000000..e8d1a5a
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_bf16_gl_sc.ll

@@ -0,0 +1,98 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Tests low precision vector ALU instructions.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <16 x bfloat> @llvm.tpu.add.low.f32.bf16(<8 x float> %x, <8 x float> %y) readnone
+declare <16 x bfloat> @llvm.tpu.add.high.f32.bf16(<8 x float> %x, <8 x float> %y) readnone
+declare <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) readnone
+declare <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) readnone
+
+; CHECK-LABEL: vadd_low_f32_bf16:
+; CHECK: v{{[0-9]+}} = vadd.low.f32.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x bfloat> @vadd_low_f32_bf16(<8 x float> %x, <8 x float> %y) {
+  %a = tail call <16 x bfloat> @llvm.tpu.add.low.f32.bf16(<8 x float> %x, <8 x float> %y)
+  ret <16 x bfloat> %a
+}
+
+; CHECK-LABEL: vadd_high_f32_bf16:
+; CHECK: v{{[0-9]+}} = vadd.high.f32.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x bfloat> @vadd_high_f32_bf16(<8 x float> %x, <8 x float> %y) {
+  %a = tail call <16 x bfloat> @llvm.tpu.add.high.f32.bf16(<8 x float> %x, <8 x float> %y)
+  ret <16 x bfloat> %a
+}
+
+; CHECK-LABEL: vadd_bf16:
+; CHECK: v{{[0-9]+}} = vadd.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x bfloat> @vadd_bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+  %a = fadd <16 x bfloat> %x, %y
+  ret <16 x bfloat> %a
+}
+
+; CHECK-LABEL: vsub_bf16:
+; CHECK: v{{[0-9]+}} = vsub.bf16 v0, v1
+define <16 x bfloat> @vsub_bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+  %a = fsub <16 x bfloat> %x, %y
+  ret <16 x bfloat> %a
+}
+
+; CHECK-LABEL: vmul_bf16:
+; CHECK: v{{[0-9]+}} = vmul.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x bfloat> @vmul_bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+  %a = fmul <16 x bfloat> %x, %y
+  ret <16 x bfloat> %a
+}
+
+; CHECK-LABEL: vmax_bf16:
+; CHECK: v{{[0-9]+}} = vmax.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x bfloat> @vmax_bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+  %a = call <16 x bfloat> @llvm.maximum.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) readnone
+  ret <16 x bfloat> %a
+}
+
+; CHECK-LABEL: vmin_bf16:
+; CHECK: v{{[0-9]+}} = vmin.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x bfloat> @vmin_bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+  %a = call <16 x bfloat> @llvm.minimum.v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) readnone
+  ret <16 x bfloat> %a
+}
+
+declare < 16 x bfloat> @llvm.tpu.clamp.v16bf16(< 16 x bfloat>, < 16 x bfloat>, < 16 x bfloat>)
+
+; CHECK-LABEL: clampi
+; CHECK: v[[v:[0-9]+]] = vmin.bf16 v2, v1
+; CHECK: v0 = vmax.bf16 v[[v]], v0
+define < 16 x bfloat> @clampi(< 16 x bfloat> %min, < 16 x bfloat> %max, < 16 x bfloat> %x) {
+entry:
+  %0 = tail call < 16 x bfloat> @llvm.tpu.clamp.v16bf16(< 16 x bfloat> %min, < 16 x bfloat> %x, < 16 x bfloat> %max)
+  ret < 16 x bfloat> %0
+}
+
+; CHECK: clamps_ii
+; CHECK: v0 = vclamps.bf16 v0, $1.0
+define <16 x bfloat> @clamps_ii(<16 x bfloat> %x) {
+entry:
+  %0 = tail call <16 x bfloat> @llvm.tpu.clamp.v16bf16(<16 x bfloat> <bfloat -1.0, bfloat -1.0, bfloat -1.0, bfloat -1.0, bfloat -1.0, bfloat -1.0, bfloat -1.0, bfloat -1.0, bfloat -1.0, bfloat -1.0, bfloat -1.0, bfloat -1.0, bfloat -1.0, bfloat -1.0, bfloat -1.0, bfloat -1.0>, <16 x bfloat> %x, <16 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>)
+  ret <16 x bfloat> %0
+}
+
+; CHECK: clamps_rr
+; CHECK: v0 = vclamps.bf16 v0, v1
+define < 16 x bfloat> @clamps_rr(< 16 x bfloat> %x, < 16 x bfloat> %max) {
+entry:
+  %min = fneg < 16 x bfloat> %max
+  %0 = tail call < 16 x bfloat> @llvm.tpu.clamp.v16bf16(< 16 x bfloat> %min, < 16 x bfloat> %x, < 16 x bfloat> %max)
+  ret < 16 x bfloat> %0
+}
+
+; CHECK: relu_r
+; CHECK: v0 = vclamp.gez.bf16 v0, v1
+define < 16 x bfloat> @relu_r(< 16 x bfloat> %x, < 16 x bfloat> %max) {
+entry:
+  %0 = tail call < 16 x bfloat> @llvm.tpu.clamp.v16bf16(< 16 x bfloat> zeroinitializer, < 16 x bfloat> %x, < 16 x bfloat> %max)
+  ret < 16 x bfloat> %0
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_compare_bf16_gl_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_compare_bf16_gl_sc.ll
new file mode 100644
index 0000000..2dbb3bb
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_compare_bf16_gl_sc.ll

@@ -0,0 +1,218 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Tests low precision vector compare instructions.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <32 x i1> @llvm.tpu.byte.not.zero.v32i1.v32i8(<32 x i8>) readnone
+
+; CHECK-LABEL: cmpeq_s16:
+; CHECK: vm{{[0-9]+}} = veq.s16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i1> @cmpeq_s16(<16 x i16> %x, <16 x i16> %y) {
+  %a = icmp eq <16 x i16> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: cmpne_s16:
+; CHECK: vm{{[0-9]+}} = vne.s16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i1> @cmpne_s16(<16 x i16> %x, <16 x i16> %y) {
+  %a = icmp ne <16 x i16> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: cmpgt_s16:
+; CHECK: vm{{[0-9]+}} = vgt.s16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i1> @cmpgt_s16(<16 x i16> %x, <16 x i16> %y) {
+  %a = icmp sgt <16 x i16> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: cmpge_s16:
+; CHECK: vm{{[0-9]+}} = vge.s16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i1> @cmpge_s16(<16 x i16> %x, <16 x i16> %y) {
+  %a = icmp sge <16 x i16> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: cmplt_s16:
+; CHECK: vm{{[0-9]+}} = vlt.s16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i1> @cmplt_s16(<16 x i16> %x, <16 x i16> %y) {
+  %a = icmp slt <16 x i16> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: cmple_s16:
+; CHECK: vm{{[0-9]+}} = vle.s16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i1> @cmple_s16(<16 x i16> %x, <16 x i16> %y) {
+  %a = icmp sle <16 x i16> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: cmpgt_u16:
+; CHECK: vm{{[0-9]+}} = vgt.u16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i1> @cmpgt_u16(<16 x i16> %x, <16 x i16> %y) {
+  %a = icmp ugt <16 x i16> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: cmpge_u16:
+; CHECK: vm{{[0-9]+}} = vge.u16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i1> @cmpge_u16(<16 x i16> %x, <16 x i16> %y) {
+  %a = icmp uge <16 x i16> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: cmplt_u16:
+; CHECK: vm{{[0-9]+}} = vlt.u16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i1> @cmplt_u16(<16 x i16> %x, <16 x i16> %y) {
+  %a = icmp ult <16 x i16> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: cmple_u16:
+; CHECK: vm{{[0-9]+}} = vle.u16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i1> @cmple_u16(<16 x i16> %x, <16 x i16> %y) {
+  %a = icmp ule <16 x i16> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: cmpoeq_bf16:
+; CHECK: vm{{[0-9]+}} = veq.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i1> @cmpoeq_bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+  %a = fcmp oeq <16 x bfloat> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: cmpune_bf16:
+; CHECK: vm{{[0-9]+}} = vne.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i1> @cmpune_bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+  %a = fcmp une <16 x bfloat> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: cmpogt_bf16:
+; CHECK: vm{{[0-9]+}} = vgt.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i1> @cmpogt_bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+  %a = fcmp ogt <16 x bfloat> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: cmpoge_bf16:
+; CHECK: vm{{[0-9]+}} = vge.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i1> @cmpoge_bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+  %a = fcmp oge <16 x bfloat> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: cmpolt_bf16:
+; CHECK: vm{{[0-9]+}} = vlt.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i1> @cmpolt_bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+  %a = fcmp olt <16 x bfloat> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: cmpole_bf16:
+; CHECK: vm{{[0-9]+}} = vle.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i1> @cmpole_bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+  %a = fcmp ole <16 x bfloat> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: bytenotzero:
+; CHECK: { vm{{[0-9]+}} = vnez.u8 v{{[0-9]+}};
+define <32 x i1> @bytenotzero(<32 x i8> %y) {
+  %a = call <32 x i1> @llvm.tpu.byte.not.zero.v32i1.v32i8(<32 x i8> %y)
+  ret <32 x i1> %a
+}
+
+; CHECK-LABEL: bytenotzero_opt_i8:
+; CHECK: { vm{{[0-9]+}} = vnez.u8 v{{[0-9]+}};
+define <32 x i1> @bytenotzero_opt_i8(<32 x i8> %y) {
+  %a = icmp ne <32 x i8> %y, <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                              i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                              i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
+                              i8 0, i8 0>
+  ret <32 x i1> %a
+}
+
+; CHECK-LABEL: cmpueq_bf16
+; CHECK-DAG: vm{{[0-9]+}} = vlt.bf16 v0, v1
+; CHECK-DAG: vm{{[0-9]+}} = vgt.bf16 v0, v1
+; CHECK-DAG: vm{{[0-9]+}} = vmor vm1, vm0
+; CHECK: vm{{[0-9]+}} = vmneg vm0
+define <16 x i1> @cmpueq_bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+  %a = fcmp ueq <16 x bfloat> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: cmpone_bf16:
+; CHECK:      { vm0 = vlt.bf16 v0, v1;
+; CHECK-NEXT:   vm1 = vgt.bf16 v0, v1 }
+; CHECK-NEXT: { vm0 = vmor vm1, vm0;
+define <16 x i1> @cmpone_bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
+  %a = fcmp one <16 x bfloat> %x, %y
+  ret <16 x i1> %a
+}
+
+; CHECK-LABEL: not_16i1:
+; CHECK: { vm{{[0-9]+}} = vmneg vm{{[0-9]+}}
+define <16 x i1> @not_16i1(<16 x i1> %x, <16 x i1> %y) {
+  %m = icmp eq <16 x i1> %x, <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0,
+                              i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>
+  ret <16 x i1> %m
+}
+
+; CHECK-LABEL: and_16i1:
+; CHECK: { vm{{[0-9]+}} = vmand vm{{[0-9]+}}, vm{{[0-9]+}}
+define <16 x i1> @and_16i1(<16 x i1> %x, <16 x i1> %y) {
+  %m = and <16 x i1> %x, %y
+  ret <16 x i1> %m
+}
+
+; CHECK-LABEL: or_16i1:
+; CHECK: { vm{{[0-9]+}} = vmor vm{{[0-9]+}}, vm{{[0-9]+}}
+define <16 x i1> @or_16i1(<16 x i1> %x, <16 x i1> %y) {
+  %m = or <16 x i1> %x, %y
+  ret <16 x i1> %m
+}
+ 
+; CHECK-LABEL: xor_16i1:
+; CHECK: { vm{{[0-9]+}} = vmxor vm{{[0-9]+}}, vm{{[0-9]+}}
+define <16 x i1> @xor_16i1(<16 x i1> %x, <16 x i1> %y) {
+  %m = xor <16 x i1> %x, %y
+  ret <16 x i1> %m
+}
+
+; CHECK-LABEL: not_32i1:
+; CHECK: { vm{{[0-9]+}} = vmneg vm{{[0-9]+}}
+define <32 x i1> @not_32i1(<32 x i1> %x, <32 x i1> %y) {
+  %m = icmp eq <32 x i1> %x, <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0,
+                              i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0,
+                              i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0,
+                              i1 0, i1 0>
+  ret <32 x i1> %m
+}
+
+; CHECK-LABEL: and_32i1:
+; CHECK: { vm{{[0-9]+}} = vmand vm{{[0-9]+}}, vm{{[0-9]+}}
+define <32 x i1> @and_32i1(<32 x i1> %x, <32 x i1> %y) {
+  %m = and <32 x i1> %x, %y
+  ret <32 x i1> %m
+}
+
+; CHECK-LABEL: or_32i1:
+; CHECK: { vm{{[0-9]+}} = vmor vm{{[0-9]+}}, vm{{[0-9]+}}
+define <32 x i1> @or_32i1(<32 x i1> %x, <32 x i1> %y) {
+  %m = or <32 x i1> %x, %y
+  ret <32 x i1> %m
+}
+ 
+; CHECK-LABEL: xor_32i1:
+; CHECK: { vm{{[0-9]+}} = vmxor vm{{[0-9]+}}, vm{{[0-9]+}}
+define <32 x i1> @xor_32i1(<32 x i1> %x, <32 x i1> %y) {
+  %m = xor <32 x i1> %x, %y
+  ret <32 x i1> %m
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_compare_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_compare_sc.ll
new file mode 100644
index 0000000..36a3dc7
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_compare_sc.ll

@@ -0,0 +1,240 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-enable-vcmasks=false | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: cmpeq:
+; CHECK: vm{{[0-9]+}} = veq.s32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i1> @cmpeq(<8 x i32> %x, <8 x i32> %y) {
+  %a = icmp eq <8 x i32> %x, %y
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: cmpne:
+; CHECK: vm{{[0-9]+}} = vne.s32 v{{[0-9]+}}, $0x2a
+define <8 x i1> @cmpne(<8 x i32> %x, <8 x i32> %y) {
+  %a = icmp ne <8 x i32> %x, <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: cmpgt:
+; CHECK: vm{{[0-9]+}} = vgt.s32 v{{[0-9]+}}, $0x2a
+define <8 x i1> @cmpgt(<8 x i32> %x, <8 x i32> %y) {
+  %a = icmp sgt <8 x i32> %x, <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: cmpugt:
+; CHECK: vm0 = vgt.u32 v0, v1
+define <8 x i1> @cmpugt(<8 x i32> %x, <8 x i32> %y) {
+  %a = icmp ugt <8 x i32> %x, %y
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: cmpge:
+; CHECK: vm{{[0-9]+}} = vge.s32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i1> @cmpge(<8 x i32> %x, <8 x i32> %y) {
+  %a = icmp sge <8 x i32> %x, %y
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: cmpuge:
+; CHECK: vm{{[0-9]+}} = vge.u32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i1> @cmpuge(<8 x i32> %x, <8 x i32> %y) {
+  %a = icmp uge <8 x i32> %x, %y
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: cmplt:
+; CHECK: vm{{[0-9]+}} = vlt.s32 v{{[0-9]+}}, $0x2a
+define <8 x i1> @cmplt(<8 x i32> %x, <8 x i32> %y) {
+  %a = icmp slt <8 x i32> %x, <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: cmpult:
+; CHECK: vm{{[0-9]+}} = vlt.u32 v{{[0-9]+}}, $0x2a
+define <8 x i1> @cmpult(<8 x i32> %x) {
+  %a = icmp ult <8 x i32> %x, <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: cmple:
+; CHECK: vm{{[0-9]+}} = vle.s32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i1> @cmple(<8 x i32> %x, <8 x i32> %y) {
+  %a = icmp sle <8 x i32> %x, %y
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: cmpule:
+; CHECK: vm{{[0-9]+}} = vle.u32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i1> @cmpule(<8 x i32> %x, <8 x i32> %y) {
+  %a = icmp ule <8 x i32> %x, %y
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: fcmple:
+; CHECK: vm{{[0-9]+}} = vle.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i1> @fcmple(<8 x float> %x, <8 x float> %y) {
+  %a = fcmp ole <8 x float> %x, %y
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: fcmplt:
+; CHECK: vm{{[0-9]+}} = vlt.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i1> @fcmplt(<8 x float> %x, <8 x float> %y) {
+  %a = fcmp olt <8 x float> %x, %y
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: fcmpge:
+; CHECK: vm{{[0-9]+}} = vge.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i1> @fcmpge(<8 x float> %x, <8 x float> %y) {
+  %a = fcmp oge <8 x float> %x, %y
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: fcmpgt:
+; CHECK: vm{{[0-9]+}} = vgt.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i1> @fcmpgt(<8 x float> %x, <8 x float> %y) {
+  %a = fcmp ogt <8 x float> %x, %y
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: fcmpeq:
+; CHECK: vm{{[0-9]+}} = veq.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i1> @fcmpeq(<8 x float> %x, <8 x float> %y) {
+  %a = fcmp oeq <8 x float> %x, %y
+  ret <8 x i1> %a
+}
+
+; Verify that we check the NAN cases for v0 and v1.
+; Please note that llvm selection dag combines such that there are no
+; vnes to test as this test did previously.
+; CHECK-LABEL: fcmpeq_unordered:
+; CHECK-DAG: vm{{[0-9]+}} = vlt.f32 v0, v1
+; CHECK-DAG: vm{{[0-9]+}} = vgt.f32 v0, v1
+; CHECK-DAG: vm{{[0-9]+}} = vmor vm1, vm0
+; CHECK: vm{{[0-9]+}} = vmneg vm0
+define <8 x i1> @fcmpeq_unordered(<8 x float> %x, <8 x float> %y) {
+  %a = fcmp ueq <8 x float> %x, %y
+  ret <8 x i1> %a
+}
+
+; Verify that we check the NaN cases for v0. This test also checks for efficient
+; setcc and unnecessary ordering checks that LLVM would insert if we don't manually
+; combine setcc with splat inputs.
+; CHECK-LABEL: fcmpne:
+; CHECK:      { vm0 = vlt.f32 v0, $42.0;
+; CHECK-NEXT:   vm1 = vgt.f32 v0, $42.0 }
+; CHECK-NEXT: { vm0 = vmor vm1, vm0;
+define <8 x i1> @fcmpne(<8 x float> %x, <8 x float> %y) {
+  %a = fcmp one <8 x float> %x, <float 42.0, float 42.0, float 42.0, float 42.0, float 42.0, float 42.0, float 42.0, float 42.0>
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: fcmpne_unordered:
+; CHECK: vm{{[0-9]+}} = vne.f32 v{{[0-9]+}}, $42.0;
+define <8 x i1> @fcmpne_unordered(<8 x float> %x, <8 x float> %y) {
+  %a = fcmp une <8 x float> %x, <float 42.0, float 42.0, float 42.0, float 42.0, float 42.0, float 42.0, float 42.0, float 42.0>
+  ret <8 x i1> %a
+}
+
+declare <8 x i1> @llvm.tpu.weird.v8f32(<8 x float>) readnone
+
+; CHECK-LABEL: weird:
+; CHECK: vm{{[0-9]+}} = vweird.f32 v{{[0-9]+}}
+define <8 x i1> @weird(<8 x float> %x, <8 x float> %y) {
+  %a = call <8 x i1> @llvm.tpu.weird.v8f32(<8 x float> %x)
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: vmimm0
+; CHECK: vm0 = vmxor vm0
+define <8 x i1> @vmimm0() {
+  ret <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>;
+}
+
+; CHECK-LABEL: vmimm1
+; CHECK: vm0 = vmmov $0xff
+define <8 x i1> @vmimm1() {
+  ret <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>;
+}
+
+; CHECK-LABEL: vmselect
+; CHECK-NOT: vm0 =
+define <8 x i1> @vmselect(<8 x i1> %c) {
+  %v0 = insertelement <8 x i1> undef, i1 -1, i32 0
+  %v1 = shufflevector <8 x i1> %v0, <8 x i1> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %v2 = insertelement <8 x i1> undef, i1 0, i32 0
+  %v3 = shufflevector <8 x i1> %v2, <8 x i1> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %r = select <8 x i1> %c, <8 x i1> %v1, <8 x i1> %v3
+  ret <8 x i1> %r
+}
+
+; CHECK-LABEL: vmselectneg
+; CHECK: vm0 = vmneg vm0
+define <8 x i1> @vmselectneg(<8 x i1> %c) {
+  %r = select <8 x i1> %c, <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i1> <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  ret <8 x i1> %r
+}
+
+; CHECK-LABEL: vmimm2
+; CHECK: v0 = vlaneseq.u32
+; CHECK: vm0 = veq.s32 v0, $0x1
+; CHECK: vm0 = vmneg vm0
+define <8 x i1> @vmimm2(<8 x i1> %c) {
+  ret <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
+}
+
+; CHECK-LABEL: vmimm3
+; CHECK: vm0 = vmand
+; CHECK: vm[[x:[0-9]+]] = vmand
+; CHECK: vm[[y:[0-9]+]] = vmor vm[[y]], vm[[x]]
+define <8 x i1> @vmimm3(<8 x i1> %c) {
+  ret <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>
+}
+
+; CHECK-LABEL: sextvm:
+; CHECK: [[vmr:vm[0-9]+]] = vne.s32
+; CHECK: [[vr:v[0-9]+]] = vimm.s32 $0x0
+; CHECK: = vsel [[vmr]], $-0x1, [[vr]]
+define <8 x i32> @sextvm(<8 x i32> %a, <8 x i32> %b) {
+  %1 = icmp ne <8 x i32> %a, %b
+  %2 = sext <8 x i1> %1 to <8 x i32>
+  ret <8 x i32> %2
+}
+
+; CHECK-LABEL: zextvm:
+; CHECK: [[vmr:vm[0-9]+]] = vne.s32
+; CHECK: [[vr:v[0-9]+]] = vimm.s32 $0x0
+; CHECK: = vsel [[vmr]], $0x1, [[vr]]
+define <8 x i32> @zextvm(<8 x i32> %a, <8 x i32> %b) {
+  %1 = icmp ne <8 x i32> %a, %b
+  %2 = zext <8 x i1> %1 to <8 x i32>
+  ret <8 x i32> %2
+}
+
+; Tests that we can lower insertelement with immediate index.
+
+; CHECK-LABEL: vmlane_i
+; CHECK: v[[v:[0-9]+]] = vlaneseq.u32
+; CHECK: vm[[vm:[0-9]+]] = veq.s32 v[[v]], $0x4
+; CHECK: v{{[0-9]+}} = vsel vm[[vm]], s{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i32> @vmlane_i(<8 x i32> %v, i32 %a) {
+  %vr = insertelement <8 x i32> %v, i32 %a, i32 4
+  ret <8 x i32> %vr
+}
+
+; Tests that we can lower insertelement with non-constant index.
+
+; CHECK-LABEL: vmlane_r
+; CHECK: v[[v:[0-9]+]] = vlaneseq.u32
+; CHECK: vm[[vm:[0-9]+]] = veq.s32 v[[v]], s{{[0-9]+}}
+; CHECK: v{{[0-9]+}} = vsel vm[[vm]], s{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i32> @vmlane_r(<8 x i32> %v, i32 %a, i32 %i) {
+  %vr = insertelement <8 x i32> %v, i32 %a, i32 %i
+  ret <8 x i32> %vr
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_compare_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_compare_tc.ll
new file mode 100644
index 0000000..acb36c5
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_compare_tc.ll

@@ -0,0 +1,47 @@
+; RUN: llc < %s -mcpu=tensorcore-vf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests Tensorcore VF unsigned compare operations.
+
+; CHECK-LABEL: cmpugt:
+; CHECK-DAG: vm0 = vgt.s32 v0, v1
+; CHECK-DAG: v[[y:[0-9]+]] = vxor.u32 v1, v0
+; CHECK-NEXT: vm[[z:[0-9]+]] = vlt.s32 v[[y]], $0x0
+; CHECK-NEXT: vm[[w:[0-9]+]] = vmxor vm[[w]], vm[[z]]
+define <1024 x i1> @cmpugt(<1024 x i32> %x, <1024 x i32> %y) {
+  %a = icmp ugt <1024 x i32> %x, %y
+  ret <1024 x i1> %a
+}
+
+; CHECK-LABEL: cmpuge:
+; CHECK: { v[[x:[0-9]+]] = vxor.u32 v1, v0 }
+; CHECK: { vm[[z:[0-9]+]] = vge.s32 v0, v1;
+; CHECK:   vm[[w:[0-9]+]] = vlt.s32 v[[x]], $0x0 }
+; CHECK: { vm0 = vmxor vm[[z]], vm[[w]];
+define <1024 x i1> @cmpuge(<1024 x i32> %x, <1024 x i32> %y) {
+  %a = icmp uge <1024 x i32> %x, %y
+  ret <1024 x i1> %a
+}
+
+; CHECK-LABEL: cmpult:
+; CHECK: { v[[x:[0-9]+]] = vshrl.u32 v0, $0x1 }
+; CHECK: { vm0 = vlt.s32 v[[x]], $0x15;
+define <1024 x i1> @cmpult(<1024 x i32> %x) {
+  %v0 = insertelement <1024 x i32> undef, i32 42, i32 0
+  %y = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %a = icmp ult <1024 x i32> %x, %y
+  ret <1024 x i1> %a
+}
+
+; CHECK-LABEL: cmpule:
+; CHECK: { v[[x:[0-9]+]] = vxor.u32 v1, v0 }
+; CHECK: { vm[[z:[0-9]+]] = vle.s32 v0, v1;
+; CHECK:   vm[[w:[0-9]+]] = vlt.s32 v[[x]], $0x0 }
+; CHECK: { vm0 = vmxor vm[[z]], vm[[w]];
+define <1024 x i1> @cmpule(<1024 x i32> %x, <1024 x i32> %y) {
+  %a = icmp ule <1024 x i32> %x, %y
+  ret <1024 x i1> %a
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_gf_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_gf_sc.ll
new file mode 100644
index 0000000..42a8dd2
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_gf_sc.ll

@@ -0,0 +1,688 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gf -asm-verbose=false -disable-cgp \
+; RUN: -instcombine-max-iterations=0 | FileCheck %s
+; REQUIRES: tpu
+
+; Test that basic 32-bit integer operations assemble as expected.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: vaddi:
+; CHECK: v{{[0-9]+}} = vadd.s32 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i32> @vaddi(<16 x i32> %x, <16 x i32> %y) {
+  %a = add <16 x i32> %x, %y
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: vaddf:
+; CHECK: v{{[0-9]+}} = vadd.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x float> @vaddf(<16 x float> %x, <16 x float> %y) {
+  %a = fadd <16 x float> %x, %y
+  ret <16 x float> %a
+}
+
+; CHECK-LABEL: vaddi_splat:
+; CHECK: v{{[0-9]+}} = vadd.s32 $0x1, v{{[0-9]+}}
+define <16 x i32> @vaddi_splat(<16 x i32> %x, <16 x i32> %y) {
+  %a = add <16 x i32> %x, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
+                           i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: vaddf_splat:
+; CHECK: v{{[0-9]+}} = vadd.f32 $1.0, v{{[0-9]+}}
+define <16 x float> @vaddf_splat(<16 x float> %x, <16 x float> %y) {
+  %a = fadd <16 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0,
+                              float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <16 x float> %a
+}
+
+; CHECK-LABEL: vaddf_splat_sreg:
+; CHECK-NOT: v2f
+; CHECK: v{{[0-9]+}} = vadd.f32 s{{[0-9]+}}, v{{[0-9]+}}
+define <16 x float> @vaddf_splat_sreg(<16 x float> %x, float %y) {
+  %v0 = insertelement <16 x float> undef, float %y, i32 0
+  %v1 = shufflevector <16 x float> %v0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                                                                        i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = fadd <16 x float> %x, %v1
+  ret <16 x float> %a
+}
+
+; CHECK-LABEL: vaddi_splat_sreg:
+; CHECK: v{{[0-9]+}} = vadd.s32 s{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i32> @vaddi_splat_sreg(<16 x i32> %x, i32 %y) {
+  %v0 = insertelement <16 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <16 x i32> %v0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                                                                    i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = add <16 x i32> %x, %v1
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: vsubi_splat_sreg:
+; CHECK: v{{[0-9]+}} = vsub.s32 s{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i32> @vsubi_splat_sreg(<16 x i32> %x, i32 %y) {
+  %v0 = insertelement <16 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <16 x i32> %v0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                                                                    i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = sub <16 x i32> %v1, %x
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: vsubf_splat_sreg:
+; CHECK: v{{[0-9]+}} = vsub.f32 s{{[0-9]+}}, v{{[0-9]+}}
+define <16 x float> @vsubf_splat_sreg(<16 x float> %x, float %y) {
+  %v0 = insertelement <16 x float> undef, float %y, i32 0
+  %v1 = shufflevector <16 x float> %v0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                                                                        i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = fsub <16 x float> %v1, %x
+  ret <16 x float> %a
+}
+
+; CHECK-LABEL: vmovf_splat_sreg:
+; CHECK: v{{[0-9]+}} = vmov s{{[0-9]+}}
+define <16 x float> @vmovf_splat_sreg(float %y) {
+  %v0 = insertelement <16 x float> undef, float %y, i32 0
+  %v1 = shufflevector <16 x float> %v0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                                                                        i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x float> %v1
+}
+
+; CHECK-LABEL: vmovf_scalar_to_vector_sreg:
+; CHECK: s[[sreg:[0-9]+]] = sld
+; CHECK: v{{[0-9]+}} = vmov s[[sreg]]
+; CHECK: v{{[0-9]+}} = vbroadcast v{{[0-9]+}}
+define <16 x i32> @vmovf_scalar_to_vector_sreg(i32* %a) {
+  %x = load i32, i32* %a, align 4
+  %v0 = insertelement <2 x i32> undef, i32 %x, i32 0
+  %v1 = insertelement <2 x i32> %v0, i32 %x, i32 1
+  %y = shufflevector <2 x i32> %v1, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                                                                 i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i32> %y
+}
+
+; CHECK-LABEL: vmovi_splat_sreg:
+; CHECK: v{{[0-9]+}} = vmov s{{[0-9]+}}
+define <16 x i32> @vmovi_splat_sreg(i32 %y) {
+  %v0 = insertelement <16 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <16 x i32> %v0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                                                                    i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i32> %v1
+}
+
+; CHECK-LABEL: vseli:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i32> @vseli(<16 x i1> %m, <16 x i32> %x, <16 x i32> %y) {
+  %a = select <16 x i1> %m, <16 x i32> %x, <16 x i32> %y
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: vseli_splat_sreg:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i32> @vseli_splat_sreg(<16 x i1> %m, i32 %y, <16 x i32> %x) {
+  %y1 = insertelement <16 x i32> undef, i32 %y, i32 0
+  %y2 = shufflevector <16 x i32> %y1, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                                                                    i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = select <16 x i1> %m, <16 x i32> %y2, <16 x i32> %x
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: vseli_splat_imm:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, $0x5, v{{[0-9]+}}
+define <16 x i32> @vseli_splat_imm(<16 x i1> %m, <16 x i32> %x) {
+  %a = select <16 x i1> %m, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5,
+                                        i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>,
+              <16 x i32> %x
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: vself:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x float> @vself(<16 x i1> %m, <16 x float> %x, <16 x float> %y) {
+  %a = select <16 x i1> %m, <16 x float> %x, <16 x float> %y
+  ret <16 x float> %a
+}
+
+; CHECK-LABEL: vself_splat_sreg:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define <16 x float> @vself_splat_sreg(<16 x i1> %m, float %y, <16 x float> %x) {
+  %y1 = insertelement <16 x float> undef, float %y, i32 0
+  %y2 = shufflevector <16 x float> %y1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                                                                        i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = select <16 x i1> %m, <16 x float> %y2, <16 x float> %x
+  ret <16 x float> %a
+}
+
+; CHECK-LABEL: vself_splat_imm:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, $0x40a00000, v{{[0-9]+}}
+define <16 x float> @vself_splat_imm(<16 x i1> %m, <16 x float> %x) {
+  %a = select <16 x i1> %m, <16 x float> <float 5.0, float 5.0, float 5.0, float 5.0, float 5.0, float 5.0, float 5.0, float 5.0,
+                                          float 5.0, float 5.0, float 5.0, float 5.0, float 5.0, float 5.0, float 5.0, float 5.0>,
+              <16 x float> %x
+  ret <16 x float> %a
+}
+
+; CHECK: v{{[0-9]+}} =    vnsel vm{{[0-9]+}}, $0x2a, v{{[0-9]+}}
+define <16 x i32> @vnselri_i32(<16 x i1> %mask, <16 x i32> %x) {
+  %y0 = insertelement <16 x i32> undef, i32 42, i32  0
+  %y = shufflevector <16 x i32> %y0, <16 x i32> undef, <16 x i32> zeroinitializer
+
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
+  ret <16 x i32>  %r
+}
+
+; CHECK: v{{[0-9]+}} =    vnsel vm{{[0-9]+}}, $0x42280000, v{{[0-9]+}}
+define <16 x float> @vnselri_float(<16 x i1> %mask, <16 x float> %x) {
+  %y0 = insertelement <16 x float> undef, float 42.0, i32  0
+  %y = shufflevector <16 x float> %y0, <16 x float> undef, <16 x i32> zeroinitializer
+
+  %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
+  ret <16 x float>  %r
+}
+
+; CHECK: v{{[0-9]+}} = vnsel vm{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i32> @vnselrs_i32(<16 x i1> %mask, <16 x i32> %x, i32 %s) {
+  %y0 = insertelement <16 x i32> undef, i32 %s, i32  0
+  %y = shufflevector <16 x i32> %y0, <16 x i32> undef, <16 x i32> zeroinitializer
+
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
+  ret <16 x i32>  %r
+}
+
+; CHECK: v{{[0-9]+}} = vnsel vm{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define <16 x float> @vnselrs_float(<16 x i1> %mask, <16 x float> %x, float %s) {
+  %y0 = insertelement <16 x float> undef, float %s, i32  0
+  %y = shufflevector <16 x float> %y0, <16 x float> undef, <16 x i32> zeroinitializer
+
+  %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
+  ret <16 x float>  %r
+}
+
+; CHECK-LABEL: vsitofp:
+; CHECK: v{{[0-9]+}} = vcvt.s32.f32 v{{[0-9]+}}
+define <16 x float> @vsitofp(<16 x i32> %x) {
+  %a = sitofp <16 x i32> %x to <16 x float>
+  ret <16 x float> %a
+}
+
+; CHECK-LABEL: vfptosi:
+; CHECK: v{{[0-9]+}} = vcvt.f32.s32 v{{[0-9]+}}
+define <16 x i32> @vfptosi(<16 x float> %x) {
+  %a = fptosi <16 x float> %x to <16 x i32>
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: vroti:
+; CHECK: v{{[0-9]+}} = vrot.slane.down v{{[0-9]+}}
+define <16 x float> @vroti(<16 x float> %x) {
+  %a = shufflevector <16 x float> %x, <16 x float> undef, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8,
+                                                                      i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0>
+  ret <16 x float> %a
+}
+
+declare <16 x float> @llvm.minimum.f32(<16 x float> %x, <16 x float> %y) readnone
+declare <16 x float> @llvm.maximum.f32(<16 x float> %x, <16 x float> %y) readnone
+declare <16 x i32> @llvm.smax.v16i32(<16 x i32> %x, <16 x i32> %y) readnone
+declare <16 x i32> @llvm.smin.v16i32(<16 x i32> %x, <16 x i32> %y) readnone
+declare <16 x i32> @llvm.umax.v16i32(<16 x i32> %x, <16 x i32> %y) readnone
+declare <16 x i32> @llvm.umin.v16i32(<16 x i32> %x, <16 x i32> %y) readnone
+
+; CHECK-LABEL: vrelu:
+; CHECK: v{{[0-9]+}} = vclamp.gez.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x float> @vrelu(<16 x float> %x, <16 x float> %y) {
+  %a = call <16 x float> @llvm.minimum.f32(<16 x float> %x, <16 x float> %y) readnone
+  %b = call <16 x float> @llvm.maximum.f32(<16 x float> %a, <16 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0,
+                                                                          float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>) readnone
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: vrelu_s:
+; CHECK: v{{[0-9]+}} = vclamp.gez.f32 v{{[0-9]+}}, s{{[0-9]+}}
+define <16 x float> @vrelu_s(<16 x float> %x, float %y) {
+  %y1 = insertelement <16 x float> undef, float %y, i32 0
+  %y2 = shufflevector <16 x float> %y1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                                                                        i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = call <16 x float> @llvm.minimum.f32(<16 x float> %x, <16 x float> %y2) readnone
+  %b = call <16 x float> @llvm.maximum.f32(<16 x float> %a, <16 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0,
+                                                                          float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>) readnone
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: vrelu_i:
+; CHECK: v{{[0-9]+}} = vclamp.gez.f32 v{{[0-9]+}}, $4
+define <16 x float> @vrelu_i(<16 x float> %x) {
+  %y1 = insertelement <16 x float> undef, float 4.0, i32 0
+  %y2 = shufflevector <16 x float> %y1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                                                                        i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = call <16 x float> @llvm.minimum.f32(<16 x float> %x, <16 x float> %y2) readnone
+  %b = call <16 x float> @llvm.maximum.f32(<16 x float> %a, <16 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0,
+                                                                          float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>) readnone
+  ret <16 x float> %b
+}
+
+declare <16 x i32> @llvm.ctpop.i32(<16 x i32> %x) readnone
+; CHECK-LABEL: vpopcnt:
+; CHECK: vpcnt
+define <16 x i32> @vpopcnt(<16 x i32> %x) {
+  %a = call <16 x i32> @llvm.ctpop.i32(<16 x i32> %x) readnone
+  ret <16 x i32> %a
+}
+
+declare <16 x i32> @llvm.ctlz.i32(<16 x i32> %x) readnone
+; CHECK-LABEL: vclz:
+; CHECK: vclz
+define <16 x i32> @vclz(<16 x i32> %x) {
+  %a = call <16 x i32> @llvm.ctlz.i32(<16 x i32> %x) readnone
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: shuffle_splat:
+; CHECK: v0 = vbroadcast v0, $0x1
+define <16 x i32> @shuffle_splat(<16 x i32> %x) {
+  %a = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
+                                                                  i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: build_vector:
+; CHECK: { v[[v0:[0-9]+]] = vmov s0;
+; CHECK:   vm[[vm0:[0-9]+]] = vcmask $0x300 }
+; CHECK: { vm[[vm1:[0-9]+]] = vcmask $0x704;
+; CHECK:   v[[v1:[0-9]+]] = vnsel vm[[vm0]], $0x35, v[[v0]] }
+; CHECK: { vm[[vm2:[0-9]+]] = vcmask $0xb08;
+; CHECK:   v[[v2:[0-9]+]] = vsel vm[[vm1]], s1, v[[v1]] }
+; CHECK: { vm[[vm3:[0-9]+]] = vcmask $0xf0c;
+; CHECK:   v[[v3:[0-9]+]] = vsel vm[[vm2]], s2, v[[v2]] }
+; CHECK: { vm[[vm4:[0-9]+]] = vcmask $0x1310;
+; CHECK:   v[[v4:[0-9]+]] = vsel vm[[vm3]], s3, v[[v3]] }
+; CHECK: { vm[[vm5:[0-9]+]] = vcmask $0x1714;
+; CHECK:   v[[v5:[0-9]+]] = vsel vm[[vm4]], $0x43, v[[v4]] }
+; CHECK: { vm[[vm6:[0-9]+]] = vcmask $0x1b18;
+; CHECK:   v[[v6:[0-9]+]] = vsel vm[[vm5]], $0x2f, v[[v5]] }
+; CHECK: { vm[[vm7:[0-9]+]] = vcmask $0x1f1c;
+; CHECK:   v[[v7:[0-9]+]] = vsel vm[[vm6]], $0x25, v[[v6]] }
+; CHECK: { vm[[vm8:[0-9]+]] = vcmask $0x2320;
+; CHECK:   v[[v8:[0-9]+]] = vsel vm[[vm7]], $0x17, v[[v7]] }
+; CHECK: { vm[[vm9:[0-9]+]] = vcmask $0x2724;
+; CHECK:   v[[v9:[0-9]+]] = vsel vm[[vm8]], $0x15, v[[v8]] }
+; CHECK: { vm[[vm10:[0-9]+]] = vcmask $0x2b28;
+; CHECK:   v[[v10:[0-9]+]] = vsel vm[[vm9]], $0x42, v[[v9]] }
+; CHECK: { vm[[vm11:[0-9]+]] = vcmask $0x2f2c;
+; CHECK:   v[[v11:[0-9]+]] = vsel vm[[vm10]], $0x59, v[[v10]] }
+; CHECK: { vm[[vm12:[0-9]+]] = vcmask $0x3330;
+; CHECK:   v[[v12:[0-9]+]] = vsel vm[[vm11]], $0x3, v[[v11]] }
+; CHECK: { vm[[vm13:[0-9]+]] = vcmask $0x3734;
+; CHECK:   v[[v13:[0-9]+]] = vsel vm[[vm12]], $0x65, v[[v12]] }
+; CHECK: { vm[[vm14:[0-9]+]] = vcmask $0x3b38;
+; CHECK:   v[[v14:[0-9]+]] = vsel vm[[vm13]], $0x64, v[[v13]] }
+; CHECK: { v{{[0-9]+}} = vsel vm[[vm14]], $0x63, v0;
+define <16 x i32> @build_vector(i32 %a, i32 %b, i32 %c, i32 %d) {
+  %v1 = insertelement <16 x i32> undef, i32 %a, i32 0
+  %v2 = insertelement <16 x i32> %v1, i32 %b, i32 1
+  %v3 = insertelement <16 x i32> %v2, i32 %c, i32 2
+  %v4 = insertelement <16 x i32> %v3, i32 %d, i32 3
+  %v5 = insertelement <16 x i32> %v4, i32 67, i32 4
+  %v6 = insertelement <16 x i32> %v5, i32 47, i32 5
+  %v7 = insertelement <16 x i32> %v6, i32 37, i32 6
+  %v8 = insertelement <16 x i32> %v7, i32 23, i32 7
+  %v9 = insertelement <16 x i32> %v8, i32 21, i32 8
+  %v10 = insertelement <16 x i32> %v9, i32 66, i32 9
+  %v11 = insertelement <16 x i32> %v10, i32 89, i32 10
+  %v12 = insertelement <16 x i32> %v11, i32 3, i32 11
+  %v13 = insertelement <16 x i32> %v12, i32 101, i32 12
+  %v14 = insertelement <16 x i32> %v13, i32 100, i32 13
+  %v15 = insertelement <16 x i32> %v14, i32 99, i32 14
+  %v16 = insertelement <16 x i32> %v15, i32 53, i32 15
+  ret <16 x i32> %v16
+}
+
+; CHECK-LABEL: build_vector2:
+; CHECK: vlaneseq
+define <16 x i32> @build_vector2() {
+  %v1 = insertelement <16 x i32> undef, i32 0, i32 0
+  %v2 = insertelement <16 x i32> %v1, i32 1, i32 1
+  %v3 = insertelement <16 x i32> %v2, i32 2, i32 2
+  %v4 = insertelement <16 x i32> %v3, i32 3, i32 3
+  %v5 = insertelement <16 x i32> %v4, i32 4, i32 4
+  %v6 = insertelement <16 x i32> %v5, i32 5, i32 5
+  %v7 = insertelement <16 x i32> %v6, i32 6, i32 6
+  %v8 = insertelement <16 x i32> %v7, i32 7, i32 7
+  %v9 = insertelement <16 x i32> %v8, i32 8, i32 8
+  %v10 = insertelement <16 x i32> %v9, i32 9, i32 9
+  %v11 = insertelement <16 x i32> %v10, i32 10, i32 10
+  %v12 = insertelement <16 x i32> %v11, i32 11, i32 11
+  %v13 = insertelement <16 x i32> %v12, i32 12, i32 12
+  %v14 = insertelement <16 x i32> %v13, i32 13, i32 13
+  %v15 = insertelement <16 x i32> %v14, i32 14, i32 14
+  %v16 = insertelement <16 x i32> %v15, i32 15, i32 15
+  ret <16 x i32> %v16
+}
+
+; CHECK-LABEL: build_vector3:
+; CHECK: vlaneseq
+define <16 x i32> @build_vector3() {
+  ret <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+}
+
+; CHECK-LABEL: build_vector4:
+; CHECK: { v[[v0:[0-9]+]] = vimm.s32 $0x5f;
+; CHECK:   vm[[vm0:[0-9]+]] = vcmask $0x300 }
+; CHECK: { vm[[vm1:[0-9]+]] = vcmask $0x704;
+; CHECK:   v[[v1:[0-9]+]] = vsel vm[[vm0]], $0x50, v[[v0]] }
+; CHECK: { vm[[vm2:[0-9]+]] = vcmask $0xb08;
+; CHECK:   v[[v2:[0-9]+]] = vsel vm[[vm1]], $0x51, v[[v1]] }
+; CHECK: { vm[[vm3:[0-9]+]] = vcmask $0xf0c;
+; CHECK:   v[[v3:[0-9]+]] = vsel vm[[vm2]], $0x52, v[[v2]] }
+; CHECK: { vm[[vm4:[0-9]+]] = vcmask $0x1310;
+; CHECK:   v[[v4:[0-9]+]] = vsel vm[[vm3]], $0x53, v[[v3]] }
+; CHECK: { vm[[vm5:[0-9]+]] = vcmask $0x1714;
+; CHECK:   v[[v5:[0-9]+]] = vsel vm[[vm4]], $0x54, v[[v4]] }
+; CHECK: { vm[[vm6:[0-9]+]] = vcmask $0x1b18;
+; CHECK:   v[[v6:[0-9]+]] = vsel vm[[vm5]], $0x55, v[[v5]] }
+; CHECK: { vm[[vm7:[0-9]+]] = vcmask $0x1f1c;
+; CHECK:   v[[v7:[0-9]+]] = vsel vm[[vm6]], $0x56, v[[v6]] }
+; CHECK: { vm[[vm8:[0-9]+]] = vcmask $0x2320;
+; CHECK:   v[[v8:[0-9]+]] = vsel vm[[vm7]], $0x57, v[[v7]] }
+; CHECK: { vm[[vm9:[0-9]+]] = vcmask $0x2724;
+; CHECK:   v[[v9:[0-9]+]] = vsel vm[[vm8]], $0x58, v[[v8]] }
+; CHECK: { vm[[vm10:[0-9]+]] = vcmask $0x2b28;
+; CHECK:   v[[v10:[0-9]+]] = vsel vm[[vm9]], $0x59, v[[v9]] }
+; CHECK: { vm[[vm11:[0-9]+]] = vcmask $0x2f2c;
+; CHECK:   v[[v11:[0-9]+]] = vsel vm[[vm10]], $0x5a, v[[v10]] }
+; CHECK: { vm[[vm12:[0-9]+]] = vcmask $0x3330;
+; CHECK:   v[[v12:[0-9]+]] = vsel vm[[vm11]], $0x5b, v[[v11]] }
+; CHECK: { vm[[vm13:[0-9]+]] = vcmask $0x3734;
+; CHECK:   v[[v13:[0-9]+]] = vsel vm[[vm12]], $0x5c, v[[v12]] }
+; CHECK: { vm[[vm14:[0-9]+]] = vcmask $0x3b38;
+; CHECK:   v[[v14:[0-9]+]] = vsel vm[[vm13]], $0x5d, v[[v13]] }
+; CHECK: { v[[v15:[0-9]+]] = vsel vm[[vm14]], $0x5e, v[[v14]] }
+; CHECK: { [tilespmem:$0x50] = vst v[[v15]]
+define void @build_vector4(<16 x float> %x, <16 x i32> %m) {
+  store <16 x i32> <i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87,
+                    i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>,
+        <16 x i32> addrspace(201)* inttoptr (i32 80 to <16 x i32> addrspace(201)*), align 32
+  ret void
+}
+
+; CHECK-LABEL: insert_element:
+; CHECK: vlaneseq
+; CHECK: veq.s32 v1, $0x3
+; CHECK: vsel vm0, s0, v0
+define <16 x i32> @insert_element(<16 x i32> %v, i32 %b) {
+  %v2 = insertelement <16 x i32> %v, i32 %b, i32 3
+  ret <16 x i32> %v2
+}
+
+; CHECK-LABEL: vmread:
+; CHECK:  v{{[0-9]+}} =	vimm.s32 $0x0
+; CHECK: vsel
+; CHECK: shalt
+define <16 x i32> @vmread(<16 x i1> %a) {
+entry:
+  %b = zext <16 x i1> %a to <16 x i32>
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: scalar_sel:
+; CHECK: v{{[0-9]+}} =	vpsel p{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i32> @scalar_sel(i1 %m, <16 x i32> %x, <16 x i32> %y) {
+  %a = select i1 %m, <16 x i32> %x, <16 x i32> %y
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: scalar_sel_imm:
+; CHECK: v{{[0-9]+}} =	vpsel !p{{[0-9]+}}, $0x1, v{{[0-9]+}}
+define <16 x i32> @scalar_sel_imm(i1 %m, <16 x i32> %x) {
+  %v0 = insertelement <16 x i32> undef, i32 1, i32 0
+  %v1 = shufflevector <16 x i32> %v0, <16 x i32> undef, <16 x i32> zeroinitializer ; create vector of all 1
+  %a = select i1 %m, <16 x i32> %x, <16 x i32> %v1
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: scalar_sel_f:
+; CHECK: v{{[0-9]+}} =	vpsel p{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x float> @scalar_sel_f(i1 %m, <16 x float> %x, <16 x float> %y) {
+  %a = select i1 %m, <16 x float> %x, <16 x float> %y
+  ret <16 x float> %a
+}
+
+; CHECK-LABEL: scalar_sel_imm_f:
+; CHECK: v{{[0-9]+}} =	vpsel !p{{[0-9]+}}, $0x3f800000, v{{[0-9]+}}
+define <16 x float> @scalar_sel_imm_f(i1 %m, <16 x float> %x) {
+  %v0 = insertelement <16 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <16 x float> %v0, <16 x float> undef, <16 x i32> zeroinitializer ; create vector of all 1
+  %a = select i1 %m, <16 x float> %x, <16 x float> %v1
+  ret <16 x float> %a
+}
+
+; CHECK-LABEL: scalar_sel_imm_f2:
+; CHECK: v{{[0-9]+}} =	vpsel p{{[0-9]+}}, $0x3f800000, v{{[0-9]+}}
+define <16 x float> @scalar_sel_imm_f2(i1 %m, <16 x float> %x) {
+  %v0 = insertelement <16 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <16 x float> %v0, <16 x float> undef, <16 x i32> zeroinitializer ; create vector of all 1
+  %a = select i1 %m, <16 x float> %v1, <16 x float> %x
+  ret <16 x float> %a
+}
+
+; CHECK-LABEL: scalar_sel_vs_v_s:
+; CHECK: v{{[0-9]+}} =	vpsel !p{{[0-9]+}}, s0, v{{[0-9]+}}
+define <16 x i32> @scalar_sel_vs_v_s(i1 %m, <16 x i32> %x, i32 %y) {
+  %v0 = insertelement <16 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <16 x i32> %v0, <16 x i32> undef, <16 x i32> zeroinitializer
+  %a = select i1 %m, <16 x i32> %x, <16 x i32> %v1
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: scalar_sel_vs_s_v:
+; CHECK: v{{[0-9]+}} =	vpsel p{{[0-9]+}}, s0, v{{[0-9]+}}
+define <16 x i32> @scalar_sel_vs_s_v(i1 %m, <16 x i32> %x, i32 %y) {
+  %v0 = insertelement <16 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <16 x i32> %v0, <16 x i32> undef, <16 x i32> zeroinitializer ; create vector of all 1
+  %a = select i1 %m, <16 x i32> %v1, <16 x i32> %x
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: bitcast1:
+; CHECK: { v{{[0-9]+}} = vadd.f32
+; CHECK-NEXT: v{{[0-9]+}} = vsub.f32 v{{[0-9]+}}, v{{[0-9]+}};
+; CHECK-NEXT: _ = sdelay $0x1 }
+; CHECK-NEXT: v0 = vand.u32
+; CHECK-NEXT: shalt
+define <16 x float> @bitcast1(<16 x float> %x, <16 x float> %y) {
+  %a = fadd <16 x float> %x, %y
+  %b = fsub <16 x float> %x, %y
+  %c = bitcast <16 x float> %a to <16 x i32>
+  %d = bitcast <16 x float> %b to <16 x i32>
+  %e = and <16 x i32> %c, %d
+  %f = bitcast <16 x i32> %e to <16 x float>
+  ret <16 x float> %f
+}
+
+; CHECK-LABEL: bitcast2:
+; CHECK-DAG: vadd.s32
+; CHECK-DAG: vsub.s32 
+; CHECK-NEXT: v0 = vadd.f32
+; CHECK: shalt
+define <16 x i32> @bitcast2(<16 x i32> %x, <16 x i32> %y) {
+  %a = add <16 x i32> %x, %y
+  %b = sub <16 x i32> %x, %y
+  %c = bitcast <16 x i32> %a to <16 x float>
+  %d = bitcast <16 x i32> %b to <16 x float>
+  %e = fadd <16 x float> %c, %d
+  %f = bitcast <16 x float> %e to <16 x i32>
+  ret <16 x i32> %f
+}
+
+; CHECK-LABEL: int_pat_vshl:
+; CHECK: v{{[0-9]+}} = vshll.u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-NEXT: shalt
+define <16 x i32> @int_pat_vshl(<16 x i32> %x, <16 x i32> %y) {
+  %a = icmp ult <16 x i32> %y, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32,
+                                i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %b = shl <16 x i32> %x, %y
+  %c = select <16 x i1> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                                                       i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i32> %c
+}
+
+; CHECK-LABEL: int_pat_vlshr:
+; CHECK: v{{[0-9]+}} = vshrl.u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-NEXT: shalt
+define <16 x i32> @int_pat_vlshr(<16 x i32> %x, <16 x i32> %y) {
+  %a = icmp ult <16 x i32> %y, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32,
+                                i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %b = lshr <16 x i32> %x, %y
+  %c = select <16 x i1> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
+                                                       i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i32> %c
+}
+
+; CHECK-LABEL: int_pat_vashr:
+; CHECK: v{{[0-9]+}} = vshra.s32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-NEXT: shalt
+define <16 x i32> @int_pat_vashr(<16 x i32> %x, <16 x i32> %y) {
+  %a = icmp ult <16 x i32> %y, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31,
+                                i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %b = select <16 x i1> %a, <16 x i32> %y, <16 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31,
+                                                       i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %c = lshr <16 x i32> %x, %b
+  ret <16 x i32> %c
+}
+
+; CHECK-LABEL: vmaxf_v:
+; CHECK: v{{[0-9]+}} = vmax.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x float> @vmaxf_v(<16 x float> %x, <16 x float> %y) {
+  %a = call <16 x float> @llvm.maximum.f32(<16 x float> %x, <16 x float> %y) readnone
+  ret <16 x float> %a
+}
+
+; CHECK-LABEL: vminf_v:
+; CHECK: v{{[0-9]+}} = vmin.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x float> @vminf_v(<16 x float> %x, <16 x float> %y) {
+  %a = call <16 x float> @llvm.minimum.f32(<16 x float> %x, <16 x float> %y) readnone
+  ret <16 x float> %a
+}
+
+; CHECK-LABEL: vsmax_v:
+; CHECK: vm[[vm:[0-9]+]] = vgt.s32 v0, v1
+; CHECK: v0 = vsel vm[[vm]], v0, v1
+define <16 x i32> @vsmax_v(<16 x i32> %x, <16 x i32> %y) {
+  %a = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %x, <16 x i32> %y) readnone
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: vsmin_v:
+; CHECK: vm[[vm:[0-9]+]] = vlt.s32 v0, v1
+; CHECK: v0 = vsel vm[[vm]], v0, v1
+define <16 x i32> @vsmin_v(<16 x i32> %x, <16 x i32> %y) {
+  %a = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %x, <16 x i32> %y) readnone
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: vumax_v:
+; CHECK: v0 = vmax.u32 v0, v1
+define <16 x i32> @vumax_v(<16 x i32> %x, <16 x i32> %y) {
+  %a = call <16 x i32> @llvm.umax.v16i32(<16 x i32> %x, <16 x i32> %y) readnone
+  ret <16 x i32> %a
+}
+
+; CHECK-LABEL: vumin_v:
+; CHECK: v0 = vmin.u32 v0, v1
+define <16 x i32> @vumin_v(<16 x i32> %x, <16 x i32> %y) {
+  %a = call <16 x i32> @llvm.umin.v16i32(<16 x i32> %x, <16 x i32> %y) readnone
+  ret <16 x i32> %a
+}
+
+; vceil
+declare  <16 x float>  @llvm.ceil.v16f32(<16 x float>  %Val)
+
+; CHECK-LABEL: vceilf:
+; CHECK: v{{[0-9]+}} = vceil.f32 v{{[0-9]+}}
+define <16 x float> @vceilf(<16 x float> %y) {
+  %res = call <16 x float>  @llvm.ceil.v16f32(<16 x float> %y)
+  ret <16 x float> %res
+}
+
+; CHECK-LABEL: vceilf_splat:
+; CHECK: v{{[0-9]+}} = vimm.f32 $13.0
+define <16 x float> @vceilf_splat() {
+
+  %v0 = insertelement <16 x float> undef, float 12.5, i32 0
+  %v1 = shufflevector <16 x float> %v0, <16 x float> undef, <16 x i32> zeroinitializer
+  %res = call <16 x float>  @llvm.ceil.v16f32(<16 x float> %v1)
+  ret <16 x float> %res
+}
+
+; CHECK-LABEL: vceilf_splat_sreg:
+; CHECK: v[[x:[0-9]+]] =  vmov s0
+; CHECK: v{{[0-9]+}} =  vceil.f32 v[[x]];
+define <16 x float> @vceilf_splat_sreg(float %y) {
+  %v0 = insertelement <16 x float> undef, float %y, i32 0
+  %v1 = shufflevector <16 x float> %v0, <16 x float> undef, <16 x i32> zeroinitializer
+  %res = call <16 x float>  @llvm.ceil.v16f32(<16 x float> %v1)
+  ret <16 x float> %res
+}
+
+; vfloor
+declare  <16 x float>  @llvm.floor.v16f32(<16 x float>  %Val)
+
+; CHECK-LABEL: vfloorf:
+; CHECK: v{{[0-9]+}} = vfloor.f32 v{{[0-9]+}}
+define <16 x float> @vfloorf(<16 x float> %y) {
+  %res = call <16 x float>  @llvm.floor.v16f32(<16 x float> %y)
+  ret <16 x float> %res
+}
+
+; CHECK-LABEL: vfloorf_splat:
+; CHECK: v{{[0-9]+}} = vimm.f32 $12.0
+define <16 x float> @vfloorf_splat() {
+  %v0 = insertelement <16 x float> undef, float 12.5, i32 0
+  %v1 = shufflevector <16 x float> %v0, <16 x float> undef, <16 x i32> zeroinitializer
+  %res = call <16 x float>  @llvm.floor.v16f32(<16 x float> %v1)
+  ret <16 x float> %res
+}
+
+; CHECK-LABEL: vfloorf_splat_sreg:
+; CHECK: v[[x:[0-9]+]] =  vmov s0
+; CHECK: v{{[0-9]+}} =  vfloor.f32 v[[x]];
+define <16 x float> @vfloorf_splat_sreg(float %y) {
+  %v0 = insertelement <16 x float> undef, float %y, i32 0
+  %v1 = shufflevector <16 x float> %v0, <16 x float> undef, <16 x i32> zeroinitializer
+  %res = call <16 x float>  @llvm.floor.v16f32(<16 x float> %v1)
+  ret <16 x float> %res
+}
+
+; CHECK-LABEL: shuffle_permute:
+; CHECK: v{{[0-9]+}} = vperm.xlane v0, v1
+define <16 x i32> @shuffle_permute(<16 x i32> %x, <16 x i32> %m) {
+  %r = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> <i32 1, i32 2, i32 0, i32 6, i32 5, i32 4, i32 3, i32 7,
+                                                                  i32 1, i32 2, i32 0, i32 6, i32 5, i32 4, i32 3, i32 7>
+  ret <16 x i32> %r
+}
+
+; CHECK-LABEL: shuffle_permutef:
+; CHECK: v{{[0-9]+}} = vperm.xlane v0, v1
+define <16 x float> @shuffle_permutef(<16 x float> %x, <16 x i32> %m) {
+  %r = shufflevector <16 x float> %x, <16 x float> undef, <16 x i32> <i32 1, i32 2, i32 0, i32 6, i32 5, i32 4, i32 3, i32 7,
+                                                                      i32 1, i32 2, i32 0, i32 6, i32 5, i32 4, i32 3, i32 7>
+  ret <16 x float> %r
+}
+
+declare <16 x i32> @llvm.tpu.vmpcnt.ones.v16i32(<16 x i1> %m) readnone
+; CHECK-LABEL: vmpcnt_ones:
+; CHECK: v{{[0-9]+}} = vmpcnt.ones.xlane
+define <16 x i32> @vmpcnt_ones(<16 x i1> %m) {
+  %a = call <16 x i32> @llvm.tpu.vmpcnt.ones.v16i32(<16 x i1> %m) readnone
+  ret <16 x i32> %a
+}
+
+declare <16 x i32> @llvm.tpu.vmctz.v16i32(<16 x i1> %m) readnone
+
+; CHECK-LABEL: vmctz:
+; CHECK: v{{[0-9]+}} = vmctz.xlane
+define <16 x i32> @vmctz(<16 x i1> %m) {
+  %a = call <16 x i32> @llvm.tpu.vmctz.v16i32(<16 x i1> %m) readnone
+  ret <16 x i32> %a
+}
+
+; FIXME(b/245600024): Migrate the same tests as in vector_sc.ll.

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_i16_gl_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_i16_gl_sc.ll
new file mode 100644
index 0000000..0cfb0d7
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_i16_gl_sc.ll

@@ -0,0 +1,64 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Tests low integer precision vector ALU instructions.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <16 x i16> @llvm.smax.v16i16(<16 x i16> %x, <16 x i16> %y) readnone
+declare <16 x i16> @llvm.smin.v16i16(<16 x i16> %x, <16 x i16> %y) readnone
+declare <16 x i16> @llvm.umax.v16i16(<16 x i16> %x, <16 x i16> %y) readnone
+declare <16 x i16> @llvm.umin.v16i16(<16 x i16> %x, <16 x i16> %y) readnone
+
+; CHECK-LABEL: vadd_s16:
+; CHECK: v{{[0-9]+}} = vadd.s16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i16> @vadd_s16(<16 x i16> %x, <16 x i16> %y) {
+  %a = add <16 x i16> %x, %y
+  ret <16 x i16> %a
+}
+
+; CHECK-LABEL: vsub_s16:
+; CHECK: v{{[0-9]+}} = vsub.s16 v0, v1
+define <16 x i16> @vsub_s16(<16 x i16> %x, <16 x i16> %y) {
+  %a = sub <16 x i16> %x, %y
+  ret <16 x i16> %a
+}
+
+; CHECK-LABEL: vmul_s16:
+; CHECK: v{{[0-9]+}} = vmul.u16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i16> @vmul_s16(<16 x i16> %x, <16 x i16> %y) {
+  %a = mul <16 x i16> %x, %y
+  ret <16 x i16> %a
+}
+
+; CHECK-LABEL: vsmax_v:
+; CHECK: vm[[vm:[0-9]+]] = vgt.s16 v0, v1
+; CHECK: v0 = vsel vm[[vm]], v0, v1
+define <16 x i16> @vsmax_v(<16 x i16> %x, <16 x i16> %y) {
+  %a = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %x, <16 x i16> %y) readnone
+  ret <16 x i16> %a
+}
+
+; CHECK-LABEL: vsmin_v:
+; CHECK: vm[[vm:[0-9]+]] = vlt.s16 v0, v1
+; CHECK: v0 = vsel vm[[vm]], v0, v1
+define <16 x i16> @vsmin_v(<16 x i16> %x, <16 x i16> %y) {
+  %a = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %x, <16 x i16> %y) readnone
+  ret <16 x i16> %a
+}
+
+; CHECK-LABEL: vmax_u16:
+; CHECK: v{{[0-9]+}} = vmax.u16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i16> @vmax_u16(<16 x i16> %x, <16 x i16> %y) {
+  %a = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %x, <16 x i16> %y) readnone
+  ret <16 x i16> %a
+}
+
+; CHECK-LABEL: vmin_u16:
+; CHECK: v{{[0-9]+}} = vmin.u16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i16> @vmin_u16(<16 x i16> %x, <16 x i16> %y) {
+  %a = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %x, <16 x i16> %y) readnone
+  ret <16 x i16> %a
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_i16_opt_gl_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_i16_opt_gl_sc.ll
new file mode 100644
index 0000000..d89070c
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_i16_opt_gl_sc.ll

@@ -0,0 +1,25 @@
+; RUN: opt -S -O2 -mcpu=sparsecore-tec-gl < %s | \
+; RUN: llc -mcpu=sparsecore-tec-gl -asm-verbose=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Tests optimizations of low integer precision vector ALU instructions.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: vmax_u16:
+; CHECK: v{{[0-9]+}} = vmax.u16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i16> @vmax_u16(<16 x i16> %x, <16 x i16> %y) {
+  %c = icmp ugt <16 x i16> %x, %y
+  %r = select <16 x i1> %c, <16 x i16> %x, <16 x i16> %y
+  ret <16 x i16> %r
+}
+
+; CHECK-LABEL: vmin_u16:
+; CHECK: v{{[0-9]+}} = vmin.u16 v{{[0-9]+}}, v{{[0-9]+}}
+define <16 x i16> @vmin_u16(<16 x i16> %x, <16 x i16> %y) {
+  %c = icmp ult <16 x i16> %x, %y
+  %r = select <16 x i1> %c, <16 x i16> %x, <16 x i16> %y
+  ret <16 x i16> %r
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_aliasing.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_aliasing.ll
new file mode 100644
index 0000000..fee5d5d
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_aliasing.ll

@@ -0,0 +1,168 @@
+; RUN: llc < %s -mcpu=tensorcore-pf -tpu-critical-path-sched -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+; This test use critical path scheduler to make sure instructions are pulled 
+; down if they are not blocked by aliasing.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)*, i32, i32)
+declare <1024 x float> @llvm.tpu.vld.shuffle.f32(<1024 x float> addrspace(205)*, i32, i32)
+declare <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)*, i32, i32)
+declare <1024 x float> @llvm.tpu.vld.strided.f32(<1024 x float> addrspace(205)*, i32, i32)
+declare void @llvm.tpu.vst.strided.i32(<1024 x i32>, <1024 x i32> addrspace(205)*, i32, i32, <1024 x i1>)
+declare void @llvm.tpu.vst.strided.f32(<1024 x float>, <1024 x float> addrspace(205)*, i32, i32, <1024 x i1>)
+declare <1024 x i32> @llvm.tpu.vld.indexed.i32(<1024 x i32> addrspace(205)*, i32, i32, i32, i32)
+declare <1024 x float> @llvm.tpu.vld.indexed.f32(<1024 x float> addrspace(205)*, i32, i32, i32, i32)
+declare void @llvm.tpu.vst.indexed.i32(<1024 x i32>, <1024 x i32> addrspace(205)*, i32, i32, <1024 x i1>, i32, i32)
+declare void @llvm.tpu.vst.indexed.f32(<1024 x float>, <1024 x float> addrspace(205)*, i32, i32, <1024 x i1>, i32, i32)
+declare void @llvm.tpu.vst.evenodd.sublanes.i32(<1024 x i32>, <1024 x i32> addrspace(205)*, i32, i32, <1024 x i1>, i32)
+declare <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.i32(<1024 x i32> addrspace(205)*, i32, i32, i32)
+
+declare i32 @llvm.tpu.set.lane.indexed(<1024 x i32>, i32)
+declare i32 @llvm.tpu.set.sublane.indexed(<1024 x i32>, i32)
+declare i32 @llvm.tpu.set.iar.raw(<1024 x i32>, i32)
+declare <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32) nounwind
+
+; Test that the backend can analyse correcly shuffle and mask ld/st.
+; CHECK-LABEL: alias_mask_shuffle:
+; CHECK: vld
+; CHECK: vld
+; CHECK: vst
+define <1024 x i32> @alias_mask_shuffle(<1024 x i32> addrspace(205)* readonly %ptr, <1024 x i1> %m, <1024 x i32> %x) {
+entry:
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %x, <1024 x i32> addrspace(205)* %ptr, i32 4, i32 1, <1024 x i1> %m)
+  %0 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* %ptr, i32 255, i32 0)
+  %1 = call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* %ptr, i32 3, i32 1)
+  %a = add <1024 x i32> %0, %1
+  ret <1024 x i32> %a
+}
+
+; Test that the backend can analyse correcly shuffle and mask ld/st.
+; CHECK-LABEL: alias_dyn_mask_stride:
+; CHECK: vld
+; CHECK: vld
+; CHECK: vst
+define <1024 x i32> @alias_dyn_mask_stride(<1024 x i32> addrspace(205)* readonly %ptr, i32 %mask, <1024 x i1> %m, <1024 x i32> %x) {
+entry:
+  %ptr1 = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %ptr, i32 1
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %x, <1024 x i32> addrspace(205)* %ptr1, i32 2, i32 1, <1024 x i1> %m)
+  %0 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* %ptr, i32 %mask, i32 0)
+  %1 = call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* %ptr, i32 3, i32 2)
+  %a = add <1024 x i32> %0, %1
+  ret <1024 x i32> %a
+}
+
+; Same case but with dynamic stride, we cannot analyse this case.
+; CHECK-LABEL: alias_dyn_stride:
+; CHECK: vst
+; CHECK: vld
+; CHECK: vld
+define <1024 x i32> @alias_dyn_stride(<1024 x i32> addrspace(205)* readonly %ptr, i32 %stride, i32 %mask, <1024 x i1> %m, <1024 x i32> %x) {
+entry:
+  %ptr1 = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %ptr, i32 1
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %x, <1024 x i32> addrspace(205)* %ptr1, i32 2, i32 1, <1024 x i1> %m)
+  %0 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* %ptr, i32 %mask, i32 0)
+  %1 = call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* %ptr, i32 3, i32 %stride)
+  %a = add <1024 x i32> %0, %1
+  ret <1024 x i32> %a
+}
+
+; Case with dynamic stride with same offset and disjoint mask.
+; CHECK-LABEL: alias_dyn_stride_disjoint_mask:
+; CHECK: vld
+; CHECK: vld
+; CHECK: vst
+define <1024 x i32> @alias_dyn_stride_disjoint_mask(<1024 x i32> addrspace(205)* readonly %ptr, i32 %stride, i32 %mask, <1024 x i1> %m, <1024 x i32> %x, <1024 x i32> %x1) {
+entry:
+  %x2 = add <1024 x i32> %x, %x1 ; this adds a little bit more latency to push the store down.
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %x2, <1024 x i32> addrspace(205)* %ptr, i32 4, i32 %stride, <1024 x i1> %m)
+  %0 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* %ptr, i32 %mask, i32 0)
+  %1 = call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* %ptr, i32 3, i32 %stride)
+  %a = add <1024 x i32> %0, %1
+  ret <1024 x i32> %a
+}
+
+; Negative stride, the strided load will alias with the load with offset -1.
+; CHECK-LABEL: alias_negative_stride:
+; CHECK: vst
+; CHECK: vld
+; CHECK: vld
+define <1024 x i32> @alias_negative_stride(<1024 x i32> addrspace(205)* readonly %ptr, <1024 x i1> %m, <1024 x i32> %x) {
+entry:
+  %ptr1 = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %ptr, i32 -1
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %x, <1024 x i32> addrspace(205)* %ptr1, i32 255, i32 1, <1024 x i1> %m)
+  %0 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* %ptr, i32 255, i32 0)
+  %1 = call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* %ptr, i32 255, i32 -1)
+  %a = add <1024 x i32> %0, %1
+  ret <1024 x i32> %a
+}
+
+; Negative stride. Since the store only write one element it won't alias with
+; the ld with negative stride.
+; CHECK-LABEL: alias_negative_stride2:
+; CHECK: vld
+; CHECK: vld
+; CHECK: vst
+define <1024 x i32> @alias_negative_stride2(<1024 x i32> addrspace(205)* readonly %ptr, <1024 x i1> %m, <1024 x i32> %x) {
+entry:
+  %ptr1 = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %ptr, i32 -1
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %x, <1024 x i32> addrspace(205)* %ptr1, i32 1, i32 1, <1024 x i1> %m)
+  %0 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* %ptr, i32 255, i32 0)
+  %1 = call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* %ptr, i32 255, i32 -1)
+  %a = add <1024 x i32> %0, %1
+  ret <1024 x i32> %a
+}
+
+; Store writes with a mask of 0xF0 so doesn't alias the ld.evenodd
+; CHECK-LABEL: alias_ld_evenodd:
+; CHECK: vld
+; CHECK: vst
+define <1024 x i32> @alias_ld_evenodd(<1024 x i32> addrspace(205)* readonly %ptr, <1024 x i1> %m, <1024 x i32> %x, <1024 x i32> %index) {
+entry:
+  %iar = call i32 @llvm.tpu.set.iar.raw(<1024 x i32> %index, i32 1)
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %x, <1024 x i32> addrspace(205)* %ptr, i32 16, i32 1, <1024 x i1> %m)
+  %0 = call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.i32(<1024 x i32> addrspace(205)* %ptr, i32 255, i32 1, i32 %iar)
+  %a = add <1024 x i32> %0, %x
+  ret <1024 x i32> %a
+}
+
+; Store writes with a mask overlapping with the oddeven ld.
+; CHECK-LABEL: alias_ld_evenodd2:
+; CHECK: vst
+; CHECK: vld
+define <1024 x i32> @alias_ld_evenodd2(<1024 x i32> addrspace(205)* readonly %ptr, <1024 x i1> %m, <1024 x i32> %x, <1024 x i32> %index) {
+entry:
+  %iar = call i32 @llvm.tpu.set.iar.raw(<1024 x i32> %index, i32 1)
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %x, <1024 x i32> addrspace(205)* %ptr, i32 4, i32 1, <1024 x i1> %m)
+  %0 = call <1024 x i32> @llvm.tpu.vld.replicate.evenodd.sublanes.i32(<1024 x i32> addrspace(205)* %ptr, i32 255, i32 1, i32 %iar)
+  %a = add <1024 x i32> %0, %x
+  ret <1024 x i32> %a
+}
+
+; store followed by load with dijoint mask.
+; CHECK-LABEL: alias_vst_evenodd:
+; CHECK: vld
+; CHECK: vst
+define <1024 x i32> @alias_vst_evenodd(<1024 x i32> addrspace(205)* readonly %ptr, <1024 x i1> %m, <1024 x i32> %x, <1024 x i32> %index, <1024 x i32> %data) {
+entry:
+  %iar = call i32 @llvm.tpu.set.iar.raw(<1024 x i32> %index, i32 0)
+  call void @llvm.tpu.vst.evenodd.sublanes.i32(<1024 x i32> %data, <1024 x i32> addrspace(205)* %ptr, i32 15, i32 1, <1024 x i1> %m, i32 %iar)
+  %0 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* %ptr, i32 15, i32 30001)
+  %a = add <1024 x i32> %0, %x
+  ret <1024 x i32> %a
+}
+
+; store followed by load with overlapping mask.
+; CHECK-LABEL: alias_vst_evenodd2:
+; CHECK: vst
+; CHECK: vld
+define <1024 x i32> @alias_vst_evenodd2(<1024 x i32> addrspace(205)* readonly %ptr, <1024 x i1> %m, <1024 x i32> %x, <1024 x i32> %index, <1024 x i32> %data) {
+entry:
+  %iar = call i32 @llvm.tpu.set.iar.raw(<1024 x i32> %index, i32 0)
+  call void @llvm.tpu.vst.evenodd.sublanes.i32(<1024 x i32> %data, <1024 x i32> addrspace(205)* %ptr, i32 15, i32 1, <1024 x i1> %m, i32 %iar)
+  %0 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* %ptr, i32 15, i32 30002)
+  %a = add <1024 x i32> %0, %x
+  ret <1024 x i32> %a
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_bf16_gl_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_bf16_gl_sc.ll
new file mode 100644
index 0000000..81a11e0
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_bf16_gl_sc.ll

@@ -0,0 +1,257 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "googletpu"
+
+@a = external addrspace(201) global <16 x bfloat>, align 32
+
+declare void @llvm.tpu.vst.msk.v8i1.v32i8(<8 x i1>, <32 x i8> addrspace(201)*, <32 x i8>)
+declare <8 x i1> @llvm.tpu.16i1.to.8i1(<16 x i1>)
+declare <16 x bfloat> @llvm.tpu.vld.msk.idx.v16bf16.v8i1.p201v16bf16(<8 x i1>, <16 x bfloat> addrspace(201)*, <8 x i32>)
+declare <16 x bfloat> @llvm.tpu.vld.msk.v16bf16.v8i1.p201v16bf16(<8 x i1>, <16 x bfloat> addrspace(201)*)
+declare void @llvm.tpu.vst.msk.add.v8i1.p201v16bf16.v16bf16(<8 x i1>, <16 x bfloat> addrspace(201)*, <16 x bfloat>)
+declare void @llvm.tpu.vst.msk.add.strided.v8i1.p201v16bf16.v16bf16(<8 x i1>, <16 x bfloat> addrspace(201)*, i32, <16 x bfloat>)
+declare void @llvm.tpu.vst.msk.idx.add.v8i1.p201v16bf16.v16bf16(<8 x i1>, <16 x bfloat> addrspace(201)*, <8 x i32>, <16 x bfloat>)
+declare void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v16bf16.v16bf16(<8 x i1>, <16 x bfloat> addrspace(201)*, <8 x i32>, <16 x bfloat>)
+declare void @llvm.tpu.vst.cb.msk.idx.add.v8i1.v16bf16(<8 x i1>, x86_mmx, i32, <8 x i32>, <16 x bfloat>)
+declare void @llvm.tpu.vst.cb.msk.idx.add.np.v8i1.v16bf16(<8 x i1>, x86_mmx, i32, <8 x i32>, <16 x bfloat>)
+declare void @llvm.tpu.vst.cb.msk.add.v8i1.v16bf16(<8 x i1>, x86_mmx, i32, <16 x bfloat>)
+declare void @llvm.tpu.vst.cb.upd.msk.add.v8i1.v16bf16(<8 x i1>, x86_mmx, i32, <16 x bfloat>)
+declare void @llvm.tpu.vst.cb.msk.add.strided.v8i1.v16bf16(<8 x i1>, x86_mmx, i32, i32, <16 x bfloat>)
+declare void @llvm.tpu.vst.cb.upd.msk.add.strided.v8i1.v16bf16(<8 x i1>, x86_mmx, i32, i32, <16 x bfloat>)
+declare <16 x bfloat> @llvm.tpu.vst.msk.idx.ret.add.np.v16bf16.v8i1.p201v16bf16(<8 x i1>, <16 x bfloat> addrspace(201)*, <8 x i32>, <16 x bfloat>)
+declare <16 x bfloat> addrspace(201)* @llvm.tpu.inttoptr.p201v16bf16(i32) nounwind
+
+; CHECK-LABEL: vldi_bf16:
+; CHECK: v0 = vld [tilespmem:s0+$0x0]
+define <16 x bfloat> @vldi_bf16(<16 x bfloat> addrspace(201)* %a) {
+  %b = load <16 x bfloat>, <16 x bfloat> addrspace(201)* %a
+  ret <16 x bfloat> %b
+}
+
+; CHECK-LABEL: vldi2_bf16:
+; CHECK: v0 = vld [tilespmem:s0+$0x8]
+define <16 x bfloat> @vldi2_bf16(<16 x bfloat> addrspace(201)* %a) {
+  %addr = getelementptr <16 x bfloat>, <16 x bfloat> addrspace(201)* %a, i32 1
+  %b = load <16 x bfloat>, <16 x bfloat> addrspace(201)* %addr
+  ret <16 x bfloat> %b
+}
+
+; CHECK-LABEL: load_bf16
+; CHECK: {{v[0-9]+}} = vld [tilespmem:a]
+define <16 x bfloat> @load_bf16() #0 {
+  %1 = load <16 x bfloat>, <16 x bfloat> addrspace(201)* @a, align 32
+  ret <16 x bfloat> %1
+}
+
+; CHECK-LABEL: vld_idx_bf16
+; CHECK: {{v[0-9]+}} = vld.idx.msk [tilespmem:{{v[0-9]+.*}}], vm0
+define <16 x bfloat> @vld_idx_bf16(<8 x i32> %idx, <8 x i1> %msk) #0 {
+  %1 = call <16 x bfloat> @llvm.tpu.vld.msk.idx.v16bf16.v8i1.p201v16bf16(<8 x i1> %msk, <16 x bfloat> addrspace(201)* @a, <8 x i32> %idx)
+  ret <16 x bfloat> %1
+}
+
+; CHECK-LABEL: vld_msk_immad_bf16:
+; CHECK: v0 = vld.msk [tilespmem:$0x64], vm0
+define <16 x bfloat> @vld_msk_immad_bf16(<16 x bfloat> addrspace(201)* %b, <8 x i1> %m) {
+  %a = call <16 x bfloat> addrspace(201)* @llvm.tpu.inttoptr.p201v16bf16(i32 100)
+  %r = tail call <16 x bfloat> @llvm.tpu.vld.msk.v16bf16.v8i1.p201v16bf16(<8 x i1> %m,
+                                                                     <16 x bfloat> addrspace(201)* %a)
+  ret <16 x bfloat> %r
+}
+
+; CHECK-LABEL: vld_msk_bf16
+; CHECK: {{v[0-9]+}} = vld.msk [tilespmem:{{.*}}], vm{{[0-9]+}};
+define <16 x bfloat> @vld_msk_bf16(<8 x i1> %msk) #0 {
+  %1 = call <16 x bfloat> @llvm.tpu.vld.msk.v16bf16.v8i1.p201v16bf16(<8 x i1> %msk, <16 x bfloat> addrspace(201)* @a)
+  ret <16 x bfloat> %1
+}
+
+; CHECK-LABEL: vstbf16:
+; CHECK: [tilespmem:s0+$0x8] = vst v0
+define void @vstbf16(<16 x bfloat> %a, <16 x bfloat> addrspace(201)* %b) {
+  %addr = getelementptr <16 x bfloat>, <16 x bfloat> addrspace(201)* %b, i32 1
+  store <16 x bfloat> %a, <16 x bfloat> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vstmskaddbf16:
+; CHECK: [tilespmem:s0+$0x0] = vst.add.bf16.msk vm0, v0;
+define void @vstmskaddbf16(<16 x bfloat> addrspace(201)* %base, <16 x bfloat> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.add.v8i1.p201v16bf16.v16bf16(<8 x i1> %m, <16 x bfloat> addrspace(201)* %base, <16 x bfloat> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstmskaddbf16_disp:
+; CHECK: [tilespmem:s0+$0x2000] = vst.add.bf16.msk vm0, v0;
+define void @vstmskaddbf16_disp(<16 x bfloat> addrspace(201)* %base, <16 x bfloat> %val, <8 x i1> %m) {
+entry:
+  %b = getelementptr inbounds <16 x bfloat>, <16 x bfloat> addrspace(201)* %base, i32 1024
+  tail call void @llvm.tpu.vst.msk.add.v8i1.p201v16bf16.v16bf16(<8 x i1> %m, <16 x bfloat> addrspace(201)* %b, <16 x bfloat> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskadd_stridebf16_0:
+; CHECK: [tilespmem:s0+$0x0 ss:s1] = vst.add.bf16.msk vm0, v0;
+define void @vstmskadd_stridebf16_0(<16 x bfloat> addrspace(201)* %base, <16 x bfloat> %val, <8 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.add.strided.v8i1.p201v16bf16.v16bf16(<8 x i1> %m, <16 x bfloat> addrspace(201)* %base, i32 %s, <16 x bfloat> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstmskadd_disp_stridebf16_0:
+; CHECK: [tilespmem:s0+$0x2000 ss:s1] = vst.add.bf16.msk vm0, v0;
+define void @vstmskadd_disp_stridebf16_0(<16 x bfloat> addrspace(201)* %base, <16 x bfloat> %val, <8 x i1> %m, i32 %s) {
+entry:
+  %b = getelementptr inbounds <16 x bfloat>, <16 x bfloat> addrspace(201)* %base, i32 1024
+  tail call void @llvm.tpu.vst.msk.add.strided.v8i1.p201v16bf16.v16bf16(<8 x i1> %m, <16 x bfloat> addrspace(201)* %b, i32 %s, <16 x bfloat> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskadd_stridebf16_1:
+; CHECK: [tilespmem:s0+$0x0 ss:$0x8] = vst.add.bf16.msk vm0, v0;
+define void @vstmskadd_stridebf16_1(<16 x bfloat> addrspace(201)* %base, <16 x bfloat> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.add.strided.v8i1.p201v16bf16.v16bf16(<8 x i1> %m, <16 x bfloat> addrspace(201)* %base, i32 8, <16 x bfloat> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstmskadd_disp_stridebf16_1:
+; CHECK: [tilespmem:s0+$0x2000 ss:$0x400] = vst.add.bf16.msk vm0, v0;
+define void @vstmskadd_disp_stridebf16_1(<16 x bfloat> addrspace(201)* %base, <16 x bfloat> %val, <8 x i1> %m) {
+entry:
+  %b = getelementptr inbounds <16 x bfloat>, <16 x bfloat> addrspace(201)* %base, i32 1024
+  tail call void @llvm.tpu.vst.msk.add.strided.v8i1.p201v16bf16.v16bf16(<8 x i1> %m, <16 x bfloat> addrspace(201)* %b, i32 1024, <16 x bfloat> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstidxaddbf16:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.add.bf16.msk vm0, v1;
+define void @vstidxaddbf16(<16 x bfloat> addrspace(201)* %base, <8 x i32> %off, <16 x bfloat> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.add.v8i1.p201v16bf16.v16bf16(<8 x i1> %m, <16 x bfloat> addrspace(201)* %base, <8 x i32> %off, <16 x bfloat> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstidxaddbf16_np:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.add.bf16.msk vm0, v1;
+define void @vstidxaddbf16_np(<16 x bfloat> addrspace(201)* %base, <8 x i32> %off, <16 x bfloat> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v16bf16.v16bf16(<8 x i1> %m, <16 x bfloat> addrspace(201)* %base, <8 x i32> %off, <16 x bfloat> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxaddbf16:
+; CHECK: [tilespmem:v0+s0+$0x0 cbreg:$0x0] = vst.idx.cb.add.bf16.msk vm0, v1;
+define void @vstcbidxaddbf16(<8 x i1> %m, i32 %base, <8 x i32> %off, <16 x bfloat> %val) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.idx.add.v8i1.v16bf16(<8 x i1> %m,
+                                               x86_mmx undef,
+                                               i32 %base,
+                                               <8 x i32> %off, <16 x bfloat> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxaddbf16_np:
+; CHECK: [tilespmem:v0+s0+$0x0 cbreg:$0x0] = vst.idx.cb.add.bf16.msk vm0, v1;
+define void @vstcbidxaddbf16_np(<8 x i1> %m, i32 %base, <8 x i32> %off, <16 x bfloat> %val) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.idx.add.np.v8i1.v16bf16(<8 x i1> %m,
+                                                  x86_mmx undef,
+                                                  i32 %base,
+                                                  <8 x i32> %off, <16 x bfloat> %val)
+  ret void
+}
+
+
+; CHECK-LABEL: vstcbmskaddbf16:
+; CHECK: [tilespmem:s0+$0x0 cbreg:$0x0] = vst.cb.add.bf16.msk vm0, v0;
+define void @vstcbmskaddbf16(i32 %base, <16 x bfloat> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.add.v8i1.v16bf16(<8 x i1> %m, x86_mmx undef,
+                                                  i32 %base, <16 x bfloat> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstcbmskaddbf16_disp:
+; CHECK: [tilespmem:s0+$0x400 cbreg:$0x0] = vst.cb.add.bf16.msk vm0, v0;
+define void @vstcbmskaddbf16_disp(i32 %base, <16 x bfloat> %val, <8 x i1> %m) {
+entry:
+  %b = add i32 %base, 1024
+  tail call void @llvm.tpu.vst.cb.msk.add.v8i1.v16bf16(<8 x i1> %m, x86_mmx undef, i32 %b, <16 x bfloat> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbupdmskaddbf16:
+; CHECK: [tilespmem:s0+$0x0 cbreg:$0x0] = vst.cb.upd.add.bf16.msk vm0, v0;
+define void @vstcbupdmskaddbf16(i32 %base, <16 x bfloat> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.v8i1.v16bf16(<8 x i1> %m, x86_mmx undef,
+                                                      i32 %base, <16 x bfloat> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstcbupdmskaddbf16_disp:
+; CHECK: [tilespmem:s0+$0x400 cbreg:$0x0] = vst.cb.upd.add.bf16.msk vm0, v0;
+define void @vstcbupdmskaddbf16_disp(i32 %base, <16 x bfloat> %val, <8 x i1> %m) {
+entry:
+  %b = add i32 %base, 1024
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.v8i1.v16bf16(<8 x i1> %m, x86_mmx undef, i32 %b, <16 x bfloat> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbmskaddstridedbf16:
+; CHECK: [tilespmem:s0+$0x0 ss:$0x7d cbreg:$0x0] = vst.cb.add.bf16.msk vm0, v0
+; CHECK: [tilespmem:s0+$0x0 ss:s1 cbreg:$0x0] = vst.cb.add.bf16.msk vm0, v0
+define void @vstcbmskaddstridedbf16(i32 %base, <16 x bfloat> %val, <8 x i1> %m, i32 %ss) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.add.strided.v8i1.v16bf16(<8 x i1> %m,
+                                                   x86_mmx undef,
+                                                   i32 %base,
+                                                   i32 125,
+                                                   <16 x bfloat> %val)
+  tail call void @llvm.tpu.vst.cb.msk.add.strided.v8i1.v16bf16(<8 x i1> %m,
+                                                   x86_mmx undef,
+                                                   i32 %base,
+                                                   i32 %ss,
+                                                   <16 x bfloat> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbupdmskaddstridedbf16:
+; CHECK: [tilespmem:s0+$0x0 ss:$0x7d cbreg:$0x0] = vst.cb.upd.add.bf16.msk vm0, v0
+; CHECK: [tilespmem:s0+$0x0 ss:s1 cbreg:$0x0] = vst.cb.upd.add.bf16.msk vm0, v0
+define void @vstcbupdmskaddstridedbf16(i32 %base, <16 x bfloat> %val, <8 x i1> %m, i32 %ss) {
+entry:
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.strided.v8i1.v16bf16(<8 x i1> %m,
+                                                       x86_mmx undef,
+                                                       i32 %base,
+                                                       i32 125,
+                                                       <16 x bfloat> %val)
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.strided.v8i1.v16bf16(<8 x i1> %m,
+                                                       x86_mmx undef,
+                                                       i32 %base,
+                                                       i32 %ss,
+                                                       <16 x bfloat> %val)
+  ret void
+}
+
+; CHECK-LABEL: func_vld_vst_msk_idx_addbf16_np:
+; CHECK: [tilespmem:v0+s0+$0x0], v0 = vst.idx.ret.add.bf16.msk vm0, v1;
+define <16 x bfloat> @func_vld_vst_msk_idx_addbf16_np(<16 x bfloat> addrspace(201)* %base, <8 x i32> %idx, <16 x bfloat> %val, <8 x i1> %m) {
+  %r = tail call <16 x bfloat> @llvm.tpu.vst.msk.idx.ret.add.np.v16bf16.v8i1.p201v16bf16(<8 x i1> %m, <16 x bfloat> addrspace(201)* %base,
+                                                                                    <8 x i32> %idx, <16 x bfloat> %val)
+  ret <16 x bfloat> %r
+}
+
+attributes #0 = { "implicit-section-name"=".text.tile_execute" "target-cpu"="sparsecore-tec-gl" }
+attributes #1 = { "implicit-section-name"=".text.tile_access" "target-cpu"="sparsecore-tac-gl" }
+attributes #2 = { "implicit-section-name"=".text.scs" "target-cpu"="sparsecore-scs-gl" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_bf16_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_bf16_sc.ll
new file mode 100644
index 0000000..6128972
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_bf16_sc.ll

@@ -0,0 +1,74 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false \
+; RUN: | FileCheck %s
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "googletpu"
+
+@a = external addrspace(201) global <16 x bfloat>, align 32
+
+declare void @llvm.tpu.vst.msk.v8i1.v32i8(<8 x i1>, <32 x i8> addrspace(201)*, <32 x i8>)
+declare <8 x i1> @llvm.tpu.16i1.to.8i1(<16 x i1>)
+declare <16 x bfloat> @llvm.tpu.vld.msk.idx.v16bf16.v8i1.p201v16bf16(<8 x i1>, <16 x bfloat> addrspace(201)*, <8 x i32>)
+declare <16 x bfloat> @llvm.tpu.vld.msk.v16bf16.v8i1.p201v16bf16(<8 x i1>, <16 x bfloat> addrspace(201)*)
+
+declare <16 x bfloat> addrspace(201)* @llvm.tpu.inttoptr.p201v16bf16(i32) nounwind
+
+; CHECK-LABEL: vldi_bf16:
+; CHECK: v0 = vld [tilespmem:s0+$0x0]
+define <16 x bfloat> @vldi_bf16(<16 x bfloat> addrspace(201)* %a) {
+  %b = load <16 x bfloat>, <16 x bfloat> addrspace(201)* %a
+  ret <16 x bfloat> %b
+}
+
+; CHECK-LABEL: vldi2_bf16:
+; CHECK: v0 = vld [tilespmem:s0+$0x8]
+define <16 x bfloat> @vldi2_bf16(<16 x bfloat> addrspace(201)* %a) {
+  %addr = getelementptr <16 x bfloat>, <16 x bfloat> addrspace(201)* %a, i32 1
+  %b = load <16 x bfloat>, <16 x bfloat> addrspace(201)* %addr
+  ret <16 x bfloat> %b
+}
+
+; CHECK-LABEL: load_bf16
+; CHECK: {{v[0-9]+}} = vld [tilespmem:a]
+define <16 x bfloat> @load_bf16() #0 {
+  %1 = load <16 x bfloat>, <16 x bfloat> addrspace(201)* @a, align 32
+  ret <16 x bfloat> %1
+}
+
+; CHECK-LABEL: vld_idx_bf16
+; CHECK: {{v[0-9]+}} = vld.idx.msk [tilespmem:{{v[0-9]+.*}}], vm0
+define <16 x bfloat> @vld_idx_bf16(<8 x i32> %idx, <8 x i1> %msk) #0 {
+  %1 = call <16 x bfloat> @llvm.tpu.vld.msk.idx.v16bf16.v8i1.p201v16bf16(<8 x i1> %msk, <16 x bfloat> addrspace(201)* @a, <8 x i32> %idx)
+  ret <16 x bfloat> %1
+}
+
+; CHECK-LABEL: vld_msk_immad_bf16:
+; CHECK: v0 = vld.msk [tilespmem:$0x64], vm0
+define <16 x bfloat> @vld_msk_immad_bf16(<16 x bfloat> addrspace(201)* %b, <8 x i1> %m) {
+  %a = call <16 x bfloat> addrspace(201)* @llvm.tpu.inttoptr.p201v16bf16(i32 100)
+  %r = tail call <16 x bfloat> @llvm.tpu.vld.msk.v16bf16.v8i1.p201v16bf16(<8 x i1> %m,
+                                                                     <16 x bfloat> addrspace(201)* %a)
+  ret <16 x bfloat> %r
+}
+
+; CHECK-LABEL: vld_msk_bf16
+; CHECK: {{v[0-9]+}} = vld.msk [tilespmem:{{.*}}], vm{{[0-9]+}};
+define <16 x bfloat> @vld_msk_bf16(<8 x i1> %msk) #0 {
+  %1 = call <16 x bfloat> @llvm.tpu.vld.msk.v16bf16.v8i1.p201v16bf16(<8 x i1> %msk, <16 x bfloat> addrspace(201)* @a)
+  ret <16 x bfloat> %1
+}
+
+; CHECK-LABEL: vstbf16:
+; CHECK: [tilespmem:s0+$0x8] = vst v0
+define void @vstbf16(<16 x bfloat> %a, <16 x bfloat> addrspace(201)* %b) {
+  %addr = getelementptr <16 x bfloat>, <16 x bfloat> addrspace(201)* %b, i32 1
+  store <16 x bfloat> %a, <16 x bfloat> addrspace(201)* %addr
+  ret void
+}
+
+attributes #0 = { "implicit-section-name"=".text.tile_execute" "target-cpu"="sparsecore-tec-gl" }
+attributes #1 = { "implicit-section-name"=".text.tile_access" "target-cpu"="sparsecore-tac-gl" }
+attributes #2 = { "implicit-section-name"=".text.scs" "target-cpu"="sparsecore-scs-gl" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_gf_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_gf_sc.ll
new file mode 100644
index 0000000..c6e265a
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_gf_sc.ll

@@ -0,0 +1,758 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-allow-global-offset-for-test | FileCheck %s
+; RUN: llc < %s -mcpu=sparsecore-tec-gf -asm-verbose=false -disable-cgp \
+; RUN: -opaque-pointers -tpu-allow-global-offset-for-test | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+%struct.V = type { <16 x i32>, <16 x i32> }
+
+declare <16 x i32> @llvm.tpu.vld.msk.v16i32.v16i1.p201v16i32(<16 x i1>, <16 x i32> addrspace(201)*)
+declare <16 x i32> @llvm.tpu.vld.msk.idx.v16i32.v16i1.p201v16i32(<16 x i1>, <16 x i32> addrspace(201)*, <16 x i32>)
+declare <16 x float> @llvm.tpu.vld.msk.idx.v16f32.v16i1.p201v16f32(<16 x i1>, <16 x float> addrspace(201)*, <16 x i32>)
+declare void @llvm.tpu.vst.msk.idx.v16i1.p201v16i32.v16i32(<16 x i1>, <16 x i32> addrspace(201)*, <16 x i32>, <16 x i32>) #1
+declare void @llvm.tpu.vst.msk.idx.v16i1.p201v16f32.v16f32(<16 x i1>, <16 x float> addrspace(201)*, <16 x i32>, <16 x float>) #1
+declare void @llvm.tpu.vst.msk.v16i1.p201v16i32.v16i32(<16 x i1>, <16 x i32> addrspace(201)*, <16 x i32>) argmemonly nounwind
+declare void @llvm.tpu.vst.msk.idx.add.v16i1.p201v16i32.v16i32(<16 x i1>, <16 x i32> addrspace(201)*, <16 x i32>, <16 x i32>)
+declare void @llvm.tpu.vst.msk.idx.add.v16i1.p201v16f32.v16f32(<16 x i1>, <16 x float> addrspace(201)*, <16 x i32>, <16 x float>)
+declare void @llvm.tpu.vst.msk.idx.add.np.v16i1.p201v16i32.v16i32(<16 x i1>, <16 x i32> addrspace(201)*, <16 x i32>, <16 x i32>)
+declare void @llvm.tpu.vst.msk.idx.add.np.v16i1.p201v16f32.v16f32(<16 x i1>, <16 x float> addrspace(201)*, <16 x i32>, <16 x float>)
+declare <16 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v16i32(i32) nounwind
+declare <16 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v16float(i32) nounwind
+declare <16 x i32> @llvm.tpu.vld.cb.msk.v16i32.v16i1(<16 x i1>, x86_mmx, i32)
+declare <16 x i32> @llvm.tpu.vld.cb.msk.idx.v16i32.v16i1(<16 x i1>, x86_mmx, i32, <16 x i32>)
+declare <16 x float> @llvm.tpu.vld.cb.msk.idx.v16f32.v16i1(<16 x i1>, x86_mmx, i32, <16 x i32>)
+declare <16 x i32> @llvm.tpu.vld.cb.msk.idx.np.v16i32.v16i1(<16 x i1>, x86_mmx, i32, <16 x i32>)
+declare <16 x float> @llvm.tpu.vld.cb.msk.idx.np.v16f32.v16i1(<16 x i1>, x86_mmx, i32, <16 x i32>)
+declare void @llvm.tpu.vst.cb.msk.v16i1.v16i32(<16 x i1>, x86_mmx, i32, <16 x i32>)
+declare void @llvm.tpu.vst.cb.msk.idx.v16i1.v16i32(<16 x i1>, x86_mmx, i32, <16 x i32>, <16 x i32>)
+declare void @llvm.tpu.vst.cb.msk.idx.v16i1.v16f32(<16 x i1>, x86_mmx, i32, <16 x i32>, <16 x float>)
+declare void @llvm.tpu.vst.cb.msk.idx.add.v16i1.v16i32(<16 x i1>, x86_mmx, i32, <16 x i32>, <16 x i32>)
+declare void @llvm.tpu.vst.cb.msk.idx.add.v16i1.v16f32(<16 x i1>, x86_mmx, i32, <16 x i32>, <16 x float>)
+declare void @llvm.tpu.vst.cb.msk.idx.add.np.v16i1.v16i32(<16 x i1>, x86_mmx, i32, <16 x i32>, <16 x i32>)
+declare void @llvm.tpu.vst.cb.msk.idx.add.np.v16i1.v16f32(<16 x i1>, x86_mmx, i32, <16 x i32>, <16 x float>)
+declare <16 x i32> @llvm.tpu.vld.cb.upd.msk.v16i32.v16i1(<16 x i1>, x86_mmx, i32)
+declare void @llvm.tpu.vst.cb.upd.msk.v16i1.v16i32(<16 x i1>, x86_mmx, i32, <16 x i32>)
+declare <16 x i32> @llvm.tpu.vld.msk.strided.v16i32.p201v16i32.v16i1(<16 x i1>, <16 x i32> addrspace(201)*, i32)
+declare void @llvm.tpu.vst.msk.strided.v16i1.p201v16i32.v16i32(<16 x i1>, <16 x i32> addrspace(201)*, i32, <16 x i32>)
+declare void @llvm.tpu.vst.msk.add.strided.v16i1.p201v16i32.v16i32(<16 x i1>, <16 x i32> addrspace(201)*, i32, <16 x i32>)
+declare void @llvm.tpu.vst.msk.add.strided.v16i1.p201v16f32.v16f32(<16 x i1>, <16 x float> addrspace(201)*, i32, <16 x float>)
+declare <16 x i32> @llvm.tpu.vld.msk.idx.strided.v16i32.v16i1.p201v16i32(<16 x i1>, <16 x i32> addrspace(201)*, <16 x i32>, i32)
+declare <16 x float> @llvm.tpu.vld.msk.idx.strided.v16f32.v16i1.p201v16f32(<16 x i1>, <16 x float> addrspace(201)*, <16 x i32>, i32)
+declare void @llvm.tpu.vst.msk.idx.strided.v16i1.p201v16i32.v16i32(<16 x i1>, <16 x i32> addrspace(201)*, <16 x i32>, i32, <16 x i32>)
+declare void @llvm.tpu.vst.msk.idx.strided.v16i1.p201v16f32.v16f32(<16 x i1>, <16 x float> addrspace(201)*, <16 x i32>, i32, <16 x float>)
+
+attributes #1 = { argmemonly nounwind }
+
+@garr = addrspace(201) global <16 x i32> zeroinitializer
+@garrf = addrspace(201) global <16 x float> zeroinitializer
+
+; CHECK-LABEL: vldi:
+; CHECK: v0 = vld [tilespmem:s0+$0x0]
+define <16 x i32> @vldi(<16 x i32> addrspace(201)* %a) {
+  %b = load <16 x i32>, <16 x i32> addrspace(201)* %a
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vldi2:
+; CHECK: v0 = vld [tilespmem:s0+$0x10]
+define <16 x i32> @vldi2(<16 x i32> addrspace(201)* %a) {
+  %addr = getelementptr <16 x i32>, <16 x i32> addrspace(201)* %a, i32 1
+  %b = load <16 x i32>, <16 x i32> addrspace(201)* %addr
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vld_immad:
+; CHECK: v0 = vld [tilespmem:$0x10]
+define <16 x i32> @vld_immad(<16 x i32> %a) {
+  %addr = call <16 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v16i32(i32 16)
+  %b = load <16 x i32>, <16 x i32> addrspace(201)* %addr
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vld_immadf:
+; CHECK: v0 = vld [tilespmem:$0x10]
+define <16 x float> @vld_immadf(<16 x float> %a) {
+  %addr = call <16 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v16float(i32 16)
+  %b = load <16 x float>, <16 x float> addrspace(201)* %addr
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: vld_msk_immad:
+; CHECK: v0 = vld.msk [tilespmem:$0x64], vm0
+define <16 x i32> @vld_msk_immad(<16 x i32> addrspace(201)* %b, <16 x i1> %m) {
+  %a = call <16 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v16i32(i32 100)
+  %r = tail call <16 x i32> @llvm.tpu.vld.msk.v16i32.v16i1.p201v16i32(<16 x i1> %m,
+                                                                      <16 x i32> addrspace(201)* %a)
+  ret <16 x i32> %r
+}
+
+; CHECK-LABEL: vld_msk_disp:
+; CHECK: v0 = vld.msk [tilespmem:s0+$0x640], vm0
+define <16 x i32> @vld_msk_disp(<16 x i32> addrspace(201)* %b, <16 x i1> %m) {
+  %a = getelementptr inbounds <16 x i32>, <16 x i32> addrspace(201)* %b, i32 100
+  %r = tail call <16 x i32> @llvm.tpu.vld.msk.v16i32.v16i1.p201v16i32(<16 x i1> %m,
+                                                                      <16 x i32> addrspace(201)* %a)
+  ret <16 x i32> %r
+}
+
+; CHECK-LABEL: vld_msk_idxi:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], vm0;
+define <16 x i32> @vld_msk_idxi(<16 x i32> addrspace(201)* %b, <16 x i32> %off, <16 x i1> %m) {
+entry:
+  %0 = tail call <16 x i32> @llvm.tpu.vld.msk.idx.v16i32.v16i1.p201v16i32(<16 x i1> %m, <16 x i32> addrspace(201)* %b, <16 x i32> %off)
+  ret <16 x i32> %0
+}
+
+; CHECK-LABEL: vld_msk_idxi_disp:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x800], vm0;
+define <16 x i32> @vld_msk_idxi_disp(<16 x i32> addrspace(201)* %b, <16 x i32> %off, <16 x i1> %m) {
+entry:
+  %a = getelementptr inbounds <16 x i32>, <16 x i32> addrspace(201)* %b, i32 128
+  %0 = tail call <16 x i32> @llvm.tpu.vld.msk.idx.v16i32.v16i1.p201v16i32(<16 x i1> %m, <16 x i32> addrspace(201)* %a, <16 x i32> %off)
+  ret <16 x i32> %0
+}
+
+; CHECK-LABEL: vld_msk_idxf:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], vm0;
+define <16 x float> @vld_msk_idxf(<16 x float> addrspace(201)* %b, <16 x i32> %off, <16 x i1> %m) {
+entry:
+  %0 = tail call <16 x float> @llvm.tpu.vld.msk.idx.v16f32.v16i1.p201v16f32(<16 x i1> %m, <16 x float> addrspace(201)* %b, <16 x i32> %off)
+  ret <16 x float> %0
+}
+
+; CHECK-LABEL: vld_msk_idxf_disp:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x800], vm0;
+define <16 x float> @vld_msk_idxf_disp(<16 x float> addrspace(201)* %b, <16 x i32> %off, <16 x i1> %m) {
+entry:
+  %a = getelementptr inbounds <16 x float>, <16 x float> addrspace(201)* %b, i32 128
+  %0 = tail call <16 x float> @llvm.tpu.vld.msk.idx.v16f32.v16i1.p201v16f32(<16 x i1> %m, <16 x float> addrspace(201)* %a, <16 x i32> %off)
+  ret <16 x float> %0
+}
+
+; CHECK-LABEL: vldcbmsk_disp:
+; CHECK: v0 = vld.cb.msk [tilespmem:s0+$0x10 cbreg:$0x0], vm0;
+define <16 x i32> @vldcbmsk_disp(i32 %base, <16 x i1> %m) {
+entry:
+  %a = add i32 %base, 16
+  %0 = tail call <16 x i32> @llvm.tpu.vld.cb.msk.v16i32.v16i1(<16 x i1> %m,
+                                                x86_mmx undef,
+                                                i32 %a)
+  ret <16 x i32> %0
+}
+
+; CHECK-LABEL: vldcbidxmski_disp:
+; CHECK: v0 = vld.idx.cb.msk [tilespmem:v0+s0+$0x80 cbreg:$0x0], vm0;
+define <16 x i32> @vldcbidxmski_disp(i32 %base, <16 x i32> %off, <16 x i1> %m) {
+entry:
+  %a = add i32 %base, 128
+  %0 = tail call <16 x i32> @llvm.tpu.vld.cb.msk.idx.v16i32.v16i1(<16 x i1> %m,
+                                                     x86_mmx undef,
+                                                     i32 %a,
+                                                     <16 x i32> %off)
+  ret <16 x i32> %0
+}
+
+; CHECK-LABEL: vldcbidxmskf_disp:
+; CHECK: v0 = vld.idx.cb.msk [tilespmem:v0+s0+$0x100 cbreg:$0x0], vm0;
+define <16 x float> @vldcbidxmskf_disp(i32 %base, <16 x i32> %off, <16 x i1> %m) {
+entry:
+  %a = add i32 %base, 256
+  %0 = tail call <16 x float> @llvm.tpu.vld.cb.msk.idx.v16f32.v16i1(<16 x i1> %m,
+                                                       x86_mmx undef,
+                                                       i32 %a,
+                                                       <16 x i32> %off)
+  ret <16 x float> %0
+}
+
+; CHECK-LABEL: vldcbidxmski_np_disp:
+; CHECK: v0 = vld.idx.cb.msk [tilespmem:v0+s0+$0x800 cbreg:$0x0], vm0;
+define <16 x i32> @vldcbidxmski_np_disp(i32 %base, <16 x i32> %off, <16 x i1> %m) {
+entry:
+  %a = add i32 %base, 2048
+  %0 = tail call <16 x i32> @llvm.tpu.vld.cb.msk.idx.np.v16i32.v16i1(<16 x i1> %m,
+                                                        x86_mmx undef,
+                                                        i32 %a,
+                                                        <16 x i32> %off)
+  ret <16 x i32> %0
+}
+
+; CHECK-LABEL: vldcbidxmskf_np_disp:
+; CHECK: v0 = vld.idx.cb.msk [tilespmem:v0+s0+$0x1000 cbreg:$0x0], vm0;
+define <16 x float> @vldcbidxmskf_np_disp(i32 %base, <16 x i32> %off, <16 x i1> %m) {
+entry:
+  %a = add i32 %base, 4096
+  %0 = tail call <16 x float> @llvm.tpu.vld.cb.msk.idx.np.v16f32.v16i1(<16 x i1> %m,
+                                                          x86_mmx undef,
+                                                          i32 %a,
+                                                          <16 x i32> %off)
+  ret <16 x float> %0
+}
+
+; CHECK-LABEL: vstcbmsk_disp:
+; CHECK: [tilespmem:s0+$0x2000 cbreg:$0x0] = vst.cb.msk vm0, v0;
+define void @vstcbmsk_disp(i32 %base, <16 x i32> %val, <16 x i1> %m) {
+entry:
+  %a = add i32 %base, 8192
+  tail call void @llvm.tpu.vst.cb.msk.v16i1.v16i32(<16 x i1> %m,
+                                      x86_mmx undef,
+                                      i32 %a,
+                                      <16 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxmski_disp:
+; CHECK: [tilespmem:v0+s0+$0x4000 cbreg:$0x0] = vst.idx.cb.msk vm0, v1;
+define void @vstcbidxmski_disp(i32 %base, <16 x i32> %off, <16 x i32> %val, <16 x i1> %m) {
+entry:
+  %a = add i32 %base, 16384
+  tail call void @llvm.tpu.vst.cb.msk.idx.v16i1.v16i32(<16 x i1> %m,
+                                           x86_mmx undef,
+                                           i32 %a,
+                                           <16 x i32> %off, <16 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxmskf_disp:
+; CHECK: [tilespmem:v0+s0+$0x8000 cbreg:$0x0] = vst.idx.cb.msk vm0, v1;
+define void @vstcbidxmskf_disp(i32 %base, <16 x i32> %off, <16 x float> %val, <16 x i1> %m) {
+entry:
+  %a = add i32 %base, 32768
+  tail call void @llvm.tpu.vst.cb.msk.idx.v16i1.v16f32(<16 x i1> %m,
+                                           x86_mmx undef,
+                                           i32 %a,
+                                           <16 x i32> %off, <16 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxaddi_disp:
+; CHECK: [tilespmem:v0+s0+$0x40000 cbreg:$0x0] = vst.idx.cb.add.s32.msk vm0, v1;
+define void @vstcbidxaddi_disp(<16 x i1> %m, i32 %base, <16 x i32> %off, <16 x i32> %val) {
+entry:
+  %a = add i32 %base, 262144
+  tail call void @llvm.tpu.vst.cb.msk.idx.add.v16i1.v16i32(<16 x i1> %m,
+                                               x86_mmx undef,
+                                               i32 %a,
+                                               <16 x i32> %off, <16 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxaddf_disp:
+; CHECK: [tilespmem:v0+s0+$0x4 cbreg:$0x0] = vst.idx.cb.add.f32.msk vm0, v1;
+define void @vstcbidxaddf_disp(<16 x i1> %m, i32 %base, <16 x i32> %off, <16 x float> %val) {
+entry:
+  %a = add i32 %base, 4
+  tail call void @llvm.tpu.vst.cb.msk.idx.add.v16i1.v16f32(<16 x i1> %m,
+                                               x86_mmx undef,
+                                               i32 %a,
+                                               <16 x i32> %off, <16 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxaddi_np_disp:
+; CHECK: [tilespmem:v0+s0+$0x8 cbreg:$0x0] = vst.idx.cb.add.s32.msk vm0, v1;
+define void @vstcbidxaddi_np_disp(<16 x i1> %m, i32 %base, <16 x i32> %off, <16 x i32> %val) {
+entry:
+  %a = add i32 %base, 8
+  tail call void @llvm.tpu.vst.cb.msk.idx.add.np.v16i1.v16i32(<16 x i1> %m,
+                                                  x86_mmx undef,
+                                                  i32 %a,
+                                                  <16 x i32> %off, <16 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxaddf_np_disp:
+; CHECK: [tilespmem:v0+s0+$0x10 cbreg:$0x0] = vst.idx.cb.add.f32.msk vm0, v1;
+define void @vstcbidxaddf_np_disp(<16 x i1> %m, i32 %base, <16 x i32> %off, <16 x float> %val) {
+entry:
+  %a = add i32 %base, 16
+  tail call void @llvm.tpu.vst.cb.msk.idx.add.np.v16i1.v16f32(<16 x i1> %m,
+                                                  x86_mmx undef,
+                                                  i32 %a,
+                                                  <16 x i32> %off, <16 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vldcbupdmsk_disp:
+; CHECK: v0 = vld.cb.upd.msk [tilespmem:s0+$0x20 cbreg:$0x0], vm0;
+define <16 x i32> @vldcbupdmsk_disp(i32 %base, <16 x i1> %m) {
+entry:
+  %a = add i32 %base, 32
+  %0 = tail call <16 x i32> @llvm.tpu.vld.cb.upd.msk.v16i32.v16i1(<16 x i1> %m,
+                                                    x86_mmx undef,
+                                                    i32 %a)
+  ret <16 x i32> %0
+}
+
+; CHECK-LABEL: vstcbupdmsk_disp:
+; CHECK: [tilespmem:s0+$0x40 cbreg:$0x0] = vst.cb.upd.msk vm0, v0;
+define void @vstcbupdmsk_disp(i32 %base, <16 x i32> %val, <16 x i1> %m) {
+entry:
+  %a = add i32 %base, 64
+  tail call void @llvm.tpu.vst.cb.upd.msk.v16i1.v16i32(<16 x i1> %m,
+                                          x86_mmx undef,
+                                          i32 %a,
+                                          <16 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vsti:
+; CHECK: [tilespmem:s0+$0x0] = vst v0
+define void @vsti(<16 x i32> %a, <16 x i32> addrspace(201)* %b) {
+  store <16 x i32> %a, <16 x i32> addrspace(201)* %b
+  ret void
+}
+
+; CHECK-LABEL: vsti2:
+; CHECK: [tilespmem:s0+$0x10] = vst v0
+define void @vsti2(<16 x i32> %a, <16 x i32> addrspace(201)* %b) {
+  %addr = getelementptr <16 x i32>, <16 x i32> addrspace(201)* %b, i32 1
+  store <16 x i32> %a, <16 x i32> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vstf2:
+; CHECK: [tilespmem:s0+$0x10] = vst v0
+define void @vstf2(<16 x float> %a, <16 x float> addrspace(201)* %b) {
+  %addr = getelementptr <16 x float>, <16 x float> addrspace(201)* %b, i32 1
+  store <16 x float> %a, <16 x float> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vst_gep:
+; CHECK: s0 = sadd.s32 s1, s0
+; CHECK: [tilespmem:s0+$0x0] = vst v0
+define void @vst_gep(<16 x float> %a, <16 x float> addrspace(201)* %b, i32 %c) {
+  %addr = getelementptr <16 x float>, <16 x float> addrspace(201)* %b, i32 %c
+  store <16 x float> %a, <16 x float> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vld_struct:
+; CHECK: v0 = vld [tilespmem:s0+$0x10]
+define <16 x i32> @vld_struct(%struct.V addrspace(201)* %v) {
+  %p = getelementptr inbounds %struct.V, %struct.V addrspace(201)* %v, i32 0, i32 1
+  %l = load <16 x i32>, <16 x i32> addrspace(201)* %p
+  ret <16 x i32> %l
+}
+
+; CHECK-LABEL: vst_global:
+; CHECK: [tilespmem:garr+32] = vst v0
+define void @vst_global(<16 x i32> %a) {
+  %addr = getelementptr <16 x i32>, <16 x i32> addrspace(201)* @garr, i32 2
+  store <16 x i32> %a, <16 x i32> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vst_globalf:
+; CHECK: [tilespmem:garrf+32] = vst v0
+define void @vst_globalf(<16 x float> %a) {
+  %addr = getelementptr <16 x float>, <16 x float> addrspace(201)* @garrf, i32 2
+  store <16 x float> %a, <16 x float> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vst_immad:
+; CHECK: [tilespmem:$0x10] = vst v0
+define void @vst_immad(<16 x i32> %a) {
+  %addr = call <16 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v16i32(i32 16)
+  store <16 x i32> %a, <16 x i32> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vst_immadf:
+; CHECK: [tilespmem:$0x10] = vst v0
+define void @vst_immadf(<16 x float> %a) {
+  %addr = call <16 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v16float(i32 16)
+  store <16 x float> %a, <16 x float> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idxi:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.msk vm0, v1
+define void @vst_msk_idxi(<16 x i1> %m, <16 x i32> %offs, <16 x i32> %v) {
+  tail call void @llvm.tpu.vst.msk.idx.v16i1.p201v16i32.v16i32(<16 x i1> %m,
+                                        <16 x i32> addrspace(201)* @garr,
+                                        <16 x i32> %offs,
+                                        <16 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idxi_disp:
+; CHECK: [tilespmem:v0+s0+$0x400] = vst.idx.msk vm0, v1
+define void @vst_msk_idxi_disp(<16 x i32> addrspace(201)* %b, <16 x i1> %m, <16 x i32> %offs, <16 x i32> %v) {
+  %a = getelementptr inbounds <16 x i32>, <16 x i32> addrspace(201)* %b, i32 64
+  tail call void @llvm.tpu.vst.msk.idx.v16i1.p201v16i32.v16i32(<16 x i1> %m,
+                                        <16 x i32> addrspace(201)* %a,
+                                        <16 x i32> %offs,
+                                        <16 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idxf:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.msk vm0, v1
+define void @vst_msk_idxf(<16 x i1> %m, <16 x i32> %offs, <16 x float> %v) {
+  tail call void @llvm.tpu.vst.msk.idx.v16i1.p201v16f32.v16f32(<16 x i1> %m,
+                                        <16 x float> addrspace(201)* @garrf,
+                                        <16 x i32> %offs,
+                                        <16 x float> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idxf_disp:
+; CHECK: [tilespmem:v0+s0+$0x400] = vst.idx.msk vm0, v1
+define void @vst_msk_idxf_disp(<16 x float> addrspace(201)* %b, <16 x i1> %m, <16 x i32> %offs, <16 x float> %v) {
+  %a = getelementptr inbounds <16 x float>, <16 x float> addrspace(201)* %b, i32 64
+  tail call void @llvm.tpu.vst.msk.idx.v16i1.p201v16f32.v16f32(<16 x i1> %m,
+                                        <16 x float> addrspace(201)* %a,
+                                        <16 x i32> %offs,
+                                        <16 x float> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idx_addi:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.add.s32.msk vm0, v1
+define void @vst_msk_idx_addi(<16 x i1> %m, <16 x i32> %offs, <16 x i32> %v) {
+  tail call void @llvm.tpu.vst.msk.idx.add.v16i1.p201v16i32.v16i32(<16 x i1> %m,
+                                        <16 x i32> addrspace(201)* @garr,
+                                        <16 x i32> %offs,
+                                        <16 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idx_addi_disp:
+; CHECK: [tilespmem:v0+s0+$0x400] = vst.idx.add.s32.msk vm0, v1
+define void @vst_msk_idx_addi_disp(<16 x i32> addrspace(201)* %b, <16 x i1> %m, <16 x i32> %offs, <16 x i32> %v) {
+  %a = getelementptr inbounds <16 x i32>, <16 x i32> addrspace(201)* %b, i32 64
+  tail call void @llvm.tpu.vst.msk.idx.add.v16i1.p201v16i32.v16i32(<16 x i1> %m,
+                                        <16 x i32> addrspace(201)* %a,
+                                        <16 x i32> %offs,
+                                        <16 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idx_addi_np_disp:
+; CHECK: [tilespmem:v0+s0+$0x400] = vst.idx.add.s32.msk vm0, v1
+define void @vst_msk_idx_addi_np_disp(<16 x i32> addrspace(201)* %b, <16 x i1> %m, <16 x i32> %offs, <16 x i32> %v) {
+  %a = getelementptr inbounds <16 x i32>, <16 x i32> addrspace(201)* %b, i32 64
+  tail call void @llvm.tpu.vst.msk.idx.add.np.v16i1.p201v16i32.v16i32(<16 x i1> %m,
+                                        <16 x i32> addrspace(201)* %a,
+                                        <16 x i32> %offs,
+                                        <16 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idx_addf:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.add.f32.msk vm0, v1
+define void @vst_msk_idx_addf(<16 x i1> %m, <16 x i32> %offs, <16 x float> %v) {
+  tail call void @llvm.tpu.vst.msk.idx.add.v16i1.p201v16f32.v16f32(<16 x i1> %m,
+                                        <16 x float> addrspace(201)* @garrf,
+                                        <16 x i32> %offs,
+                                        <16 x float> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idx_addf_disp:
+; CHECK: [tilespmem:v0+s0+$0x400] = vst.idx.add.f32.msk vm0, v1
+define void @vst_msk_idx_addf_disp(<16 x float> addrspace(201)* %b, <16 x i1> %m, <16 x i32> %offs, <16 x float> %v) {
+  %a = getelementptr inbounds <16 x float>, <16 x float> addrspace(201)* %b, i32 64
+  tail call void @llvm.tpu.vst.msk.idx.add.v16i1.p201v16f32.v16f32(<16 x i1> %m,
+                                        <16 x float> addrspace(201)* %a,
+                                        <16 x i32> %offs,
+                                        <16 x float> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idx_addf_np_disp:
+; CHECK: [tilespmem:v0+s0+$0x400] = vst.idx.add.f32.msk vm0, v1
+define void @vst_msk_idx_addf_np_disp(<16 x float> addrspace(201)* %b, <16 x i1> %m, <16 x i32> %offs, <16 x float> %v) {
+  %a = getelementptr inbounds <16 x float>, <16 x float> addrspace(201)* %b, i32 64
+  tail call void @llvm.tpu.vst.msk.idx.add.np.v16i1.p201v16f32.v16f32(<16 x i1> %m,
+                                        <16 x float> addrspace(201)* %a,
+                                        <16 x i32> %offs,
+                                        <16 x float> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk:
+; CHECK: [tilespmem:s0+$0x0] = vst.msk vm0, v0
+define void @vst_msk(<16 x i1> %m, <16 x i32> %v) {
+  tail call void @llvm.tpu.vst.msk.v16i1.p201v16i32.v16i32(<16 x i1> %m,
+                                                   <16 x i32> addrspace(201)* @garr,
+                                                   <16 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_disp:
+; CHECK: [tilespmem:s0+$0x200] = vst.msk vm0, v0
+define void @vst_msk_disp(<16 x i32> addrspace(201)* %b,
+                          <16 x i1> %m, <16 x i32> %v) {
+  %a = getelementptr inbounds <16 x i32>, <16 x i32> addrspace(201)* %b, i32 32
+  tail call void @llvm.tpu.vst.msk.v16i1.p201v16i32.v16i32(<16 x i1> %m,
+                                                   <16 x i32> addrspace(201)* %a,
+                                                   <16 x i32> %v)
+  ret void
+}
+
+; TODO(hgreving): we only support 19-bit immediates right now because we're
+; conflating signed and unsigend immediates. See comment in TPUInstrFormat.td.
+
+; CHECK-LABEL: vld_32bitimm_0
+; CHECK: { s[[s:[0-9]+]] = sadd.s32 $0x8000000, s{{[0-9]+}}
+; CHECK-NEXT: { v{{[0-9]+}} = vld [tilespmem:s[[s]]+$0x0]
+define <16 x i32> @vld_32bitimm_0(<16 x i32> addrspace(201)* %a) {
+  %addr = getelementptr <16 x i32>, <16 x i32> addrspace(201)* %a, i32 8388608
+  %b = load <16 x i32>, <16 x i32> addrspace(201)* %addr
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vld_32bitimm_1
+; CHECK: { s[[s:[0-9]+]] = simm.s32 $0xffffff
+; CHECK-NEXT: { v{{[0-9]+}} = vld [tilespmem:s[[s]]+$0x0]
+define <16 x i32> @vld_32bitimm_1(<16 x i32> addrspace(201)* %a) {
+  %b = load <16 x i32>, <16 x i32> addrspace(201)* inttoptr (i32 16777215 to <16 x i32> addrspace(201)*)
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vld_19bitimm_0
+; CHECK: { v{{[0-9]+}} = vld [tilespmem:s{{[0-9]+}}+$0x7fff0]
+define <16 x i32> @vld_19bitimm_0(<16 x i32> addrspace(201)* %a) {
+  %addr = getelementptr <16 x i32>, <16 x i32> addrspace(201)* %a, i32 32767
+  %b = load <16 x i32>, <16 x i32> addrspace(201)* %addr
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vld_19bitimm_1
+; CHECK: { v{{[0-9]+}} = vld [tilespmem:$0x7ffff]
+define <16 x i32> @vld_19bitimm_1(<16 x i32> addrspace(201)* %a) {
+  %b = load <16 x i32>, <16 x i32> addrspace(201)* inttoptr (i32 524287 to <16 x i32> addrspace(201)*)
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vst_32bitimm_0
+; CHECK: { s[[s:[0-9]+]] = sadd.s32 $0x7fffff0, s{{[0-9]+}}
+; CHECK-NEXT: { [tilespmem:s[[s]]+$0x0] = vst
+define void @vst_32bitimm_0(<16 x i32> addrspace(201)* %a, <16 x i32> %b) {
+  %addr = getelementptr <16 x i32>, <16 x i32> addrspace(201)* %a, i32 8388607
+  store <16 x i32> %b, <16 x i32> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vst_32bitimm_1
+; CHECK: { s[[s:[0-9]+]] = simm.s32 $0xffffff
+; CHECK-NEXT: { [tilespmem:s[[s]]+$0x0] = vst
+define void @vst_32bitimm_1(<16 x i32> addrspace(201)* %a, <16 x i32> %b) {
+  store <16 x i32> %b, <16 x i32> addrspace(201)* inttoptr (i32 16777215 to <16 x i32> addrspace(201)*)
+  ret void
+}
+
+; CHECK-LABEL: vst_19bitimm_0
+; CHECK: { [tilespmem:s{{[0-9]+}}+$0x7fff0] = vst
+define void @vst_19bitimm_0(<16 x i32> addrspace(201)* %a, <16 x i32> %b) {
+  %addr = getelementptr <16 x i32>, <16 x i32> addrspace(201)* %a, i32 32767
+  store <16 x i32> %b, <16 x i32> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vst_19bitimm_1
+; CHECK: { [tilespmem:$0x7ffff] = vst
+define void @vst_19bitimm_1(<16 x i32> addrspace(201)* %a, <16 x i32> %b) {
+  store <16 x i32> %b, <16 x i32> addrspace(201)* inttoptr (i32 524287 to <16 x i32> addrspace(201)*)
+  ret void
+}
+
+; We're providing these patterns right now, though we may decide to -disable-vector-combine
+; in the future.
+
+; CHECK-LABEL: svldi:
+; CHECK: v0 = vld [tilespmem:$0x8]
+; CHECK: (v2sf) = vpush v0, $0x3
+; CHECK: s0 = spop (v2sf)
+define i32 @svldi(<16 x i32> addrspace(201)* %a) {
+  %v = load <16 x i32>, <16 x i32> addrspace(201)* inttoptr (i32 8 to <16 x i32> addrspace(201)*)
+  %b = extractelement <16 x i32> %v, i32 3
+  ret i32 %b
+}
+
+; CHECK-LABEL: svlda:
+; CHECK: v0 = vld [tilespmem:s0+$0x0]
+; CHECK: (v2sf) = vpush v0, $0x1
+; CHECK: s0 = spop (v2sf)
+define i32 @svlda(<16 x i32> addrspace(201)* %a) {
+  %addr = getelementptr <16 x i32>, <16 x i32> addrspace(201)* %a, i32 0
+  %v = load <16 x i32>, <16 x i32> addrspace(201)* %addr
+  %b = extractelement <16 x i32> %v, i32 1
+  ret i32 %b
+}
+
+; CHECK-LABEL: svldfi:
+; CHECK: v0 = vld [tilespmem:$0x8]
+; CHECK: (v2sf) = vpush v0, $0x3
+; CHECK: s0 = spop (v2sf)
+define float @svldfi(<16 x float> addrspace(201)* %a) {
+  %v = load <16 x float>, <16 x float> addrspace(201)* inttoptr (i32 8 to <16 x float> addrspace(201)*)
+  %b = extractelement <16 x float> %v, i32 3
+  ret float %b
+}
+
+; CHECK-LABEL: svldfa:
+; CHECK: v0 = vld [tilespmem:s0+$0x0]
+; CHECK: (v2sf) = vpush v0, $0x1
+; CHECK: s0 = spop (v2sf)
+define float @svldfa(<16 x float> addrspace(201)* %a) {
+  %addr = getelementptr <16 x float>, <16 x float> addrspace(201)* %a, i32 0
+  %v = load <16 x float>, <16 x float> addrspace(201)* %addr
+  %b = extractelement <16 x float> %v, i32 1
+  ret float %b
+}
+
+; CHECK-LABEL: svlds:
+; CHECK: v0 = vld [tilespmem:s0+$0x10]
+; CHECK: (v2sf) = vpush v0, $0x1
+; CHECK: s0 = spop (v2sf)
+define i32 @svlds(%struct.V addrspace(201)* %s) {
+  %p = getelementptr inbounds %struct.V, %struct.V addrspace(201)* %s, i32 0, i32 1
+  %v = load <16 x i32>, <16 x i32> addrspace(201)* %p
+  %b = extractelement <16 x i32> %v, i32 1
+  ret i32 %b
+}
+
+; CHECK-LABEL: scalarized_gep:
+; CHECK: s0 = sadd.s32 s1, s0
+; CHECK: v0 = vld [tilespmem:s0+$0x0]
+; CHECK: (v2sf) = vpush v0, $0x0
+; CHECK: s0 = spop (v2sf)
+define i32 @scalarized_gep(i32 addrspace(201)* %a, i32 %b) {
+  %1 = getelementptr i32, i32 addrspace(201)* %a, i32 %b
+  %2 = load i32, i32 addrspace(201)* %1, align 32
+  ret i32 %2
+}
+
+; CHECK-LABEL: scalarized_gep2:
+; CHECK: v0 = vld [tilespmem:s0+$0x8]
+; CHECK: (v2sf) = vpush v0, $0x0
+; CHECK: s0 = spop (v2sf)
+define i32 @scalarized_gep2(<16 x i32> addrspace(201)* %a) {
+  %1 = getelementptr <16 x i32>, <16 x i32> addrspace(201)* %a, i32 0, i32 8
+  %2 = bitcast i32 addrspace(201)* %1 to <16 x i32> addrspace(201)*
+  %3 = load <16 x i32>, <16 x i32> addrspace(201)* %2, align 32
+  %r = extractelement <16 x i32> %3, i32 0
+  ret i32 %r
+}
+
+; CHECK-LABEL: scalarized_gep3:
+; CHECK: v0 = vld [tilespmem:s0+$0x8]
+; CHECK: (v2sf) = vpush v0, $0x0
+; CHECK: s0 = spop (v2sf)
+define i32 @scalarized_gep3(i32 addrspace(201)* %a) {
+  %1 = getelementptr i32, i32 addrspace(201)* %a, i32 8
+  %r = load i32, i32 addrspace(201)* %1, align 32
+  ret i32 %r
+}
+
+@a = addrspace(201) global <16 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10,
+                                       i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>, align 32
+
+; CHECK-LABEL: scalarized_gep4:
+; CHECK: v0 = vld [tilespmem:a+2]
+; CHECK: (v2sf) = vpush v0, $0x0
+; CHECK: s{{[0-9]+}} = spop (v2sf)
+define void @scalarized_gep4(i32 %b, i32 %c) {
+entry:
+  %vecext = load i32, i32 addrspace(201)* getelementptr (<16 x i32>, <16 x i32> addrspace(201)* @a, i32 0, i32 2), align 8
+  %0 = load i32, i32* inttoptr (i32 256 to i32*), align 256
+  %add = add nsw i32 %vecext, %0
+  store i32 %add, i32* inttoptr (i32 256 to i32*), align 256
+  ret void
+}
+
+; CHECK-LABEL: vldi_align:
+; CHECK: v0 = vld [tilespmem:s0+$0x0]
+define <16 x i32> @vldi_align(<16 x i32> addrspace(201)* %a) {
+  %b = load <16 x i32>, <16 x i32> addrspace(201)* %a, align 4
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: vsti_align:
+; CHECK: [tilespmem:s0+$0x0] = vst v0
+define void @vsti_align(<16 x i32> %a, <16 x i32> addrspace(201)* %b) {
+  store <16 x i32> %a, <16 x i32> addrspace(201)* %b, align 4
+  ret void
+}
+
+; CHECK-LABEL: vldmsk_strided:
+; CHECK: v0 = vld.msk [tilespmem:s0+$0x0 ss:s1], vm0;
+define <16 x i32> @vldmsk_strided(<16 x i32> addrspace(201)* %base, <16 x i1> %m, i32 %s) {
+entry:
+  %0 = tail call <16 x i32> @llvm.tpu.vld.msk.strided.v16i32.p201v16i32.v16i1(<16 x i1> %m, <16 x i32> addrspace(201)* %base, i32 %s)
+  ret <16 x i32> %0
+}
+
+; CHECK-LABEL: vstmsk_strided:
+; CHECK: [tilespmem:s0+$0x0 ss:s1] = vst.msk vm0, v0;
+define void @vstmsk_strided(<16 x i32> addrspace(201)* %base, <16 x i32> %val, <16 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.strided.v16i1.p201v16i32.v16i32(<16 x i1> %m, <16 x i32> addrspace(201)* %base, i32 %s, <16 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskadd_stridedi:
+; CHECK: [tilespmem:s0+$0x0 ss:s1] = vst.add.s32.msk vm0, v0;
+define void @vstmskadd_stridedi(<16 x i32> addrspace(201)* %base, <16 x i32> %val, <16 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.add.strided.v16i1.p201v16i32.v16i32(<16 x i1> %m, <16 x i32> addrspace(201)* %base, i32 %s, <16 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskadd_stridedf:
+; CHECK: [tilespmem:s0+$0x0 ss:s1] = vst.add.f32.msk vm0, v0;
+define void @vstmskadd_stridedf(<16 x float> addrspace(201)* %base, <16 x float> %val, <16 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.add.strided.v16i1.p201v16f32.v16f32(<16 x i1> %m, <16 x float> addrspace(201)* %base, i32 %s, <16 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vldmskidx_stridedi:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0 ss:s1], vm0;
+define <16 x i32> @vldmskidx_stridedi(<16 x i32> addrspace(201)* %base, <16 x i32> %off, <16 x i1> %m, i32 %s) {
+entry:
+  %0 = tail call <16 x i32> @llvm.tpu.vld.msk.idx.strided.v16i32.v16i1.p201v16i32(<16 x i1> %m, <16 x i32> addrspace(201)* %base, <16 x i32> %off, i32 %s)
+  ret <16 x i32> %0
+}
+
+; CHECK-LABEL: vldmskidx_stridedf:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0 ss:s1], vm0;
+define <16 x float> @vldmskidx_stridedf(<16 x float> addrspace(201)* %base, <16 x i32> %off, <16 x i1> %m, i32 %s) {
+entry:
+  %0 = tail call <16 x float> @llvm.tpu.vld.msk.idx.strided.v16f32.v16i1.p201v16f32(<16 x i1> %m, <16 x float> addrspace(201)* %base, <16 x i32> %off, i32 %s)
+  ret <16 x float> %0
+}
+
+; CHECK-LABEL: vstmskidx_stridedi:
+; CHECK: [tilespmem:v0+s0+$0x0 ss:s1] = vst.idx.msk vm0, v1;
+define void @vstmskidx_stridedi(<16 x i32> addrspace(201)* %base, <16 x i32> %off, <16 x i32> %val, <16 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.strided.v16i1.p201v16i32.v16i32(<16 x i1> %m, <16 x i32> addrspace(201)* %base, <16 x i32> %off, i32 %s, <16 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskidx_stridedf:
+; CHECK: [tilespmem:v0+s0+$0x0 ss:s1] = vst.idx.msk vm0, v1;
+define void @vstmskidx_stridedf(<16 x float> addrspace(201)* %base, <16 x i32> %off, <16 x float> %val, <16 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.strided.v16i1.p201v16f32.v16f32(<16 x i1> %m, <16 x float> addrspace(201)* %base, <16 x i32> %off, i32 %s, <16 x float> %val)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_i16_gl_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_i16_gl_sc.ll
new file mode 100644
index 0000000..19c6378
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_i16_gl_sc.ll

@@ -0,0 +1,249 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "googletpu"
+
+@a = external addrspace(201) global <16 x bfloat>, align 32
+@ai = external addrspace(201) global <16 x i16>, align 32
+
+declare void @llvm.tpu.vst.msk.v8i1.v32i8(<8 x i1>, <32 x i8> addrspace(201)*, <32 x i8>)
+declare <8 x i1> @llvm.tpu.16i1.to.8i1(<16 x i1>)
+declare <16 x i16> @llvm.tpu.vld.msk.idx.v16i16.v8i1.p201v16i16(<8 x i1>, <16 x i16> addrspace(201)*, <8 x i32>)
+declare <16 x i16> @llvm.tpu.vld.msk.v16i16.v8i1.p201v16i16(<8 x i1>, <16 x i16> addrspace(201)*)
+declare void @llvm.tpu.vst.msk.add.v8i1.p201v16i16.v16i16(<8 x i1>, <16 x i16> addrspace(201)*, <16 x i16>)
+declare void @llvm.tpu.vst.msk.add.strided.v8i1.p201v16i16.v16i16(<8 x i1>, <16 x i16> addrspace(201)*, i32, <16 x i16>)
+declare void @llvm.tpu.vst.msk.idx.add.v8i1.p201v16i16.v16i16(<8 x i1>, <16 x i16> addrspace(201)*, <8 x i32>, <16 x i16>)
+declare void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v16i16.v16i16(<8 x i1>, <16 x i16> addrspace(201)*, <8 x i32>, <16 x i16>)
+declare void @llvm.tpu.vst.cb.msk.idx.add.v8i1.v16i16(<8 x i1>, x86_mmx, i32, <8 x i32>, <16 x i16>)
+declare void @llvm.tpu.vst.cb.msk.idx.add.np.v8i1.v16i16(<8 x i1>, x86_mmx, i32, <8 x i32>, <16 x i16>)
+declare void @llvm.tpu.vst.cb.msk.add.v8i1.v16i16(<8 x i1>, x86_mmx, i32, <16 x i16>)
+declare void @llvm.tpu.vst.cb.upd.msk.add.v8i1.v16i16(<8 x i1>, x86_mmx, i32, <16 x i16>)
+declare void @llvm.tpu.vst.cb.msk.add.strided.v8i1.v16i16(<8 x i1>, x86_mmx, i32, i32, <16 x i16>)
+declare void @llvm.tpu.vst.cb.upd.msk.add.strided.v8i1.v16i16(<8 x i1>, x86_mmx, i32, i32, <16 x i16>)
+declare <16 x i16> @llvm.tpu.vst.msk.idx.ret.add.np.v16i16.v8i1.p201v16i16(<8 x i1>, <16 x i16> addrspace(201)*, <8 x i32>, <16 x i16>)
+declare <16 x i16> addrspace(201)* @llvm.tpu.inttoptr.p201v16i16(i32) nounwind
+
+; CHECK-LABEL: vstmskadds16:
+; CHECK: [tilespmem:s0+$0x0] = vst.add.s16.msk vm0, v0;
+define void @vstmskadds16(<16 x i16> addrspace(201)* %base, <16 x i16> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.add.v8i1.p201v16i16.v16i16(<8 x i1> %m, <16 x i16> addrspace(201)* %base, <16 x i16> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstmskadds16_disp:
+; CHECK: [tilespmem:s0+$0x2000] = vst.add.s16.msk vm0, v0;
+define void @vstmskadds16_disp(<16 x i16> addrspace(201)* %base, <16 x i16> %val, <8 x i1> %m) {
+entry:
+  %b = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(201)* %base, i32 1024
+  tail call void @llvm.tpu.vst.msk.add.v8i1.p201v16i16.v16i16(<8 x i1> %m, <16 x i16> addrspace(201)* %b, <16 x i16> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskadd_strideds16_0:
+; CHECK: [tilespmem:s0+$0x0 ss:s1] = vst.add.s16.msk vm0, v0;
+define void @vstmskadd_strideds16_0(<16 x i16> addrspace(201)* %base, <16 x i16> %val, <8 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.add.strided.v8i1.p201v16i16.v16i16(<8 x i1> %m, <16 x i16> addrspace(201)* %base, i32 %s, <16 x i16> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstmskadd_disp_strideds16_0:
+; CHECK: [tilespmem:s0+$0x2000 ss:s1] = vst.add.s16.msk vm0, v0;
+define void @vstmskadd_disp_strideds16_0(<16 x i16> addrspace(201)* %base, <16 x i16> %val, <8 x i1> %m, i32 %s) {
+entry:
+  %b = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(201)* %base, i32 1024
+  tail call void @llvm.tpu.vst.msk.add.strided.v8i1.p201v16i16.v16i16(<8 x i1> %m, <16 x i16> addrspace(201)* %b, i32 %s, <16 x i16> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskadd_strideds16_1:
+; CHECK: [tilespmem:s0+$0x0 ss:$0x8] = vst.add.s16.msk vm0, v0;
+define void @vstmskadd_strideds16_1(<16 x i16> addrspace(201)* %base, <16 x i16> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.add.strided.v8i1.p201v16i16.v16i16(<8 x i1> %m, <16 x i16> addrspace(201)* %base, i32 8, <16 x i16> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstmskadd_disp_strideds16_1:
+; CHECK: [tilespmem:s0+$0x2000 ss:$0x400] = vst.add.s16.msk vm0, v0;
+define void @vstmskadd_disp_strideds16_1(<16 x i16> addrspace(201)* %base, <16 x i16> %val, <8 x i1> %m) {
+entry:
+  %b = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(201)* %base, i32 1024
+  tail call void @llvm.tpu.vst.msk.add.strided.v8i1.p201v16i16.v16i16(<8 x i1> %m, <16 x i16> addrspace(201)* %b, i32 1024, <16 x i16> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstidxadds16:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.add.s16.msk vm0, v1;
+define void @vstidxadds16(<16 x i16> addrspace(201)* %base, <8 x i32> %off, <16 x i16> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.add.v8i1.p201v16i16.v16i16(<8 x i1> %m, <16 x i16> addrspace(201)* %base, <8 x i32> %off, <16 x i16> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstidxadds16_np:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.add.s16.msk vm0, v1;
+define void @vstidxadds16_np(<16 x i16> addrspace(201)* %base, <8 x i32> %off, <16 x i16> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v16i16.v16i16(<8 x i1> %m, <16 x i16> addrspace(201)* %base, <8 x i32> %off, <16 x i16> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxadds16:
+; CHECK: [tilespmem:v0+s0+$0x0 cbreg:$0x0] = vst.idx.cb.add.s16.msk vm0, v1;
+define void @vstcbidxadds16(<8 x i1> %m, i32 %base, <8 x i32> %off, <16 x i16> %val) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.idx.add.v8i1.v16i16(<8 x i1> %m,
+                                               x86_mmx undef,
+                                               i32 %base,
+                                               <8 x i32> %off, <16 x i16> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxadds16_np:
+; CHECK: [tilespmem:v0+s0+$0x0 cbreg:$0x0] = vst.idx.cb.add.s16.msk vm0, v1;
+define void @vstcbidxadds16_np(<8 x i1> %m, i32 %base, <8 x i32> %off, <16 x i16> %val) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.idx.add.np.v8i1.v16i16(<8 x i1> %m,
+                                                  x86_mmx undef,
+                                                  i32 %base,
+                                                  <8 x i32> %off, <16 x i16> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbmskadds16:
+; CHECK: [tilespmem:s0+$0x0 cbreg:$0x0] = vst.cb.add.s16.msk vm0, v0;
+define void @vstcbmskadds16(i32 %base, <16 x i16> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.add.v8i1.v16i16(<8 x i1> %m, x86_mmx undef,
+                                                 i32 %base, <16 x i16> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstcbmskadds16_disp:
+; CHECK: [tilespmem:s0+$0x400 cbreg:$0x0] = vst.cb.add.s16.msk vm0, v0;
+define void @vstcbmskadds16_disp(i32 %base, <16 x i16> %val, <8 x i1> %m) {
+entry:
+  %b = add i32 %base, 1024
+  tail call void @llvm.tpu.vst.cb.msk.add.v8i1.v16i16(<8 x i1> %m, x86_mmx undef, i32 %b, <16 x i16> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbupdmskadds16:
+; CHECK: [tilespmem:s0+$0x0 cbreg:$0x0] = vst.cb.upd.add.s16.msk vm0, v0;
+define void @vstcbupdmskadds16(i32 %base, <16 x i16> %val, <8 x i1> %m) {
+entry:
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.v8i1.v16i16(<8 x i1> %m, x86_mmx undef,
+                                                     i32 %base, <16 x i16> %val)
+  ret void
+}
+
+; 1024 * 8 (vector size) * 4 (bytes) / 4 (TPU GEP lowering adjust for word) = 0x2000.
+; CHECK-LABEL: vstcbupdmskadds16_disp:
+; CHECK: [tilespmem:s0+$0x400 cbreg:$0x0] = vst.cb.upd.add.s16.msk vm0, v0;
+define void @vstcbupdmskadds16_disp(i32 %base, <16 x i16> %val, <8 x i1> %m) {
+entry:
+  %b = add i32 %base, 1024
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.v8i1.v16i16(<8 x i1> %m, x86_mmx undef, i32 %b, <16 x i16> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbmskaddstrideds16:
+; CHECK: [tilespmem:s0+$0x0 ss:$0x7d cbreg:$0x0] = vst.cb.add.s16.msk vm0, v0
+; CHECK: [tilespmem:s0+$0x0 ss:s1 cbreg:$0x0] = vst.cb.add.s16.msk vm0, v0
+define void @vstcbmskaddstrideds16(i32 %base, <16 x i16> %val, <8 x i1> %m, i32 %ss) {
+entry:
+  tail call void @llvm.tpu.vst.cb.msk.add.strided.v8i1.v16i16(<8 x i1> %m,
+                                                   x86_mmx undef,
+                                                   i32 %base,
+                                                   i32 125,
+                                                   <16 x i16> %val)
+  tail call void @llvm.tpu.vst.cb.msk.add.strided.v8i1.v16i16(<8 x i1> %m,
+                                                   x86_mmx undef,
+                                                   i32 %base,
+                                                   i32 %ss,
+                                                   <16 x i16> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbupdmskaddstrideds16:
+; CHECK: [tilespmem:s0+$0x0 ss:$0x7d cbreg:$0x0] = vst.cb.upd.add.s16.msk vm0, v0
+; CHECK: [tilespmem:s0+$0x0 ss:s1 cbreg:$0x0] = vst.cb.upd.add.s16.msk vm0, v0
+define void @vstcbupdmskaddstrideds16(i32 %base, <16 x i16> %val, <8 x i1> %m, i32 %ss) {
+entry:
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.strided.v8i1.v16i16(<8 x i1> %m,
+                                                       x86_mmx undef,
+                                                       i32 %base,
+                                                       i32 125,
+                                                       <16 x i16> %val)
+  tail call void @llvm.tpu.vst.cb.upd.msk.add.strided.v8i1.v16i16(<8 x i1> %m,
+                                                       x86_mmx undef,
+                                                       i32 %base,
+                                                       i32 %ss,
+                                                       <16 x i16> %val)
+  ret void
+}
+
+; CHECK-LABEL: func_vld_vst_msk_idx_adds16_np:
+; CHECK: [tilespmem:v0+s0+$0x0], v0 = vst.idx.ret.add.s16.msk vm0, v1;
+define <16 x i16> @func_vld_vst_msk_idx_adds16_np(<16 x i16> addrspace(201)* %base, <8 x i32> %idx, <16 x i16> %val, <8 x i1> %m) {
+  %r = tail call <16 x i16> @llvm.tpu.vst.msk.idx.ret.add.np.v16i16.v8i1.p201v16i16(<8 x i1> %m, <16 x i16> addrspace(201)* %base,
+                                                                               <8 x i32> %idx, <16 x i16> %val)
+  ret <16 x i16> %r
+}
+
+; CHECK-LABEL: vldi_s16:
+; CHECK: v0 = vld [tilespmem:s0+$0x0]
+define <16 x i16> @vldi_s16(<16 x i16> addrspace(201)* %a) {
+  %b = load <16 x i16>, <16 x i16> addrspace(201)* %a
+  ret <16 x i16> %b
+}
+
+; CHECK-LABEL: vldi2_s16:
+; CHECK: v0 = vld [tilespmem:s0+$0x8]
+define <16 x i16> @vldi2_s16(<16 x i16> addrspace(201)* %a) {
+  %addr = getelementptr <16 x i16>, <16 x i16> addrspace(201)* %a, i32 1
+  %b = load <16 x i16>, <16 x i16> addrspace(201)* %addr
+  ret <16 x i16> %b
+}
+
+; CHECK-LABEL: load_s16
+; CHECK: {{v[0-9]+}} = vld [tilespmem:ai]
+define <16 x i16> @load_s16() #0 {
+  %1 = load <16 x i16>, <16 x i16> addrspace(201)* @ai, align 32
+  ret <16 x i16> %1
+}
+
+; CHECK-LABEL: vld_idx_s16
+; CHECK: {{v[0-9]+}} = vld.idx.msk [tilespmem:{{v[0-9]+.*}}], vm0
+define <16 x i16> @vld_idx_s16(<8 x i32> %idx, <8 x i1> %msk) #0 {
+  %1 = call <16 x i16> @llvm.tpu.vld.msk.idx.v16i16.v8i1.p201v16i16(<8 x i1> %msk, <16 x i16> addrspace(201)* @ai, <8 x i32> %idx)
+  ret <16 x i16> %1
+}
+
+; CHECK-LABEL: vld_msk_immad_s16:
+; CHECK: v0 = vld.msk [tilespmem:$0x64], vm0
+define <16 x i16> @vld_msk_immad_s16(<16 x i16> addrspace(201)* %b, <8 x i1> %m) {
+  %a = call <16 x i16> addrspace(201)* @llvm.tpu.inttoptr.p201v16i16(i32 100)
+  %r = tail call <16 x i16> @llvm.tpu.vld.msk.v16i16.v8i1.p201v16i16(<8 x i1> %m,
+                                                                     <16 x i16> addrspace(201)* %a)
+  ret <16 x i16> %r
+}
+
+; CHECK-LABEL: vld_msk_s16
+; CHECK: {{v[0-9]+}} = vld.msk [tilespmem:{{.*}}], vm{{[0-9]+}};
+define <16 x i16> @vld_msk_s16(<8 x i1> %msk) #0 {
+  %1 = call <16 x i16> @llvm.tpu.vld.msk.v16i16.v8i1.p201v16i16(<8 x i1> %msk, <16 x i16> addrspace(201)* @ai)
+  ret <16 x i16> %1
+}
+
+attributes #0 = { "implicit-section-name"=".text.tile_execute" "target-cpu"="sparsecore-tec-gl" }
+attributes #1 = { "implicit-section-name"=".text.tile_access" "target-cpu"="sparsecore-tac-gl" }
+attributes #2 = { "implicit-section-name"=".text.scs" "target-cpu"="sparsecore-scs-gl" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_i16_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_i16_sc.ll
new file mode 100644
index 0000000..17cc849
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_i16_sc.ll

@@ -0,0 +1,66 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false \
+; RUN: | FileCheck %s
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "googletpu"
+
+@a = external addrspace(201) global <16 x bfloat>, align 32
+@ai = external addrspace(201) global <16 x i16>, align 32
+
+declare void @llvm.tpu.vst.msk.v8i1.v32i8(<8 x i1>, <32 x i8> addrspace(201)*, <32 x i8>)
+declare <8 x i1> @llvm.tpu.16i1.to.8i1(<16 x i1>)
+declare <16 x i16> @llvm.tpu.vld.msk.idx.v16i16.v8i1.p201v16i16(<8 x i1>, <16 x i16> addrspace(201)*, <8 x i32>)
+declare <16 x i16> @llvm.tpu.vld.msk.v16i16.v8i1.p201v16i16(<8 x i1>, <16 x i16> addrspace(201)*)
+declare <16 x i16> addrspace(201)* @llvm.tpu.inttoptr.p201v16i16(i32) nounwind
+
+; CHECK-LABEL: vldi_s16:
+; CHECK: v0 = vld [tilespmem:s0+$0x0]
+define <16 x i16> @vldi_s16(<16 x i16> addrspace(201)* %a) {
+  %b = load <16 x i16>, <16 x i16> addrspace(201)* %a
+  ret <16 x i16> %b
+}
+
+; CHECK-LABEL: vldi2_s16:
+; CHECK: v0 = vld [tilespmem:s0+$0x8]
+define <16 x i16> @vldi2_s16(<16 x i16> addrspace(201)* %a) {
+  %addr = getelementptr <16 x i16>, <16 x i16> addrspace(201)* %a, i32 1
+  %b = load <16 x i16>, <16 x i16> addrspace(201)* %addr
+  ret <16 x i16> %b
+}
+
+; CHECK-LABEL: load_s16
+; CHECK: {{v[0-9]+}} = vld [tilespmem:ai]
+define <16 x i16> @load_s16() #0 {
+  %1 = load <16 x i16>, <16 x i16> addrspace(201)* @ai, align 32
+  ret <16 x i16> %1
+}
+
+; CHECK-LABEL: vld_idx_s16
+; CHECK: {{v[0-9]+}} = vld.idx.msk [tilespmem:{{v[0-9]+.*}}], vm0
+define <16 x i16> @vld_idx_s16(<8 x i32> %idx, <8 x i1> %msk) #0 {
+  %1 = call <16 x i16> @llvm.tpu.vld.msk.idx.v16i16.v8i1.p201v16i16(<8 x i1> %msk, <16 x i16> addrspace(201)* @ai, <8 x i32> %idx)
+  ret <16 x i16> %1
+}
+
+; CHECK-LABEL: vld_msk_immad_s16:
+; CHECK: v0 = vld.msk [tilespmem:$0x64], vm0
+define <16 x i16> @vld_msk_immad_s16(<16 x i16> addrspace(201)* %b, <8 x i1> %m) {
+  %a = call <16 x i16> addrspace(201)* @llvm.tpu.inttoptr.p201v16i16(i32 100)
+  %r = tail call <16 x i16> @llvm.tpu.vld.msk.v16i16.v8i1.p201v16i16(<8 x i1> %m,
+                                                                     <16 x i16> addrspace(201)* %a)
+  ret <16 x i16> %r
+}
+
+; CHECK-LABEL: vld_msk_s16
+; CHECK: {{v[0-9]+}} = vld.msk [tilespmem:{{.*}}], vm{{[0-9]+}};
+define <16 x i16> @vld_msk_s16(<8 x i1> %msk) #0 {
+  %1 = call <16 x i16> @llvm.tpu.vld.msk.v16i16.v8i1.p201v16i16(<8 x i1> %msk, <16 x i16> addrspace(201)* @ai)
+  ret <16 x i16> %1
+}
+
+attributes #0 = { "implicit-section-name"=".text.tile_execute" "target-cpu"="sparsecore-tec-gl" }
+attributes #1 = { "implicit-section-name"=".text.tile_access" "target-cpu"="sparsecore-tac-gl" }
+attributes #2 = { "implicit-section-name"=".text.scs" "target-cpu"="sparsecore-scs-gl" }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_imm_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_imm_tc.ll
new file mode 100644
index 0000000..7bbbed7
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_imm_tc.ll

@@ -0,0 +1,291 @@
+; RUN: llc < %s -mcpu=tensorcore-vf -asm-verbose=false -disable-cgp | FileCheck --check-prefixes=CHECK,CHECK-VF %s
+; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp | FileCheck --check-prefixes=CHECK,CHECK-PF %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Tests that 16- and 20-bit immediate-operands are selected (or not) based on PF
+; and VF sub-targets for vector-load/vector-store instruction variants.
+
+declare <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)*, i32, i32)
+declare <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)*, i32, i32)
+declare void @llvm.tpu.vst.strided.i32(<1024 x i32>, <1024 x i32> addrspace(205)*, i32, i32, <1024 x i1>)
+declare <1024 x i32> @llvm.tpu.vld.indexed.i32(<1024 x i32> addrspace(205)*, i32, i32, i32, i32)
+
+declare i32 @llvm.tpu.set.lane.indexed(<1024 x i32>, i32)
+declare <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32) nounwind
+
+; CHECK-LABEL: ld_isel_imm_vs_reg_based_on_offset:
+; CHECK-VF-DAG: v{{[0-9]+}} = vld [vmem:$0x1]
+; CHECK-VF-DAG: v{{[0-9]+}} = vld [vmem:$0x44444]
+; CHECK-VF-DAG: s[[x:[0-9]+]] = simm.s32 $0x444444
+; CHECK-VF-DAG: v{{[0-9]+}} = vld [vmem:s[[x]]+$0x0]
+
+; CHECK-PF-DAG: v{{[0-9]+}} = vld [vmem:$0x1]
+; CHECK-PF-DAG: s[[x:[0-9]+]] = simm.s32 $0x44444
+; CHECK-PF-DAG: v{{[0-9]+}} = vld [vmem:s[[x]]+$0x0]
+; CHECK-PF-DAG: s[[y:[0-9]+]] = simm.s32 $0x444444
+; CHECK-PF-DAG: v{{[0-9]+}} = vld [vmem:s[[y]]+$0x0]
+define <1024 x i32> @ld_isel_imm_vs_reg_based_on_offset() {
+entry:
+  %a0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 1)
+  %b0 = load <1024 x i32>, <1024 x i32> addrspace(205)* %a0
+
+  %a1 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 279620)
+  %b1 = load <1024 x i32>, <1024 x i32> addrspace(205)* %a1
+
+  %a2 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 4473924)
+  %b2 = load <1024 x i32>, <1024 x i32> addrspace(205)* %a2
+
+  %0 = add <1024 x i32> %b0, %b1
+  %1 = add <1024 x i32> %0, %b2
+
+  ret <1024 x i32> %1
+}
+
+; CHECK-LABEL: ld_isel_imm_vs_reg_based_on_sublanemask:
+; CHECK-VF-DAG: v{{[0-9]+}} = vld.sshfl [vmem:$0x1 sm:$0x7b], $0x1234567
+; CHECK-VF-DAG: v{{[0-9]+}} = vld.sshfl [vmem:$0x1 sm:$0x44444], $0x1234567
+; CHECK-VF-DAG: s[[x:[0-9]+]] = simm.s32 $0x444444
+; CHECK-VF-DAG: v{{[0-9]+}} = vld.sshfl [vmem:$0x1 sm:s[[x]]], $0x1234567
+
+; CHECK-PF-DAG: v{{[0-9]+}} =  vld.sshfl [vmem:$0x1 sm:$0x7b], $0x1234567
+; CHECK-PF-DAG: s[[x:[0-9]+]] =  simm.s32 $0x44444
+; CHECK-PF-DAG: v{{[0-9]+}} =  vld.sshfl [vmem:$0x1 sm:s[[x]]], $0x1234567
+; CHECK-PF-DAG: s[[y:[0-9]+]] =  simm.s32 $0x444444
+; CHECK-PF-DAG: v{{[0-9]+}} =  vld.sshfl [vmem:$0x1 sm:s[[y]]], $0x1234567
+define <1024 x i32> @ld_isel_imm_vs_reg_based_on_sublanemask() {
+entry:
+  %0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 1)
+  %1 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* nonnull %0, i32 123, i32 19088743)
+  %2 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* nonnull %0, i32 279620, i32 19088743)
+  %3 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* nonnull %0, i32 4473924, i32 19088743)
+
+  %4 = add <1024 x i32> %1, %2
+  %5 = add <1024 x i32> %4, %3
+
+  ret <1024 x i32> %5
+}
+
+; CHECK-LABEL: ld_isel_with_base_imm_vs_reg_based_on_sublanemask:
+; CHECK-VF-DAG: v{{[0-9]+}} = vld.sshfl [vmem:s0+$0x0 sm:$0x7b], $0x1234567
+; CHECK-VF-DAG: v{{[0-9]+}} = vld.sshfl [vmem:s0+$0x0 sm:$0x44444], $0x1234567
+; CHECK-VF-DAG: s[[x:[0-9]+]] = simm.s32 $0x444444
+; CHECK-VF-DAG: v{{[0-9]+}} = vld.sshfl [vmem:s0+$0x0 sm:s[[x]]], $0x1234567
+
+; CHECK-PF-DAG: v{{[0-9]+}} =  vld.sshfl [vmem:s0+$0x0 sm:$0x7b], $0x1234567
+; CHECK-PF-DAG: s[[x:[0-9]+]] =  simm.s32 $0x44444
+; CHECK-PF-DAG: v{{[0-9]+}} =  vld.sshfl [vmem:s0+$0x0 sm:s[[x]]], $0x1234567
+; CHECK-PF-DAG: s[[y:[0-9]+]] =  simm.s32 $0x444444
+; CHECK-PF-DAG: v{{[0-9]+}} =  vld.sshfl [vmem:s0+$0x0 sm:s[[y]]], $0x1234567
+define <1024 x i32> @ld_isel_with_base_imm_vs_reg_based_on_sublanemask(<1024 x i32> addrspace(205)* readonly %ptr) {
+entry:
+  %0 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* %ptr, i32 123, i32 19088743)
+  %1 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* %ptr, i32 279620, i32 19088743)
+  %2 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* %ptr, i32 4473924, i32 19088743)
+
+  %3 = add <1024 x i32> %0, %1
+  %4 = add <1024 x i32> %3, %2
+
+  ret <1024 x i32> %4
+}
+
+; CHECK-LABEL: ld_isel_imm_vs_reg_based_on_stride:
+; CHECK-VF-DAG: v{{[0-9]+}} = vld [vmem:$0x1 ss:$0x5 sm:$0x7b]
+; CHECK-VF-DAG: v{{[0-9]+}} = vld [vmem:$0x1 ss:$0x44444 sm:$0x7b]
+; CHECK-VF-DAG: s[[x:[0-9]+]] = simm.s32 $0x444444
+; CHECK-VF-DAG: v{{[0-9]+}} = vld [vmem:$0x1 ss:s[[x]] sm:$0x7b]
+
+; CHECK-PF-DAG: v{{[0-9]+}} = vld [vmem:$0x1 ss:$0x5 sm:$0x7b]
+; CHECK-PF-DAG: s[[x:[0-9]+]] =  simm.s32 $0x44444
+; CHECK-PF-DAG: v{{[0-9]+}} = vld [vmem:$0x1 ss:s[[x]] sm:$0x7b]
+; CHECK-PF-DAG: s[[y:[0-9]+]] =  simm.s32 $0x444444
+; CHECK-PF-DAG: v{{[0-9]+}} = vld [vmem:$0x1 ss:s[[y]] sm:$0x7b]
+define <1024 x i32> @ld_isel_imm_vs_reg_based_on_stride() {
+entry:
+  %0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 1)
+  %1 = call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* nonnull %0, i32 123, i32 5)
+  %2 = call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* nonnull %0, i32 123, i32 279620)
+  %3 = call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* nonnull %0, i32 123, i32 4473924)
+
+  %4 = add <1024 x i32> %1, %2
+  %5 = add <1024 x i32> %4, %3
+
+  ret <1024 x i32> %5
+}
+
+; CHECK-LABEL: ld_with_base_isel_imm_vs_reg_based_on_stride:
+; CHECK-VF-DAG: v{{[0-9]+}} = vld [vmem:s0+$0x8 ss:$0x5 sm:$0x7b]
+; CHECK-VF-DAG: v{{[0-9]+}} = vld [vmem:s0+$0x8 ss:$0x44444 sm:$0x7b]
+; CHECK-VF-DAG: s[[x:[0-9]+]] = simm.s32 $0x444444
+; CHECK-VF-DAG: v{{[0-9]+}} = vld [vmem:s0+$0x8 ss:s[[x]] sm:$0x7b]
+
+; CHECK-PF-DAG: v{{[0-9]+}} = vld [vmem:s0+$0x8 ss:$0x5 sm:$0x7b]
+; CHECK-PF-DAG: s[[x:[0-9]+]] =  simm.s32 $0x44444
+; CHECK-PF-DAG: v{{[0-9]+}} = vld [vmem:s0+$0x8 ss:s[[x]] sm:$0x7b]
+; CHECK-PF-DAG: s[[y:[0-9]+]] =  simm.s32 $0x444444
+; CHECK-PF-DAG: v{{[0-9]+}} = vld [vmem:s0+$0x8 ss:s[[y]] sm:$0x7b]
+define <1024 x i32> @ld_with_base_isel_imm_vs_reg_based_on_stride(<1024 x i32> addrspace(205)* readonly %ptr) {
+entry:
+  %add.ptr = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %ptr, i32 1
+  %0 = call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* %add.ptr, i32 123, i32 5)
+  %1 = call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* %add.ptr, i32 123, i32 279620)
+  %2 = call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* %add.ptr, i32 123, i32 4473924)
+
+  %3 = add <1024 x i32> %0, %1
+  %4 = add <1024 x i32> %3, %2
+
+  ret <1024 x i32> %4
+}
+
+; CHECK-LABEL: st_isel_imm_vs_reg_based_on_offset:
+; CHECK-VF-DAG: [vmem:$0x1] =   vst v{{[0-9]+}}
+; CHECK-VF-DAG: [vmem:$0x44444] =  vst v{{[0-9]+}}
+; CHECK-VF-DAG: s[[x:[0-9]+]] =    simm.s32 $0x444444
+; CHECK-VF-DAG: [vmem:s[[x]]+$0x0] =  vst v{{[0-9]+}}
+
+; CHECK-PF-DAG: [vmem:$0x1] =   vst v{{[0-9]+}}
+; CHECK-PF-DAG: s[[x:[0-9]+]] =    simm.s32 $0x44444
+; CHECK-PF-DAG: [vmem:s[[x]]+$0x0] =  vst v{{[0-9]+}}
+; CHECK-PF-DAG: s[[y:[0-9]+]] =    simm.s32 $0x444444
+; CHECK-PF-DAG: [vmem:s[[y]]+$0x0] =  vst v{{[0-9]+}}
+define void @st_isel_imm_vs_reg_based_on_offset(<1024 x i32> %data) {
+entry:
+  %a0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 1)
+  store <1024 x i32> %data, <1024 x i32> addrspace(205)* %a0
+
+  %a1 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 279620)
+  store <1024 x i32> %data, <1024 x i32> addrspace(205)* %a1
+
+  %a2 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 4473924)
+  store <1024 x i32> %data, <1024 x i32> addrspace(205)* %a2
+
+  ret void
+}
+
+; CHECK-LABEL: st_isel_imm_vs_reg_based_on_stride:
+; CHECK-VF-DAG: [vmem:$0x1 ss:$0x5 sm:$0x7b] = vst.msk  vm{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-VF-DAG: [vmem:$0x1 ss:$0x44444 sm:$0x7b] = vst.msk  vm{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-VF-DAG: s[[x:[0-9]+]] =    simm.s32 $0x44444
+; CHECK-VF-DAG: [vmem:$0x1 ss:s[[x]] sm:$0x7b] = vst.msk  vm{{[0-9]+}}, v{{[0-9]+}}
+
+; CHECK-PF-DAG: [vmem:$0x1 ss:$0x5 sm:$0x7b] = vst.msk  vm{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-PF-DAG: s[[x:[0-9]+]] =    simm.s32 $0x44444
+; CHECK-PF-DAG: [vmem:$0x1 ss:s[[x]] sm:$0x7b] = vst.msk  vm{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-PF-DAG: s[[y:[0-9]+]] =    simm.s32 $0x444444
+; CHECK-PF-DAG: [vmem:$0x1 ss:s[[y]] sm:$0x7b] = vst.msk  vm{{[0-9]+}}, v{{[0-9]+}}
+define void @st_isel_imm_vs_reg_based_on_stride(<1024 x i32> %data, <1024 x i1> %m, <1024 x i32> addrspace(205)* readonly %ptr, i32 %mask, i32 %strided){
+entry:
+  %0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 1)
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %data, <1024 x i32> addrspace(205)* nonnull %0, i32 123, i32 5, <1024 x i1> %m)
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %data, <1024 x i32> addrspace(205)* nonnull %0, i32 123, i32 279620, <1024 x i1> %m)
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %data, <1024 x i32> addrspace(205)* nonnull %0, i32 123, i32 4473924, <1024 x i1> %m)
+  ret void
+}
+
+; CHECK-LABEL: st_isel_imm_vs_reg_based_on_sublanemask:
+; CHECK-VF-DAG: [vmem:$0x1 ss:$0x5 sm:$0x7b] = vst.msk  vm{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-VF-DAG: [vmem:$0x1 ss:$0x5 sm:$0x44444] = vst.msk  vm{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-VF-DAG: s[[x:[0-9]+]] =    simm.s32 $0x44444
+; CHECK-VF-DAG: [vmem:$0x1 ss:$0x5 sm:s[[x]]] = vst.msk  vm{{[0-9]+}}, v{{[0-9]+}}
+
+; CHECK-PF-DAG: [vmem:$0x1 ss:$0x5 sm:$0x7b] = vst.msk  vm{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-PF-DAG: s[[x:[0-9]+]] =    simm.s32 $0x44444
+; CHECK-PF-DAG: [vmem:$0x1 ss:$0x5 sm:s[[x]]] = vst.msk  vm{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-PF-DAG: s[[y:[0-9]+]] =    simm.s32 $0x444444
+; CHECK-PF-DAG: [vmem:$0x1 ss:$0x5 sm:s[[y]]] = vst.msk  vm{{[0-9]+}}, v{{[0-9]+}}
+define void @st_isel_imm_vs_reg_based_on_sublanemask(<1024 x i32> %data, <1024 x i1> %m, <1024 x i32> addrspace(205)* readonly %ptr, i32 %mask, i32 %strided){
+entry:
+  %0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 1)
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %data, <1024 x i32> addrspace(205)* nonnull %0, i32 123, i32 5, <1024 x i1> %m)
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %data, <1024 x i32> addrspace(205)* nonnull %0, i32 279620, i32 5, <1024 x i1> %m)
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %data, <1024 x i32> addrspace(205)* nonnull %0, i32 4473924, i32 5, <1024 x i1> %m)
+  ret void
+}
+
+; CHECK-LABEL: ld_indexed_isel_imm_vs_reg_based_on_offset:
+; CHECK-VF-DAG: s[[x:[0-9]+]] = simm.s32 $0x1234567
+; CHECK-VF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:$0x1 ss:s[[x]] sm:$0x7b]
+; CHECK-VF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:$0x44444 ss:s[[x]] sm:$0x7b]
+; CHECK-VF-DAG: s[[y:[0-9]+]] = simm.s32 $0x444444
+; CHECK-VF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:s[[y]]+$0x0 ss:s[[x]] sm:$0x7b]
+
+; CHECK-PF-DAG: s[[x:[0-9]+]] = simm.s32 $0x1234567
+; CHECK-PF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:$0x1 ss:s[[x]] sm:$0x7b]
+; CHECK-PF-DAG: s[[z:[0-9]+]] = simm.s32 $0x44444
+; CHECK-PF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:s[[z]]+$0x0 ss:s[[x]] sm:$0x7b]
+; CHECK-PF-DAG: s[[y:[0-9]+]] = simm.s32 $0x444444
+; CHECK-PF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:s[[y]]+$0x0 ss:s[[x]] sm:$0x7b]
+define <1024 x i32> @ld_indexed_isel_imm_vs_reg_based_on_offset(<1024 x i32> %index) {
+entry:
+  %iar = call i32 @llvm.tpu.set.lane.indexed(<1024 x i32> %index, i32 0)
+
+  %a0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 1)
+  %a1 = call <1024 x i32> @llvm.tpu.vld.indexed.i32(<1024 x i32> addrspace(205)* nonnull %a0, i32 123, i32 19088743, i32 %iar, i32 0)
+
+  %b0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 279620)
+  %b1 = call <1024 x i32> @llvm.tpu.vld.indexed.i32(<1024 x i32> addrspace(205)* nonnull %b0, i32 123, i32 19088743, i32 %iar, i32 0)
+
+  %c0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 4473924)
+  %c1 = call <1024 x i32> @llvm.tpu.vld.indexed.i32(<1024 x i32> addrspace(205)* nonnull %c0, i32 123, i32 19088743, i32 %iar, i32 0)
+
+  %0 = add <1024 x i32> %a1, %b1
+  %1 = add <1024 x i32> %0, %c1
+
+  ret <1024 x i32> %1
+}
+
+; CHECK-LABEL: ld_indexed_isel_imm_vs_reg_based_on_sublanemask:
+; CHECK-VF-DAG: s[[x:[0-9]+]] = simm.s32 $0x1234567
+; CHECK-VF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:$0x1 ss:s[[x]] sm:$0x7b]
+; CHECK-VF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:$0x1 ss:s[[x]] sm:$0x44444]
+; CHECK-VF-DAG: s[[y:[0-9]+]] = simm.s32 $0x444444
+; CHECK-VF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:$0x1 ss:s[[x]] sm:s[[y]]]
+
+; CHECK-PF-DAG: s[[x:[0-9]+]] = simm.s32 $0x1234567
+; CHECK-PF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:$0x1 ss:s[[x]] sm:$0x7b]
+; CHECK-PF-DAG: s[[z:[0-9]+]] = simm.s32 $0x44444
+; CHECK-PF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:$0x1 ss:s[[x]] sm:s[[z]]]
+; CHECK-PF-DAG: s[[y:[0-9]+]] = simm.s32 $0x444444
+; CHECK-PF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:$0x1 ss:s[[x]] sm:s[[y]]]
+define <1024 x i32> @ld_indexed_isel_imm_vs_reg_based_on_sublanemask(<1024 x i32> %index) {
+entry:
+  %iar = call i32 @llvm.tpu.set.lane.indexed(<1024 x i32> %index, i32 0)
+
+  %a0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 1)
+  %a1 = call <1024 x i32> @llvm.tpu.vld.indexed.i32(<1024 x i32> addrspace(205)* nonnull %a0, i32 123, i32 19088743, i32 %iar, i32 0)
+  %b1 = call <1024 x i32> @llvm.tpu.vld.indexed.i32(<1024 x i32> addrspace(205)* nonnull %a0, i32 279620, i32 19088743, i32 %iar, i32 0)
+  %c1 = call <1024 x i32> @llvm.tpu.vld.indexed.i32(<1024 x i32> addrspace(205)* nonnull %a0, i32 4473924, i32 19088743, i32 %iar, i32 0)
+
+  %0 = add <1024 x i32> %a1, %b1
+  %1 = add <1024 x i32> %0, %c1
+
+  ret <1024 x i32> %1
+}
+
+; CHECK-LABEL: ld_indexed_isel_imm_vs_reg_based_on_stride:
+; CHECK-VF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:$0x1 ss:$0x5 sm:$0x7b]
+; CHECK-VF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:$0x1 ss:$0x44444 sm:$0x7b]
+; CHECK-VF-DAG: s[[x:[0-9]+]] = simm.s32 $0x444444
+; CHECK-VF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:$0x1 ss:s[[x]] sm:$0x7b]
+
+; CHECK-PF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:$0x1 ss:$0x5 sm:$0x7b]
+; CHECK-PF-DAG: s[[z:[0-9]+]] = simm.s32 $0x44444
+; CHECK-PF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:$0x1 ss:s[[z]] sm:$0x7b]
+; CHECK-PF-DAG: s[[y:[0-9]+]] = simm.s32 $0x444444
+; CHECK-PF-DAG: v{{[0-9]+}} = vld.iar0 [vmem:$0x1 ss:s[[y]] sm:$0x7b]
+define <1024 x i32> @ld_indexed_isel_imm_vs_reg_based_on_stride(<1024 x i32> %index) {
+entry:
+  %iar = call i32 @llvm.tpu.set.lane.indexed(<1024 x i32> %index, i32 0)
+
+  %a0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 1)
+  %a1 = call <1024 x i32> @llvm.tpu.vld.indexed.i32(<1024 x i32> addrspace(205)* nonnull %a0, i32 123, i32 5, i32 %iar, i32 0)
+  %b1 = call <1024 x i32> @llvm.tpu.vld.indexed.i32(<1024 x i32> addrspace(205)* nonnull %a0, i32 123, i32 279620, i32 %iar, i32 0)
+  %c1 = call <1024 x i32> @llvm.tpu.vld.indexed.i32(<1024 x i32> addrspace(205)* nonnull %a0, i32 123, i32 4473924, i32 %iar, i32 0)
+
+  %0 = add <1024 x i32> %a1, %b1
+  %1 = add <1024 x i32> %0, %c1
+
+  ret <1024 x i32> %1
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_peep_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_peep_sc.ll
new file mode 100644
index 0000000..c4188c6
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_peep_sc.ll

@@ -0,0 +1,38 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -opaque-pointers | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32) nounwind
+
+; Tests that we peephole vector loads to broadcast with simple strided loads.
+
+; CHECK-LABEL: vldi_bcasti:
+; CHECK: v0 = vld.msk [tilespmem:$0x2000 ss:$0x0], $0xff
+define <8 x i32> @vldi_bcasti() {
+  %a = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 8192)
+  %b = load <8 x i32>, <8 x i32> addrspace(201)* %a
+  %r = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: vldr_bcasti:
+; CHECK: v0 = vld.msk [tilespmem:s0+$0x0 ss:$0x0], $0xff
+define <8 x i32> @vldr_bcasti(<8 x i32> addrspace(201)* %a) {
+  %b = load <8 x i32>, <8 x i32> addrspace(201)* %a
+  %r = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: vldri_bcasti:
+; CHECK: v0 = vld.msk [tilespmem:s0+$0x8 ss:$0x0], $0xff
+define <8 x i32> @vldri_bcasti(<8 x i32> addrspace(201)* %a) {
+  %addr = getelementptr <8 x i32>, <8 x i32> addrspace(201)* %a, i32 1
+  %b = load <8 x i32>, <8 x i32> addrspace(201)* %addr
+  %r = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
+  ret <8 x i32> %r
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_sc.ll
new file mode 100644
index 0000000..bfaed65
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_sc.ll

@@ -0,0 +1,757 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-allow-global-offset-for-test | FileCheck %s
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -opaque-pointers -tpu-allow-global-offset-for-test | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+%struct.V = type { <8 x i32>, <8 x i32> }
+
+declare <8 x i32> @llvm.tpu.vld.msk.v8i32.v8i1.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*)
+declare <8 x i32> @llvm.tpu.vld.msk.idx.v8i32.v8i1.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>)
+declare <8 x float> @llvm.tpu.vld.msk.idx.v8f32.v8i1.p201v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>)
+declare void @llvm.tpu.vst.msk.idx.v8i1.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>) #1
+declare void @llvm.tpu.vst.msk.idx.v8i1.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, <8 x float>) #1
+declare void @llvm.tpu.vst.msk.idx.add.v8i1.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+declare void @llvm.tpu.vst.msk.idx.add.v8i1.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, <8 x float>)
+declare void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, <8 x i32>)
+declare void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, <8 x float>)
+declare void @llvm.tpu.vst.msk.v8i1.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>) argmemonly nounwind
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32) nounwind
+declare <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32) nounwind
+declare <8 x i32> @llvm.tpu.vld.cb.msk.v8i32.v8i1(<8 x i1>, x86_mmx, i32)
+declare <8 x i32> @llvm.tpu.vld.cb.msk.idx.v8i32.v8i1(<8 x i1>, x86_mmx, i32, <8 x i32>)
+declare <8 x float> @llvm.tpu.vld.cb.msk.idx.v8f32.v8i1(<8 x i1>, x86_mmx, i32, <8 x i32>)
+declare <8 x i32> @llvm.tpu.vld.cb.msk.idx.np.v8i32.v8i1(<8 x i1>, x86_mmx, i32, <8 x i32>)
+declare <8 x float> @llvm.tpu.vld.cb.msk.idx.np.v8f32.v8i1(<8 x i1>, x86_mmx, i32, <8 x i32>)
+declare void @llvm.tpu.vst.cb.msk.v8i1.v8i32(<8 x i1>, x86_mmx, i32, <8 x i32>)
+declare void @llvm.tpu.vst.cb.msk.idx.v8i1.v8i32(<8 x i1>, x86_mmx, i32, <8 x i32>, <8 x i32>)
+declare void @llvm.tpu.vst.cb.msk.idx.v8i1.v8f32(<8 x i1>, x86_mmx, i32, <8 x i32>, <8 x float>)
+declare void @llvm.tpu.vst.cb.msk.idx.add.v8i1.v8i32(<8 x i1>, x86_mmx, i32, <8 x i32>, <8 x i32>)
+declare void @llvm.tpu.vst.cb.msk.idx.add.v8i1.v8f32(<8 x i1>, x86_mmx, i32, <8 x i32>, <8 x float>)
+declare void @llvm.tpu.vst.cb.msk.idx.add.np.v8i1.v8i32(<8 x i1>, x86_mmx, i32, <8 x i32>, <8 x i32>)
+declare void @llvm.tpu.vst.cb.msk.idx.add.np.v8i1.v8f32(<8 x i1>, x86_mmx, i32, <8 x i32>, <8 x float>)
+declare <8 x i32> @llvm.tpu.vld.cb.upd.msk.v8i32.v8i1(<8 x i1>, x86_mmx, i32)
+declare void @llvm.tpu.vst.cb.upd.msk.v8i1.v8i32(<8 x i1>, x86_mmx, i32, <8 x i32>)
+declare <8 x i32> @llvm.tpu.vld.msk.strided.v8i32.p201v8i32.v8i1(<8 x i1>, <8 x i32> addrspace(201)*, i32)
+declare void @llvm.tpu.vst.msk.strided.v8i1.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, i32, <8 x i32>)
+declare void @llvm.tpu.vst.msk.add.strided.v8i1.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, i32, <8 x i32>)
+declare void @llvm.tpu.vst.msk.add.strided.v8i1.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, i32, <8 x float>)
+declare <8 x i32> @llvm.tpu.vld.msk.idx.strided.v8i32.v8i1.p201v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, i32)
+declare <8 x float> @llvm.tpu.vld.msk.idx.strided.v8f32.v8i1.p201v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, i32)
+declare void @llvm.tpu.vst.msk.idx.strided.v8i1.p201v8i32.v8i32(<8 x i1>, <8 x i32> addrspace(201)*, <8 x i32>, i32, <8 x i32>)
+declare void @llvm.tpu.vst.msk.idx.strided.v8i1.p201v8f32.v8f32(<8 x i1>, <8 x float> addrspace(201)*, <8 x i32>, i32, <8 x float>)
+
+attributes #1 = { argmemonly nounwind }
+
+@garr = addrspace(201) global <8 x i32> zeroinitializer
+@garrf = addrspace(201) global <8 x float> zeroinitializer
+
+; CHECK-LABEL: vldi:
+; CHECK: v0 = vld [tilespmem:s0+$0x0]
+define <8 x i32> @vldi(<8 x i32> addrspace(201)* %a) {
+  %b = load <8 x i32>, <8 x i32> addrspace(201)* %a
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vldi2:
+; CHECK: v0 = vld [tilespmem:s0+$0x8]
+define <8 x i32> @vldi2(<8 x i32> addrspace(201)* %a) {
+  %addr = getelementptr <8 x i32>, <8 x i32> addrspace(201)* %a, i32 1
+  %b = load <8 x i32>, <8 x i32> addrspace(201)* %addr
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vld_immad:
+; CHECK: v0 = vld [tilespmem:$0x10]
+define <8 x i32> @vld_immad(<8 x i32> %a) {
+  %addr = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 16)
+  %b = load <8 x i32>, <8 x i32> addrspace(201)* %addr
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vld_immadf:
+; CHECK: v0 = vld [tilespmem:$0x10]
+define <8 x float> @vld_immadf(<8 x float> %a) {
+  %addr = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 16)
+  %b = load <8 x float>, <8 x float> addrspace(201)* %addr
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vld_msk_immad:
+; CHECK: v0 = vld.msk [tilespmem:$0x64], vm0
+define <8 x i32> @vld_msk_immad(<8 x i32> addrspace(201)* %b, <8 x i1> %m) {
+  %a = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 100)
+  %r = tail call <8 x i32> @llvm.tpu.vld.msk.v8i32.v8i1.p201v8i32(<8 x i1> %m,
+                                                                  <8 x i32> addrspace(201)* %a)
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: vld_msk_disp:
+; CHECK: v0 = vld.msk [tilespmem:s0+$0x320], vm0
+define <8 x i32> @vld_msk_disp(<8 x i32> addrspace(201)* %b, <8 x i1> %m) {
+  %a = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %b, i32 100
+  %r = tail call <8 x i32> @llvm.tpu.vld.msk.v8i32.v8i1.p201v8i32(<8 x i1> %m,
+                                                                  <8 x i32> addrspace(201)* %a)
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: vld_msk_idxi:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], vm0;
+define <8 x i32> @vld_msk_idxi(<8 x i32> addrspace(201)* %b, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.v8i32.v8i1.p201v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %b, <8 x i32> %off)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vld_msk_idxi_disp:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x400], vm0;
+define <8 x i32> @vld_msk_idxi_disp(<8 x i32> addrspace(201)* %b, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %a = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %b, i32 128
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.v8i32.v8i1.p201v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %a, <8 x i32> %off)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vld_msk_idxf:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], vm0;
+define <8 x float> @vld_msk_idxf(<8 x float> addrspace(201)* %b, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %0 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32.v8i1.p201v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %b, <8 x i32> %off)
+  ret <8 x float> %0
+}
+
+; CHECK-LABEL: vld_msk_idxf_disp:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x400], vm0;
+define <8 x float> @vld_msk_idxf_disp(<8 x float> addrspace(201)* %b, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %a = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %b, i32 128
+  %0 = tail call <8 x float> @llvm.tpu.vld.msk.idx.v8f32.v8i1.p201v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %a, <8 x i32> %off)
+  ret <8 x float> %0
+}
+
+; CHECK-LABEL: vldcbmsk_disp:
+; CHECK: v0 = vld.cb.msk [tilespmem:s0+$0x10 cbreg:$0x0], vm0;
+define <8 x i32> @vldcbmsk_disp(i32 %base, <8 x i1> %m) {
+entry:
+  %a = add i32 %base, 16
+  %0 = tail call <8 x i32> @llvm.tpu.vld.cb.msk.v8i32.v8i1(<8 x i1> %m,
+                                                x86_mmx undef,
+                                                i32 %a)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vldcbidxmski_disp:
+; CHECK: v0 = vld.idx.cb.msk [tilespmem:v0+s0+$0x80 cbreg:$0x0], vm0;
+define <8 x i32> @vldcbidxmski_disp(i32 %base, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %a = add i32 %base, 128
+  %0 = tail call <8 x i32> @llvm.tpu.vld.cb.msk.idx.v8i32.v8i1(<8 x i1> %m,
+                                                     x86_mmx undef,
+                                                     i32 %a,
+                                                     <8 x i32> %off)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vldcbidxmskf_disp:
+; CHECK: v0 = vld.idx.cb.msk [tilespmem:v0+s0+$0x100 cbreg:$0x0], vm0;
+define <8 x float> @vldcbidxmskf_disp(i32 %base, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %a = add i32 %base, 256
+  %0 = tail call <8 x float> @llvm.tpu.vld.cb.msk.idx.v8f32.v8i1(<8 x i1> %m,
+                                                       x86_mmx undef,
+                                                       i32 %a,
+                                                       <8 x i32> %off)
+  ret <8 x float> %0
+}
+
+; CHECK-LABEL: vldcbidxmski_np_disp:
+; CHECK: v0 = vld.idx.cb.msk [tilespmem:v0+s0+$0x800 cbreg:$0x0], vm0;
+define <8 x i32> @vldcbidxmski_np_disp(i32 %base, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %a = add i32 %base, 2048
+  %0 = tail call <8 x i32> @llvm.tpu.vld.cb.msk.idx.np.v8i32.v8i1(<8 x i1> %m,
+                                                        x86_mmx undef,
+                                                        i32 %a,
+                                                        <8 x i32> %off)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vldcbidxmskf_np_disp:
+; CHECK: v0 = vld.idx.cb.msk [tilespmem:v0+s0+$0x1000 cbreg:$0x0], vm0;
+define <8 x float> @vldcbidxmskf_np_disp(i32 %base, <8 x i32> %off, <8 x i1> %m) {
+entry:
+  %a = add i32 %base, 4096
+  %0 = tail call <8 x float> @llvm.tpu.vld.cb.msk.idx.np.v8f32.v8i1(<8 x i1> %m,
+                                                          x86_mmx undef,
+                                                          i32 %a,
+                                                          <8 x i32> %off)
+  ret <8 x float> %0
+}
+
+; CHECK-LABEL: vstcbmsk_disp:
+; CHECK: [tilespmem:s0+$0x2000 cbreg:$0x0] = vst.cb.msk vm0, v0;
+define void @vstcbmsk_disp(i32 %base, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  %a = add i32 %base, 8192
+  tail call void @llvm.tpu.vst.cb.msk.v8i1.v8i32(<8 x i1> %m,
+                                      x86_mmx undef,
+                                      i32 %a,
+                                      <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxmski_disp:
+; CHECK: [tilespmem:v0+s0+$0x4000 cbreg:$0x0] = vst.idx.cb.msk vm0, v1;
+define void @vstcbidxmski_disp(i32 %base, <8 x i32> %off, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  %a = add i32 %base, 16384
+  tail call void @llvm.tpu.vst.cb.msk.idx.v8i1.v8i32(<8 x i1> %m,
+                                           x86_mmx undef,
+                                           i32 %a,
+                                           <8 x i32> %off, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxmskf_disp:
+; CHECK: [tilespmem:v0+s0+$0x8000 cbreg:$0x0] = vst.idx.cb.msk vm0, v1;
+define void @vstcbidxmskf_disp(i32 %base, <8 x i32> %off, <8 x float> %val, <8 x i1> %m) {
+entry:
+  %a = add i32 %base, 32768
+  tail call void @llvm.tpu.vst.cb.msk.idx.v8i1.v8f32(<8 x i1> %m,
+                                           x86_mmx undef,
+                                           i32 %a,
+                                           <8 x i32> %off, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxaddi_disp:
+; CHECK: [tilespmem:v0+s0+$0x40000 cbreg:$0x0] = vst.idx.cb.add.s32.msk vm0, v1;
+define void @vstcbidxaddi_disp(<8 x i1> %m, i32 %base, <8 x i32> %off, <8 x i32> %val) {
+entry:
+  %a = add i32 %base, 262144
+  tail call void @llvm.tpu.vst.cb.msk.idx.add.v8i1.v8i32(<8 x i1> %m,
+                                               x86_mmx undef,
+                                               i32 %a,
+                                               <8 x i32> %off, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxaddf_disp:
+; CHECK: [tilespmem:v0+s0+$0x4 cbreg:$0x0] = vst.idx.cb.add.f32.msk vm0, v1;
+define void @vstcbidxaddf_disp(<8 x i1> %m, i32 %base, <8 x i32> %off, <8 x float> %val) {
+entry:
+  %a = add i32 %base, 4
+  tail call void @llvm.tpu.vst.cb.msk.idx.add.v8i1.v8f32(<8 x i1> %m,
+                                               x86_mmx undef,
+                                               i32 %a,
+                                               <8 x i32> %off, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxaddi_np_disp:
+; CHECK: [tilespmem:v0+s0+$0x8 cbreg:$0x0] = vst.idx.cb.add.s32.msk vm0, v1;
+define void @vstcbidxaddi_np_disp(<8 x i1> %m, i32 %base, <8 x i32> %off, <8 x i32> %val) {
+entry:
+  %a = add i32 %base, 8
+  tail call void @llvm.tpu.vst.cb.msk.idx.add.np.v8i1.v8i32(<8 x i1> %m,
+                                                  x86_mmx undef,
+                                                  i32 %a,
+                                                  <8 x i32> %off, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstcbidxaddf_np_disp:
+; CHECK: [tilespmem:v0+s0+$0x10 cbreg:$0x0] = vst.idx.cb.add.f32.msk vm0, v1;
+define void @vstcbidxaddf_np_disp(<8 x i1> %m, i32 %base, <8 x i32> %off, <8 x float> %val) {
+entry:
+  %a = add i32 %base, 16
+  tail call void @llvm.tpu.vst.cb.msk.idx.add.np.v8i1.v8f32(<8 x i1> %m,
+                                                  x86_mmx undef,
+                                                  i32 %a,
+                                                  <8 x i32> %off, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vldcbupdmsk_disp:
+; CHECK: v0 = vld.cb.upd.msk [tilespmem:s0+$0x20 cbreg:$0x0], vm0;
+define <8 x i32> @vldcbupdmsk_disp(i32 %base, <8 x i1> %m) {
+entry:
+  %a = add i32 %base, 32
+  %0 = tail call <8 x i32> @llvm.tpu.vld.cb.upd.msk.v8i32.v8i1(<8 x i1> %m,
+                                                    x86_mmx undef,
+                                                    i32 %a)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vstcbupdmsk_disp:
+; CHECK: [tilespmem:s0+$0x40 cbreg:$0x0] = vst.cb.upd.msk vm0, v0;
+define void @vstcbupdmsk_disp(i32 %base, <8 x i32> %val, <8 x i1> %m) {
+entry:
+  %a = add i32 %base, 64
+  tail call void @llvm.tpu.vst.cb.upd.msk.v8i1.v8i32(<8 x i1> %m,
+                                          x86_mmx undef,
+                                          i32 %a,
+                                          <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vsti:
+; CHECK: [tilespmem:s0+$0x0] = vst v0
+define void @vsti(<8 x i32> %a, <8 x i32> addrspace(201)* %b) {
+  store <8 x i32> %a, <8 x i32> addrspace(201)* %b
+  ret void
+}
+
+; CHECK-LABEL: vsti2:
+; CHECK: [tilespmem:s0+$0x8] = vst v0
+define void @vsti2(<8 x i32> %a, <8 x i32> addrspace(201)* %b) {
+  %addr = getelementptr <8 x i32>, <8 x i32> addrspace(201)* %b, i32 1
+  store <8 x i32> %a, <8 x i32> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vstf2:
+; CHECK: [tilespmem:s0+$0x8] = vst v0
+define void @vstf2(<8 x float> %a, <8 x float> addrspace(201)* %b) {
+  %addr = getelementptr <8 x float>, <8 x float> addrspace(201)* %b, i32 1
+  store <8 x float> %a, <8 x float> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vst_gep:
+; CHECK: s0 = sadd.s32 s1, s0
+; CHECK: [tilespmem:s0+$0x0] = vst v0
+define void @vst_gep(<8 x float> %a, <8 x float> addrspace(201)* %b, i32 %c) {
+  %addr = getelementptr <8 x float>, <8 x float> addrspace(201)* %b, i32 %c
+  store <8 x float> %a, <8 x float> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vld_struct:
+; CHECK: v0 = vld [tilespmem:s0+$0x8]
+define <8 x i32> @vld_struct(%struct.V addrspace(201)* %v) {
+  %p = getelementptr inbounds %struct.V, %struct.V addrspace(201)* %v, i32 0, i32 1
+  %l = load <8 x i32>, <8 x i32> addrspace(201)* %p
+  ret <8 x i32> %l
+}
+
+; CHECK-LABEL: vst_global:
+; CHECK: [tilespmem:garr+16] = vst v0
+define void @vst_global(<8 x i32> %a) {
+  %addr = getelementptr <8 x i32>, <8 x i32> addrspace(201)* @garr, i32 2
+  store <8 x i32> %a, <8 x i32> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vst_globalf:
+; CHECK: [tilespmem:garrf+16] = vst v0
+define void @vst_globalf(<8 x float> %a) {
+  %addr = getelementptr <8 x float>, <8 x float> addrspace(201)* @garrf, i32 2
+  store <8 x float> %a, <8 x float> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vst_immad:
+; CHECK: [tilespmem:$0x10] = vst v0
+define void @vst_immad(<8 x i32> %a) {
+  %addr = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 16)
+  store <8 x i32> %a, <8 x i32> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vst_immadf:
+; CHECK: [tilespmem:$0x10] = vst v0
+define void @vst_immadf(<8 x float> %a) {
+  %addr = call <8 x float> addrspace(201)* @llvm.tpu.inttoptr.p201v8float(i32 16)
+  store <8 x float> %a, <8 x float> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idxi:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.msk vm0, v1
+define void @vst_msk_idxi(<8 x i1> %m, <8 x i32> %offs, <8 x i32> %v) {
+  tail call void @llvm.tpu.vst.msk.idx.v8i1.p201v8i32.v8i32(<8 x i1> %m,
+                                        <8 x i32> addrspace(201)* @garr,
+                                        <8 x i32> %offs,
+                                        <8 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idxi_disp:
+; CHECK: [tilespmem:v0+s0+$0x200] = vst.idx.msk vm0, v1
+define void @vst_msk_idxi_disp(<8 x i32> addrspace(201)* %b, <8 x i1> %m, <8 x i32> %offs, <8 x i32> %v) {
+  %a = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %b, i32 64
+  tail call void @llvm.tpu.vst.msk.idx.v8i1.p201v8i32.v8i32(<8 x i1> %m,
+                                        <8 x i32> addrspace(201)* %a,
+                                        <8 x i32> %offs,
+                                        <8 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idxf:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.msk vm0, v1
+define void @vst_msk_idxf(<8 x i1> %m, <8 x i32> %offs, <8 x float> %v) {
+  tail call void @llvm.tpu.vst.msk.idx.v8i1.p201v8f32.v8f32(<8 x i1> %m,
+                                        <8 x float> addrspace(201)* @garrf,
+                                        <8 x i32> %offs,
+                                        <8 x float> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idxf_disp:
+; CHECK: [tilespmem:v0+s0+$0x200] = vst.idx.msk vm0, v1
+define void @vst_msk_idxf_disp(<8 x float> addrspace(201)* %b, <8 x i1> %m, <8 x i32> %offs, <8 x float> %v) {
+  %a = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %b, i32 64
+  tail call void @llvm.tpu.vst.msk.idx.v8i1.p201v8f32.v8f32(<8 x i1> %m,
+                                        <8 x float> addrspace(201)* %a,
+                                        <8 x i32> %offs,
+                                        <8 x float> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idx_addi:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.add.s32.msk vm0, v1
+define void @vst_msk_idx_addi(<8 x i1> %m, <8 x i32> %offs, <8 x i32> %v) {
+  tail call void @llvm.tpu.vst.msk.idx.add.v8i1.p201v8i32.v8i32(<8 x i1> %m,
+                                        <8 x i32> addrspace(201)* @garr,
+                                        <8 x i32> %offs,
+                                        <8 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idx_addi_disp:
+; CHECK: [tilespmem:v0+s0+$0x200] = vst.idx.add.s32.msk vm0, v1
+define void @vst_msk_idx_addi_disp(<8 x i32> addrspace(201)* %b, <8 x i1> %m, <8 x i32> %offs, <8 x i32> %v) {
+  %a = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %b, i32 64
+  tail call void @llvm.tpu.vst.msk.idx.add.v8i1.p201v8i32.v8i32(<8 x i1> %m,
+                                        <8 x i32> addrspace(201)* %a,
+                                        <8 x i32> %offs,
+                                        <8 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idx_addi_np_disp:
+; CHECK: [tilespmem:v0+s0+$0x200] = vst.idx.add.s32.msk vm0, v1
+define void @vst_msk_idx_addi_np_disp(<8 x i32> addrspace(201)* %b, <8 x i1> %m, <8 x i32> %offs, <8 x i32> %v) {
+  %a = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %b, i32 64
+  tail call void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8i32.v8i32(<8 x i1> %m,
+                                        <8 x i32> addrspace(201)* %a,
+                                        <8 x i32> %offs,
+                                        <8 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idx_addf:
+; CHECK: [tilespmem:v0+s0+$0x0] = vst.idx.add.f32.msk vm0, v1
+define void @vst_msk_idx_addf(<8 x i1> %m, <8 x i32> %offs, <8 x float> %v) {
+  tail call void @llvm.tpu.vst.msk.idx.add.v8i1.p201v8f32.v8f32(<8 x i1> %m,
+                                        <8 x float> addrspace(201)* @garrf,
+                                        <8 x i32> %offs,
+                                        <8 x float> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idx_addf_disp:
+; CHECK: [tilespmem:v0+s0+$0x200] = vst.idx.add.f32.msk vm0, v1
+define void @vst_msk_idx_addf_disp(<8 x float> addrspace(201)* %b, <8 x i1> %m, <8 x i32> %offs, <8 x float> %v) {
+  %a = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %b, i32 64
+  tail call void @llvm.tpu.vst.msk.idx.add.v8i1.p201v8f32.v8f32(<8 x i1> %m,
+                                        <8 x float> addrspace(201)* %a,
+                                        <8 x i32> %offs,
+                                        <8 x float> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_idx_addf_np_disp:
+; CHECK: [tilespmem:v0+s0+$0x200] = vst.idx.add.f32.msk vm0, v1
+define void @vst_msk_idx_addf_np_disp(<8 x float> addrspace(201)* %b, <8 x i1> %m, <8 x i32> %offs, <8 x float> %v) {
+  %a = getelementptr inbounds <8 x float>, <8 x float> addrspace(201)* %b, i32 64
+  tail call void @llvm.tpu.vst.msk.idx.add.np.v8i1.p201v8f32.v8f32(<8 x i1> %m,
+                                        <8 x float> addrspace(201)* %a,
+                                        <8 x i32> %offs,
+                                        <8 x float> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk:
+; CHECK: [tilespmem:s0+$0x0] = vst.msk vm0, v0
+define void @vst_msk(<8 x i1> %m, <8 x i32> %v) {
+  tail call void @llvm.tpu.vst.msk.v8i1.p201v8i32.v8i32(<8 x i1> %m,
+                                                   <8 x i32> addrspace(201)* @garr,
+                                                   <8 x i32> %v)
+
+  ret void
+}
+
+; CHECK-LABEL: vst_msk_disp:
+; CHECK: [tilespmem:s0+$0x100] = vst.msk vm0, v0
+define void @vst_msk_disp(<8 x i32> addrspace(201)* %b,
+                          <8 x i1> %m, <8 x i32> %v) {
+  %a = getelementptr inbounds <8 x i32>, <8 x i32> addrspace(201)* %b, i32 32
+  tail call void @llvm.tpu.vst.msk.v8i1.p201v8i32.v8i32(<8 x i1> %m,
+                                                   <8 x i32> addrspace(201)* %a,
+                                                   <8 x i32> %v)
+  ret void
+}
+
+; TODO(hgreving): we only support 19-bit immediates right now because we're
+; conflating signed and unsigend immediates. See comment in TPUInstrFormat.td.
+
+; CHECK-LABEL: vld_32bitimm_0
+; CHECK: { s[[s:[0-9]+]] = sadd.s32 $0x7fffff8, s{{[0-9]+}}
+; CHECK-NEXT: { v{{[0-9]+}} = vld [tilespmem:s[[s]]+$0x0]
+define <8 x i32> @vld_32bitimm_0(<8 x i32> addrspace(201)* %a) {
+  %addr = getelementptr <8 x i32>, <8 x i32> addrspace(201)* %a, i32 16777215
+  %b = load <8 x i32>, <8 x i32> addrspace(201)* %addr
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vld_32bitimm_1
+; CHECK: { s[[s:[0-9]+]] = simm.s32 $0xffffff
+; CHECK-NEXT: { v{{[0-9]+}} = vld [tilespmem:s[[s]]+$0x0]
+define <8 x i32> @vld_32bitimm_1(<8 x i32> addrspace(201)* %a) {
+  %b = load <8 x i32>, <8 x i32> addrspace(201)* inttoptr (i32 16777215 to <8 x i32> addrspace(201)*)
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vld_19bitimm_0
+; CHECK: { v{{[0-9]+}} = vld [tilespmem:s{{[0-9]+}}+$0x7fff8]
+define <8 x i32> @vld_19bitimm_0(<8 x i32> addrspace(201)* %a) {
+  %addr = getelementptr <8 x i32>, <8 x i32> addrspace(201)* %a, i32 65535
+  %b = load <8 x i32>, <8 x i32> addrspace(201)* %addr
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vld_19bitimm_1
+; CHECK: { v{{[0-9]+}} = vld [tilespmem:$0x7ffff]
+define <8 x i32> @vld_19bitimm_1(<8 x i32> addrspace(201)* %a) {
+  %b = load <8 x i32>, <8 x i32> addrspace(201)* inttoptr (i32 524287 to <8 x i32> addrspace(201)*)
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vst_32bitimm_0
+; CHECK: { s[[s:[0-9]+]] = sadd.s32 $0x7fffff8, s{{[0-9]+}}
+; CHECK-NEXT: { [tilespmem:s[[s]]+$0x0] = vst
+define void @vst_32bitimm_0(<8 x i32> addrspace(201)* %a, <8 x i32> %b) {
+  %addr = getelementptr <8 x i32>, <8 x i32> addrspace(201)* %a, i32 16777215
+  store <8 x i32> %b, <8 x i32> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vst_32bitimm_1
+; CHECK: { s[[s:[0-9]+]] = simm.s32 $0xffffff
+; CHECK-NEXT: { [tilespmem:s[[s]]+$0x0] = vst
+define void @vst_32bitimm_1(<8 x i32> addrspace(201)* %a, <8 x i32> %b) {
+  store <8 x i32> %b, <8 x i32> addrspace(201)* inttoptr (i32 16777215 to <8 x i32> addrspace(201)*)
+  ret void
+}
+
+; CHECK-LABEL: vst_19bitimm_0
+; CHECK: { [tilespmem:s{{[0-9]+}}+$0x7fff8] = vst
+define void @vst_19bitimm_0(<8 x i32> addrspace(201)* %a, <8 x i32> %b) {
+  %addr = getelementptr <8 x i32>, <8 x i32> addrspace(201)* %a, i32 65535
+  store <8 x i32> %b, <8 x i32> addrspace(201)* %addr
+  ret void
+}
+
+; CHECK-LABEL: vst_19bitimm_1
+; CHECK: { [tilespmem:$0x7ffff] = vst
+define void @vst_19bitimm_1(<8 x i32> addrspace(201)* %a, <8 x i32> %b) {
+  store <8 x i32> %b, <8 x i32> addrspace(201)* inttoptr (i32 524287 to <8 x i32> addrspace(201)*)
+  ret void
+}
+
+; We're providing these patterns right now, though we may decide to -disable-vector-combine
+; in the future.
+
+; CHECK-LABEL: svldi:
+; CHECK: v0 = vld [tilespmem:$0x8]
+; CHECK: (v2sf) = vpush v0, $0x3
+; CHECK: s0 = spop (v2sf)
+define i32 @svldi(<8 x i32> addrspace(201)* %a) {
+  %v = load <8 x i32>, <8 x i32> addrspace(201)* inttoptr (i32 8 to <8 x i32> addrspace(201)*)
+  %b = extractelement <8 x i32> %v, i32 3
+  ret i32 %b
+}
+
+; CHECK-LABEL: svlda:
+; CHECK: v0 = vld [tilespmem:s0+$0x0]
+; CHECK: (v2sf) = vpush v0, $0x1
+; CHECK: s0 = spop (v2sf)
+define i32 @svlda(<8 x i32> addrspace(201)* %a) {
+  %addr = getelementptr <8 x i32>, <8 x i32> addrspace(201)* %a, i32 0
+  %v = load <8 x i32>, <8 x i32> addrspace(201)* %addr
+  %b = extractelement <8 x i32> %v, i32 1
+  ret i32 %b
+}
+
+; CHECK-LABEL: svldfi:
+; CHECK: v0 = vld [tilespmem:$0x8]
+; CHECK: (v2sf) = vpush v0, $0x3
+; CHECK: s0 = spop (v2sf)
+define float @svldfi(<8 x float> addrspace(201)* %a) {
+  %v = load <8 x float>, <8 x float> addrspace(201)* inttoptr (i32 8 to <8 x float> addrspace(201)*)
+  %b = extractelement <8 x float> %v, i32 3
+  ret float %b
+}
+
+; CHECK-LABEL: svldfa:
+; CHECK: v0 = vld [tilespmem:s0+$0x0]
+; CHECK: (v2sf) = vpush v0, $0x1
+; CHECK: s0 = spop (v2sf)
+define float @svldfa(<8 x float> addrspace(201)* %a) {
+  %addr = getelementptr <8 x float>, <8 x float> addrspace(201)* %a, i32 0
+  %v = load <8 x float>, <8 x float> addrspace(201)* %addr
+  %b = extractelement <8 x float> %v, i32 1
+  ret float %b
+}
+
+; CHECK-LABEL: svlds:
+; CHECK: v0 = vld [tilespmem:s0+$0x8]
+; CHECK: (v2sf) = vpush v0, $0x1
+; CHECK: s0 = spop (v2sf)
+define i32 @svlds(%struct.V addrspace(201)* %s) {
+  %p = getelementptr inbounds %struct.V, %struct.V addrspace(201)* %s, i32 0, i32 1
+  %v = load <8 x i32>, <8 x i32> addrspace(201)* %p
+  %b = extractelement <8 x i32> %v, i32 1
+  ret i32 %b
+}
+
+; CHECK-LABEL: scalarized_gep:
+; CHECK: s0 = sadd.s32 s1, s0
+; CHECK: v0 = vld [tilespmem:s0+$0x0]
+; CHECK: (v2sf) = vpush v0, $0x0
+; CHECK: s0 = spop (v2sf)
+define i32 @scalarized_gep(i32 addrspace(201)* %a, i32 %b) {
+  %1 = getelementptr i32, i32 addrspace(201)* %a, i32 %b
+  %2 = load i32, i32 addrspace(201)* %1, align 32
+  ret i32 %2
+}
+
+; CHECK-LABEL: scalarized_gep2:
+; CHECK: v0 = vld [tilespmem:s0+$0x8]
+; CHECK: (v2sf) = vpush v0, $0x0
+; CHECK: s0 = spop (v2sf)
+define i32 @scalarized_gep2(<8 x i32> addrspace(201)* %a) {
+  %1 = getelementptr <8 x i32>, <8 x i32> addrspace(201)* %a, i32 0, i32 8
+  %2 = bitcast i32 addrspace(201)* %1 to <8 x i32> addrspace(201)*
+  %3 = load <8 x i32>, <8 x i32> addrspace(201)* %2, align 32
+  %r = extractelement <8 x i32> %3, i32 0
+  ret i32 %r
+}
+
+; CHECK-LABEL: scalarized_gep3:
+; CHECK: v0 = vld [tilespmem:s0+$0x8]
+; CHECK: (v2sf) = vpush v0, $0x0
+; CHECK: s0 = spop (v2sf)
+define i32 @scalarized_gep3(i32 addrspace(201)* %a) {
+  %1 = getelementptr i32, i32 addrspace(201)* %a, i32 8
+  %r = load i32, i32 addrspace(201)* %1, align 32
+  ret i32 %r
+}
+
+@a = addrspace(201) global <8 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>, align 32
+
+; CHECK-LABEL: scalarized_gep4:
+; CHECK: v0 = vld [tilespmem:a+2]
+; CHECK: (v2sf) = vpush v0, $0x0
+; CHECK: s{{[0-9]+}} = spop (v2sf)
+define void @scalarized_gep4(i32 %b, i32 %c) {
+entry:
+  %vecext = load i32, i32 addrspace(201)* getelementptr (<8 x i32>, <8 x i32> addrspace(201)* @a, i32 0, i32 2), align 8
+  %0 = load i32, i32* inttoptr (i32 256 to i32*), align 256
+  %add = add nsw i32 %vecext, %0
+  store i32 %add, i32* inttoptr (i32 256 to i32*), align 256
+  ret void
+}
+
+; CHECK-LABEL: vldi_align:
+; CHECK: v0 = vld [tilespmem:s0+$0x0]
+define <8 x i32> @vldi_align(<8 x i32> addrspace(201)* %a) {
+  %b = load <8 x i32>, <8 x i32> addrspace(201)* %a, align 4
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: vsti_align:
+; CHECK: [tilespmem:s0+$0x0] = vst v0
+define void @vsti_align(<8 x i32> %a, <8 x i32> addrspace(201)* %b) {
+  store <8 x i32> %a, <8 x i32> addrspace(201)* %b, align 4
+  ret void
+}
+
+; CHECK-LABEL: vldmsk_strided:
+; CHECK: v0 = vld.msk [tilespmem:s0+$0x0 ss:s1], vm0;
+define <8 x i32> @vldmsk_strided(<8 x i32> addrspace(201)* %base, <8 x i1> %m, i32 %s) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.strided.v8i32.p201v8i32.v8i1(<8 x i1> %m, <8 x i32> addrspace(201)* %base, i32 %s)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vstmsk_strided:
+; CHECK: [tilespmem:s0+$0x0 ss:s1] = vst.msk vm0, v0;
+define void @vstmsk_strided(<8 x i32> addrspace(201)* %base, <8 x i32> %val, <8 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.strided.v8i1.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, i32 %s, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskadd_stridedi:
+; CHECK: [tilespmem:s0+$0x0 ss:s1] = vst.add.s32.msk vm0, v0;
+define void @vstmskadd_stridedi(<8 x i32> addrspace(201)* %base, <8 x i32> %val, <8 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.add.strided.v8i1.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, i32 %s, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskadd_stridedf:
+; CHECK: [tilespmem:s0+$0x0 ss:s1] = vst.add.f32.msk vm0, v0;
+define void @vstmskadd_stridedf(<8 x float> addrspace(201)* %base, <8 x float> %val, <8 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.add.strided.v8i1.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %base, i32 %s, <8 x float> %val)
+  ret void
+}
+
+; CHECK-LABEL: vldmskidx_stridedi:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0 ss:s1], vm0;
+define <8 x i32> @vldmskidx_stridedi(<8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i1> %m, i32 %s) {
+entry:
+  %0 = tail call <8 x i32> @llvm.tpu.vld.msk.idx.strided.v8i32.v8i1.p201v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %off, i32 %s)
+  ret <8 x i32> %0
+}
+
+; CHECK-LABEL: vldmskidx_stridedf:
+; CHECK: v0 = vld.idx.msk [tilespmem:v0+s0+$0x0 ss:s1], vm0;
+define <8 x float> @vldmskidx_stridedf(<8 x float> addrspace(201)* %base, <8 x i32> %off, <8 x i1> %m, i32 %s) {
+entry:
+  %0 = tail call <8 x float> @llvm.tpu.vld.msk.idx.strided.v8f32.v8i1.p201v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %base, <8 x i32> %off, i32 %s)
+  ret <8 x float> %0
+}
+
+; CHECK-LABEL: vstmskidx_stridedi:
+; CHECK: [tilespmem:v0+s0+$0x0 ss:s1] = vst.idx.msk vm0, v1;
+define void @vstmskidx_stridedi(<8 x i32> addrspace(201)* %base, <8 x i32> %off, <8 x i32> %val, <8 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.strided.v8i1.p201v8i32.v8i32(<8 x i1> %m, <8 x i32> addrspace(201)* %base, <8 x i32> %off, i32 %s, <8 x i32> %val)
+  ret void
+}
+
+; CHECK-LABEL: vstmskidx_stridedf:
+; CHECK: [tilespmem:v0+s0+$0x0 ss:s1] = vst.idx.msk vm0, v1;
+define void @vstmskidx_stridedf(<8 x float> addrspace(201)* %base, <8 x i32> %off, <8 x float> %val, <8 x i1> %m, i32 %s) {
+entry:
+  tail call void @llvm.tpu.vst.msk.idx.strided.v8i1.p201v8f32.v8f32(<8 x i1> %m, <8 x float> addrspace(201)* %base, <8 x i32> %off, i32 %s, <8 x float> %val)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_tc.ll
new file mode 100644
index 0000000..9f715ab
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_ldst_tc.ll

@@ -0,0 +1,353 @@
+; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)*, i32, i32)
+declare <1024 x float> @llvm.tpu.vld.shuffle.f32(<1024 x float> addrspace(205)*, i32, i32)
+declare <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)*, i32, i32)
+declare <1024 x float> @llvm.tpu.vld.strided.f32(<1024 x float> addrspace(205)*, i32, i32)
+declare void @llvm.tpu.vst.strided.i32(<1024 x i32>, <1024 x i32> addrspace(205)*, i32, i32, <1024 x i1>)
+declare void @llvm.tpu.vst.strided.f32(<1024 x float>, <1024 x float> addrspace(205)*, i32, i32, <1024 x i1>)
+declare <1024 x i32> @llvm.tpu.vld.indexed.i32(<1024 x i32> addrspace(205)*, i32, i32, i32, i32)
+declare <1024 x float> @llvm.tpu.vld.indexed.f32(<1024 x float> addrspace(205)*, i32, i32, i32, i32)
+declare void @llvm.tpu.vst.indexed.i32(<1024 x i32>, <1024 x i32> addrspace(205)*, i32, i32, <1024 x i1>, i32, i32)
+declare void @llvm.tpu.vst.indexed.f32(<1024 x float>, <1024 x float> addrspace(205)*, i32, i32, <1024 x i1>, i32, i32)
+declare void @llvm.tpu.vst.evenodd.sublanes.f32(<1024 x float>, <1024 x float> addrspace(205)*, i32, i32, <1024 x i1>, i32)
+declare <1024 x float> @llvm.tpu.vld.replicate.evenodd.sublanes.f32(<1024 x float> addrspace(205)*, i32, i32, i32)
+
+declare i32 @llvm.tpu.set.lane.indexed(<1024 x i32>, i32)
+declare i32 @llvm.tpu.set.sublane.indexed(<1024 x i32>, i32)
+declare i32 @llvm.tpu.set.iar.raw(<1024 x i32>, i32)
+declare <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32) nounwind
+
+
+; CHECK-LABEL: ld_shuffle_imm:
+; CHECK: v0 =	vld.sshfl [vmem:$0x1 sm:$0x7b], $0x1234567
+define <1024 x i32> @ld_shuffle_imm() {
+entry:
+  %0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 1)
+  %1 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* nonnull %0, i32 123, i32 19088743)
+  ret <1024 x i32> %1
+}
+
+; CHECK-LABEL: ld_shuffle_ptr:
+; CHECK: v0 =	vld.sshfl [vmem:s0+$0x0 sm:$0x7b], $0x1234567
+define <1024 x i32> @ld_shuffle_ptr(<1024 x i32> addrspace(205)* readonly %ptr) {
+entry:
+  %0 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* %ptr, i32 123, i32 19088743)
+  ret <1024 x i32> %0
+}
+
+; CHECK-LABEL: ld_shuffle_ptr_imm:
+; CHECK: v0 =	vld.sshfl [vmem:s0+$0x8 sm:$0x7b], $0x1234567
+define <1024 x i32> @ld_shuffle_ptr_imm(<1024 x i32> addrspace(205)* readonly %ptr) {
+entry:
+  %add.ptr = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %ptr, i32 1
+  %0 = call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* %add.ptr, i32 123, i32 19088743)
+  ret <1024 x i32> %0
+}
+
+; CHECK-LABEL: ld_shuffle__reg:
+; CHECK: v0 =	vld.sshfl [vmem:s0+$0x8 sm:s1], s2
+define <1024 x i32> @ld_shuffle__reg(<1024 x i32> addrspace(205)* readonly %ptr, i32 %mask, i32 %shuffle_){
+entry:
+  %add.ptr = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %ptr, i32 1
+  %0 = tail call <1024 x i32> @llvm.tpu.vld.shuffle.i32(<1024 x i32> addrspace(205)* %add.ptr, i32 %mask, i32 %shuffle_)
+  ret <1024 x i32> %0
+}
+
+; CHECK-LABEL: ld_shuffle__reg_f:
+; CHECK: v0 =	vld.sshfl [vmem:s0+$0x8 sm:s1], s2
+define <1024 x float> @ld_shuffle__reg_f(<1024 x float> addrspace(205)* readonly %ptr, i32 %mask, i32 %shuffle_){
+entry:
+  %add.ptr = getelementptr inbounds <1024 x float>, <1024 x float> addrspace(205)* %ptr, i32 1
+  %0 = tail call <1024 x float> @llvm.tpu.vld.shuffle.f32(<1024 x float> addrspace(205)* %add.ptr, i32 %mask, i32 %shuffle_)
+  ret <1024 x float> %0
+}
+
+; CHECK-LABEL: ld_shuffle__reg_f_nm:
+; CHECK: v0 =	vld.sshfl [vmem:s0+$0x8], s1
+define <1024 x float> @ld_shuffle__reg_f_nm(<1024 x float> addrspace(205)* readonly %ptr, i32 %shuffle_){
+entry:
+  %add.ptr = getelementptr inbounds <1024 x float>, <1024 x float> addrspace(205)* %ptr, i32 1
+  %0 = tail call <1024 x float> @llvm.tpu.vld.shuffle.f32(<1024 x float> addrspace(205)* %add.ptr, i32 255, i32 %shuffle_)
+  ret <1024 x float> %0
+}
+
+; CHECK-LABEL: ld_strided_imm:
+; CHECK: s[[x:[0-9]+]] = simm.s32 $0x1234567
+; CHECK: v0 =	vld [vmem:$0x1 ss:s[[x]] sm:$0x7b]
+define <1024 x i32> @ld_strided_imm() {
+entry:
+  %0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 1)
+  %1 = call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* nonnull %0, i32 123, i32 19088743)
+  ret <1024 x i32> %1
+}
+
+; CHECK-LABEL: ld_strided_ptr_imm:
+; CHECK: s[[x:[0-9]+]] = simm.s32 $0x1234567
+; CHECK: v0 =	vld [vmem:s0+$0x8 ss:s[[x]] sm:$0x7b]
+define <1024 x i32> @ld_strided_ptr_imm(<1024 x i32> addrspace(205)* readonly %ptr) {
+entry:
+  %add.ptr = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %ptr, i32 1
+  %0 = call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* %add.ptr, i32 123, i32 19088743)
+  ret <1024 x i32> %0
+}
+
+; CHECK-LABEL: ld_strided_reg:
+; CHECK: v0 =	vld [vmem:s0+$0x8 ss:s2 sm:s1]
+define <1024 x i32> @ld_strided_reg(<1024 x i32> addrspace(205)* readonly %ptr, i32 %mask, i32 %strided){
+entry:
+  %add.ptr = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %ptr, i32 1
+  %0 = tail call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* %add.ptr, i32 %mask, i32 %strided)
+  ret <1024 x i32> %0
+}
+
+; CHECK-LABEL: ld_strided_reg_f:
+; CHECK: v0 =	vld [vmem:s0+$0x8 ss:s2 sm:s1]
+define <1024 x float> @ld_strided_reg_f(<1024 x float> addrspace(205)* readonly %ptr, i32 %mask, i32 %strided){
+entry:
+  %add.ptr = getelementptr inbounds <1024 x float>, <1024 x float> addrspace(205)* %ptr, i32 1
+  %0 = tail call <1024 x float> @llvm.tpu.vld.strided.f32(<1024 x float> addrspace(205)* %add.ptr, i32 %mask, i32 %strided)
+  ret <1024 x float> %0
+}
+
+; CHECK-LABEL: ld_strided_reg_f_nm:
+; CHECK: v0 =	vld [vmem:s0+$0x8 ss:s1]
+define <1024 x float> @ld_strided_reg_f_nm(<1024 x float> addrspace(205)* readonly %ptr, i32 %strided){
+entry:
+  %add.ptr = getelementptr inbounds <1024 x float>, <1024 x float> addrspace(205)* %ptr, i32 1
+  %0 = tail call <1024 x float> @llvm.tpu.vld.strided.f32(<1024 x float> addrspace(205)* %add.ptr, i32 255, i32 %strided)
+  ret <1024 x float> %0
+}
+
+; CHECK-LABEL: ld_mask_no_stride_reg_f:
+; CHECK: v0 =	vld [vmem:s0+$0x8 sm:s1]
+define <1024 x float> @ld_mask_no_stride_reg_f(<1024 x float> addrspace(205)* readonly %ptr, i32 %mask, i32 %strided){
+entry:
+  %add.ptr = getelementptr inbounds <1024 x float>, <1024 x float> addrspace(205)* %ptr, i32 1
+  %0 = tail call <1024 x float> @llvm.tpu.vld.strided.f32(<1024 x float> addrspace(205)* %add.ptr, i32 %mask, i32 1)
+  ret <1024 x float> %0
+}
+
+; CHECK-LABEL: st_strided_reg:
+; CHECK: [vmem:s0+$0x8 ss:s2 sm:s1] = vst.msk  vm0, v0
+define void @st_strided_reg(<1024 x i32> %data, <1024 x i1> %m, <1024 x i32> addrspace(205)* readonly %ptr, i32 %mask, i32 %strided){
+entry:
+  %add.ptr = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %ptr, i32 1
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %data, <1024 x i32> addrspace(205)* %add.ptr, i32 %mask, i32 %strided, <1024 x i1> %m)
+  ret void
+}
+
+; CHECK-LABEL: st_strided_imm:
+; CHECK: s[[x:[0-9]+]] = simm.s32 $0x1234567
+; CHECK: [vmem:$0x1 ss:s[[x]] sm:$0x7b] = vst.msk  vm0, v0
+define void @st_strided_imm(<1024 x i32> %data, <1024 x i1> %m) {
+entry:
+  %0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 1)
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %data, <1024 x i32> addrspace(205)* nonnull %0, i32 123, i32 19088743, <1024 x i1> %m)
+  ret void
+}
+
+; CHECK-LABEL: st_strided_ptr_imm:
+; CHECK: s[[x:[0-9]+]] = simm.s32 $0x1234567
+; CHECK: [vmem:s0+$0x8 ss:s[[x]] sm:$0x7b] = vst.msk  vm0, v0
+define void @st_strided_ptr_imm(<1024 x i32> %data, <1024 x i1> %m, <1024 x i32> addrspace(205)* readonly %ptr) {
+entry:
+  %add.ptr = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %ptr, i32 1
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %data, <1024 x i32> addrspace(205)* %add.ptr, i32 123, i32 19088743, <1024 x i1> %m)
+  ret void
+}
+
+; CHECK-LABEL: st_strided_reg_f:
+; CHECK: [vmem:s0+$0x8 ss:s2 sm:s1] = vst.msk  vm0, v0
+define void @st_strided_reg_f(<1024 x float> %data, <1024 x i1> %m, <1024 x float> addrspace(205)* readonly %ptr, i32 %mask, i32 %strided){
+entry:
+  %add.ptr = getelementptr inbounds <1024 x float>, <1024 x float> addrspace(205)* %ptr, i32 1
+  call void @llvm.tpu.vst.strided.f32(<1024 x float> %data, <1024 x float> addrspace(205)* %add.ptr, i32 %mask, i32 %strided, <1024 x i1> %m)
+  ret void
+}
+
+; CHECK-LABEL: st_strided_reg_f_nm:
+; CHECK: [vmem:s0+$0x8 ss:s1] = vst.msk  vm0, v0
+define void @st_strided_reg_f_nm(<1024 x float> %data, <1024 x i1> %m, <1024 x float> addrspace(205)* readonly %ptr, i32 %strided){
+entry:
+  %add.ptr = getelementptr inbounds <1024 x float>, <1024 x float> addrspace(205)* %ptr, i32 1
+  call void @llvm.tpu.vst.strided.f32(<1024 x float> %data, <1024 x float> addrspace(205)* %add.ptr, i32 255, i32 %strided, <1024 x i1> %m)
+  ret void
+}
+
+; CHECK-LABEL: st_mask_no_stride_reg_f:
+; CHECK: [vmem:s0+$0x8 sm:s1] = vst.msk  vm0, v0
+define void @st_mask_no_stride_reg_f(<1024 x float> %data, <1024 x i1> %m, <1024 x float> addrspace(205)* readonly %ptr, i32 %mask, i32 %strided){
+entry:
+  %add.ptr = getelementptr inbounds <1024 x float>, <1024 x float> addrspace(205)* %ptr, i32 1
+  call void @llvm.tpu.vst.strided.f32(<1024 x float> %data, <1024 x float> addrspace(205)* %add.ptr, i32 %mask, i32 1, <1024 x i1> %m)
+  ret void
+}
+
+; CHECK-LABEL: st_strided_reg_novmask:
+; CHECK: [vmem:s0+$0x8 ss:s2 sm:s1] = vst v0
+define void @st_strided_reg_novmask(<1024 x i32> %data, <1024 x i32> addrspace(205)* readonly %ptr, i32 %mask, i32 %strided){
+entry:
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %vmask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %add.ptr = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %ptr, i32 1
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %data, <1024 x i32> addrspace(205)* %add.ptr, i32 %mask, i32 %strided, <1024 x i1> %vmask)
+  ret void
+}
+
+; CHECK-LABEL: st_nostride_novmask:
+; CHECK: [vmem:s0+$0x8 sm:s1] = vst v0
+define void @st_nostride_novmask(<1024 x i32> %data, <1024 x i32> addrspace(205)* readonly %ptr, i32 %mask){
+entry:
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %vmask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %add.ptr = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %ptr, i32 1
+  call void @llvm.tpu.vst.strided.i32(<1024 x i32> %data, <1024 x i32> addrspace(205)* %add.ptr, i32 %mask, i32 1, <1024 x i1> %vmask)
+  ret void
+}
+
+; CHECK-LABEL: ld_indexed_imm:
+; CHECK: (iar0) = vsetiar.lane v0
+; CHECK: s[[x:[0-9]+]] = simm.s32 $0x1234567
+; CHECK: _ = vdelay $0x4
+; CHECK: v0 = vld.iar0 [vmem:$0x1 ss:s[[x]] sm:$0x7b]
+define <1024 x i32> @ld_indexed_imm(<1024 x i32> %index) {
+entry:
+  %iar = call i32 @llvm.tpu.set.lane.indexed(<1024 x i32> %index, i32 0)
+  %0 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 1)
+  %1 = call <1024 x i32> @llvm.tpu.vld.indexed.i32(<1024 x i32> addrspace(205)* nonnull %0, i32 123, i32 19088743, i32 %iar, i32 0)
+  ret <1024 x i32> %1
+}
+
+; CHECK-LABEL: ld_indexed_ptr_iar1:
+; CHECK: (iar1) = vsetiar.sublane v0
+; CHECK: s[[x:[0-9]+]] = simm.s32 $0x1234567
+; CHECK: _ = vdelay $0x4
+; CHECK: v0 = vld.iar1 [vmem:s0+$0x8 ss:s[[x]] sm:$0x7b]
+define <1024 x i32> @ld_indexed_ptr_iar1(<1024 x i32> addrspace(205)* readonly %ptr, <1024 x i32> %index) {
+entry:
+  %iar = call i32 @llvm.tpu.set.sublane.indexed(<1024 x i32> %index, i32 1)
+  %add.ptr = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %ptr, i32 1
+  %0 = call <1024 x i32> @llvm.tpu.vld.indexed.i32(<1024 x i32> addrspace(205)* %add.ptr, i32 123, i32 19088743, i32 %iar, i32 1)
+  ret <1024 x i32> %0
+}
+
+; CHECK-LABEL: ld_indexed_nostride_ptr_iar1_f:
+; CHECK: (iar1) = vsetiar.raw v0
+; CHECK: _ = vdelay $0x4
+; CHECK: v0 = vld.iar1 [vmem:s0+$0x8 sm:s1]
+define <1024 x float> @ld_indexed_nostride_ptr_iar1_f(<1024 x float> addrspace(205)* readonly %ptr, i32 %mask, i32 %strided, <1024 x i32> %index){
+entry:
+  %iar = call i32 @llvm.tpu.set.iar.raw(<1024 x i32> %index, i32 1)
+  %add.ptr = getelementptr inbounds <1024 x float>, <1024 x float> addrspace(205)* %ptr, i32 1
+  %0 = tail call <1024 x float> @llvm.tpu.vld.indexed.f32(<1024 x float> addrspace(205)* %add.ptr, i32 %mask, i32 1, i32 %iar, i32 1)
+  ret <1024 x float> %0
+}
+
+
+; CHECK-LABEL: st_indexed_novmask:
+; CHECK: (iar0) = vsetiar.lane v0
+; CHECK: s[[x:[0-9]+]] = simm.s32 $0x1234567
+; CHECK: _ = vdelay $0x4
+; CHECK: [vmem:s0+$0x8 ss:$0x7b sm:s[[x]]] = vst.iar  (iar0), v1
+define void @st_indexed_novmask(<1024 x i32> %index, <1024 x i32> %data, <1024 x i32> addrspace(205)* %ptr) {
+entry:
+  %iar = call i32 @llvm.tpu.set.lane.indexed(<1024 x i32> %index, i32 0)
+   %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %vmask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %add.ptr = getelementptr inbounds <1024 x i32>, <1024 x i32> addrspace(205)* %ptr, i32 1
+  call void @llvm.tpu.vst.indexed.i32(<1024 x i32> %data, <1024 x i32> addrspace(205)* %add.ptr, i32 19088743, i32 123, <1024 x i1> %vmask, i32 %iar, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: st_indexed_vmask_f_iar1:
+; CHECK: (iar1) = vsetiar.lane v0
+; CHECK: _ = vdelay $0x4
+; CHECK: [vmem:s0+$0x0] = vst.iar.msk (iar1), vm0, v1
+define void @st_indexed_vmask_f_iar1(<1024 x i32> %index, <1024 x float> %data, <1024 x float> addrspace(205)* %ptr, <1024 x i1> %vmask) {
+entry:
+  %iar = call i32 @llvm.tpu.set.lane.indexed(<1024 x i32> %index, i32 1)
+  call void @llvm.tpu.vst.indexed.f32(<1024 x float> %data, <1024 x float> addrspace(205)* %ptr, i32 255, i32 1, <1024 x i1> %vmask, i32 %iar, i32 1)
+  ret void
+}
+
+; Check that we insert 4 cycles between a vmem store and iar store.
+; CHECK-LABEL: st_indexed_hazard:
+; CHECK-DAG: (iar1) = vsetiar.lane v0
+; CHECK-DAG: [vmem:s1+$0x0] = vst v2
+; CHECK: _ = vdelay $0x4
+; CHECK: [vmem:s0+$0x0] = vst.iar.msk (iar1), vm0, v1
+define void @st_indexed_hazard(<1024 x i32> %index, <1024 x float> %data, <1024 x i32> %data1, <1024 x float> addrspace(205)* %ptr, <1024 x i32> addrspace(205)* %ptr1, <1024 x i1> %vmask) {
+entry:
+  store <1024 x i32> %data1, <1024 x i32> addrspace(205)* %ptr1
+  %iar = call i32 @llvm.tpu.set.lane.indexed(<1024 x i32> %index, i32 1)
+  call void @llvm.tpu.vst.indexed.f32(<1024 x float> %data, <1024 x float> addrspace(205)* %ptr, i32 255, i32 1, <1024 x i1> %vmask, i32 %iar, i32 1)
+  ret void
+}
+
+; Check that we insert 4 cycles between a vmem store and iar load.
+; CHECK-LABEL: ld_indexed_hazard:
+; CHECK-DAG: (iar1) = vsetiar.lane v0
+; CHECK-DAG: [vmem:s1+$0x0] = vst v2
+; CHECK: _ = vdelay $0x4
+; CHECK: v0 = vld.iar1 [vmem:s0+$0x0]
+define <1024 x float> @ld_indexed_hazard(<1024 x i32> %index, <1024 x float> %data, <1024 x i32> %data1, <1024 x float> addrspace(205)* %ptr, <1024 x i32> addrspace(205)* %ptr1, <1024 x i1> %vmask) {
+entry:
+  store <1024 x i32> %data1, <1024 x i32> addrspace(205)* %ptr1
+  %iar = call i32 @llvm.tpu.set.lane.indexed(<1024 x i32> %index, i32 1)
+  %0 = tail call <1024 x float> @llvm.tpu.vld.indexed.f32(<1024 x float> addrspace(205)* %ptr, i32 255, i32 1, i32 %iar, i32 1)
+  ret <1024 x float> %0
+}
+
+; CHECK-LABEL: st_evenodd:
+; CHECK: (iar0) = vsetiar.raw v0
+; CHECK: _ = vdelay $0x4
+; CHECK: [vmem:s0+$0x0 sm:$0xf] = vst.iar.msk (iar0), vm0, v1
+define void @st_evenodd(<1024 x i32> %index, <1024 x float> %data, <1024 x float> addrspace(205)* %ptr, <1024 x i1> %vmask) {
+entry:
+  %iar = call i32 @llvm.tpu.set.iar.raw(<1024 x i32> %index, i32 0)
+  call void @llvm.tpu.vst.evenodd.sublanes.f32(<1024 x float> %data, <1024 x float> addrspace(205)* %ptr, i32 15, i32 1, <1024 x i1> %vmask, i32 %iar)
+  ret void
+}
+
+; CHECK-LABEL: ld_replicate_evenodd:
+; CHECK: (iar1) = vsetiar.raw v0
+; CHECK: _ = vdelay $0x4
+; CHECK: v0 = vld.iar1 [vmem:s0+$0x8 sm:s1]
+define <1024 x float> @ld_replicate_evenodd(<1024 x float> addrspace(205)* readonly %ptr, i32 %mask, i32 %strided, <1024 x i32> %index){
+entry:
+  %iar = call i32 @llvm.tpu.set.iar.raw(<1024 x i32> %index, i32 1)
+  %add.ptr = getelementptr inbounds <1024 x float>, <1024 x float> addrspace(205)* %ptr, i32 1
+  %0 = tail call <1024 x float> @llvm.tpu.vld.replicate.evenodd.sublanes.f32(<1024 x float> addrspace(205)* %add.ptr, i32 %mask, i32 1, i32 %iar)
+  ret <1024 x float> %0
+}
+
+; We're providing these patterns right now, though we may decide to -disable-vector-combine
+; in the future.
+
+; CHECK-LABEL: svldi:
+; CHECK: v0 = vld [vmem:$0x8]
+; CHECK: (v2sf) = vpush v0
+; CHECK: s0 = spop (v2sf)
+define i32 @svldi(<1024 x i32> addrspace(205)* %a) {
+  %v = load <1024 x i32>, <1024 x i32> addrspace(205)* inttoptr (i32 8 to <1024 x i32> addrspace(205)*)
+  %b = extractelement <1024 x i32> %v, i32 0
+  ret i32 %b
+}
+
+; CHECK-LABEL: svlda:
+; CHECK: v0 = vld [vmem:s0+$0x0]
+; CHECK: (v2sf) = vpush v0
+; CHECK: s0 = spop (v2sf)
+define i32 @svlda(<1024 x i32> addrspace(205)* %a) {
+  %addr = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %a, i32 0
+  %v = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr
+  %b = extractelement <1024 x i32> %v, i32 0
+  ret i32 %b
+}
+
+; TODO(b/189276901): Looks like the extractelement > 0 is missing on PF.

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_manip_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_manip_sc.ll
new file mode 100644
index 0000000..b5d3b2f
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_manip_sc.ll

@@ -0,0 +1,98 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp -print-encoding-annotations | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: vecext_i
+; CHECK: {  v0 =	vld [tilespmem:s0+$0x0];
+; CHECK: {  	(v2sf) = vpush v0, $0x2
+; CHECK-NEXT:   _ = sdelay $0x5  }
+; CHECK-NEXT: {   s0 = spop (v2sf)
+; CHECK-NEXT:	    _ =	shalt  }
+define i32 @vecext_i(<8 x i32> addrspace(201)* %a) {
+entry:
+  %0 = load <8 x i32>, <8 x i32> addrspace(201)* %a, align 32
+  %vecext = extractelement <8 x i32> %0, i32 2
+  ret i32 %vecext
+}
+
+; CHECK-LABEL: vecstore_i
+; CHECK:    (v2sf) = vpush v0, $0x3
+; CHECK:   	s[[x:[0-9]+]] =	spop (v2sf)
+; CHECK:     [smem:s{{[0-9]+}}] =	sst s[[x]]
+; CHECK: 	  _ =	shalt
+define i32 @vecstore_i(i32* %a, <8 x i32> %b) {
+entry:
+  %vecext = extractelement <8 x i32> %b, i32 3
+  store i32 %vecext, i32* %a
+  ret i32 0
+}
+
+; CHECK-LABEL: vecext_r
+; CHECK: {  v0 =	vld [tilespmem:s0+$0x0];
+; CHECK: {  	(v2sf) = vpush v0, s{{[0-9]+}}
+; CHECK-NEXT:   _ = sdelay $0x5  }
+; CHECK-NEXT: {   s0 = spop (v2sf)
+; CHECK-NEXT:	    _ =	shalt  }
+define i32 @vecext_r(<8 x i32> addrspace(201)* %a, i32 %b) {
+entry:
+  %0 = load <8 x i32>, <8 x i32> addrspace(201)* %a, align 32
+  %vecext = extractelement <8 x i32> %0, i32 %b
+  ret i32 %vecext
+}
+
+; CHECK-LABEL: vecstore_r
+; CHECK:    (v2sf) = vpush v0, s{{[0-9]+}}
+; CHECK:   	s[[x:[0-9]+]] =	spop (v2sf)
+; CHECK:     [smem:s{{[0-9]+}}] =	sst s[[x]]
+; CHECK: 	  _ =	shalt
+define i32 @vecstore_r(i32* %a, <8 x i32> %b, i32 %c) {
+entry:
+  %vecext = extractelement <8 x i32> %b, i32 %c
+  store i32 %vecext, i32* %a
+  ret i32 0
+}
+
+; CHECK-LABEL: extractzero_i:
+; CHECK: {  	v0 =	vld [tilespmem:s0+$0x0];
+; CHECK: {  	(v2sf) = vpush v0, $0x0
+; CHECK-NEXT:   _ = sdelay $0x5  }
+; CHECK-NEXT: {   s[[x:[0-9]+]] = spop (v2sf) }
+; CHECK-NEXT: {  	s0 =	sadd.s32 s1, s[[x]];
+; CHECK-NEXT:     _ =	shalt  }
+define i32 @extractzero_i(<8 x i32> addrspace(201)*  %a, i32 %b) {
+entry:
+  %0 = load <8 x i32>, <8 x i32> addrspace(201)* %a, align 32
+  %1 = extractelement <8 x i32> %0, i32 0
+  %vecext = add i32 %1, %b
+  ret i32 %vecext
+}
+
+; CHECK-LABEL: shuffleveci:
+; CHECK: v0 = vbroadcast v0, $0x7
+define <8 x i32> @shuffleveci(<8 x i32> %a) {
+  %r = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: shufflevecf:
+; CHECK: v0 = vbroadcast v0, $0x7
+define <8 x float> @shufflevecf(<8 x float> %a) {
+  %r = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  ret <8 x float> %r
+}
+
+declare <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32) nounwind
+
+; CHECK-LABEL: vldi_bcastr:
+; CHECK: v0 = vld [tilespmem:$0x2000]
+; CHECK: v0 = vbroadcast v0, s0
+define <8 x i32> @vldi_bcastr(i32 %b) {
+  %a = call <8 x i32> addrspace(201)* @llvm.tpu.inttoptr.p201v8i32(i32 8192)
+  %l = load <8 x i32>, <8 x i32> addrspace(201)* %a
+  %v1 = extractelement <8 x i32> %l, i32 %b
+  %v2 = insertelement <8 x i32> undef, i32 %v1, i64 0
+  %r = shufflevector <8 x i32> %v2, <8 x i32> undef, <8 x i32> zeroinitializer
+  ret <8 x i32> %r
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_mask.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_mask.ll
new file mode 100644
index 0000000..ad41ccd
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_mask.ll

@@ -0,0 +1,47 @@
+; RUN: llc < %s -mcpu=tensorcore-jf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Test that we support broadcasting an element from a vector mask even though
+; the code is inefficient.
+
+; CHECK-LABEL: splat_vi1:
+; CHECK: v{{[0-9]+}} = vimm.s32 $0x0
+; CHECK: v{{[0-9]+}} = vsel vm0, $0x1, v{{[0-9]+}}
+; CHECK: (v2sf) = vpush v{{[0-9]+}}
+; CHECK: s{{[0-9]+}} = spop (v2sf)
+; CHECK: v{{[0-9]+}} = vmov s{{[0-9]+}}
+; CHECK: vm{{[0-9]+}} = veq.s32 v{{[0-9]+}}, $0x1;
+define <1024 x i1> @splat_vi1(<1024 x i1> %vm) {
+  %splat = shufflevector <1024 x i1> %vm, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  ret <1024 x i1> %splat
+ }
+
+;Test that we emit optimized code sequence in case cmp and select can be
+; transformed to scalar instructions instead of doing a costly broadcast.
+
+; CHECK-LABEL: scalarize_cmp:
+; CHECK: p{{[0-9]+}} = seq.s32 s0, $0x0
+; CHECK-NEXT: v0 = vimm.f32 @p{{[0-9]+}} $0.0;
+; CHECK: shalt
+define <1024 x float> @scalarize_cmp(i32 %x, <1024 x float> %y) {
+  %splatx = insertelement <1024 x i32> undef, i32 %x, i32 0
+  %vm = icmp eq <1024 x i32> %splatx, zeroinitializer
+  %vmsplat = shufflevector <1024 x i1> %vm, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  %res = select <1024 x i1> %vmsplat, <1024 x float> zeroinitializer, <1024 x float> %y
+   ret <1024 x float> %res
+ }
+
+; Test that we handle the case of optimization of shuffle vector of insert element
+; of mask values.
+
+; CHECK-LABEL: splat_insert_opt_vi1:
+; CHECK: vm{{[0-9]+}} =   vmxor vm{{[0-9]+}}
+; CHECK: vm{{[0-9]+}} =   vmneg @p{{[0-9]+}} vm{{[0-9]+}}
+define <1024 x i1> @splat_insert_opt_vi1(<1024 x i1> %vm, i1 %b) {
+  %insert = insertelement <1024 x i1> %vm, i1 %b, i32 0
+  %splat = shufflevector <1024 x i1> %insert, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  ret <1024 x i1> %splat
+ }

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_misc_mask_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_misc_mask_tc.ll
new file mode 100644
index 0000000..a22c792
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_misc_mask_tc.ll

@@ -0,0 +1,31 @@
+; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare <1024 x i1> @llvm.tpu.weird.v1024f32(<1024 x float>) readnone
+declare <1024 x i1> @llvm.tpu.lane.mask(<1024 x i32>) readnone
+declare <1024 x i1> @llvm.tpu.sublane.mask(<1024 x i32>) readnone
+
+
+; CHECK-LABEL: weird:
+; CHECK: vm{{[0-9]+}} = vweird.f32 v{{[0-9]+}}
+define <1024 x i1> @weird(<1024 x float> %x, <1024 x float> %y) {
+  %a = call <1024 x i1> @llvm.tpu.weird.v1024f32(<1024 x float> %x)
+  ret <1024 x i1> %a
+}
+
+; CHECK-LABEL: lanemask:
+; CHECK: vm{{[0-9]+}} = vlmask v{{[0-9]+}}
+define <1024 x i1> @lanemask(<1024 x i32> %x) {
+  %a = call <1024 x i1> @llvm.tpu.lane.mask(<1024 x i32> %x)
+  ret <1024 x i1> %a
+}
+
+; CHECK-LABEL: sublanemask:
+; CHECK: vm{{[0-9]+}} = vsmask v{{[0-9]+}}
+define <1024 x i1> @sublanemask(<1024 x i32> %x) {
+  %a = call <1024 x i1> @llvm.tpu.sublane.mask(<1024 x i32> %x)
+  ret <1024 x i1> %a
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_pf.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_pf.ll
new file mode 100644
index 0000000..a361298
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_pf.ll

@@ -0,0 +1,32 @@
+; RUN: llc < %s -mcpu=tensorcore-jf -asm-verbose=false -disable-cgp | FileCheck %s
+; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+declare <1024 x i32> @llvm.tpu.cvt.pr.fptosi.v1024i32.v1024f32(<1024 x float>, <1024 x i32>) readnone
+
+; CHECK-LABEL: vfptosi32rr:
+; CHECK: v{{[0-9]+}} = vcvt.f32.s32 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x i32> @vfptosi32rr(<1024 x float> %x, <1024 x i32> %cvt) {
+  %a = call <1024 x i32> @llvm.tpu.cvt.pr.fptosi.v1024i32.v1024f32(<1024 x float> %x, <1024 x i32> %cvt)
+  ret <1024 x i32> %a
+}
+
+; CHECK-LABEL: vfptosi32rs:
+; CHECK: v{{[0-9]+}} = vcvt.f32.s32 v{{[0-9]+}}, s{{[0-9]+}}
+define <1024 x i32> @vfptosi32rs(<1024 x float> %x, i32 %cvt) {
+  %v0 = insertelement <1024 x i32> undef, i32 %cvt, i32 0
+  %splatcvt = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x i32> @llvm.tpu.cvt.pr.fptosi.v1024i32.v1024f32(<1024 x float> %x, <1024 x i32> %splatcvt)
+  ret <1024 x i32> %a
+}
+
+; CHECK-LABEL: vfptosi32ri:
+; CHECK: v{{[0-9]+}} = vcvt.f32.s32 v{{[0-9]+}}, $0x7fffffff
+define <1024 x i32> @vfptosi32ri(<1024 x float> %x) {
+  %v0 = insertelement <1024 x i32> undef, i32 2147483647, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x i32> @llvm.tpu.cvt.pr.fptosi.v1024i32.v1024f32(<1024 x float> %x, <1024 x i32> %v1)
+  ret <1024 x i32> %a
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_sc.ll
new file mode 100644
index 0000000..3e26da5
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_sc.ll

@@ -0,0 +1,1211 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -instcombine-max-iterations=0 | FileCheck -check-prefixes CHECK,CHECK-VF %s
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -instcombine-max-iterations=0  -opaque-pointers | FileCheck -check-prefixes CHECK,CHECK-VF %s
+; RUN: llc < %s -mcpu=sparsecore-tec-gl -asm-verbose=false -disable-cgp \
+; RUN: -instcombine-max-iterations=0 | FileCheck -check-prefixes CHECK,CHECK-GL %s
+; REQUIRES: tpu
+
+; Test that basic 32-bit integer operations assemble as expected.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: vaddi:
+; CHECK: v{{[0-9]+}} = vadd.s32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i32> @vaddi(<8 x i32> %x, <8 x i32> %y) {
+  %a = add <8 x i32> %x, %y
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vaddf:
+; CHECK: v{{[0-9]+}} = vadd.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @vaddf(<8 x float> %x, <8 x float> %y) {
+  %a = fadd <8 x float> %x, %y
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vaddi_splat:
+; CHECK: v{{[0-9]+}} = vadd.s32 $0x1, v{{[0-9]+}}
+define <8 x i32> @vaddi_splat(<8 x i32> %x) {
+  %a = add <8 x i32> %x, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vaddf_splat:
+; CHECK: v{{[0-9]+}} = vadd.f32 $1.0, v{{[0-9]+}}
+define <8 x float> @vaddf_splat(<8 x float> %x) {
+  %a = fadd <8 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vaddf_splat_sreg:
+; CHECK-NOT: v2f
+; CHECK: v{{[0-9]+}} = vadd.f32 s{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @vaddf_splat_sreg(<8 x float> %x, float %y) {
+  %v0 = insertelement <8 x float> undef, float %y, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = fadd <8 x float> %x, %v1
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vaddi_splat_sreg:
+; CHECK: v{{[0-9]+}} = vadd.s32 s{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i32> @vaddi_splat_sreg(<8 x i32> %x, i32 %y) {
+  %v0 = insertelement <8 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = add <8 x i32> %x, %v1
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vsubi:
+; CHECK: v{{[0-9]+}} = vsub.s32 v0, v1
+define <8 x i32> @vsubi(<8 x i32> %x, <8 x i32> %y) {
+  %a = sub <8 x i32> %x, %y
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vsubf:
+; CHECK: v{{[0-9]+}} = vsub.f32 v0, v1
+define <8 x float> @vsubf(<8 x float> %x, <8 x float> %y) {
+  %a = fsub <8 x float> %x, %y
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vsubi_splat:
+; CHECK: v{{[0-9]+}} = vsub.s32 $0x1, v{{[0-9]+}}
+define <8 x i32> @vsubi_splat(<8 x i32> %x) {
+  %a = sub <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %x
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vsubf_splat:
+; CHECK: v{{[0-9]+}} = vsub.f32 $1.0, v{{[0-9]+}}
+define <8 x float> @vsubf_splat(<8 x float> %x) {
+  %a = fsub <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vsubi_splat_sreg:
+; CHECK: v{{[0-9]+}} = vsub.s32 s{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i32> @vsubi_splat_sreg(<8 x i32> %x, i32 %y) {
+  %v0 = insertelement <8 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = sub <8 x i32> %v1, %x
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vsubf_splat_sreg:
+; CHECK: v{{[0-9]+}} = vsub.f32 s{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @vsubf_splat_sreg(<8 x float> %x, float %y) {
+  %v0 = insertelement <8 x float> undef, float %y, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = fsub <8 x float> %v1, %x
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vmovf_splat_sreg:
+; CHECK: v{{[0-9]+}} = vmov s{{[0-9]+}}
+define <8 x float> @vmovf_splat_sreg(float %y) {
+  %v0 = insertelement <8 x float> undef, float %y, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x float> %v1
+}
+
+; CHECK-LABEL: vmovf_scalar_to_vector_sreg:
+; CHECK: s[[sreg:[0-9]+]] = sld
+; CHECK: v{{[0-9]+}} = vmov s[[sreg]]
+; CHECK: v{{[0-9]+}} = vbroadcast v{{[0-9]+}}
+define <8 x i32> @vmovf_scalar_to_vector_sreg(i32* %a) {
+  %x = load i32, i32* %a, align 4
+  %v0 = insertelement <2 x i32> undef, i32 %x, i32 0
+  %v1 = insertelement <2 x i32> %v0, i32 %x, i32 1
+  %y = shufflevector <2 x i32> %v1, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i32> %y
+}
+
+; CHECK-LABEL: vmovi_splat_sreg:
+; CHECK: v{{[0-9]+}} = vmov s{{[0-9]+}}
+define <8 x i32> @vmovi_splat_sreg(i32 %y) {
+  %v0 = insertelement <8 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i32> %v1
+}
+
+; CHECK-LABEL: vseli:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i32> @vseli(<8 x i1> %m, <8 x i32> %x, <8 x i32> %y) {
+  %a = select <8 x i1> %m, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vseli_splat_sreg:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i32> @vseli_splat_sreg(<8 x i1> %m, i32 %y, <8 x i32> %x) {
+  %y1 = insertelement <8 x i32> undef, i32 %y, i32 0
+  %y2 = shufflevector <8 x i32> %y1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = select <8 x i1> %m, <8 x i32> %y2, <8 x i32> %x
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vseli_splat_imm:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, $0x5, v{{[0-9]+}}
+define <8 x i32> @vseli_splat_imm(<8 x i1> %m, <8 x i32> %x) {
+  %a = select <8 x i1> %m, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> %x
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vself:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @vself(<8 x i1> %m, <8 x float> %x, <8 x float> %y) {
+  %a = select <8 x i1> %m, <8 x float> %x, <8 x float> %y
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vself_splat_sreg:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @vself_splat_sreg(<8 x i1> %m, float %y, <8 x float> %x) {
+  %y1 = insertelement <8 x float> undef, float %y, i32 0
+  %y2 = shufflevector <8 x float> %y1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = select <8 x i1> %m, <8 x float> %y2, <8 x float> %x
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vself_splat_imm:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, $0x40a00000, v{{[0-9]+}}
+define <8 x float> @vself_splat_imm(<8 x i1> %m, <8 x float> %x) {
+  %a = select <8 x i1> %m, <8 x float> <float 5.0, float 5.0, float 5.0, float 5.0, float 5.0, float 5.0, float 5.0, float 5.0>, <8 x float> %x
+  ret <8 x float> %a
+}
+
+; CHECK: v{{[0-9]+}} =    vnsel vm{{[0-9]+}}, $0x2a, v{{[0-9]+}}
+define <8 x i32> @vnselri_i32(<8 x i1> %mask, <8 x i32> %x) {
+  %y0 = insertelement <8 x i32> undef, i32 42, i32  0
+  %y = shufflevector <8 x i32> %y0, <8 x i32> undef, <8 x i32> zeroinitializer
+
+  %r = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32>  %r
+}
+
+; CHECK: v{{[0-9]+}} =    vnsel vm{{[0-9]+}}, $0x42280000, v{{[0-9]+}}
+define <8 x float> @vnselri_float(<8 x i1> %mask, <8 x float> %x) {
+  %y0 = insertelement <8 x float> undef, float 42.0, i32  0
+  %y = shufflevector <8 x float> %y0, <8 x float> undef, <8 x i32> zeroinitializer
+
+  %r = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
+  ret <8 x float>  %r
+}
+
+; CHECK: v{{[0-9]+}} = vnsel vm{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i32> @vnselrs_i32(<8 x i1> %mask, <8 x i32> %x, i32 %s) {
+  %y0 = insertelement <8 x i32> undef, i32 %s, i32  0
+  %y = shufflevector <8 x i32> %y0, <8 x i32> undef, <8 x i32> zeroinitializer
+
+  %r = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32>  %r
+}
+
+; CHECK: v{{[0-9]+}} = vnsel vm{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @vnselrs_float(<8 x i1> %mask, <8 x float> %x, float %s) {
+  %y0 = insertelement <8 x float> undef, float %s, i32  0
+  %y = shufflevector <8 x float> %y0, <8 x float> undef, <8 x i32> zeroinitializer
+
+  %r = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
+  ret <8 x float>  %r
+}
+
+; CHECK-LABEL: vsitofp:
+; CHECK: v{{[0-9]+}} = vcvt.s32.f32 v{{[0-9]+}}
+define <8 x float> @vsitofp(<8 x i32> %x) {
+  %a = sitofp <8 x i32> %x to <8 x float>
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vfptosi:
+; CHECK: v{{[0-9]+}} = vcvt.f32.s32 v{{[0-9]+}}
+define <8 x i32> @vfptosi(<8 x float> %x) {
+  %a = fptosi <8 x float> %x to <8 x i32>
+  ret <8 x i32> %a
+}
+
+declare <8 x i32> @llvm.tpu.cvt.fptosi.v8i32.v8f32(<8 x float>) readnone
+
+; CHECK-LABEL: vfptosi32r:
+; CHECK: v{{[0-9]+}} = vcvt.f32.s32 v{{[0-9]+}}
+define <8 x i32> @vfptosi32r(<8 x float> %x) {
+  %a = call <8 x i32> @llvm.tpu.cvt.fptosi.v8i32.v8f32(<8 x float> %x)
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vfptosi32i:
+; CHECK: v[[x:[0-9]+]] = vimm.f32 $1.0
+; CHECK: v{{[0-9]+}} = vcvt.f32.s32 v[[x]]
+define <8 x i32> @vfptosi32i() {
+  %v0 = insertelement <8 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> zeroinitializer
+  %a = call <8 x i32> @llvm.tpu.cvt.fptosi.v8i32.v8f32(<8 x float> %v1)
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vfptosi32s:
+; CHECK: v[[x:[0-9]+]] = vmov s0
+; CHECK: v{{[0-9]+}} = vcvt.f32.s32 v[[x]]
+define <8 x i32> @vfptosi32s(float %y) {
+  %v0 = insertelement <8 x float> undef, float %y, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> zeroinitializer
+  %a = call <8 x i32> @llvm.tpu.cvt.fptosi.v8i32.v8f32(<8 x float> %v1)
+  ret <8 x i32> %a
+}
+
+declare <8 x i32> @llvm.tpu.vlaneseq.v8i32() readnone
+
+; CHECK-LABEL: vlaneseq:
+; CHECK: v{{[0-9]+}} = vlaneseq.u32
+define <8 x i32> @vlaneseq(<8 x float> %x) {
+  %a = call <8 x i32> @llvm.tpu.vlaneseq.v8i32() readnone
+  ret <8 x i32> %a
+}
+
+declare <8 x i32> @llvm.tpu.vrshra(<8 x i32>, <8 x i32>) readnone
+
+; CHECK-LABEL: vrshra_v:
+; CHECK: v{{[0-9]+}} = vrshra.s32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i32> @vrshra_v(<8 x i32> %x) {
+  %a = call <8 x i32> @llvm.tpu.vrshra(<8 x i32> %x, <8 x i32> %x) readnone
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vrshra_s:
+; CHECK: v{{[0-9]+}} = vrshra.s32 v{{[0-9]+}}, s{{[0-9]+}}
+define <8 x i32> @vrshra_s(<8 x i32> %x, i32 %y) {
+  %y1 = insertelement <8 x i32> undef, i32 %y, i32 0
+  %y2 = shufflevector <8 x i32> %y1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = call <8 x i32> @llvm.tpu.vrshra(<8 x i32> %x, <8 x i32> %y2) readnone
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vrshra_i:
+; CHECK: v{{[0-9]+}} = vrshra.s32 v{{[0-9]+}}, $0x6
+define <8 x i32> @vrshra_i(<8 x i32> %x) {
+  %y1 = insertelement <8 x i32> undef, i32 6, i32 0
+  %y2 = shufflevector <8 x i32> %y1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = call <8 x i32> @llvm.tpu.vrshra(<8 x i32> %x, <8 x i32> %y2) readnone
+  ret <8 x i32> %a
+}
+
+declare <8 x i32> @llvm.tpu.significand(<8 x float>) readnone
+declare <8 x i32> @llvm.tpu.exponent(<8 x float>) readnone
+declare <8 x float> @llvm.tpu.compose(<8 x float>, <8 x float>) readnone
+
+; CHECK-LABEL: vsignificand:
+; CHECK: v{{[0-9]+}} = vf32.s.s32 v{{[0-9]+}}
+define <8 x i32> @vsignificand(<8 x float> %x) {
+  %a = call <8 x i32> @llvm.tpu.significand(<8 x float> %x) readnone
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vexponent:
+; CHECK: v{{[0-9]+}} = vf32.e.s32 v{{[0-9]+}}
+define <8 x i32> @vexponent(<8 x float> %x) {
+  %a = call <8 x i32> @llvm.tpu.exponent(<8 x float> %x) readnone
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vcompose:
+; CHECK: v{{[0-9]+}} = vf32.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @vcompose(<8 x float> %x) {
+  %a = call <8 x float> @llvm.tpu.compose(<8 x float> %x, <8 x float> %x) readnone
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vroti:
+; CHECK: v{{[0-9]+}} = vrot.slane.down v{{[0-9]+}}
+define <8 x float> @vroti(<8 x float> %x) {
+  %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0>
+  ret <8 x float> %a
+}
+
+declare <8 x float> @llvm.minimum.f32(<8 x float>, <8 x float>) readnone
+declare <8 x float> @llvm.maximum.f32(<8 x float>, <8 x float>) readnone
+declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) readnone
+declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) readnone
+declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>) readnone
+declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>) readnone
+declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) readnone
+declare <8 x float> @llvm.fabs.v8f32(<8 x float>) readnone
+declare <8 x float> @llvm.copysign.v8f32(<8 x float>, <8 x float>) readnone
+
+; CHECK-LABEL: vrelu:
+; CHECK: v{{[0-9]+}} = vclamp.gez.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @vrelu(<8 x float> %x, <8 x float> %y) {
+  %a = call <8 x float> @llvm.minimum.f32(<8 x float> %x, <8 x float> %y) readnone
+  %b = call <8 x float> @llvm.maximum.f32(<8 x float> %a, <8 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>) readnone
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vrelu_s:
+; CHECK: v{{[0-9]+}} = vclamp.gez.f32 v{{[0-9]+}}, s{{[0-9]+}}
+define <8 x float> @vrelu_s(<8 x float> %x, float %y) {
+  %y1 = insertelement <8 x float> undef, float %y, i32 0
+  %y2 = shufflevector <8 x float> %y1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = call <8 x float> @llvm.minimum.f32(<8 x float> %x, <8 x float> %y2) readnone
+  %b = call <8 x float> @llvm.maximum.f32(<8 x float> %a, <8 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>) readnone
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: vrelu_i:
+; CHECK: v{{[0-9]+}} = vclamp.gez.f32 v{{[0-9]+}}, $4
+define <8 x float> @vrelu_i(<8 x float> %x) {
+  %y1 = insertelement <8 x float> undef, float 4.0, i32 0
+  %y2 = shufflevector <8 x float> %y1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %a = call <8 x float> @llvm.minimum.f32(<8 x float> %x, <8 x float> %y2) readnone
+  %b = call <8 x float> @llvm.maximum.f32(<8 x float> %a, <8 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>) readnone
+  ret <8 x float> %b
+}
+
+declare <8 x i32> @llvm.ctpop.i32(<8 x i32> %x) readnone
+; CHECK-LABEL: vpopcnt:
+; CHECK: vpcnt
+define <8 x i32> @vpopcnt(<8 x i32> %x) {
+  %a = call <8 x i32> @llvm.ctpop.i32(<8 x i32> %x) readnone
+  ret <8 x i32> %a
+}
+
+declare <8 x i32> @llvm.ctlz.i32(<8 x i32> %x) readnone
+; CHECK-LABEL: vclz:
+; CHECK: vclz
+define <8 x i32> @vclz(<8 x i32> %x) {
+  %a = call <8 x i32> @llvm.ctlz.i32(<8 x i32> %x) readnone
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: shuffle_splat:
+; CHECK: v0 = vbroadcast v0, $0x1
+define <8 x i32> @shuffle_splat(<8 x i32> %x) {
+  %a = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: build_vector:
+; CHECK: { v[[v0:[0-9]+]] = vmov s0;
+; CHECK:   vm[[vm0:[0-9]+]] = vcmask $0x300 }
+; CHECK: { vm[[vm1:[0-9]+]] = vcmask $0x704;
+; CHECK:   v[[v1:[0-9]+]] = vnsel vm[[vm0]], $0x17, v[[v0]] }
+; CHECK: { vm[[vm2:[0-9]+]] = vcmask $0xb08;
+; CHECK:   v[[v2:[0-9]+]] = vsel vm[[vm1]], s1, v[[v1]] }
+; CHECK: { vm[[vm3:[0-9]+]] = vcmask $0xf0c;
+; CHECK:   v[[v3:[0-9]+]] = vsel vm[[vm2]], s2, v[[v2]] }
+; CHECK: { vm[[vm4:[0-9]+]] = vcmask $0x1310;
+; CHECK:   v[[v4:[0-9]+]] = vsel vm[[vm3]], s3, v[[v3]] }
+; CHECK: { vm[[vm5:[0-9]+]] = vcmask $0x1714;
+; CHECK:   v[[v5:[0-9]+]] = vsel vm[[vm4]], $0x43, v[[v4]] }
+; CHECK: { vm[[vm6:[0-9]+]] = vcmask $0x1b18;
+; CHECK:   v[[v5:[0-9]+]] = vsel vm[[vm5]], $0x2f, v[[v5]] }
+; CHECK: { v{{[0-9]+}} = vsel vm[[vm6]], $0x25, v[[v5]]
+define <8 x i32> @build_vector(i32 %a, i32 %b, i32 %c, i32 %d) {
+  %v1 = insertelement <8 x i32> undef, i32 %a, i32 0
+  %v2 = insertelement <8 x i32> %v1, i32 %b, i32 1
+  %v3 = insertelement <8 x i32> %v2, i32 %c, i32 2
+  %v4 = insertelement <8 x i32> %v3, i32 %d, i32 3
+  %v5 = insertelement <8 x i32> %v4, i32 67, i32 4
+  %v6 = insertelement <8 x i32> %v5, i32 47, i32 5
+  %v7 = insertelement <8 x i32> %v6, i32 37, i32 6
+  %v8 = insertelement <8 x i32> %v7, i32 23, i32 7
+  ret <8 x i32> %v8
+}
+
+; CHECK-LABEL: build_vector2:
+; CHECK: vlaneseq
+define <8 x i32> @build_vector2() {
+  %v1 = insertelement <8 x i32> undef, i32 0, i32 0
+  %v2 = insertelement <8 x i32> %v1, i32 1, i32 1
+  %v3 = insertelement <8 x i32> %v2, i32 2, i32 2
+  %v4 = insertelement <8 x i32> %v3, i32 3, i32 3
+  %v5 = insertelement <8 x i32> %v4, i32 4, i32 4
+  %v6 = insertelement <8 x i32> %v5, i32 5, i32 5
+  %v7 = insertelement <8 x i32> %v6, i32 6, i32 6
+  %v8 = insertelement <8 x i32> %v7, i32 7, i32 7
+  ret <8 x i32> %v8
+}
+
+; CHECK-LABEL: build_vector3:
+; CHECK: vlaneseq
+define <8 x i32> @build_vector3() {
+  ret <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+}
+
+; CHECK-LABEL: build_vector4:
+; CHECK: { v0 =	vimm.s32 $0x57;
+; CHECK:   vm[[vm0:[0-9]+]] = vcmask $0x300 }
+; CHECK: { vm[[vm1:[0-9]+]] = vcmask $0x704;
+; CHECK:   v0 = vsel vm[[vm0]], $0x50, v0 }
+; CHECK: { vm[[vm2:[0-9]+]] = vcmask $0xb08;
+; CHECK:   v0 =	vsel vm[[vm1]], $0x51, v0 }
+; CHECK: { vm[[vm3:[0-9]+]] = vcmask $0xf0c;
+; CHECK:   v0 = vsel vm[[vm2]], $0x52, v0 }
+; CHECK: { vm[[vm4:[0-9]+]] = vcmask $0x1310;
+; CHECK:   v0 = vsel vm[[vm3]], $0x53, v0 }
+; CHECK: { vm[[vm5:[0-9]+]] = vcmask $0x1714;
+; CHECK:   v0 = vsel vm[[vm4]], $0x54, v0 }
+; CHECK: { vm[[vm6:[0-9]+]] = vcmask $0x1b18;
+; CHECK:   v0 = vsel vm[[vm5]], $0x55, v0 }
+; CHECK: { v0 = vsel vm[[vm6]], $0x56, v0
+; CHECK: { [tilespmem:$0x50] = vst v0
+define void @build_vector4(<8 x float> %x, <8 x i32> %m) {
+  store <8 x i32> <i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87>, <8 x i32> addrspace(201)* inttoptr (i32 80 to <8 x i32> addrspace(201)*), align 32
+  ret void
+}
+
+; CHECK-LABEL: insert_element:
+; CHECK: vlaneseq
+; CHECK: veq.s32 v1, $0x3
+; CHECK: vsel vm0, s0, v0
+define <8 x i32> @insert_element(<8 x i32> %v, i32 %b) {
+  %v2 = insertelement <8 x i32> %v, i32 %b, i32 3
+  ret <8 x i32> %v2
+}
+
+; CHECK-LABEL: vmread:
+; CHECK:  v{{[0-9]+}} =	vimm.s32 $0x0
+; CHECK: vsel
+; CHECK: shalt
+define <8 x i32> @vmread(<8 x i1> %a) {
+entry:
+  %b = zext <8 x i1> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: scalar_sel:
+; CHECK: v{{[0-9]+}} =	vpsel p{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i32> @scalar_sel(i1 %m, <8 x i32> %x, <8 x i32> %y) {
+  %a = select i1 %m, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: scalar_sel_imm:
+; CHECK: v{{[0-9]+}} =	vpsel !p{{[0-9]+}}, $0x1, v{{[0-9]+}}
+define <8 x i32> @scalar_sel_imm(i1 %m, <8 x i32> %x) {
+  %v0 = insertelement <8 x i32> undef, i32 1, i32 0
+  %v1 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> zeroinitializer ; create vector of all 1
+  %a = select i1 %m, <8 x i32> %x, <8 x i32> %v1
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: scalar_sel_f:
+; CHECK: v{{[0-9]+}} =	vpsel p{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @scalar_sel_f(i1 %m, <8 x float> %x, <8 x float> %y) {
+  %a = select i1 %m, <8 x float> %x, <8 x float> %y
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: scalar_sel_imm_f:
+; CHECK: v{{[0-9]+}} =	vpsel !p{{[0-9]+}}, $0x3f800000, v{{[0-9]+}}
+define <8 x float> @scalar_sel_imm_f(i1 %m, <8 x float> %x) {
+  %v0 = insertelement <8 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> zeroinitializer ; create vector of all 1
+  %a = select i1 %m, <8 x float> %x, <8 x float> %v1
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: scalar_sel_imm_f2:
+; CHECK: v{{[0-9]+}} =	vpsel p{{[0-9]+}}, $0x3f800000, v{{[0-9]+}}
+define <8 x float> @scalar_sel_imm_f2(i1 %m, <8 x float> %x) {
+  %v0 = insertelement <8 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> zeroinitializer ; create vector of all 1
+  %a = select i1 %m, <8 x float> %v1, <8 x float> %x
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: scalar_sel_vs_v_s:
+; CHECK: v{{[0-9]+}} =	vpsel !p{{[0-9]+}}, s0, v{{[0-9]+}}
+define <8 x i32> @scalar_sel_vs_v_s(i1 %m, <8 x i32> %x, i32 %y) {
+  %v0 = insertelement <8 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %a = select i1 %m, <8 x i32> %x, <8 x i32> %v1
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: scalar_sel_vs_s_v:
+; CHECK: v{{[0-9]+}} =	vpsel p{{[0-9]+}}, s0, v{{[0-9]+}}
+define <8 x i32> @scalar_sel_vs_s_v(i1 %m, <8 x i32> %x, i32 %y) {
+  %v0 = insertelement <8 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> zeroinitializer ; create vector of all 1
+  %a = select i1 %m, <8 x i32> %v1, <8 x i32> %x
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: bitcast1:
+; CHECK: { v{{[0-9]+}} = vadd.f32
+; CHECK-NEXT: v{{[0-9]+}} = vsub.f32 v{{[0-9]+}}, v{{[0-9]+}};
+; CHECK-NEXT: _ = sdelay $0x1 }
+; CHECK-NEXT: v0 = vand.u32
+; CHECK-NEXT: shalt
+define <8 x float> @bitcast1(<8 x float> %x, <8 x float> %y) {
+  %a = fadd <8 x float> %x, %y
+  %b = fsub <8 x float> %x, %y
+  %c = bitcast <8 x float> %a to <8 x i32>
+  %d = bitcast <8 x float> %b to <8 x i32>
+  %e = and <8 x i32> %c, %d
+  %f = bitcast <8 x i32> %e to <8 x float>
+  ret <8 x float> %f
+}
+
+; CHECK-LABEL: bitcast2:
+; CHECK-DAG: vadd.s32
+; CHECK-DAG: vsub.s32 
+; CHECK-NEXT: v0 = vadd.f32
+; CHECK: shalt
+define <8 x i32> @bitcast2(<8 x i32> %x, <8 x i32> %y) {
+  %a = add <8 x i32> %x, %y
+  %b = sub <8 x i32> %x, %y
+  %c = bitcast <8 x i32> %a to <8 x float>
+  %d = bitcast <8 x i32> %b to <8 x float>
+  %e = fadd <8 x float> %c, %d
+  %f = bitcast <8 x float> %e to <8 x i32>
+  ret <8 x i32> %f
+}
+
+declare <8 x i32> @llvm.tpu.vmpcnt.ones.v8i32(<8 x i1> %m) readnone
+
+; CHECK-LABEL: vmpcnt_ones:
+; CHECK: v{{[0-9]+}} = vmpcnt.ones.xlane
+define <8 x i32> @vmpcnt_ones(<8 x i1> %m) {
+  %a = call <8 x i32> @llvm.tpu.vmpcnt.ones.v8i32(<8 x i1> %m) readnone
+  ret <8 x i32> %a
+}
+
+declare <8 x i32> @llvm.tpu.vmctz.v8i32(<8 x i1> %m) readnone
+
+; CHECK-LABEL: vmctz:
+; CHECK: v{{[0-9]+}} = vmctz.xlane
+define <8 x i32> @vmctz(<8 x i1> %m) {
+  %a = call <8 x i32> @llvm.tpu.vmctz.v8i32(<8 x i1> %m) readnone
+  ret <8 x i32> %a
+}
+
+declare <8 x i32> @llvm.tpu.vshift.insert.v8i32(<8 x i32> %v0, <8 x i32> %v1, i32 %p) readnone
+
+; CHECK-LABEL: vshift_inserti:
+; CHECK: v{{[0-9]+}} = vshift.insert v0, v1, s0;
+define <8 x i32> @vshift_inserti(<8 x i32> %v0, <8 x i32> %v1, i32 %p) {
+  %a = call <8 x i32> @llvm.tpu.vshift.insert.v8i32(<8 x i32> %v0, <8 x i32> %v1, i32 %p) readnone
+  ret <8 x i32> %a
+}
+
+declare <8 x float> @llvm.tpu.vshift.insert.v8f32(<8 x float> %v0, <8 x float> %v1, i32 %p) readnone
+
+; CHECK-LABEL: vshift_insertf:
+; CHECK: v{{[0-9]+}} = vshift.insert v0, v1, s0;
+define <8 x float> @vshift_insertf(<8 x float> %v0, <8 x float> %v1, i32 %p) {
+  %a = call <8 x float> @llvm.tpu.vshift.insert.v8f32(<8 x float> %v0, <8 x float> %v1, i32 %p) readnone
+  ret <8 x float> %a
+}
+
+declare <8 x float> @llvm.tpu.clamp.symmetric(<8 x float>, <8 x float>) readnone
+
+; CHECK-LABEL: clamp_symm:
+; CHECK: v0 = vclamps.f32 v0, v1
+define <8 x float> @clamp_symm(<8 x float> %a, <8 x float> %b) {
+  %c = call <8 x float> @llvm.tpu.clamp.symmetric(<8 x float> %a, <8 x float> %b) readnone
+  ret <8 x float> %c
+}
+
+declare <8 x i32> @llvm.tpu.shll.v8i32(<8 x i32>, <8 x i32>) #1
+declare <8 x i32> @llvm.tpu.shrl.v8i32(<8 x i32>, <8 x i32>) #1
+declare <8 x i32> @llvm.tpu.shra.v8i32(<8 x i32>, <8 x i32>) #1
+
+; CHECK-LABEL: int_rr_vshl:
+; CHECK: v{{[0-9]+}} = vshll.u32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i32> @int_rr_vshl(<8 x i32> %x, <8 x i32> %y) {
+  %a = call <8 x i32> @llvm.tpu.shll.v8i32(<8 x i32> %x, <8 x i32> %y)
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: int_ri_vshl:
+; CHECK: v{{[0-9]+}} = vshll.u32 v{{[0-9]+}}, $0x20
+define <8 x i32> @int_ri_vshl(<8 x i32> %x) {
+  %a = call <8 x i32> @llvm.tpu.shll.v8i32(<8 x i32> %x, <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>)
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: int_rs_vshl:
+; CHECK: v{{[0-9]+}} = vshll.u32 v{{[0-9]+}}, s{{[0-9]+}}
+define <8 x i32> @int_rs_vshl(<8 x i32> %x, i32 %y) {
+  %a = insertelement <8 x i32> undef, i32 %y, i32 0
+  %b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %c = call <8 x i32> @llvm.tpu.shll.v8i32(<8 x i32> %x, <8 x i32> %b)
+  ret <8 x i32> %c
+}
+
+; CHECK-LABEL: int_rr_vshr:
+; CHECK: v{{[0-9]+}} = vshrl.u32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i32> @int_rr_vshr(<8 x i32> %x, <8 x i32> %y) {
+  %a = call <8 x i32> @llvm.tpu.shrl.v8i32(<8 x i32> %x, <8 x i32> %y)
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: int_ri_vshr:
+; CHECK: v{{[0-9]+}} = vshrl.u32 v{{[0-9]+}}, $0x20
+define <8 x i32> @int_ri_vshr(<8 x i32> %x) {
+  %a = call <8 x i32> @llvm.tpu.shrl.v8i32(<8 x i32> %x, <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>)
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: int_rs_vshr:
+; CHECK: v{{[0-9]+}} = vshrl.u32 v{{[0-9]+}}, s{{[0-9]+}}
+define <8 x i32> @int_rs_vshr(<8 x i32> %x, i32 %y) {
+  %a = insertelement <8 x i32> undef, i32 %y, i32 0
+  %b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %c = call <8 x i32> @llvm.tpu.shrl.v8i32(<8 x i32> %x, <8 x i32> %b)
+  ret <8 x i32> %c
+}
+
+; CHECK-LABEL: int_rr_vashr:
+; CHECK: v{{[0-9]+}} = vshra.s32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i32> @int_rr_vashr(<8 x i32> %x, <8 x i32> %y) {
+  %a = call <8 x i32> @llvm.tpu.shra.v8i32(<8 x i32> %x, <8 x i32> %y)
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: int_ri_vashr:
+; CHECK: v{{[0-9]+}} = vshra.s32 v{{[0-9]+}}, $0x20
+define <8 x i32> @int_ri_vashr(<8 x i32> %x) {
+  %a = call <8 x i32> @llvm.tpu.shra.v8i32(<8 x i32> %x, <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>)
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: int_rs_vashr:
+; CHECK: v{{[0-9]+}} = vshra.s32 v{{[0-9]+}}, s{{[0-9]+}}
+define <8 x i32> @int_rs_vashr(<8 x i32> %x, i32 %y) {
+  %a = insertelement <8 x i32> undef, i32 %y, i32 0
+  %b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %c = call <8 x i32> @llvm.tpu.shra.v8i32(<8 x i32> %x, <8 x i32> %b)
+  ret <8 x i32> %c
+}
+
+; CHECK-LABEL: int_pat_vshl:
+; CHECK: v{{[0-9]+}} = vshll.u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-NEXT: shalt
+define <8 x i32> @int_pat_vshl(<8 x i32> %x, <8 x i32> %y) {
+  %a = icmp ult <8 x i32> %y, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %b = shl <8 x i32> %x, %y
+  %c = select <8 x i1> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i32> %c
+}
+
+; CHECK-LABEL: int_pat_vlshr:
+; CHECK: v{{[0-9]+}} = vshrl.u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-NEXT: shalt
+define <8 x i32> @int_pat_vlshr(<8 x i32> %x, <8 x i32> %y) {
+  %a = icmp ult <8 x i32> %y, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %b = lshr <8 x i32> %x, %y
+  %c = select <8 x i1> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i32> %c
+}
+
+; CHECK-LABEL: int_pat_vashr:
+; CHECK: v{{[0-9]+}} = vshra.s32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-NEXT: shalt
+define <8 x i32> @int_pat_vashr(<8 x i32> %x, <8 x i32> %y) {
+  %a = icmp ult <8 x i32> %y, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %b = select <8 x i1> %a, <8 x i32> %y, <8 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %c = lshr <8 x i32> %x, %b
+  ret <8 x i32> %c
+}
+
+; CHECK-LABEL: vmaxf_v:
+; CHECK: v{{[0-9]+}} = vmax.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @vmaxf_v(<8 x float> %x, <8 x float> %y) {
+  %a = call <8 x float> @llvm.maximum.f32(<8 x float> %x, <8 x float> %y) readnone
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vminf_v:
+; CHECK: v{{[0-9]+}} = vmin.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @vminf_v(<8 x float> %x, <8 x float> %y) {
+  %a = call <8 x float> @llvm.minimum.f32(<8 x float> %x, <8 x float> %y) readnone
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vsmax_v:
+; CHECK: vm[[vm:[0-9]+]] = vgt.s32 v0, v1
+; CHECK: v0 = vsel vm[[vm]], v0, v1
+define <8 x i32> @vsmax_v(<8 x i32> %x, <8 x i32> %y) {
+  %a = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %x, <8 x i32> %y) readnone
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vsmin_v:
+; CHECK: vm[[vm:[0-9]+]] = vlt.s32 v0, v1
+; CHECK: v0 = vsel vm[[vm]], v0, v1
+define <8 x i32> @vsmin_v(<8 x i32> %x, <8 x i32> %y) {
+  %a = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %x, <8 x i32> %y) readnone
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vumax_v:
+; CHECK-VF: vm[[vm:[0-9]+]] = vgt.u32 v0, v1
+; CHECK-VF: v0 = vsel vm[[vm]], v0, v1
+; CHECK-GL: v0 = vmax.u32 v0, v1
+define <8 x i32> @vumax_v(<8 x i32> %x, <8 x i32> %y) {
+  %a = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %x, <8 x i32> %y) readnone
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vumin_v:
+; CHECK-VF: vm[[vm:[0-9]+]] = vlt.u32 v0, v1
+; CHECK-VF: v0 = vsel vm[[vm]], v0, v1
+; CHECK-GL: v0 = vmin.u32 v0, v1
+define <8 x i32> @vumin_v(<8 x i32> %x, <8 x i32> %y) {
+  %a = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %x, <8 x i32> %y) readnone
+  ret <8 x i32> %a
+}
+
+; vcvt
+declare <8 x float> @llvm.tpu.vcvt.fptobf8(<8 x float>) nounwind
+declare <8 x float> @llvm.tpu.vcvt.fptoif8(<8 x float>) nounwind
+declare <8 x float> @llvm.tpu.vcvt.fptobf16(<8 x float>) nounwind
+declare <8 x float> @llvm.tpu.vcvt.fptohf16(<8 x float>) nounwind
+declare <8 x float> @llvm.tpu.vcvt.sr.fptobf8(<8 x i32>, <8 x float>) nounwind
+declare <8 x float> @llvm.tpu.vcvt.sr.fptoif8(<8 x i32>, <8 x float>) nounwind
+declare <8 x float> @llvm.tpu.vcvt.sr.fptobf16(<8 x i32>, <8 x float>) nounwind
+declare <8 x float> @llvm.tpu.vcvt.sr.fptohf16(<8 x i32>, <8 x float>) nounwind
+
+; CHECK-LABEL: fptobf8:
+; CHECK: vcvt.f32.bf8
+define <8 x float> @fptobf8(<8 x float> %x) {
+  %a = call <8 x float> @llvm.tpu.vcvt.fptobf8(<8 x float> %x)
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: fptoif8:
+; CHECK: vcvt.f32.if8
+define <8 x float> @fptoif8(<8 x float> %x) {
+  %a = call <8 x float> @llvm.tpu.vcvt.fptoif8(<8 x float> %x)
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: fptobf16:
+; CHECK: vcvt.f32.bf16
+define <8 x float> @fptobf16(<8 x float> %x) {
+  %a = call <8 x float> @llvm.tpu.vcvt.fptobf16(<8 x float> %x)
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: fptohf16:
+; CHECK: vcvt.f32.hf16
+define <8 x float> @fptohf16(<8 x float> %x) {
+  %a = call <8 x float> @llvm.tpu.vcvt.fptohf16(<8 x float> %x)
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: sr.fptobf8rr:
+; CHECK: v{{[0-9]+}} =	vcvt.sr.f32.bf8 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @sr.fptobf8rr(<8 x i32> %random, <8 x float> %x) {
+  %a = call <8 x float> @llvm.tpu.vcvt.sr.fptobf8(<8 x i32> %random, <8 x float> %x)
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: sr.fptobf8ir:
+; CHECK: v{{[0-9]+}} =	vcvt.sr.f32.bf8 $0x1, v{{[0-9]+}}
+define <8 x float> @sr.fptobf8ir(<8 x float> %x) {
+  %v0 = insertelement <8 x i32> undef, i32 1, i32 0
+  %v1 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %a = call <8 x float> @llvm.tpu.vcvt.sr.fptobf8(<8 x i32> %v1, <8 x float> %x)
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: sr.fptobf8sr:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.bf8 s{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @sr.fptobf8sr(i32 %y, <8 x float> %x) {
+  %v0 = insertelement <8 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %a = call <8 x float> @llvm.tpu.vcvt.sr.fptobf8(<8 x i32> %v1, <8 x float> %x)
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: sr.fptoif8rr:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.if8 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @sr.fptoif8rr(<8 x i32> %random, <8 x float> %x) {
+  %a = call <8 x float> @llvm.tpu.vcvt.sr.fptoif8(<8 x i32> %random, <8 x float> %x)
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: sr.fptoif8ir:
+; CHECK: v{{[0-9]+}} =	vcvt.sr.f32.if8 $0x1, v{{[0-9]+}}
+define <8 x float> @sr.fptoif8ir(<8 x float> %x) {
+  %v0 = insertelement <8 x i32> undef, i32 1, i32 0
+  %v1 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %a = call <8 x float> @llvm.tpu.vcvt.sr.fptoif8(<8 x i32> %v1, <8 x float> %x)
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: sr.fptoif8sr:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.if8 s{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @sr.fptoif8sr(i32 %y, <8 x float> %x) {
+  %v0 = insertelement <8 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %a = call <8 x float> @llvm.tpu.vcvt.sr.fptoif8(<8 x i32> %v1, <8 x float> %x)
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: sr.fptobf16rr:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @sr.fptobf16rr(<8 x i32> %random, <8 x float> %x) {
+  %a = call <8 x float> @llvm.tpu.vcvt.sr.fptobf16(<8 x i32> %random, <8 x float> %x)
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: sr.fptobf16ir:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.bf16 $0x1, v{{[0-9]+}}
+define <8 x float> @sr.fptobf16ir(<8 x float> %x) {
+  %v0 = insertelement <8 x i32> undef, i32 1, i32 0
+  %v1 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %a = call <8 x float> @llvm.tpu.vcvt.sr.fptobf16(<8 x i32> %v1, <8 x float> %x)
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: sr.fptobf16sr:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.bf16 s{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @sr.fptobf16sr(i32 %y, <8 x float> %x) {
+  %v0 = insertelement <8 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %a = call <8 x float> @llvm.tpu.vcvt.sr.fptobf16(<8 x i32> %v1, <8 x float> %x)
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: sr.fptohf16rr:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.hf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @sr.fptohf16rr(<8 x i32> %random, <8 x float> %x) {
+  %a = call <8 x float> @llvm.tpu.vcvt.sr.fptohf16(<8 x i32> %random, <8 x float> %x)
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: sr.fptohf16ir:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.hf16 $0x1, v{{[0-9]+}}
+define <8 x float> @sr.fptohf16ir(<8 x float> %x) {
+  %v0 = insertelement <8 x i32> undef, i32 1, i32 0
+  %v1 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %a = call <8 x float> @llvm.tpu.vcvt.sr.fptohf16(<8 x i32> %v1, <8 x float> %x)
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: sr.fptohf16sr:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.hf16 s{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @sr.fptohf16sr(i32 %y, <8 x float> %x) {
+  %v0 = insertelement <8 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %a = call <8 x float> @llvm.tpu.vcvt.sr.fptohf16(<8 x i32> %v1, <8 x float> %x)
+  ret <8 x float> %a
+}
+
+; Byte not zero
+declare <8 x i1> @llvm.tpu.byte.not.zero.v8i1.v8i32(<8 x i32>) readnone
+
+; CHECK-LABEL: bytenotzero:
+; CHECK: vm{{[0-9]+}} = vnez.u8 v{{[0-9]+}}
+define <8 x i1> @bytenotzero(<8 x i32> %x) {
+  %a = call <8 x i1> @llvm.tpu.byte.not.zero.v8i1.v8i32(<8 x i32> %x)
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: bytenotzero_sreg_splat:
+; CHECK: vm{{[0-9]+}} = vnez.u8 s{{[0-9]+}}
+define <8 x i1> @bytenotzero_sreg_splat(i32 %y) {
+  %v0 = insertelement <8 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <8 x i32> %v0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %a = call <8 x i1> @llvm.tpu.byte.not.zero.v8i1.v8i32(<8 x i32> %v1)
+  ret <8 x i1> %a
+}
+
+; Total order and class compares
+declare <8 x i1> @llvm.tpu.vlt.to(<8 x float>, <8 x float>) readnone
+declare <8 x i1> @llvm.tpu.vle.to(<8 x float>, <8 x float>) readnone
+declare <8 x i1> @llvm.tpu.vclass(<8 x float>, <8 x float>) readnone
+
+; CHECK-LABEL: vltto:
+; CHECK: vm{{[0-9]+}} = vlt.to.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i1> @vltto(<8 x float> %x, <8 x float> %y) {
+  %a = call <8 x i1> @llvm.tpu.vlt.to(<8 x float> %x, <8 x float> %y)
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: vltto_splat:
+; CHECK: vm{{[0-9]+}} = vlt.to.f32 v{{[0-9]+}}, $1.0
+define <8 x i1> @vltto_splat(<8 x float> %x) {
+  %v0 = insertelement <8 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> zeroinitializer
+  %a = call <8 x i1> @llvm.tpu.vlt.to(<8 x float> %x, <8 x float> %v1)
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: vltto_sreg_splat:
+; CHECK: vm{{[0-9]+}} = vlt.to.f32 v{{[0-9]+}}, s{{[0-9]+}}
+define <8 x i1> @vltto_sreg_splat(<8 x float> %x, float %y) {
+  %v0 = insertelement <8 x float> undef, float %y, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> zeroinitializer
+  %a = call <8 x i1> @llvm.tpu.vlt.to(<8 x float> %x, <8 x float> %v1)
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: vleto:
+; CHECK: vm{{[0-9]+}} = vle.to.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i1> @vleto(<8 x float> %x, <8 x float> %y) {
+  %a = call <8 x i1> @llvm.tpu.vle.to(<8 x float> %x, <8 x float> %y)
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: vleto_splat:
+; CHECK: vm{{[0-9]+}} = vle.to.f32 v{{[0-9]+}}, $1.0
+define <8 x i1> @vleto_splat(<8 x float> %x) {
+  %v0 = insertelement <8 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> zeroinitializer
+  %a = call <8 x i1> @llvm.tpu.vle.to(<8 x float> %x, <8 x float> %v1)
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: vleto_sreg_splat:
+; CHECK: vm{{[0-9]+}} = vle.to.f32 v{{[0-9]+}}, s{{[0-9]+}}
+define <8 x i1> @vleto_sreg_splat(<8 x float> %x, float %y) {
+  %v0 = insertelement <8 x float> undef, float %y, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> zeroinitializer
+  %a = call <8 x i1> @llvm.tpu.vle.to(<8 x float> %x, <8 x float> %v1)
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: vclass:
+; CHECK: vm{{[0-9]+}} = vclass.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x i1> @vclass(<8 x float> %x, <8 x float> %y) {
+  %a = call <8 x i1> @llvm.tpu.vclass(<8 x float> %x, <8 x float> %y)
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: vclass_splat:
+; CHECK: vm{{[0-9]+}} = vclass.f32 v{{[0-9]+}}, $1.0
+define <8 x i1> @vclass_splat(<8 x float> %x) {
+  %v0 = insertelement <8 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> zeroinitializer
+  %a = call <8 x i1> @llvm.tpu.vclass(<8 x float> %x, <8 x float> %v1)
+  ret <8 x i1> %a
+}
+
+; CHECK-LABEL: vclass_sreg_splat:
+; CHECK: vm{{[0-9]+}} = vclass.f32 v{{[0-9]+}}, s{{[0-9]+}}
+define <8 x i1> @vclass_sreg_splat(<8 x float> %x, float %y) {
+  %v0 = insertelement <8 x float> undef, float %y, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> zeroinitializer
+  %a = call <8 x i1> @llvm.tpu.vclass(<8 x float> %x, <8 x float> %v1)
+  ret <8 x i1> %a
+}
+
+; vceil
+declare  <8 x float>  @llvm.ceil.v8f32(<8 x float>  %Val)
+
+; CHECK-LABEL: vceilf:
+; CHECK: v{{[0-9]+}} = vceil.f32 v{{[0-9]+}}
+define <8 x float> @vceilf(<8 x float> %y) {
+  %res = call <8 x float>  @llvm.ceil.v8f32(<8 x float> %y)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: vceilf_splat:
+; CHECK: v{{[0-9]+}} = vimm.f32 $13.0
+define <8 x float> @vceilf_splat() {
+
+  %v0 = insertelement <8 x float> undef, float 12.5, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> zeroinitializer
+  %res = call <8 x float>  @llvm.ceil.v8f32(<8 x float> %v1)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: vceilf_splat_sreg:
+; CHECK: v[[x:[0-9]+]] =  vmov s0
+; CHECK: v{{[0-9]+}} =  vceil.f32 v[[x]];
+define <8 x float> @vceilf_splat_sreg(float %y) {
+  %v0 = insertelement <8 x float> undef, float %y, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> zeroinitializer
+  %res = call <8 x float>  @llvm.ceil.v8f32(<8 x float> %v1)
+  ret <8 x float> %res
+}
+
+; vfloor
+declare  <8 x float>  @llvm.floor.v8f32(<8 x float>  %Val)
+
+; CHECK-LABEL: vfloorf:
+; CHECK: v{{[0-9]+}} = vfloor.f32 v{{[0-9]+}}
+define <8 x float> @vfloorf(<8 x float> %y) {
+  %res = call <8 x float>  @llvm.floor.v8f32(<8 x float> %y)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: vfloorf_splat:
+; CHECK: v{{[0-9]+}} = vimm.f32 $12.0
+define <8 x float> @vfloorf_splat() {
+  %v0 = insertelement <8 x float> undef, float 12.5, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> zeroinitializer
+  %res = call <8 x float>  @llvm.floor.v8f32(<8 x float> %v1)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: vfloorf_splat_sreg:
+; CHECK: v[[x:[0-9]+]] =  vmov s0
+; CHECK: v{{[0-9]+}} =  vfloor.f32 v[[x]];
+define <8 x float> @vfloorf_splat_sreg(float %y) {
+  %v0 = insertelement <8 x float> undef, float %y, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> zeroinitializer
+  %res = call <8 x float>  @llvm.floor.v8f32(<8 x float> %v1)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: vshift_inserti_0i:
+; CHECK: v{{[0-9]+}} = vshift.insert v0, v1, $0x0;
+define <8 x i32> @vshift_inserti_0i(<8 x i32> %v0, <8 x i32> %v1) {
+  %a = call <8 x i32> @llvm.tpu.vshift.insert.v8i32(<8 x i32> %v0, <8 x i32> %v1, i32 0) readnone
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vshift_insertf_0i:
+; CHECK: v{{[0-9]+}} = vshift.insert v0, v1, $0x0;
+define <8 x float> @vshift_insertf_0i(<8 x float> %v0, <8 x float> %v1) {
+  %a = call <8 x float> @llvm.tpu.vshift.insert.v8f32(<8 x float> %v0, <8 x float> %v1, i32 0) readnone
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vshift_inserti_8i:
+; CHECK: v{{[0-9]+}} = vshift.insert v0, v1, $0x8;
+define <8 x i32> @vshift_inserti_8i(<8 x i32> %v0, <8 x i32> %v1) {
+  %a = call <8 x i32> @llvm.tpu.vshift.insert.v8i32(<8 x i32> %v0, <8 x i32> %v1, i32 8) readnone
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vshift_insertf_8i:
+; CHECK: v{{[0-9]+}} = vshift.insert v0, v1, $0x8;
+define <8 x float> @vshift_insertf_8i(<8 x float> %v0, <8 x float> %v1) {
+  %a = call <8 x float> @llvm.tpu.vshift.insert.v8f32(<8 x float> %v0, <8 x float> %v1, i32 8) readnone
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vshift_inserti_9i:
+; CHECK: v{{[0-9]+}} = vshift.insert v0, v1, s0;
+define <8 x i32> @vshift_inserti_9i(<8 x i32> %v0, <8 x i32> %v1) {
+  %a = call <8 x i32> @llvm.tpu.vshift.insert.v8i32(<8 x i32> %v0, <8 x i32> %v1, i32 9) readnone
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vshift_insertf_9i:
+; CHECK: v{{[0-9]+}} = vshift.insert v0, v1, s0;
+define <8 x float> @vshift_insertf_9i(<8 x float> %v0, <8 x float> %v1) {
+  %a = call <8 x float> @llvm.tpu.vshift.insert.v8f32(<8 x float> %v0, <8 x float> %v1, i32 9) readnone
+  ret <8 x float> %a
+}
+
+declare <8 x i32> @llvm.tpu.sc.permute.v8i32(<8 x i32> %x, <8 x i32> %y) readnone
+declare <8 x float> @llvm.tpu.sc.permute.v8f32(<8 x float> %x, <8 x i32> %y) readnone
+
+; CHECK-LABEL: vperm:
+; CHECK: v{{[0-9]+}} = vperm.xlane v0, v1
+define <8 x i32> @vperm(<8 x i32> %x, <8 x i32> %y) {
+  %r = call <8 x i32> @llvm.tpu.sc.permute.v8i32(<8 x i32> %x, <8 x i32> %y) readnone
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: vpermf:
+; CHECK: v{{[0-9]+}} = vperm.xlane v0, v1
+define <8 x float> @vpermf(<8 x float> %x, <8 x i32> %y) {
+  %r = call <8 x float> @llvm.tpu.sc.permute.v8f32(<8 x float> %x, <8 x i32> %y) readnone
+  ret <8 x float> %r
+}
+
+; CHECK-LABEL: shuffle_permute:
+; CHECK: v{{[0-9]+}} = vperm.xlane v0, v1
+define <8 x i32> @shuffle_permute(<8 x i32> %x, <8 x i32> %m) {
+  %r = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 6, i32 5, i32 4, i32 3, i32 7>
+  ret <8 x i32> %r
+}
+
+; CHECK-LABEL: shuffle_permutef:
+; CHECK: v{{[0-9]+}} = vperm.xlane v0, v1
+define <8 x float> @shuffle_permutef(<8 x float> %x, <8 x i32> %m) {
+  %r = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 6, i32 5, i32 4, i32 3, i32 7>
+  ret <8 x float> %r
+}
+
+; CHECK-LABEL: vabsi:
+; CHECK-VF: v[[v0:[0-9]+]] = vshra.s32 v0, $0x1f
+; CHECK-VF: v[[v1:[0-9]+]] = vxor.u32 v[[v0]], v0
+; CHECK-VF: v{{[0-9]+}} = vsub.s32 v[[v1]], v1
+; CHECK-GL: v[[v0:[0-9]+]] = vsub.s32 $0x0, v0
+; CHECK-GL: v{{[0-9]+}} = vmin.u32 v0, v[[v0]]
+define <8 x i32> @vabsi(<8 x i32> %x) {
+  %a = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %x, i1 0) readnone
+  ret <8 x i32> %a
+}
+
+; CHECK-LABEL: vabsf:
+; CHECK: v{{[0-9]+}} = vand.u32 $0x7fffffff, v0
+define <8 x float> @vabsf(<8 x float> %x) {
+  %a = call <8 x float> @llvm.fabs.v8f32(<8 x float> %x) readnone
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vcopysign_opt:
+; CHECK-NOT: vand
+; CHECK-NOT: vor
+; CHECK: shalt
+define <8 x float> @vcopysign_opt(<8 x float> %x) {
+  %a = call <8 x float> @llvm.copysign.v8f32(<8 x float> %x, <8 x float> %x) readnone
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vcopysign:
+; CHECK: v[[v0:[0-9]+]] = vand.u32 $-0x80000000, v1
+; CHECK: v[[v1:[0-9]+]] = vand.u32 $0x7fffffff, v0
+; CHECK: v{{[0-9]+}} = vor.u32 v[[v0]], v[[v1]]
+define <8 x float> @vcopysign(<8 x float> %x, <8 x float> %y) {
+  %a = call <8 x float> @llvm.copysign.v8f32(<8 x float> %x, <8 x float> %y) readnone
+  ret <8 x float> %a
+}
+
+declare <8 x float> @llvm.tpu.clamp.v8f32(<8 x float>, <8 x float>, <8 x float>)
+
+; CHECK-LABEL: clampi
+; CHECK: v[[v:[0-9]+]] = vmin.f32 v2, v1
+; CHECK: v0 = vmax.f32 v[[v]], v0
+define <8 x float> @clampi(<8 x float> %min, <8 x float> %max, <8 x float> %x) {
+entry:
+  %0 = tail call <8 x float> @llvm.tpu.clamp.v8f32(<8 x float> %min, <8 x float> %x, <8 x float> %max)
+  ret <8 x float> %0
+}
+
+; CHECK: clamps_ii
+; CHECK: v0 = vclamps.f32 v0, $1.0
+define <8 x float> @clamps_ii(<8 x float> %x) {
+entry:
+  %0 = tail call <8 x float> @llvm.tpu.clamp.v8f32(<8 x float> <float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>, <8 x float> %x, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  ret <8 x float> %0
+}
+
+; CHECK: clamps_rr
+; CHECK: v0 = vclamps.f32 v0, v1
+define <8 x float> @clamps_rr(<8 x float> %x, <8 x float> %max) {
+entry:
+  %min = fneg <8 x float> %max
+  %0 = tail call <8 x float> @llvm.tpu.clamp.v8f32(<8 x float> %min, <8 x float> %x, <8 x float> %max)
+  ret <8 x float> %0
+}
+
+; CHECK: relu_r
+; CHECK: v0 = vclamp.gez.f32 v0, v1
+define <8 x float> @relu_r(<8 x float> %x, <8 x float> %max) {
+entry:
+  %0 = tail call <8 x float> @llvm.tpu.clamp.v8f32(<8 x float> zeroinitializer, <8 x float> %x, <8 x float> %max)
+  ret <8 x float> %0
+}
+
+; CHECK: relu_i
+; CHECK: v0 = vclamp.gez.f32 v0, v1
+define <8 x float> @relu_i(<8 x float> %x, <8 x float> %max) {
+entry:
+  %0 = tail call <8 x float> @llvm.tpu.clamp.v8f32(<8 x float> zeroinitializer, <8 x float> %x, <8 x float> <float 0x36B0000000000000, float 0x36B0000000000000, float 0x36B0000000000000, float 0x36B0000000000000, float 0x36B0000000000000, float 0x38B0000000000000, float 0x36B0000000000000, float 0x37B0000000000000>)
+  ret <8 x float> %0
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_tc.ll
new file mode 100644
index 0000000..1bc19c7
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_tc.ll

@@ -0,0 +1,562 @@
+; RUN: llc < %s -mcpu=tensorcore-jf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+; Test that basic 32-bit integer operations assemble as expected.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-LABEL: vaddi:
+; CHECK: v{{[0-9]+}} = vadd.s32 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x i32> @vaddi(<1024 x i32> %x, <1024 x i32> %y) {
+  %a = add <1024 x i32> %x, %y
+  ret <1024 x i32> %a
+}
+
+; CHECK-LABEL: vaddf:
+; CHECK: v{{[0-9]+}} = vadd.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @vaddf(<1024 x float> %x, <1024 x float> %y) {
+  %a = fadd <1024 x float> %x, %y
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: vaddi_splat:
+; CHECK: v{{[0-9]+}} = vadd.s32 $0x1, v{{[0-9]+}}
+define <1024 x i32> @vaddi_splat(<1024 x i32> %x, <1024 x i32> %y) {
+  %v0 = insertelement <1024 x i32> undef, i32 1, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer ; create vector of all 1
+  %a = add <1024 x i32> %x, %v1
+  ret <1024 x i32> %a
+}
+
+; CHECK-LABEL: vaddf_splat:
+; CHECK: v{{[0-9]+}} = vadd.f32 $1.0, v{{[0-9]+}}
+define <1024 x float> @vaddf_splat(<1024 x float> %x, <1024 x float> %y) {
+  %v0 = insertelement <1024 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer ; create vector of all 1.0
+  %a = fadd <1024 x float> %x, %v1
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: vaddf_splat_sreg:
+; CHECK-NOT: v2f
+; CHECK: v{{[0-9]+}} = vadd.f32 s{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @vaddf_splat_sreg(<1024 x float> %x, float %y) {
+  %v0 = insertelement <1024 x float> undef, float %y, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %a = fadd <1024 x float> %x, %v1
+  ret <1024 x float> %a
+}
+
+
+; CHECK-LABEL: vaddi_splat_sreg:
+; CHECK: v{{[0-9]+}} = vadd.s32 s{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x i32> @vaddi_splat_sreg(<1024 x i32> %x, i32 %y) {
+  %v0 = insertelement <1024 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %a = add <1024 x i32> %x, %v1
+  ret <1024 x i32> %a
+}
+
+; CHECK-LABEL: vsubi_splat_sreg:
+; CHECK: v{{[0-9]+}} = vsub.s32 s{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x i32> @vsubi_splat_sreg(<1024 x i32> %x, i32 %y) {
+  %v0 = insertelement <1024 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %a = sub <1024 x i32> %v1, %x
+  ret <1024 x i32> %a
+}
+
+; CHECK-LABEL: vsubf_splat_sreg:
+; CHECK: v{{[0-9]+}} = vsub.f32 s{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @vsubf_splat_sreg(<1024 x float> %x, float %y) {
+  %v0 = insertelement <1024 x float> undef, float %y, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %a = fsub <1024 x float> %v1, %x
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: vmovf_splat_sreg:
+; CHECK: v{{[0-9]+}} = vmov s{{[0-9]+}}
+define <1024 x float> @vmovf_splat_sreg(float %y) {
+  %v0 = insertelement <1024 x float> undef, float %y, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  ret <1024 x float> %v1
+}
+
+; CHECK-LABEL: vmovi_splat_sreg:
+; CHECK: v{{[0-9]+}} = vmov s{{[0-9]+}}
+define <1024 x i32> @vmovi_splat_sreg(i32 %y) {
+  %v0 = insertelement <1024 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  ret <1024 x i32> %v1
+}
+
+; CHECK-LABEL: vseli:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x i32> @vseli(<1024 x i1> %m, <1024 x i32> %x, <1024 x i32> %y) {
+  %a = select <1024 x i1> %m, <1024 x i32> %x, <1024 x i32> %y
+  ret <1024 x i32> %a
+}
+
+; CHECK-LABEL: vseli_splat_sreg:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x i32> @vseli_splat_sreg(<1024 x i1> %m, i32 %y, <1024 x i32> %x) {
+  %y1 = insertelement <1024 x i32> undef, i32 %y, i32 0
+  %y2 = shufflevector <1024 x i32> %y1, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %a = select <1024 x i1> %m, <1024 x i32> %y2, <1024 x i32> %x
+  ret <1024 x i32> %a
+}
+
+; CHECK-LABEL: vseli_splat_imm:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, $0x5, v{{[0-9]+}}
+define <1024 x i32> @vseli_splat_imm(<1024 x i1> %m, <1024 x i32> %x) {
+  %v0 = insertelement <1024 x i32> undef, i32 5, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer ; create vector of all 5
+  %a = select <1024 x i1> %m, <1024 x i32> %v1, <1024 x i32> %x
+  ret <1024 x i32> %a
+}
+
+; CHECK-LABEL: vself:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @vself(<1024 x i1> %m, <1024 x float> %x, <1024 x float> %y) {
+  %a = select <1024 x i1> %m, <1024 x float> %x, <1024 x float> %y
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: vself_splat_sreg:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @vself_splat_sreg(<1024 x i1> %m, float %y, <1024 x float> %x) {
+  %y1 = insertelement <1024 x float> undef, float %y, i32 0
+  %y2 = shufflevector <1024 x float> %y1, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %a = select <1024 x i1> %m, <1024 x float> %y2, <1024 x float> %x
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: vself_splat_imm:
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, $0x40a00000, v{{[0-9]+}}
+define <1024 x float> @vself_splat_imm(<1024 x i1> %m, <1024 x float> %x) {
+  %v0 = insertelement <1024 x float> undef, float 5.0, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer ; create vector of all 5.0
+  %a = select <1024 x i1> %m, <1024 x float> %v1, <1024 x float> %x
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: vsitofp:
+; CHECK: v{{[0-9]+}} = vcvt.s32.f32 v{{[0-9]+}}
+define <1024 x float> @vsitofp(<1024 x i32> %x) {
+  %a = sitofp <1024 x i32> %x to <1024 x float>
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: vfptosi:
+; CHECK: v{{[0-9]+}} = vcvt.f32.s32 v{{[0-9]+}}, $-0x1
+define <1024 x i32> @vfptosi(<1024 x float> %x) {
+  %a = fptosi <1024 x float> %x to <1024 x i32>
+  ret <1024 x i32> %a
+}
+
+declare <1024 x float> @llvm.minimum.f32(<1024 x float> %x, <1024 x float> %y) readnone
+declare <1024 x float> @llvm.maximum.f32(<1024 x float> %x, <1024 x float> %y) readnone
+
+; CHECK-LABEL: vrelu:
+; CHECK: v{{[0-9]+}} = vclamp.gez.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @vrelu(<1024 x float> %x, <1024 x float> %y) {
+  %a = call <1024 x float> @llvm.minimum.f32(<1024 x float> %x, <1024 x float> %y) readnone
+  %b = call <1024 x float> @llvm.maximum.f32(<1024 x float> %a, <1024 x float> zeroinitializer) readnone
+  ret <1024 x float> %b
+}
+
+; CHECK-LABEL: vrelu_s:
+; CHECK: v{{[0-9]+}} = vclamp.gez.f32 v{{[0-9]+}}, s{{[0-9]+}}
+define <1024 x float> @vrelu_s(<1024 x float> %x, float %y) {
+  %y1 = insertelement <1024 x float> undef, float %y, i32 0
+  %y2 = shufflevector <1024 x float> %y1, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x float> @llvm.minimum.f32(<1024 x float> %x, <1024 x float> %y2) readnone
+  %b = call <1024 x float> @llvm.maximum.f32(<1024 x float> %a, <1024 x float> zeroinitializer) readnone
+  ret <1024 x float> %b
+}
+
+; CHECK-LABEL: vrelu_i:
+; CHECK: v{{[0-9]+}} = vclamp.gez.f32 v{{[0-9]+}}, $4
+define <1024 x float> @vrelu_i(<1024 x float> %x) {
+  %y1 = insertelement <1024 x float> undef, float 4.0, i32 0
+  %y2 = shufflevector <1024 x float> %y1, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x float> @llvm.minimum.f32(<1024 x float> %x, <1024 x float> %y2) readnone
+  %b = call <1024 x float> @llvm.maximum.f32(<1024 x float> %a, <1024 x float> zeroinitializer) readnone
+  ret <1024 x float> %b
+}
+
+declare <1024 x i32> @llvm.ctpop.i32(<1024 x i32> %x) readnone
+; CHECK-LABEL: vpopcnt:
+; CHECK: vpcnt
+define <1024 x i32> @vpopcnt(<1024 x i32> %x) {
+  %a = call <1024 x i32> @llvm.ctpop.i32(<1024 x i32> %x) readnone
+  ret <1024 x i32> %a
+}
+
+declare <1024 x i32> @llvm.ctlz.i32(<1024 x i32> %x) readnone
+; CHECK-LABEL: vclz:
+; CHECK: vclz
+define <1024 x i32> @vclz(<1024 x i32> %x) {
+  %a = call <1024 x i32> @llvm.ctlz.i32(<1024 x i32> %x) readnone
+  ret <1024 x i32> %a
+}
+
+; FIXME(b/239605457): Re-enable.
+; CHECK-LABEL: build_vector:
+; CHECK-DAG-DISABLE: vlaneseq
+; CHECK-DAG-DISABLE: [[x:v[0-9]+]] = vmov s0
+; CHECK-DAG-DISABLE: veq.s32 v{{[0-9]+}}, $0x1
+; CHECK-DISABLE: vsel vm0, s1, [[x]]
+define <1024 x i32> @build_vector(i32 %a, i32 %b, i32 %c, i32 %d) {
+  %v1 = insertelement <1024 x i32> undef, i32 %a, i32 0
+  %v2 = insertelement <1024 x i32> %v1, i32 %b, i32 1
+  %v3 = insertelement <1024 x i32> %v2, i32 %c, i32 2
+  %v4 = insertelement <1024 x i32> %v3, i32 %d, i32 3
+  %v5 = insertelement <1024 x i32> %v4, i32 67, i32 4
+  %v6 = insertelement <1024 x i32> %v5, i32 47, i32 5
+  %v7 = insertelement <1024 x i32> %v6, i32 37, i32 6
+  %v8 = insertelement <1024 x i32> %v7, i32 23, i32 7
+  ret <1024 x i32> %v8
+}
+
+
+; CHECK-LABEL: insert_element:
+; CHECK: vlaneseq
+; CHECK: veq.s32 v1, $0x3
+; CHECK: vsel vm0, s0, v0
+define <1024 x i32> @insert_element(<1024 x i32> %v, i32 %b) {
+  %v2 = insertelement <1024 x i32> %v, i32 %b, i32 3
+  ret <1024 x i32> %v2
+}
+
+; CHECK-LABEL: vmread:
+; CHECK:  v{{[0-9]+}} =	vimm.s32 $0x0
+; CHECK: vsel
+; CHECK: shalt
+define <1024 x i32> @vmread(<1024 x i1> %a) {
+entry:
+  %b = zext <1024 x i1> %a to <1024 x i32>
+  ret <1024 x i32> %b
+}
+
+; CHECK-LABEL: loadimm:
+; CHECK: v{{[0-9]+}} =	vld [vmem:$0x4]
+define <1024 x float> @loadimm() {
+entry:
+  %a = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 4)
+  %b = load <1024 x float>, <1024 x float> addrspace(205)* %a
+  ret <1024 x float> %b
+}
+
+; CHECK-LABEL: storeimm:
+; CHECK: [vmem:$0x4] =	vst v{{[0-9]+}}
+define void @storeimm(<1024 x i32> %x) {
+entry:
+  %a = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 4)
+  store <1024 x i32> %x, <1024 x i32> addrspace(205)* %a
+  ret void
+}
+
+declare <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32) nounwind
+declare <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32) nounwind
+
+; CHECK-LABEL: bitcast1:
+; CHECK: v{{[0-9]+}} = vadd.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @bitcast1(<1024 x float> %x, <1024 x float> %y) {
+  %a = fadd <1024 x float> %x, %y
+  %b = fsub <1024 x float> %x, %y
+  %c = bitcast <1024 x float> %a to <1024 x i32>
+  %d = bitcast <1024 x float> %b to <1024 x i32>
+  %e = and <1024 x i32> %c, %d
+  %f = bitcast <1024 x i32> %e to <1024 x float>
+  ret <1024 x float> %f
+}
+
+; CHECK-LABEL: scalar_sel:
+; CHECK: v{{[0-9]+}} =	vmov @!p{{[0-9]+}} v{{[0-9]+}}
+define <1024 x i32> @scalar_sel(i1 %m, <1024 x i32> %x, <1024 x i32> %y) {
+  %a = select i1 %m, <1024 x i32> %x, <1024 x i32> %y
+  ret <1024 x i32> %a
+}
+
+; CHECK-LABEL: scalar_sel_imm:
+; CHECK: v{{[0-9]+}} =	vimm.s32 @!p{{[0-9]+}} $0x1
+define <1024 x i32> @scalar_sel_imm(i1 %m, <1024 x i32> %x) {
+  %v0 = insertelement <1024 x i32> undef, i32 1, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer ; create vector of all 1
+  %a = select i1 %m, <1024 x i32> %x, <1024 x i32> %v1
+  ret <1024 x i32> %a
+}
+
+; CHECK-LABEL: scalar_sel_f:
+; CHECK: v{{[0-9]+}} =	vmov @!p{{[0-9]+}} v{{[0-9]+}}
+define <1024 x float> @scalar_sel_f(i1 %m, <1024 x float> %x, <1024 x float> %y) {
+  %a = select i1 %m, <1024 x float> %x, <1024 x float> %y
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: scalar_sel_imm_f:
+; CHECK: v{{[0-9]+}} =	vimm.f32 @!p{{[0-9]+}} $1
+define <1024 x float> @scalar_sel_imm_f(i1 %m, <1024 x float> %x, <1024 x float> %y) {
+  %v0 = insertelement <1024 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer ; create vector of all 1
+  %a = select i1 %m, <1024 x float> %x, <1024 x float> %v1
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: scalar_sel_imm_f2:
+; CHECK: v{{[0-9]+}} =	vimm.f32 @p{{[0-9]+}} $1
+define <1024 x float> @scalar_sel_imm_f2(i1 %m, <1024 x float> %x, <1024 x float> %y) {
+  %v0 = insertelement <1024 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer ; create vector of all 1
+  %a = select i1 %m, <1024 x float> %v1, <1024 x float> %x
+  ret <1024 x float> %a
+}
+
+; Check that in the case where we cannot have a single predicated move
+; we generate two moves.
+; CHECK-LABEL: scalar_sel_two_moves:
+; CHECK: v0 =	vmov v1
+; CHECK: v0 =	vimm.f32 @p{{[0-9]+}} $1
+define <1024 x float> @scalar_sel_two_moves(i1 %m, <1024 x float> %x, <1024 x float> %y) {
+  %v0 = insertelement <1024 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer ; create vector of all 1
+  %a = select i1 %m, <1024 x float> %v1, <1024 x float> %y
+  ret <1024 x float> %a
+}
+
+declare <1024 x i32> @llvm.tpu.significand(<1024 x float>) nounwind
+declare <1024 x i32> @llvm.tpu.exponent(<1024 x float>) nounwind
+declare <1024 x float> @llvm.tpu.compose(<1024 x float>, <1024 x float>) nounwind
+declare <1024 x float> @llvm.tpu.pack(<1024 x float>, <1024 x float>)
+declare <1024 x float> @llvm.tpu.packc(<1024 x float>, <1024 x float>)
+declare <1024 x float> @llvm.tpu.unpacku(<1024 x float>) nounwind
+declare <1024 x float> @llvm.tpu.unpackl(<1024 x float>) nounwind
+
+; CHECK-LABEL: significand:
+; CHECK: v{{[0-9]+}} = vf32.s.s32 v{{[0-9]+}}
+define <1024 x i32> @significand(<1024 x float> %a) {
+  %r = call <1024 x i32> @llvm.tpu.significand(<1024 x float> %a)
+  ret <1024 x i32> %r
+}
+
+; CHECK-LABEL: vexponent:
+; CHECK: v{{[0-9]+}} = vf32.e.s32 v{{[0-9]+}}
+define <1024 x i32> @vexponent(<1024 x float> %x) {
+  %a = call <1024 x i32> @llvm.tpu.exponent(<1024 x float> %x) readnone
+  ret <1024 x i32> %a
+}
+
+; CHECK-LABEL: vcompose:
+; CHECK: v{{[0-9]+}} = vf32.f32 v0, v1
+define <1024 x float> @vcompose(<1024 x float> %x, <1024 x float> %y) {
+  %a = call <1024 x float> @llvm.tpu.compose(<1024 x float> %x, <1024 x float> %y) readnone
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: vpack:
+; CHECK: v{{[0-9]+}} = vpack.f32.f16 v0, v1
+define <1024 x float> @vpack(<1024 x float> %x, <1024 x float> %y) {
+  %a = call <1024 x float> @llvm.tpu.pack(<1024 x float> %x, <1024 x float> %y) readnone
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: vpack_scalar:
+; CHECK: v{{[0-9]+}} = vpack.f32.f16 s0, v0
+define <1024 x float> @vpack_scalar(<1024 x float> %x, float %y) {
+  %v0 = insertelement <1024 x float> undef, float %y, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x float> @llvm.tpu.pack(<1024 x float> %v1, <1024 x float> %x) readnone
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: vpackc:
+; CHECK: v{{[0-9]+}} = vpackc.f32.f16 v0, v1
+define <1024 x float> @vpackc(<1024 x float> %x, <1024 x float> %y) {
+  %a = call <1024 x float> @llvm.tpu.packc(<1024 x float> %x, <1024 x float> %y) readnone
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: unpacku:
+; CHECK: v{{[0-9]+}} = vunpacku v{{[0-9]+}}
+define <1024 x float> @unpacku(<1024 x float> %a) {
+  %r = call <1024 x float> @llvm.tpu.unpacku(<1024 x float> %a)
+  ret <1024 x float> %r
+}
+
+; CHECK-LABEL: unpackl:
+; CHECK: v{{[0-9]+}} = vunpackl v{{[0-9]+}}
+define <1024 x float> @unpackl(<1024 x float> %a) {
+  %r = call <1024 x float> @llvm.tpu.unpackl(<1024 x float> %a)
+  ret <1024 x float> %r
+}
+
+declare void @llvm.tpu.tc.vint(i32)
+; CHECK-LABEL: vint_r:
+; CHECK: _ = vint s0
+define void @vint_r(i32 %y) {
+  call void @llvm.tpu.tc.vint(i32 %y)
+  ret void
+}
+
+; CHECK-LABEL: vint_i:
+; CHECK: _ = vint $0x2a
+define void @vint_i() {
+  call void @llvm.tpu.tc.vint(i32 42)
+  ret void
+}
+
+; CHECK-LABEL: trunc_i1:
+; CHECK:	[[x:v[0-9]+]] =	vand.u32 $0x1, v0
+; CHECK:	vm0 =	veq.s32 [[x]], $0x1;
+define <1024 x i1> @trunc_i1(<1024 x i32> %x) {
+  %y = trunc <1024 x i32> %x to <1024 x i1>
+  ret <1024 x i1> %y
+}
+
+; Make sure we don't crash on vselect_cc
+; CHECK-LABEL: vselect_cc:
+; CHECK: sgt.s32 s0, $0x1
+define <1024 x i32> @vselect_cc(i32 %s, <1024 x i32> %x, <1024 x i32> %y) {
+  %c = icmp sgt i32 %s, 1
+  %r = select i1 %c, <1024 x i32> %x, <1024 x i32> %y
+  ret <1024 x i32> %r
+}
+
+; CHECK-LABEL: vselect_cc_f:
+; CHECK: sgt.s32 s0, $0x1
+define <1024 x float> @vselect_cc_f(i32 %s, <1024 x float> %x, <1024 x float> %y) {
+  %c = icmp sgt i32 %s, 1
+  %r = select i1 %c, <1024 x float> %x, <1024 x float> %y
+  ret <1024 x float> %r
+}
+
+; CHECK-LABEL: fneg:
+; CHECK: v{{[0-9]+}} = vxor.u32 $-0x80000000, v{{[0-9]+}}
+define <1024 x float> @fneg(<1024 x float> %x) {
+  %v0 = insertelement <1024 x float> undef, float -0.0, i32 0
+  %vzero = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %r = fsub <1024 x float> %vzero, %x
+  ret <1024 x float> %r
+}
+
+; Test that vrot is getting expanded correctly.
+; CHECK-LABEL: vrotl:
+; CHECK-DAG: v{{[0-9]+}} = vsub.s32 $0x20, v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vshll.u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: v{{[0-9]+}} = vshrl.u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: v{{[0-9]+}} = vor.u32 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x i32> @vrotl(<1024 x i32> %x, <1024 x i32> %y) {
+entry:
+  %v0 = insertelement <1024 x i32> undef, i32 32, i32 0
+  %vtwo = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %0 = shl <1024 x i32> %x, %y
+  %1 = sub <1024 x i32> %vtwo, %y
+  %2 = lshr <1024 x i32> %x, %1
+  %3 = or <1024 x i32> %0, %2
+  ret <1024 x i32> %3
+}
+
+; CHECK-LABEL: uitofp_i1:
+; CHECK: vimm.s32 $0x0
+; CHECK: vsel vm0, $0x3f800000, v0;
+define <1024 x float> @uitofp_i1(<1024 x i1> %x) {
+  %a = uitofp <1024 x i1> %x to <1024 x float>
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: sitofp_i1:
+; CHECK: vimm.s32 $0x0
+; CHECK: vsel vm0, $-0x40800000, v0;
+define <1024 x float> @sitofp_i1(<1024 x i1> %x) {
+  %a = sitofp <1024 x i1> %x to <1024 x float>
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: vmul_constant:
+; CHECK: vshll.u32 v0, $0x2
+; CHECK: vsub.s32 v1, v{{[0-9]+}}
+define <1024 x i32> @vmul_constant(<1024 x i32> %x) {
+  %v0 = insertelement <1024 x i32> undef, i32 3, i32 0
+  %splat = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %z = mul <1024 x i32> %x, %splat
+  ret <1024 x i32> %z
+}
+
+; Test that we emulate the mulatiplication with 6 fmul.
+; CHECK-LABEL: vmul_noconstant:
+; CHECK-DAG: v{{[0-9]+}} = vshrl.u32 v{{[0-9]+}}, $0xb
+; CHECK-DAG: v{{[0-9]+}} = vand.u32 $0x7ff, v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vshrl.u32 v{{[0-9]+}}, $0xb
+; CHECK-DAG: v{{[0-9]+}} = vand.u32 $0x7ff, v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vshrl.u32 v{{[0-9]+}}, $0x16
+; CHECK-DAG: v{{[0-9]+}} = vshrl.u32 v{{[0-9]+}}, $0x16
+; CHECK-DAG: v{{[0-9]+}} = vand.u32 $0x7ff, v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vcvt.s32.f32 v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vand.u32 $0x7ff, v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vcvt.s32.f32 v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vcvt.s32.f32 v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vcvt.s32.f32 v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vmul.f32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vcvt.s32.f32 v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vmul.f32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vcvt.s32.f32 v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vmul.f32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vmul.f32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vcvt.f32.s32 v{{[0-9]+}}, $-0x1
+; CHECK-DAG: v{{[0-9]+}} = vmul.f32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vmul.f32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vcvt.f32.s32 v{{[0-9]+}}, $-0x1
+; CHECK-DAG: v{{[0-9]+}} = vcvt.f32.s32 v{{[0-9]+}}, $-0x1
+; CHECK-DAG: v{{[0-9]+}} = vcvt.f32.s32 v{{[0-9]+}}, $-0x1
+; CHECK-DAG: v{{[0-9]+}} = vcvt.f32.s32 v{{[0-9]+}}, $-0x1
+; CHECK-DAG: v{{[0-9]+}} = vcvt.f32.s32 v{{[0-9]+}}, $-0x1
+; CHECK-DAG: v{{[0-9]+}} = vadd.s32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vadd.s32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vshll.u32 v{{[0-9]+}}, $0xb
+; CHECK-DAG: v{{[0-9]+}} = vadd.s32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vadd.s32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} = vshll.u32 v{{[0-9]+}}, $0x16
+; CHECK-DAG: v{{[0-9]+}} = vadd.s32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: _ = shalt
+define <1024 x i32> @vmul_noconstant(<1024 x i32> %x, <1024 x i32> %y) {
+  %z = mul <1024 x i32> %x, %y
+  ret <1024 x i32> %z
+}
+
+; Test that we are able to match the immediate index even if DAG combine replace 
+; the add with or.
+; CHECK-LABEL: loadorindex
+; CHECK: v{{[0-9]+}} = vld [vmem:s{{[0-9]+}}+$0x8]
+define <1024 x float> @loadorindex(i32 %addr) {
+entry:
+  %addr1 = and i32 %addr, 4294967280
+  %a = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %addr1)
+  %aoffset = getelementptr <1024 x float>, <1024 x float> addrspace(205)* %a, i32 1
+  %b = load <1024 x float>, <1024 x float> addrspace(205)* %aoffset
+  ret <1024 x float> %b
+}
+
+; Check that we are able to analyze known bits with JF mul24.
+; CHECK-LABEL: loadorindex2
+; CHECK: v{{[0-9]+}} = vld [vmem:s{{[0-9]+}}+$0x8]
+define <1024 x float> @loadorindex2(i32 %addr) {
+entry:
+  %addr1 = mul i32 %addr, 48
+  %a = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 %addr1)
+  %aoffset = getelementptr <1024 x float>, <1024 x float> addrspace(205)* %a, i32 1
+  %b = load <1024 x float>, <1024 x float> addrspace(205)* %aoffset
+  ret <1024 x float> %b
+}
+
+declare <1024 x i1> @llvm.tpu.addcarry.v1024i1(<1024 x i32>, <1024 x i32>) readnone
+
+; CHECK-LABEL: vaddcarry:
+; CHECK: vm{{[0-9]+}} = vc.u32 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x i1> @vaddcarry(<1024 x i32> %x, <1024 x i32> %y) {
+  %a = call <1024 x i1> @llvm.tpu.addcarry.v1024i1(<1024 x i32> %x, <1024 x i32> %y)
+  ret <1024 x i1> %a
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_vf.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_vf.ll
new file mode 100644
index 0000000..c34ddab
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_vf.ll

@@ -0,0 +1,534 @@
+; RUN: llc < %s -mcpu=tensorcore-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; vceil
+declare  <1024 x float>  @llvm.ceil.v1024f32(<1024 x float>  %Val)
+
+; CHECK-LABEL: vceilf:
+; CHECK: v{{[0-9]+}} = vceil.f32 v{{[0-9]+}}
+define <1024 x float> @vceilf(<1024 x float> %y) {
+  %res = call <1024 x float>  @llvm.ceil.v1024f32(<1024 x float> %y)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: vceilf_splat:
+; CHECK: v{{[0-9]+}} = vimm.f32 $13.0
+define <1024 x float> @vceilf_splat() {
+
+  %v0 = insertelement <1024 x float> undef, float 12.5, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %res = call <1024 x float>  @llvm.ceil.v1024f32(<1024 x float> %v1)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: vceilf_splat_sreg:
+; CHECK: v[[x:[0-9]+]] =  vmov s0
+; CHECK: v{{[0-9]+}} =  vceil.f32 v[[x]];
+define <1024 x float> @vceilf_splat_sreg(float %y) {
+  %v0 = insertelement <1024 x float> undef, float %y, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %res = call <1024 x float>  @llvm.ceil.v1024f32(<1024 x float> %v1)
+  ret <1024 x float> %res
+}
+
+; vfloor
+declare  <1024 x float>  @llvm.floor.v1024f32(<1024 x float>  %Val)
+
+; CHECK-LABEL: vfloorf:
+; CHECK: v{{[0-9]+}} = vfloor.f32 v{{[0-9]+}}
+define <1024 x float> @vfloorf(<1024 x float> %y) {
+  %res = call <1024 x float>  @llvm.floor.v1024f32(<1024 x float> %y)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: vfloorf_splat:
+; CHECK: v{{[0-9]+}} = vimm.f32 $12.0
+define <1024 x float> @vfloorf_splat() {
+  %v0 = insertelement <1024 x float> undef, float 12.5, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %res = call <1024 x float>  @llvm.floor.v1024f32(<1024 x float> %v1)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: vfloorf_splat_sreg:
+; CHECK: v[[x:[0-9]+]] =  vmov s0
+; CHECK: v{{[0-9]+}} =  vfloor.f32 v[[x]];
+define <1024 x float> @vfloorf_splat_sreg(float %y) {
+  %v0 = insertelement <1024 x float> undef, float %y, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %res = call <1024 x float>  @llvm.floor.v1024f32(<1024 x float> %v1)
+  ret <1024 x float> %res
+}
+
+; rotate sublane
+declare  <1024 x float> @llvm.tpu.vrot.sublane.v1024f32(<1024 x float>, <1024 x float>)
+declare  <1024 x i32> @llvm.tpu.vrot.sublane.v1024i32(<1024 x i32>, <1024 x i32>)
+
+; CHECK-LABEL: vrotslanei:
+; CHECK: v{{[0-9]+}} = vrot.slane v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x i32> @vrotslanei(<1024 x i32> %x, <1024 x i32> %y) {
+  %res = call <1024 x i32>  @llvm.tpu.vrot.sublane.v1024i32(<1024 x i32> %x, <1024 x i32> %y)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: vrotslanef:
+; CHECK: v{{[0-9]+}} = vrot.slane v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @vrotslanef(<1024 x float> %x, <1024 x float> %y) {
+  %res = call <1024 x float>  @llvm.tpu.vrot.sublane.v1024f32(<1024 x float> %x, <1024 x float> %y)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: vrotslanei_splat:
+; CHECK: v{{[0-9]+}} = vrot.slane v{{[0-9]+}}, $0x1
+define <1024 x i32> @vrotslanei_splat(<1024 x i32> %x) {
+  %v0 = insertelement <1024 x i32> undef, i32 1, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %res = call <1024 x i32>  @llvm.tpu.vrot.sublane.v1024i32(<1024 x i32> %x, <1024 x i32> %v1)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: vrotslanef_splat:
+; CHECK: v{{[0-9]+}} = vrot.slane v{{[0-9]+}}, $0x3f800000;
+define <1024 x float> @vrotslanef_splat(<1024 x float> %x) {
+  %v0 = insertelement <1024 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %res = call <1024 x float>  @llvm.tpu.vrot.sublane.v1024f32(<1024 x float> %x, <1024 x float> %v1)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: vrotslanei_splat_sreg:
+; CHECK: v{{[0-9]+}} = vrot.slane v{{[0-9]+}}, s{{[0-9]+}}
+define <1024 x i32> @vrotslanei_splat_sreg(<1024 x i32> %x, i32 %y) {
+  %v0 = insertelement <1024 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %res = call <1024 x i32>  @llvm.tpu.vrot.sublane.v1024i32(<1024 x i32> %x, <1024 x i32> %v1)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: vrotslanef_splat_sreg:
+; CHECK: v{{[0-9]+}} = vrot.slane v{{[0-9]+}}, s{{[0-9]+}}
+define <1024 x float> @vrotslanef_splat_sreg(<1024 x float> %x, float %y) {
+  %v0 = insertelement <1024 x float> undef, float %y, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %res = call <1024 x float>  @llvm.tpu.vrot.sublane.v1024f32(<1024 x float> %x, <1024 x float> %v1)
+  ret <1024 x float> %res
+}
+
+; permute sublane
+declare  <1024 x i32> @llvm.tpu.vperm.sublane.v1024i32(<1024 x i32>, <1024 x i32>)
+declare  <1024 x float> @llvm.tpu.vperm.sublane.v1024f32(<1024 x float>, <1024 x float>)
+
+; CHECK-LABEL: vpermslanei:
+; CHECK: v{{[0-9]+}} = vperm.slane v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x i32> @vpermslanei(<1024 x i32> %x, <1024 x i32> %y) {
+  %res = call <1024 x i32>  @llvm.tpu.vperm.sublane.v1024i32(<1024 x i32> %x, <1024 x i32> %y)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: vpermslanef:
+; CHECK: v{{[0-9]+}} = vperm.slane v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @vpermslanef(<1024 x float> %x, <1024 x float> %y) {
+  %res = call <1024 x float>  @llvm.tpu.vperm.sublane.v1024f32(<1024 x float> %x, <1024 x float> %y)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: vpermslanei_splat:
+; CHECK: v{{[0-9]+}} = vperm.slane v{{[0-9]+}}, $0x1
+define <1024 x i32> @vpermslanei_splat(<1024 x i32> %x) {
+  %v0 = insertelement <1024 x i32> undef, i32 1, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %res = call <1024 x i32>  @llvm.tpu.vperm.sublane.v1024i32(<1024 x i32> %x, <1024 x i32> %v1)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: vpermslanef_splat:
+; CHECK: v{{[0-9]+}} = vperm.slane v{{[0-9]+}}, $0x3f800000;
+define <1024 x float> @vpermslanef_splat(<1024 x float> %x) {
+  %v0 = insertelement <1024 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %res = call <1024 x float>  @llvm.tpu.vperm.sublane.v1024f32(<1024 x float> %x, <1024 x float> %v1)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: vpermslanei_splat_sreg:
+; CHECK: v{{[0-9]+}} = vperm.slane v{{[0-9]+}}, s{{[0-9]+}}
+define <1024 x i32> @vpermslanei_splat_sreg(<1024 x i32> %x, i32 %y) {
+  %v0 = insertelement <1024 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %res = call <1024 x i32>  @llvm.tpu.vperm.sublane.v1024i32(<1024 x i32> %x, <1024 x i32> %v1)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: vpermslanef_splat_sreg:
+; CHECK: v{{[0-9]+}} = vperm.slane v{{[0-9]+}}, s{{[0-9]+}}
+define <1024 x float> @vpermslanef_splat_sreg(<1024 x float> %x, float %y) {
+  %v0 = insertelement <1024 x float> undef, float %y, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %res = call <1024 x float>  @llvm.tpu.vperm.sublane.v1024f32(<1024 x float> %x, <1024 x float> %v1)
+  ret <1024 x float> %res
+}
+
+; vpack
+declare <1024 x float> @llvm.tpu.deprecated.pack.i.bf16(<1024 x float>, <1024 x float>)
+declare <1024 x float> @llvm.tpu.deprecated.pack.c.bf16(<1024 x float>, <1024 x float>)
+
+; CHECK-LABEL: vpack_i_bf16:
+; CHECK: v{{[0-9]+}} = vpack.i.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @vpack_i_bf16(<1024 x float> %y, <1024 x float> %x) {
+  %a = call <1024 x float> @llvm.tpu.deprecated.pack.i.bf16(<1024 x float> %y, <1024 x float> %x) readnone
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: vpack_i_bf16_scalar:
+; CHECK: v{{[0-9]+}} = vpack.i.bf16 s{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @vpack_i_bf16_scalar(<1024 x float> %x, float %y) {
+  %v0 = insertelement <1024 x float> undef, float %y, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x float> @llvm.tpu.deprecated.pack.i.bf16(<1024 x float> %v1, <1024 x float> %x) readnone
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: vpack_c_bf16:
+; CHECK: v{{[0-9]+}} = vpack.c.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @vpack_c_bf16(<1024 x float> %y, <1024 x float> %x) {
+  %a = call <1024 x float> @llvm.tpu.deprecated.pack.c.bf16(<1024 x float> %y, <1024 x float> %x) readnone
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: vpack_c_bf16_scalar:
+; CHECK: v{{[0-9]+}} = vpack.c.bf16 s{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @vpack_c_bf16_scalar(<1024 x float> %x, float %y) {
+  %v0 = insertelement <1024 x float> undef, float %y, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x float> @llvm.tpu.deprecated.pack.c.bf16(<1024 x float> %v1, <1024 x float> %x) readnone
+  ret <1024 x float> %a
+}
+
+; Unpack Interleaved
+declare <1024 x float> @llvm.tpu.deprecated.unpack.i.l.bf16(<1024 x float>)
+declare <1024 x float> @llvm.tpu.deprecated.unpack.i.u.bf16(<1024 x float>)
+; Unpack Compressed
+declare <1024 x float> @llvm.tpu.deprecated.unpack.c.l.bf16(<1024 x float>)
+declare <1024 x float> @llvm.tpu.deprecated.unpack.c.u.bf16(<1024 x float>)
+
+; CHECK-LABEL: unpack_i_l_bf16
+; CHECK: vunpack.i.l.bf16
+define <1024 x float> @unpack_i_l_bf16(<1024 x float> %x)  {
+  %res = call <1024 x float> @llvm.tpu.deprecated.unpack.i.l.bf16(<1024 x float> %x)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: unpack_i_u_bf16
+; CHECK: vunpack.i.u.bf16
+define <1024 x float> @unpack_i_u_bf16(<1024 x float> %x)  {
+  %res = call <1024 x float> @llvm.tpu.deprecated.unpack.i.u.bf16(<1024 x float> %x)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: unpack_c_l_bf16
+; CHECK: vunpack.c.l.bf16
+define <1024 x float> @unpack_c_l_bf16(<1024 x float> %x)  {
+  %res = call <1024 x float> @llvm.tpu.deprecated.unpack.c.l.bf16(<1024 x float> %x)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: unpack_c_u_bf16
+; CHECK: unpack.c.u.bf16
+define <1024 x float> @unpack_c_u_bf16(<1024 x float> %x)  {
+  %res = call <1024 x float> @llvm.tpu.deprecated.unpack.c.u.bf16(<1024 x float> %x)
+  ret <1024 x float> %res
+}
+
+; vcvt
+declare <1024 x i32> @llvm.tpu.cvt.fptosi.v1024i32.v1024f32(<1024 x float>) readnone
+declare <1024 x float> @llvm.tpu.vcvt.fptobf8(<1024 x float>) nounwind
+declare <1024 x float> @llvm.tpu.vcvt.fptoif8(<1024 x float>) nounwind
+declare <1024 x float> @llvm.tpu.vcvt.fptobf16(<1024 x float>) nounwind
+declare <1024 x float> @llvm.tpu.vcvt.fptohf16(<1024 x float>) nounwind
+declare <1024 x float> @llvm.tpu.vcvt.sr.fptobf8(<1024 x i32>, <1024 x float>) nounwind
+declare <1024 x float> @llvm.tpu.vcvt.sr.fptoif8(<1024 x i32>, <1024 x float>) nounwind
+declare <1024 x float> @llvm.tpu.vcvt.sr.fptobf16(<1024 x i32>, <1024 x float>) nounwind
+declare <1024 x float> @llvm.tpu.vcvt.sr.fptohf16(<1024 x i32>, <1024 x float>) nounwind
+
+; CHECK-LABEL: vfptosi32r:
+; CHECK: v{{[0-9]+}} = vcvt.f32.s32 v{{[0-9]+}}
+define <1024 x i32> @vfptosi32r(<1024 x float> %x) {
+  %a = call <1024 x i32> @llvm.tpu.cvt.fptosi.v1024i32.v1024f32(<1024 x float> %x)
+  ret <1024 x i32> %a
+}
+
+; CHECK-LABEL: vfptosi32s:
+; CHECK: v[[x:[0-9]+]] = vmov s0
+; CHECK: v{{[0-9]+}} = vcvt.f32.s32 v[[x]]
+define <1024 x i32> @vvfptosi32s(float %y) {
+  %v0 = insertelement <1024 x float> undef, float %y, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x i32> @llvm.tpu.cvt.fptosi.v1024i32.v1024f32(<1024 x float> %v1)
+  ret <1024 x i32> %a
+}
+
+; CHECK-LABEL: vfptosi32i:
+; CHECK: v[[x:[0-9]+]] = vimm.f32 $1.0
+; CHECK: v{{[0-9]+}} = vcvt.f32.s32 v[[x]]
+define <1024 x i32> @vfptosi32i() {
+  %v0 = insertelement <1024 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x i32> @llvm.tpu.cvt.fptosi.v1024i32.v1024f32(<1024 x float> %v1)
+  ret <1024 x i32> %a
+}
+
+; CHECK-LABEL: fptobf8:
+; CHECK: vcvt.f32.bf8
+define <1024 x float> @fptobf8(<1024 x float> %x) {
+  %a = call <1024 x float> @llvm.tpu.vcvt.fptobf8(<1024 x float> %x)
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: fptoif8:
+; CHECK: vcvt.f32.if8
+define <1024 x float> @fptoif8(<1024 x float> %x) {
+  %a = call <1024 x float> @llvm.tpu.vcvt.fptoif8(<1024 x float> %x)
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: fptobf16:
+; CHECK: vcvt.f32.bf16
+define <1024 x float> @fptobf16(<1024 x float> %x) {
+  %a = call <1024 x float> @llvm.tpu.vcvt.fptobf16(<1024 x float> %x)
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: fptohf16:
+; CHECK: vcvt.f32.hf16
+define <1024 x float> @fptohf16(<1024 x float> %x) {
+  %a = call <1024 x float> @llvm.tpu.vcvt.fptohf16(<1024 x float> %x)
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: sr.fptobf8rr:
+; CHECK: v{{[0-9]+}} =	vcvt.sr.f32.bf8 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @sr.fptobf8rr(<1024 x i32> %random, <1024 x float> %x) {
+  %a = call <1024 x float> @llvm.tpu.vcvt.sr.fptobf8(<1024 x i32> %random, <1024 x float> %x)
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: sr.fptobf8ir:
+; CHECK: v{{[0-9]+}} =	vcvt.sr.f32.bf8 $0x1, v{{[0-9]+}}
+define <1024 x float> @sr.fptobf8ir(<1024 x float> %x) {
+  %v0 = insertelement <1024 x i32> undef, i32 1, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x float> @llvm.tpu.vcvt.sr.fptobf8(<1024 x i32> %v1, <1024 x float> %x)
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: sr.fptobf8sr:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.bf8 s{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @sr.fptobf8sr(i32 %y, <1024 x float> %x) {
+  %v0 = insertelement <1024 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x float> @llvm.tpu.vcvt.sr.fptobf8(<1024 x i32> %v1, <1024 x float> %x)
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: sr.fptoif8rr:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.if8 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @sr.fptoif8rr(<1024 x i32> %random, <1024 x float> %x) {
+  %a = call <1024 x float> @llvm.tpu.vcvt.sr.fptoif8(<1024 x i32> %random, <1024 x float> %x)
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: sr.fptoif8ir:
+; CHECK: v{{[0-9]+}} =	vcvt.sr.f32.if8 $0x1, v{{[0-9]+}}
+define <1024 x float> @sr.fptoif8ir(<1024 x float> %x) {
+  %v0 = insertelement <1024 x i32> undef, i32 1, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x float> @llvm.tpu.vcvt.sr.fptoif8(<1024 x i32> %v1, <1024 x float> %x)
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: sr.fptoif8sr:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.if8 s{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @sr.fptoif8sr(i32 %y, <1024 x float> %x) {
+  %v0 = insertelement <1024 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x float> @llvm.tpu.vcvt.sr.fptoif8(<1024 x i32> %v1, <1024 x float> %x)
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: sr.fptobf16rr:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @sr.fptobf16rr(<1024 x i32> %random, <1024 x float> %x) {
+  %a = call <1024 x float> @llvm.tpu.vcvt.sr.fptobf16(<1024 x i32> %random, <1024 x float> %x)
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: sr.fptobf16ir:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.bf16 $0x1, v{{[0-9]+}}
+define <1024 x float> @sr.fptobf16ir(<1024 x float> %x) {
+  %v0 = insertelement <1024 x i32> undef, i32 1, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x float> @llvm.tpu.vcvt.sr.fptobf16(<1024 x i32> %v1, <1024 x float> %x)
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: sr.fptobf16sr:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.bf16 s{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @sr.fptobf16sr(i32 %y, <1024 x float> %x) {
+  %v0 = insertelement <1024 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x float> @llvm.tpu.vcvt.sr.fptobf16(<1024 x i32> %v1, <1024 x float> %x)
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: sr.fptohf16rr:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.hf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @sr.fptohf16rr(<1024 x i32> %random, <1024 x float> %x) {
+  %a = call <1024 x float> @llvm.tpu.vcvt.sr.fptohf16(<1024 x i32> %random, <1024 x float> %x)
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: sr.fptohf16ir:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.hf16 $0x1, v{{[0-9]+}}
+define <1024 x float> @sr.fptohf16ir(<1024 x float> %x) {
+  %v0 = insertelement <1024 x i32> undef, i32 1, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x float> @llvm.tpu.vcvt.sr.fptohf16(<1024 x i32> %v1, <1024 x float> %x)
+  ret <1024 x float> %a
+}
+
+; CHECK-LABEL: sr.fptohf16sr:
+; CHECK: v{{[0-9]+}} = vcvt.sr.f32.hf16 s{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x float> @sr.fptohf16sr(i32 %y, <1024 x float> %x) {
+  %v0 = insertelement <1024 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x float> @llvm.tpu.vcvt.sr.fptohf16(<1024 x i32> %v1, <1024 x float> %x)
+  ret <1024 x float> %a
+}
+
+; Byte not zero
+declare <1024 x i1> @llvm.tpu.byte.not.zero.v1024i1.v1024i32(<1024 x i32>) readnone
+
+; CHECK-LABEL: bytenotzero:
+; CHECK: vm{{[0-9]+}} = vnez.u8 v{{[0-9]+}}
+define <1024 x i1> @bytenotzero(<1024 x i32> %x) {
+  %a = call <1024 x i1> @llvm.tpu.byte.not.zero.v1024i1.v1024i32(<1024 x i32> %x)
+  ret <1024 x i1> %a
+}
+
+; CHECK-LABEL: bytenotzero_sreg_splat:
+; CHECK: vm{{[0-9]+}} = vnez.u8 s{{[0-9]+}}
+define <1024 x i1> @bytenotzero_sreg_splat(i32 %y) {
+  %v0 = insertelement <1024 x i32> undef, i32 %y, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x i1> @llvm.tpu.byte.not.zero.v1024i1.v1024i32(<1024 x i32> %v1)
+  ret <1024 x i1> %a
+}
+
+; Total order and class compares
+declare <1024 x i1> @llvm.tpu.vlt.to(<1024 x float>, <1024 x float>) readnone
+declare <1024 x i1> @llvm.tpu.vle.to(<1024 x float>, <1024 x float>) readnone
+declare <1024 x i1> @llvm.tpu.vclass(<1024 x float>, <1024 x float>) readnone
+
+; CHECK-LABEL: vltto:
+; CHECK: vm{{[0-9]+}} = vlt.to.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x i1> @vltto(<1024 x float> %x, <1024 x float> %y) {
+  %a = call <1024 x i1> @llvm.tpu.vlt.to(<1024 x float> %x, <1024 x float> %y)
+  ret <1024 x i1> %a
+}
+
+; CHECK-LABEL: vltto_splat:
+; CHECK: vm{{[0-9]+}} = vlt.to.f32 v{{[0-9]+}}, $1.0
+define <1024 x i1> @vltto_splat(<1024 x float> %x) {
+  %v0 = insertelement <1024 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x i1> @llvm.tpu.vlt.to(<1024 x float> %x, <1024 x float> %v1)
+  ret <1024 x i1> %a
+}
+
+; CHECK-LABEL: vltto_sreg_splat:
+; CHECK: vm{{[0-9]+}} = vlt.to.f32 v{{[0-9]+}}, s{{[0-9]+}}
+define <1024 x i1> @vltto_sreg_splat(<1024 x float> %x, float %y) {
+  %v0 = insertelement <1024 x float> undef, float %y, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x i1> @llvm.tpu.vlt.to(<1024 x float> %x, <1024 x float> %v1)
+  ret <1024 x i1> %a
+}
+
+; CHECK-LABEL: vleto:
+; CHECK: vm{{[0-9]+}} = vle.to.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x i1> @vleto(<1024 x float> %x, <1024 x float> %y) {
+  %a = call <1024 x i1> @llvm.tpu.vle.to(<1024 x float> %x, <1024 x float> %y)
+  ret <1024 x i1> %a
+}
+
+; CHECK-LABEL: vleto_splat:
+; CHECK: vm{{[0-9]+}} = vle.to.f32 v{{[0-9]+}}, $1.0
+define <1024 x i1> @vleto_splat(<1024 x float> %x) {
+  %v0 = insertelement <1024 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x i1> @llvm.tpu.vle.to(<1024 x float> %x, <1024 x float> %v1)
+  ret <1024 x i1> %a
+}
+
+; CHECK-LABEL: vleto_sreg_splat:
+; CHECK: vm{{[0-9]+}} = vle.to.f32 v{{[0-9]+}}, s{{[0-9]+}}
+define <1024 x i1> @vleto_sreg_splat(<1024 x float> %x, float %y) {
+  %v0 = insertelement <1024 x float> undef, float %y, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x i1> @llvm.tpu.vle.to(<1024 x float> %x, <1024 x float> %v1)
+  ret <1024 x i1> %a
+}
+
+; CHECK-LABEL: vclass:
+; CHECK: vm{{[0-9]+}} = vclass.f32 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x i1> @vclass(<1024 x float> %x, <1024 x float> %y) {
+  %a = call <1024 x i1> @llvm.tpu.vclass(<1024 x float> %x, <1024 x float> %y)
+  ret <1024 x i1> %a
+}
+
+; CHECK-LABEL: vclass_splat:
+; CHECK: vm{{[0-9]+}} = vclass.f32 v{{[0-9]+}}, $1.0
+define <1024 x i1> @vclass_splat(<1024 x float> %x) {
+  %v0 = insertelement <1024 x float> undef, float 1.0, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x i1> @llvm.tpu.vclass(<1024 x float> %x, <1024 x float> %v1)
+  ret <1024 x i1> %a
+}
+
+; CHECK-LABEL: vclass_sreg_splat:
+; CHECK: vm{{[0-9]+}} = vclass.f32 v{{[0-9]+}}, s{{[0-9]+}}
+define <1024 x i1> @vclass_sreg_splat(<1024 x float> %x, float %y) {
+  %v0 = insertelement <1024 x float> undef, float %y, i32 0
+  %v1 = shufflevector <1024 x float> %v0, <1024 x float> undef, <1024 x i32> zeroinitializer
+  %a = call <1024 x i1> @llvm.tpu.vclass(<1024 x float> %x, <1024 x float> %v1)
+  ret <1024 x i1> %a
+}
+
+; CHECK-LABEL: vfptosi:
+; CHECK: v{{[0-9]+}} = vcvt.f32.s32 v{{[0-9]+}}
+define <1024 x i32> @vfptosi(<1024 x float> %x) {
+  %a = fptosi <1024 x float> %x to <1024 x i32>
+  ret <1024 x i32> %a
+}
+
+; CHECK-LABEL: vmul_constant:
+; CHECK: vmul.u32 $0x3, v{{[0-9]+}}
+define <1024 x i32> @vmul_constant(<1024 x i32> %x) {
+  %v0 = insertelement <1024 x i32> undef, i32 3, i32 0
+  %splat = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %z = mul <1024 x i32> %x, %splat
+  ret <1024 x i32> %z
+}
+
+; Test that we emulate the mulatiplication with 6 fmul.
+; CHECK-LABEL: vmul_noconstant:
+; CHECK: v{{[0-9]+}} = vmul.u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: _ = shalt
+define <1024 x i32> @vmul_noconstant(<1024 x i32> %x, <1024 x i32> %y) {
+  %z = mul <1024 x i32> %x, %y
+  ret <1024 x i32> %z
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_vf_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_vf_sc.ll
new file mode 100644
index 0000000..02ccb01
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vector_vf_sc.ll

@@ -0,0 +1,82 @@
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: | FileCheck %s
+; RUN: llc < %s -mcpu=sparsecore-tec-vf -asm-verbose=false -disable-cgp \
+; RUN: -opaque-pointers | FileCheck %s
+; REQUIRES: tpu
+
+; Test that basic but deprecated Viperfish intrinsics assemble as expected.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; vpack
+declare <8 x float> @llvm.tpu.deprecated.pack.i.bf16(<8 x float>, <8 x float>)
+declare <8 x float> @llvm.tpu.deprecated.pack.c.bf16(<8 x float>, <8 x float>)
+
+; CHECK-LABEL: vpack_i_bf16:
+; CHECK: v{{[0-9]+}} = vpack.i.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @vpack_i_bf16(<8 x float> %y, <8 x float> %x) {
+  %a = call <8 x float> @llvm.tpu.deprecated.pack.i.bf16(<8 x float> %y, <8 x float> %x) readnone
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vpack_i_bf16_scalar:
+; CHECK: v{{[0-9]+}} = vpack.i.bf16 s{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @vpack_i_bf16_scalar(<8 x float> %x, float %y) {
+  %v0 = insertelement <8 x float> undef, float %y, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> zeroinitializer
+  %a = call <8 x float> @llvm.tpu.deprecated.pack.i.bf16(<8 x float> %v1, <8 x float> %x) readnone
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vpack_c_bf16:
+; CHECK: v{{[0-9]+}} = vpack.c.bf16 v{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @vpack_c_bf16(<8 x float> %y, <8 x float> %x) {
+  %a = call <8 x float> @llvm.tpu.deprecated.pack.c.bf16(<8 x float> %y, <8 x float> %x) readnone
+  ret <8 x float> %a
+}
+
+; CHECK-LABEL: vpack_c_bf16_scalar:
+; CHECK: v{{[0-9]+}} = vpack.c.bf16 s{{[0-9]+}}, v{{[0-9]+}}
+define <8 x float> @vpack_c_bf16_scalar(<8 x float> %x, float %y) {
+  %v0 = insertelement <8 x float> undef, float %y, i32 0
+  %v1 = shufflevector <8 x float> %v0, <8 x float> undef, <8 x i32> zeroinitializer
+  %a = call <8 x float> @llvm.tpu.deprecated.pack.c.bf16(<8 x float> %v1, <8 x float> %x) readnone
+  ret <8 x float> %a
+}
+
+; Unpack Interleaved
+declare <8 x float> @llvm.tpu.deprecated.unpack.i.l.bf16(<8 x float>)
+declare <8 x float> @llvm.tpu.deprecated.unpack.i.u.bf16(<8 x float>)
+
+; Unpack Compressed
+declare <8 x float> @llvm.tpu.deprecated.unpack.c.l.bf16(<8 x float>)
+declare <8 x float> @llvm.tpu.deprecated.unpack.c.u.bf16(<8 x float>)
+
+; CHECK-LABEL: unpack_i_l_bf16
+; CHECK: vunpack.i.l.bf16
+define <8 x float> @unpack_i_l_bf16(<8 x float> %x)  {
+  %res = call <8 x float> @llvm.tpu.deprecated.unpack.i.l.bf16(<8 x float> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_i_u_bf16
+; CHECK: vunpack.i.u.bf16
+define <8 x float> @unpack_i_u_bf16(<8 x float> %x)  {
+  %res = call <8 x float> @llvm.tpu.deprecated.unpack.i.u.bf16(<8 x float> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_c_l_bf16
+; CHECK: vunpack.c.l.bf16
+define <8 x float> @unpack_c_l_bf16(<8 x float> %x)  {
+  %res = call <8 x float> @llvm.tpu.deprecated.unpack.c.l.bf16(<8 x float> %x)
+  ret <8 x float> %res
+}
+
+; CHECK-LABEL: unpack_c_u_bf16
+; CHECK: unpack.c.u.bf16
+define <8 x float> @unpack_c_u_bf16(<8 x float> %x)  {
+  %res = call <8 x float> @llvm.tpu.deprecated.unpack.c.u.bf16(<8 x float> %x)
+  ret <8 x float> %res
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/verifier_early_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/verifier_early_sc.ll
new file mode 100644
index 0000000..e4c54ea
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/verifier_early_sc.ll

@@ -0,0 +1,106 @@
+; RUN: opt < %s -S -O2 -mcpu=sparsecore-tec-vf -stop-after=tpu-verifier \
+; RUN: -tpu-fatal-verifier-error=false \
+; RUN: 2>&1 | FileCheck %s
+; REQUIRES: tpu
+
+; Tests that the early verifier reports errors correctly.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+attributes #3 = { nounwind "target-cpu"="sparsecore-scs-vf" }
+attributes #4 = { nounwind "target-cpu"="sparsecore-tec-vf" }
+attributes #5 = { nounwind "target-cpu"="sparsecore-tac-vf" }
+
+declare i32 addrspace(212)* @llvm.tpu.addrspacecast.p212i32(i32*)
+declare i32* @llvm.tpu.alloca.smem(i32)
+declare i32 addrspace(203)* @llvm.tpu.alloca.hbm(i32)
+declare i32 @llvm.tpu.ptrtoint.pi32(i32*)
+declare i32* @llvm.tpu.inttoptr.pi32(i32)
+declare i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32)
+declare void @llvm.tpu.dma.hbm.to.hbm.sc.general(i32 addrspace(211)*, i32, i32 addrspace(203)*, i32 addrspace(213)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.dma.smem.to.smem.sc.general(i32 addrspace(211)*, i32, i32*, i32 addrspace(212)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32)
+declare <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32) nounwind
+declare i32 @llvm.tpu.rsqrt(<8 x float>)
+declare i32 @llvm.tpu.pow2(<8 x float>)
+declare i32 @llvm.tpu.log2(<8 x float>)
+declare i32 @llvm.tpu.tanh(<8 x float>)
+declare i32 @llvm.tpu.rcp(<8 x float>)
+declare i32 @llvm.tpu.sigshft(<8 x float>)
+declare <8 x float> @llvm.tpu.eup.pop(i32)
+
+@flag = addrspace(204) global i32 0, align 4
+@rflag = addrspace(211) global i32 0, align 4
+
+; CHECK: Early verifier expects unfolded dreg pointer value.
+define void @dma_general_early(i32 addrspace(203)* %src, i32 addrspace(213)* %dst, i32 %size, i32 addrspace(204)* %flag, i32 addrspace(211)* %rflag, i32 %dstcid, i32 %srccid) #4 {
+  call void @llvm.tpu.dma.hbm.to.hbm.sc.general(i32 addrspace(211)* %rflag, i32 %dstcid, i32 addrspace(203)* %src, i32 addrspace(213)* %dst, i32 %size, i32 addrspace(204)* %flag, i32 %srccid, i32 addrspace(208)* null, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK: Use tpu_addrspacecast intrinsics.
+define void @smem_smemany_cast(i32 %a, i32* %src, i32 %dstcid, i32 %srccid) #4 {
+entry:
+  %0 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %1 = call i32* @llvm.tpu.alloca.smem(i32 4)
+  %2 = call i32* @llvm.tpu.alloca.smem(i32 5)
+  %arrayidx = getelementptr inbounds i32, i32* %0, i32 0
+  %3 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %3, 2
+  %arrayidx1 = getelementptr inbounds i32, i32* %1, i32 %a
+  store i32 %add, i32* %arrayidx1, align 4
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 0)
+  %cast_dst = addrspacecast i32* %2 to i32 addrspace(212)*
+  call void @llvm.tpu.dma.smem.to.smem.sc.general(i32 addrspace(211)* @rflag, i32 %dstcid, i32* %src, i32 addrspace(212)* %cast_dst, i32 4, i32 addrspace(204)* @flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}
+
+; CHECK: Unsupported address space on this processor.
+define <1024 x float> @loadimm() {
+entry:
+  %a = call <1024 x float> addrspace(205)* @llvm.tpu.inttoptr.p205v1024f32(i32 4)
+  %b = load <1024 x float>, <1024 x float> addrspace(205)* %a
+  ret <1024 x float> %b
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+define <8 x float> @vrsqrt(<8 x float> %in) {
+  %1 = call i32 @llvm.tpu.rsqrt(<8 x float> %in)
+  %r = call <8 x float> @llvm.tpu.eup.pop(i32 %1)
+  ret <8 x float> %r
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+define <8 x float> @vpow2(<8 x float> %in) {
+  %1 = call i32 @llvm.tpu.pow2(<8 x float> %in)
+  %r = call <8 x float> @llvm.tpu.eup.pop(i32 %1)
+  ret <8 x float> %r
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+define <8 x float> @vlog2(<8 x float> %in) {
+  %1 = call i32 @llvm.tpu.log2(<8 x float> %in)
+  %r = call <8 x float> @llvm.tpu.eup.pop(i32 %1)
+  ret <8 x float> %r
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+define <8 x float> @vtanh(<8 x float> %in) {
+  %1 = call i32 @llvm.tpu.tanh(<8 x float> %in)
+  %r = call <8 x float> @llvm.tpu.eup.pop(i32 %1)
+  ret <8 x float> %r
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+define <8 x float> @vrcp(<8 x float> %in) {
+  %1 = call i32 @llvm.tpu.rcp(<8 x float> %in)
+  %r = call <8 x float> @llvm.tpu.eup.pop(i32 %1)
+  ret <8 x float> %r
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+define <8 x float> @vsigshft(<8 x float> %in) {
+  %1 = call i32 @llvm.tpu.sigshft(<8 x float> %in)
+  %r = call <8 x float> @llvm.tpu.eup.pop(i32 %1)
+  ret <8 x float> %r
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/verifier_late_sc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/verifier_late_sc.ll
new file mode 100644
index 0000000..7b5f3aa
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/verifier_late_sc.ll

@@ -0,0 +1,204 @@
+; RUN: llc < %s -march=googletpu -mcpu=sparsecore-tec-vf -stop-after=tpu-verifier \
+; RUN: -tpu-fatal-verifier-error=false -asm-verbose=false -disable-cgp \
+; RUN: 2>&1 | FileCheck %s
+; REQUIRES: tpu
+
+; Test that the verifier reports errors correctly.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+attributes #0 = { nounwind "target-cpu"="tensorcore-pf" }
+attributes #1 = { nounwind "target-cpu"="tensorcore-jf" }
+attributes #2 = { nounwind "target-cpu"="tensorcore-vf" }
+attributes #3 = { nounwind "target-cpu"="sparsecore-scs-vf" }
+attributes #4 = { nounwind "target-cpu"="sparsecore-tec-vf" }
+attributes #5 = { nounwind "target-cpu"="sparsecore-tac-vf" }
+
+declare i32 addrspace(210)* @llvm.tpu.allocate.sflag.other(i32, i32)
+declare i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.segmented.v1024i32(<1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.packed.v1024i32(<1024 x i32>, <1024 x i32>, i32, i32, i32, i32)
+declare <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32, i32)
+declare void @llvm.tpu.syncadd.both(i32 addrspace(204)*, i32 addrspace(210)*, i32)
+declare void @llvm.tpu.syncadd.other(i32 addrspace(210)*, i32)
+declare void @llvm.tpu.waitle.yieldable(i32 addrspace(204)*, i32)
+declare void @llvm.tpu.waiteq.yieldable(i32 addrspace(204)*, i32)
+declare void @llvm.tpu.waiteqordone(i32 addrspace(204)*, i32)
+declare void @llvm.tpu.waiteqordone.yieldable(i32 addrspace(204)*, i32)
+declare void @llvm.tpu.dma.hbm.to.hbm.sc.simple(i32 addrspace(211)*, i32 addrspace(203)*, i32 addrspace(203)*, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.dma.hbm.to.iova.sc.simple(i32 addrspace(211)*, i32 addrspace(203)*, i32 addrspace(216)*, i32, i32, i32) argmemonly nounwind
+declare void @llvm.tpu.dma.hbm.to.hbm.sc.general(i32 addrspace(211)*, i32, i32 addrspace(203)*, i32 addrspace(213)*, i32, i32 addrspace(204)*, i32, i32 addrspace(208)*, i32, i32, i32)
+declare i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32)
+
+; CHECK: Invalid transpose height or width.
+define void @transpose_wrong_width(<1024 x i32> %v) #0 {
+  %xlu = call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 7, i32 8, i32 1, i32 undef)
+  ret void
+}
+
+; CHECK: Invalid transpose height or width.
+define void @transpose_packed_wrong_width(<1024 x i32> %v1, <1024 x i32> %v2) #0 {
+  %xlu = call i32 @llvm.tpu.tc.transpose.packed.v1024i32(<1024 x i32> %v1, <1024 x i32> %v2, i32 7, i32 8, i32 1, i32 undef)
+  ret void
+}
+
+; CHECK: Short or narrow transpose not supported.
+define void @transpose_short_jf(<1024 x i32> %v) #1 {
+  %xlu = call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 8, i32 0, i32 undef)
+  ret void
+}
+
+; CHECK: Short or narrow transpose not supported.
+define void @transpose_narrow_jf(<1024 x i32> %v) #1 {
+  %xlu = call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 8, i32 128, i32 0, i32 undef)
+  ret void
+}
+
+; CHECK: Invalid XLU Bus used.
+define void @transpose_none_zero_bus_jf(<1024 x i32> %v) #1 {
+  %xlu = call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 1, i32 undef)
+  ret void
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+; CHECK-NEXT: llvm.tpu.tc.transpose.segmented.v1024i32
+define void @transpose_segmented_jf(<1024 x i32> %v) #1 {
+  %xlu = call i32 @llvm.tpu.tc.transpose.segmented.v1024i32(<1024 x i32> %v, i32 128, i32 128, i32 0, i32 undef)
+  ret void
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+; CHECK-NEXT: llvm.tpu.waitle.yieldable
+define void @waitle_yieldable_no_scs(i32 addrspace(204)* %a) #3 {
+  call void @llvm.tpu.waitle.yieldable(i32 addrspace(204)* %a, i32 32)
+  ret void
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+; CHECK-NEXT: llvm.tpu.waiteq.yieldable
+define void @waiteq_yieldable_no_tec(i32 addrspace(204)* %a) #4 {
+  call void @llvm.tpu.waiteq.yieldable(i32 addrspace(204)* %a, i32 32)
+  ret void
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+; CHECK-NEXT: llvm.tpu.waiteq.yieldable
+define void @waiteq_yieldable_no_tac(i32 addrspace(204)* %a) #5 {
+  call void @llvm.tpu.waiteq.yieldable(i32 addrspace(204)* %a, i32 32)
+  ret void
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+; CHECK-NEXT: llvm.tpu.waiteq.yieldable
+define void @waiteq_yieldable_no_jf(i32 addrspace(204)* %a) #0 {
+  call void @llvm.tpu.waiteq.yieldable(i32 addrspace(204)* %a, i32 32)
+  ret void
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+; CHECK-NEXT: llvm.tpu.waiteqordone
+define void @waiteqordone_no_vf(i32 addrspace(204)* %a) #2 {
+  call void @llvm.tpu.waiteqordone(i32 addrspace(204)* %a, i32 32)
+  ret void
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+; CHECK-NEXT: llvm.tpu.waiteqordone.yieldable
+define void @waiteqordone_yieldable_no_vf(i32 addrspace(204)* %a) #2 {
+  call void @llvm.tpu.waiteqordone.yieldable(i32 addrspace(204)* %a, i32 32)
+  ret void
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+; CHECK-NEXT: llvm.tpu.waiteqordone.yieldable
+define void @waiteqordone_yieldable_no_tec(i32 addrspace(204)* %a) #4 {
+  call void @llvm.tpu.waiteqordone.yieldable(i32 addrspace(204)* %a, i32 32)
+  ret void
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+; CHECK-NEXT: llvm.tpu.waiteqordone.yieldable
+define void @waiteqordone_yieldable_no_tac(i32 addrspace(204)* %a) #5 {
+  call void @llvm.tpu.waiteqordone.yieldable(i32 addrspace(204)* %a, i32 32)
+  ret void
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+; CHECK-NEXT: llvm.tpu.allocate.sflag.other
+define void @allocate_sflag_other_no_scs() #3 {
+  %ptr = call i32 addrspace(210)* @llvm.tpu.allocate.sflag.other(i32 1, i32 2)
+  ret void
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+; CHECK-NEXT: llvm.tpu.syncadd.other
+define void @ssyncadd_other_no_scs(i32 addrspace(210)* %x) #3 {
+  call void @llvm.tpu.syncadd.other(i32 addrspace(210)* %x, i32 123)
+  ret void
+}
+
+; CHECK: Intrinsic not supported on this subtarget.
+; CHECK-NEXT: llvm.tpu.syncadd.both
+define void @ssyncadd_both_no_scs(i32 addrspace(204)* %x, i32 addrspace(210)* %y) #3 {
+  call void @llvm.tpu.syncadd.both(i32 addrspace(204)* %x, i32 addrspace(210)* %y, i32 123)
+  ret void
+}
+
+; CHECK: Vector load of vector != 8.
+define i32 @invalid_sc_length() #4 {
+entry:
+  %ptr = alloca <1024 x i32> addrspace(201)*, align 4
+  %0 = bitcast <1024 x i32> addrspace(201)** %ptr to i8*
+  store <1024 x i32> addrspace(201)* inttoptr (i32 8 to <1024 x i32> addrspace(201)*), <1024 x i32> addrspace(201)** %ptr, align 4
+  %1 = load <1024 x i32> addrspace(201)*, <1024 x i32> addrspace(201)** %ptr, align 4
+  %2 = load <1024 x i32>, <1024 x i32> addrspace(201)* %1, align 4096
+  %vecext = extractelement <1024 x i32> %2, i32 0
+  %3 = bitcast <1024 x i32> addrspace(201)** %ptr to i8*
+  ret i32 %vecext
+}
+
+; CHECK: Vector load of vector != 1024.
+define i32 @invalid_vf_length() #2 {
+entry:
+  %ptr = alloca <8 x i32> addrspace(201)*, align 4
+  %0 = bitcast <8 x i32> addrspace(201)** %ptr to i8*
+  store <8 x i32> addrspace(201)* inttoptr (i32 8 to <8 x i32> addrspace(201)*), <8 x i32> addrspace(201)** %ptr, align 4
+  %1 = load <8 x i32> addrspace(201)*, <8 x i32> addrspace(201)** %ptr, align 4
+  %2 = load <8 x i32>, <8 x i32> addrspace(201)* %1, align 32
+  %vecext = extractelement <8 x i32> %2, i32 0
+  %3 = bitcast <8 x i32> addrspace(201)** %ptr to i8*
+  ret i32 %vecext
+}
+
+; CHECK: trace_en field must be an immediate
+define void @dma_simple_trace_en_immediate(i32 addrspace(203)* %src, i32 addrspace(203)* %dst, i32 addrspace(211)* %rflag, i32 %size, i32 %trace_en) #4 {
+  call void @llvm.tpu.dma.hbm.to.hbm.sc.simple(i32 addrspace(211)* %rflag, i32 addrspace(203)* %src, i32 addrspace(203)* %dst, i32 %size, i32 %trace_en)
+  ret void
+}
+
+; CHECK: trace_en field must be zero or one, got 123
+define void @dma_simple_trace_en_range(i32 addrspace(203)* %src, i32 addrspace(203)* %dst, i32 addrspace(211)* %rflag, i32 %size) #4 {
+  call void @llvm.tpu.dma.hbm.to.hbm.sc.simple(i32 addrspace(211)* %rflag, i32 addrspace(203)* %src, i32 addrspace(203)* %dst, i32 %size, i32 123)
+  ret void
+}
+
+; CHECK: trace_en field must be an immediate
+define void @dma_iova_simple_trace_en_immediate(i32 addrspace(203)* %src, i32 addrspace(216)* %dst, i32 addrspace(211)* %rflag, i32 %size, i32 %offset, i32 %trace_en) #4 {
+  call void @llvm.tpu.dma.hbm.to.iova.sc.simple(i32 addrspace(211)* %rflag, i32 addrspace(203)* %src, i32 addrspace(216)* %dst, i32 %size, i32 %offset, i32 %trace_en)
+  ret void
+}
+
+; CHECK: trace_en field must be zero or one, got 123
+define void @dma_iova_simple_trace_en_range(i32 addrspace(203)* %src, i32 addrspace(216)* %dst, i32 addrspace(211)* %rflag, i32 %size, i32 %offset) #4 {
+  call void @llvm.tpu.dma.hbm.to.iova.sc.simple(i32 addrspace(211)* %rflag, i32 addrspace(203)* %src, i32 addrspace(216)* %dst, i32 %size, i32 %offset, i32 123)
+  ret void
+}
+
+; CHECK: Late verifier expects dreg pointer that points to null.
+define void @dma_general(i32 addrspace(203)* %src, i32 addrspace(213)* %dst, i32 %size, i32 addrspace(204)* %flag, i32 addrspace(211)* %rflag, i32 %dstcid, i32 %srccid) #4 {
+  %desc = tail call i32 addrspace(208)* @llvm.tpu.inttoptr.p208i32(i32 1)
+  call void @llvm.tpu.dma.hbm.to.hbm.sc.general(i32 addrspace(211)* %rflag, i32 %dstcid, i32 addrspace(203)* %src, i32 addrspace(213)* %dst, i32 %size, i32 addrspace(204)* %flag, i32 %srccid, i32 addrspace(208)* %desc, i32 4, i32 0, i32 0)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vm_spill_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vm_spill_tc.ll
new file mode 100644
index 0000000..239b7a8
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vm_spill_tc.ll

@@ -0,0 +1,120 @@
+; RUN: llc -O2 < %s -mcpu=tensorcore-jf -asm-verbose=false -tpu-critical-path-sched | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+!vmem.funcs.spill = !{!0}
+!vmem.ranges.spill.start = !{!1}
+!vmem.ranges.spill.limit = !{!2}
+
+!0 = !{void ()* @spill_vm_to_vreg}
+!1 = !{i32 100}
+!2 = !{i32 200}
+
+; Function Attrs: nounwind readnone
+declare <1024 x i32> @llvm.tpu.vlaneseq() #0
+declare <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32) nounwind
+attributes #0 = { nounwind readnone }
+
+; CHECK-LABEL: spill_vm_to_vreg:
+; CHECK: v[[z:[0-9]+]] = vimm.s32 $0x0
+; CHECK: v[[s:[0-9]+]] =	vsel vm{{[0-9]+}}, $0xffffffff, v[[z]]
+
+define void @spill_vm_to_vreg() {
+entry:
+  br label %llo-region-0
+
+llo-region-0:                                     ; preds = %entry
+  %laneseq = call <1024 x i32> @llvm.tpu.vlaneseq()
+  %.splatinsert = insertelement <1024 x i32> undef, i32 127, i32 0
+  %.splat = shufflevector <1024 x i32> %.splatinsert, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %base_rec = and <1024 x i32> %laneseq, %.splat
+
+  %.splatinsert1 = insertelement <1024 x i32> undef, i32 10, i32 0
+  %.splat2 = shufflevector <1024 x i32> %.splatinsert1, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %.splatinsert3 = insertelement <1024 x i32> undef, i32 11, i32 0
+  %.splat4 = shufflevector <1024 x i32> %.splatinsert3, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp1 = icmp eq <1024 x i32> %base_rec, %.splat4
+  %.splatinsert5 = insertelement <1024 x i32> undef, i32 1, i32 0
+  %.splat6 = shufflevector <1024 x i32> %.splatinsert5, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp2 = icmp eq <1024 x i32> %base_rec, %.splat6
+  %.splatinsert7 = insertelement <1024 x i32> undef, i32 2, i32 0
+  %.splat8 = shufflevector <1024 x i32> %.splatinsert7, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp3 = icmp eq <1024 x i32> %base_rec, %.splat8
+  %.splatinsert9 = insertelement <1024 x i32> undef, i32 3, i32 0
+  %.splat10 = shufflevector <1024 x i32> %.splatinsert9, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp4 = icmp eq <1024 x i32> %base_rec, %.splat10
+  %.splatinsert11 = insertelement <1024 x i32> undef, i32 4, i32 0
+  %.splat12 = shufflevector <1024 x i32> %.splatinsert11, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp5 = icmp eq <1024 x i32> %base_rec, %.splat12
+  %.splatinsert13 = insertelement <1024 x i32> undef, i32 5, i32 0
+  %.splat14 = shufflevector <1024 x i32> %.splatinsert13, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp6 = icmp eq <1024 x i32> %base_rec, %.splat14
+  %.splatinsert15 = insertelement <1024 x i32> undef, i32 6, i32 0
+  %.splat16 = shufflevector <1024 x i32> %.splatinsert15, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp7 = icmp eq <1024 x i32> %base_rec, %.splat16
+  %.splatinsert17 = insertelement <1024 x i32> undef, i32 7, i32 0
+  %.splat18 = shufflevector <1024 x i32> %.splatinsert17, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp8 = icmp eq <1024 x i32> %base_rec, %.splat18
+  %.splatinsert19 = insertelement <1024 x i32> undef, i32 8, i32 0
+  %.splat20 = shufflevector <1024 x i32> %.splatinsert19, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp9 = icmp eq <1024 x i32> %base_rec, %.splat20
+  %.splatinsert23 = insertelement <1024 x i32> undef, i32 9, i32 0
+  %.splat22 = shufflevector <1024 x i32> %.splatinsert23, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp10 = icmp sgt <1024 x i32> %base_rec, %.splat22
+  %.splatinsert25 = insertelement <1024 x i32> undef, i32 10, i32 0
+  %.splat24 = shufflevector <1024 x i32> %.splatinsert25, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp11 = icmp sgt <1024 x i32> %base_rec, %.splat24
+  %.splatinsert27 = insertelement <1024 x i32> undef, i32 11, i32 0
+  %.splat26 = shufflevector <1024 x i32> %.splatinsert27, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp12 = icmp sgt <1024 x i32> %base_rec, %.splat26
+  %.splatinsert29 = insertelement <1024 x i32> undef, i32 12, i32 0
+  %.splat28 = shufflevector <1024 x i32> %.splatinsert29, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp13 = icmp sgt <1024 x i32> %base_rec, %.splat28
+  %.splatinsert31 = insertelement <1024 x i32> undef, i32 13, i32 0
+  %.splat30 = shufflevector <1024 x i32> %.splatinsert31, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp14 = icmp sgt <1024 x i32> %base_rec, %.splat30
+  %.splatinsert33 = insertelement <1024 x i32> undef, i32 14, i32 0
+  %.splat32 = shufflevector <1024 x i32> %.splatinsert33, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %cmp15 = icmp sgt <1024 x i32> %base_rec, %.splat32
+
+  %sel0 = select <1024 x i1> %cmp1, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %sel1 = select <1024 x i1> %cmp10, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %sel2 = select <1024 x i1> %cmp2, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add0 = add <1024 x i32> %sel1, %sel0
+  %sel3 = select <1024 x i1> %cmp3, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add1 = add <1024 x i32> %add0, %sel2
+  %sel4 = select <1024 x i1> %cmp4, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add2 = add <1024 x i32> %add1, %sel3
+  %sel5 = select <1024 x i1> %cmp5, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add3 = add <1024 x i32> %add2, %sel4
+  %sel6 = select <1024 x i1> %cmp6, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add4 = add <1024 x i32> %add3, %sel5
+  %sel7 = select <1024 x i1> %cmp7, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add5 = add <1024 x i32> %add4, %sel6
+  %sel8 = select <1024 x i1> %cmp8, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add6 = add <1024 x i32> %add5, %sel7
+  %sel9 = select <1024 x i1> %cmp9, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add7 = add <1024 x i32> %add6, %sel8
+  %sel10 = select <1024 x i1> %cmp10, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add8 = add <1024 x i32> %add7, %sel9
+  %sel11 = select <1024 x i1> %cmp11, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add9 = add <1024 x i32> %add8, %sel10
+  %sel12 = select <1024 x i1> %cmp12, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add10 = add <1024 x i32> %add9, %sel11
+  %sel13 = select <1024 x i1> %cmp13, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add11 = add <1024 x i32> %add10, %sel12
+  
+  %sel14 = select <1024 x i1> %cmp14, <1024 x i32> %base_rec, <1024 x i32> %.splat2
+  %add12 = add <1024 x i32> %add11, %sel13
+
+  %add13 = add <1024 x i32> %add12, %sel14
+  %result = sub <1024 x i32> %add13, %base_rec
+  br label %llo-region-2
+
+llo-region-2:                                     ; preds = %llo-region-0
+  %result_addr = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 0)
+  store <1024 x i32> %result, <1024 x i32> addrspace(205)* %result_addr
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vmem_scoreboard.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vmem_scoreboard.ll
new file mode 100644
index 0000000..66cf6a6
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vmem_scoreboard.ll

@@ -0,0 +1,123 @@
+; RUN: llc < %s -mcpu=tensorcore-jf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-skip-fast-opt -instcombine-max-iterations=0 \
+; RUN: | FileCheck %s --check-prefixes CHECK-MUTATION,CHECK
+; RUN: llc < %s -mcpu=tensorcore-jf -asm-verbose=false -disable-cgp \
+; RUN: -tpu-no-rawhazard-mutation -tpu-skip-fast-opt -instcombine-max-iterations=0 \
+; RUN: | FileCheck %s --check-prefixes CHECK-NO_MUTATION,CHECK
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Test vmem read after write hazard
+
+declare void @llvm.tpu.dma.vmem.to.hbm(i32 addrspace(204)*, <1024 x i32> addrspace(205)*, i32 addrspace(203)*, i32) #0
+declare <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)*, i32, i32)
+declare i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32) nounwind
+attributes #0 = { argmemonly nounwind }
+
+; CHECK-LABEL: loadafterstore:
+; CHECK: {  [vmem:s0+$0x0] =  vst v{{[0-9]+}};
+; CHECK-NEXT:     _ =  vdelay $0x4  }
+; CHECK-NEXT: {    v{{[0-9]+}} =  vld [vmem:s1+$0x0]
+define <1024 x i32> @loadafterstore(<1024 x i32> addrspace(205)* %ptr0, <1024 x i32> addrspace(205)* %ptr1) {
+  store <1024 x i32> zeroinitializer, <1024 x i32> addrspace(205)* %ptr0
+  %res = load <1024 x i32>, <1024 x i32> addrspace(205)* %ptr1
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: loadafterstore_inserttnop:
+; CHECK: {  [vmem:s0+$0x0] =  vst v{{[0-9]+}};
+; CHECK-MUTATION-NEXT:   _ = vdelay $0x2  }
+; CHECK-MUTATION-NEXT: {  _ = vnop;
+; CHECK-MUTATION-NEXT: s{{[0-9]+}} = sand.u32 $0x7, s{{[0-9]+}}  }
+; CHECK-MUTATION-NEXT: { v{{[0-9]+}} = vadd.s32 s{{[0-9]+}}, v{{[0-9]+}} }
+; CHECK-NO_MUTATION:   _ = vdelay $0x4
+; CHECK: v{{[0-9]+}} = vld [vmem:s1+$0x0]
+define <1024 x i32> @loadafterstore_inserttnop(<1024 x i32> addrspace(205)* %ptr0, <1024 x i32> addrspace(205)* %ptr1, i32 %s0, <1024 x i32> %a0) {
+  store <1024 x i32> zeroinitializer, <1024 x i32> addrspace(205)* %ptr0
+  %s = and i32 %s0, 7
+  %v0 = insertelement <1024 x i32> undef, i32 %s, i32 0
+  %v1 = shufflevector <1024 x i32> %v0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+  %a1 = add <1024 x i32> %a0, %v1
+  %a2 = add <1024 x i32> %a1, %v1
+  %res = load <1024 x i32>, <1024 x i32> addrspace(205)* %ptr1
+  %res1 = add <1024 x i32> %res, %a2
+  ret <1024 x i32> %res1
+}
+
+
+; Test a case where the compiler can verify that the load and store don't alias
+; CHECK-LABEL: loadafterstorenoalias:
+; CHECK:  v{{[0-9]+}} =  vld [vmem:s0+$0x6400]
+; CHECK-NEXT: [vmem:s0+$0x0] =  vst v{{[0-9]+}}
+; CHECK-NOT: vdelay
+define <1024 x i32> @loadafterstorenoalias(<1024 x i32> addrspace(205)* %ptr) {
+  store <1024 x i32> zeroinitializer, <1024 x i32> addrspace(205)* %ptr
+  %addr = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %ptr, i32 3200
+  %res = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: dmaafterstore:
+; CHECK-DAG: [vmem:s0+$0x0] =  vst v{{[0-9]+}}
+; CHECK-DAG:   s{{[0-9]+}} =  simm.s32 $0xf
+; CHECK: {    [hbm:s2], [sflag:s{{[0-9]+}}] =  dma.local [vmem:s1], $0x4  }
+define void @dmaafterstore(<1024 x i32> addrspace(205)* %ptr0, <1024 x i32> addrspace(205)* %ptr1, i32 addrspace(203)* %ptr2) {
+  store <1024 x i32> zeroinitializer, <1024 x i32> addrspace(205)* %ptr0
+  %flag = call i32 addrspace(204)* @llvm.tpu.inttoptr.p204i32(i32 15)
+  call void @llvm.tpu.dma.vmem.to.hbm(i32 addrspace(204)* %flag, <1024 x i32> addrspace(205)* %ptr1, i32 addrspace(203)* %ptr2, i32 4)
+  ret void
+}
+
+; Assume worst case at the beginning of a function
+; CHECK-LABEL: loadstartfunc:
+; CHECK: {    _ =  vdelay $0x3  }
+define <1024 x i32> @loadstartfunc(<1024 x i32> addrspace(205)* %ptr) {
+  %res = load <1024 x i32>, <1024 x i32> addrspace(205)* %ptr
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: load_intr_afterstore:
+; CHECK-MUTATION: [vmem:s0+$0x0] =  vst v{{[0-9]+}}
+; CHECK-MUTATION-NEXT:  _ =  vdelay $0x3
+; CHECK-MUTATION-NEXT:  _ = vnop
+; CHECK-MUTATION-NEXT: s[[x:[0-9]+]] = simm.s32 $0x1234567
+; CHECK-MUTATION-NEXT: v{{[0-9]+}} =  vld [vmem:s1+$0x0 ss:s[[x]] sm:$0x7b]
+
+; CHECK-NO_MUTATION: s[[x:[0-9]+]] = simm.s32 $0x1234567
+; CHECK-NO_MUTATION: [vmem:s0+$0x0] =  vst v{{[0-9]+}}
+; CHECK-NO_MUTATION-NEXT:  _ =  vdelay $0x4
+; CHECK-NO_MUTATION-NEXT: v{{[0-9]+}} =  vld [vmem:s1+$0x0 ss:s[[x]] sm:$0x7b]
+define <1024 x i32> @load_intr_afterstore(<1024 x i32> addrspace(205)* %ptr0, <1024 x i32> addrspace(205)* %ptr1) {
+  store <1024 x i32> zeroinitializer, <1024 x i32> addrspace(205)* %ptr0
+  %res = call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* %ptr1, i32 123, i32 19088743)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: back_to_back_load:
+; CHECK: v{{[0-9]+}} = vld [vmem:s0+$0x0 sm:$0x1]
+; CHECK-NEXT: v{{[0-9]+}} = vld [vmem:s1+$0x0 sm:$0x1]
+; CHECK-NEXT: v{{[0-9]+}} = vadd.s32 v{{[0-9]+}}, v{{[0-9]+}}
+define <1024 x i32> @back_to_back_load(<1024 x i32> addrspace(205)* %ptr0, <1024 x i32> addrspace(205)* %ptr1) {
+  %l0 = tail call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* nonnull %ptr0, i32 1, i32 1)
+  %l1 = tail call <1024 x i32> @llvm.tpu.vld.strided.i32(<1024 x i32> addrspace(205)* nonnull %ptr1, i32 1, i32 1)
+  %a0 = add <1024 x i32> %l0, %l1
+  ret <1024 x i32> %a0
+}
+
+; Test that if we have bundles with no VIF instructions we will insert VNOP to 
+; avoid having to insert new bundles.
+; CHECK-LABEL: loadafterstore_s:
+; CHECK: {  [vmem:s0+$0x0] =  vst v{{[0-9]+}};
+; CHECK-MUTATION: _ =  vdelay $0x2 }
+; CHECK-MUTATION: _ =  vnop
+; CHECK-MUTATION: _ =  vnop
+; CHECK-NO_MUTATION: _ =  vdelay $0x4 }
+; CHECK: v{{[0-9]+}} =  vld
+define <1024 x i32> @loadafterstore_s(<1024 x i32> addrspace(205)* %ptr0, <1024 x i32> addrspace(205)* %ptr1, i32 %x) {
+  store <1024 x i32> zeroinitializer, <1024 x i32> addrspace(205)* %ptr0
+  %addr = getelementptr <1024 x i32>, <1024 x i32> addrspace(205)* %ptr1, i32 %x
+  %res = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr
+  ret <1024 x i32> %res
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vnsel.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vnsel.ll
new file mode 100644
index 0000000..1c94d47
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vnsel.ll

@@ -0,0 +1,131 @@
+; RUN: llc < %s -mcpu=tensorcore-vf -asm-verbose=false -disable-cgp | FileCheck %s --check-prefix=CHECK-VF
+; RUN: llc < %s -mcpu=tensorcore-jf -asm-verbose=false -disable-cgp | FileCheck %s
+; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; CHECK-VF: v{{[0-9]+}} = vsel vm{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}};
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}};
+define <1024 x i32> @vselrr_i32(<1024 x i1> %mask, <1024 x i32> %x, <1024 x i32> %y) {
+  %r = select <1024 x i1> %mask, <1024 x i32> %x, <1024 x i32> %y
+  ret <1024 x i32> %r
+}
+
+; CHECK-VF: v{{[0-9]+}} = vsel vm{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}};
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}};
+define <1024 x float> @vselrr_float(<1024 x i1> %mask, <1024 x float> %x, <1024 x float> %y) {
+  %r = select <1024 x i1> %mask, <1024 x float> %x, <1024 x float> %y
+  ret <1024 x float> %r
+}
+
+; CHECK-VF: v{{[0-9]+}} =    vnsel vm{{[0-9]+}}, $0x2a, v{{[0-9]+}}
+; CHECK: v{{[0-9]+}} = vimm.s32 $0x2a
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}};
+define <1024 x i32> @vnselri_i32(<1024 x i1> %mask, <1024 x i32> %x) {
+  %y0 = insertelement <1024 x i32> undef, i32 42, i32  0
+  %y = shufflevector <1024 x i32> %y0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+
+  %r = select <1024 x i1> %mask, <1024 x i32> %x, <1024 x i32> %y
+  ret <1024 x i32>  %r
+}
+
+; CHECK-VF: v{{[0-9]+}} =    vnsel vm{{[0-9]+}}, $0x42280000, v{{[0-9]+}}
+; CHECK: v{{[0-9]+}} = vimm.f32 $42.0
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}};
+define <1024 x float> @vnselri_float(<1024 x i1> %mask, <1024 x float> %x) {
+  %y0 = insertelement <1024 x float> undef, float 42.0, i32  0
+  %y = shufflevector <1024 x float> %y0, <1024 x float> undef, <1024 x i32> zeroinitializer
+
+  %r = select <1024 x i1> %mask, <1024 x float> %x, <1024 x float> %y
+  ret <1024 x float>  %r
+}
+
+; CHECK-VF: v{{[0-9]+}} = vnsel vm{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: v{{[0-9]+}} = vmov s{{[0-9]+}}
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}};
+define <1024 x i32> @vnselrs_i32(<1024 x i1> %mask, <1024 x i32> %x, i32 %s) {
+  %y0 = insertelement <1024 x i32> undef, i32 %s, i32  0
+  %y = shufflevector <1024 x i32> %y0, <1024 x i32> undef, <1024 x i32> zeroinitializer
+
+  %r = select <1024 x i1> %mask, <1024 x i32> %x, <1024 x i32> %y
+  ret <1024 x i32>  %r
+}
+
+; CHECK-VF: v{{[0-9]+}} = vnsel vm{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: v{{[0-9]+}} = vmov s{{[0-9]+}}
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}};
+define <1024 x float> @vnselrs_float(<1024 x i1> %mask, <1024 x float> %x, float %s) {
+  %y0 = insertelement <1024 x float> undef, float %s, i32  0
+  %y = shufflevector <1024 x float> %y0, <1024 x float> undef, <1024 x i32> zeroinitializer
+
+  %r = select <1024 x i1> %mask, <1024 x float> %x, <1024 x float> %y
+  ret <1024 x float>  %r
+}
+
+; CHECK-VF: v{{[0-9]+}} =    vnsel vm{{[0-9]+}}, $0x2a, v{{[0-9]+}}
+; CHECK: v{{[0-9]+}} = vimm.s32 $0x2a
+; CHECK: v{{[0-9]+}} = vsel vm{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}};
+define <1024 x i32> @vnselri_i32_1(<1024 x i1> %mask, <1024 x i32> %x) {
+
+  %r = select <1024 x i1> %mask, <1024 x i32> %x, <1024 x
+      i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42 , i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42 , i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42,
+      i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
+  ret <1024 x i32>  %r
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vs_spill_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vs_spill_tc.ll
new file mode 100644
index 0000000..23df5fe
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vs_spill_tc.ll

@@ -0,0 +1,255 @@
+; RUN: llc < %s -mcpu=tensorcore-jf -asm-verbose=false -tpu-skip-fast-opt | FileCheck %s
+; REQUIRES: tpu
+
+target datalayout = "e-m:e-p:32:32-i32:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+; Test that we can use same offsets for VMem and SMem spilling.
+!smem.funcs.spill = !{!0}
+!vmem.funcs.spill = !{!0}
+!vmem.ranges.spill.start = !{!1}
+!vmem.ranges.spill.limit = !{!2}
+!smem.ranges.spill.start = !{!1}
+!smem.ranges.spill.limit = !{!2}
+
+!0 = !{void (<1024 x i32> addrspace(205)*, i32*)* @spill_vs_to_vreg}
+!1 = !{i32 1000}
+!2 = !{i32 2000}
+
+; CHECK-LABEL: spill_vs_to_vreg:
+; CHECK-DAG: [vmem:$0x7c8] =	vst v{{[0-9]+}}
+; CHECK-DAG: [vmem:$0x7c0] =	vst v{{[0-9]+}}
+; CHECK-DAG: [smem:$0x7cf] =	sst s{{[0-9]+}}
+; CHECK-DAG: [smem:$0x7ce] =	sst s{{[0-9]+}}
+; CHECK-DAG: v{{[0-9]+}} =	vld [vmem:$0x7c0]
+; CHECK-DAG: v{{[0-9]+}} =	vld [vmem:$0x7c8]
+; CHECK-DAG: s{{[0-9]+}} =	sld [smem:$0x7ce]
+; CHECK-DAG: s{{[0-9]+}} =	sld [smem:$0x7cf]
+
+declare <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32) nounwind
+declare i32* @llvm.tpu.inttoptr.pi32(i32) nounwind
+
+define void @spill_vs_to_vreg(<1024 x i32> addrspace(205)* %unknowptr, i32* %unknsowptrs) {
+llo-region-0:
+  br label %llo-region-1
+
+llo-region-1:                                    ; preds = %llo-region-0
+  %addr00 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 000)
+  %addr01 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 010)
+  %addr02 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 020)
+  %addr03 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 030)
+  %addr04 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 040)
+  %addr05 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 050)
+  %addr06 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 060)
+  %addr07 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 070)
+  %addr08 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 080)
+  %addr09 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 090)
+  %addr10 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 100)
+  %addr11 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 110)
+  %addr12 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 120)
+  %addr13 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 130)
+  %addr14 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 140)
+  %addr15 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 150)
+  %addr16 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 160)
+  %addr17 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 170)
+  %addr18 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 180)
+  %addr19 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 190)
+  %addr20 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 200)
+  %addr21 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 210)
+  %addr22 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 220)
+  %addr23 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 230)
+  %addr24 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 240)
+  %addr25 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 250)
+  %addr26 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 260)
+  %addr27 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 270)
+  %addr28 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 280)
+  %addr29 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 290)
+  %addr30 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 300)
+  %addr31 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 310)
+  %addr32 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 320)
+  %addr33 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 330)
+  %addr34 = call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 340)
+
+  %addrs00 = call i32* @llvm.tpu.inttoptr.pi32(i32 000)
+  %addrs01 = call i32* @llvm.tpu.inttoptr.pi32(i32 010)
+  %addrs02 = call i32* @llvm.tpu.inttoptr.pi32(i32 020)
+  %addrs03 = call i32* @llvm.tpu.inttoptr.pi32(i32 030)
+  %addrs04 = call i32* @llvm.tpu.inttoptr.pi32(i32 040)
+  %addrs05 = call i32* @llvm.tpu.inttoptr.pi32(i32 050)
+  %addrs06 = call i32* @llvm.tpu.inttoptr.pi32(i32 060)
+  %addrs07 = call i32* @llvm.tpu.inttoptr.pi32(i32 070)
+  %addrs08 = call i32* @llvm.tpu.inttoptr.pi32(i32 080)
+  %addrs09 = call i32* @llvm.tpu.inttoptr.pi32(i32 090)
+  %addrs10 = call i32* @llvm.tpu.inttoptr.pi32(i32 100)
+  %addrs11 = call i32* @llvm.tpu.inttoptr.pi32(i32 110)
+  %addrs12 = call i32* @llvm.tpu.inttoptr.pi32(i32 120)
+  %addrs13 = call i32* @llvm.tpu.inttoptr.pi32(i32 130)
+  %addrs14 = call i32* @llvm.tpu.inttoptr.pi32(i32 140)
+  %addrs15 = call i32* @llvm.tpu.inttoptr.pi32(i32 150)
+  %addrs16 = call i32* @llvm.tpu.inttoptr.pi32(i32 160)
+  %addrs17 = call i32* @llvm.tpu.inttoptr.pi32(i32 170)
+  %addrs18 = call i32* @llvm.tpu.inttoptr.pi32(i32 180)
+  %addrs19 = call i32* @llvm.tpu.inttoptr.pi32(i32 190)
+  %addrs20 = call i32* @llvm.tpu.inttoptr.pi32(i32 200)
+  %addrs21 = call i32* @llvm.tpu.inttoptr.pi32(i32 210)
+  %addrs22 = call i32* @llvm.tpu.inttoptr.pi32(i32 220)
+  %addrs23 = call i32* @llvm.tpu.inttoptr.pi32(i32 230)
+  %addrs24 = call i32* @llvm.tpu.inttoptr.pi32(i32 240)
+  %addrs25 = call i32* @llvm.tpu.inttoptr.pi32(i32 250)
+  %addrs26 = call i32* @llvm.tpu.inttoptr.pi32(i32 260)
+  %addrs27 = call i32* @llvm.tpu.inttoptr.pi32(i32 270)
+  %addrs28 = call i32* @llvm.tpu.inttoptr.pi32(i32 280)
+  %addrs29 = call i32* @llvm.tpu.inttoptr.pi32(i32 290)
+  %addrs30 = call i32* @llvm.tpu.inttoptr.pi32(i32 300)
+  %addrs31 = call i32* @llvm.tpu.inttoptr.pi32(i32 310)
+  %addrs32 = call i32* @llvm.tpu.inttoptr.pi32(i32 320)
+  %addrs33 = call i32* @llvm.tpu.inttoptr.pi32(i32 330)
+  %addrs34 = call i32* @llvm.tpu.inttoptr.pi32(i32 340)
+
+
+  %val00 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr34
+  %val01 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr33
+  %val02 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr32
+  %val03 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr31
+  %val04 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr30
+  %val05 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr29
+  %val06 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr28
+  %val07 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr27
+  %val08 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr26
+  %val09 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr25
+  %val10 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr24
+  %val11 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr23
+  %val12 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr22
+  %val13 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr21
+  %val14 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr20
+  %val15 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr19
+  %val16 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr18
+  %val17 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr17
+  %val18 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr16
+  %val19 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr15
+  %val20 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr14
+  %val21 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr13
+  %val22 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr12
+  %val23 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr11
+  %val24 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr10
+  %val25 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr09
+  %val26 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr08
+  %val27 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr07
+  %val28 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr06
+  %val29 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr05
+  %val30 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr04
+  %val31 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr03
+  %val32 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr02
+  %val33 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr01
+  %val34 = load <1024 x i32>, <1024 x i32> addrspace(205)* %addr00
+  
+  %vals00 = load i32, i32* %addrs34
+  %vals01 = load i32, i32* %addrs33
+  %vals02 = load i32, i32* %addrs32
+  %vals03 = load i32, i32* %addrs31
+  %vals04 = load i32, i32* %addrs30
+  %vals05 = load i32, i32* %addrs29
+  %vals06 = load i32, i32* %addrs28
+  %vals07 = load i32, i32* %addrs27
+  %vals08 = load i32, i32* %addrs26
+  %vals09 = load i32, i32* %addrs25
+  %vals10 = load i32, i32* %addrs24
+  %vals11 = load i32, i32* %addrs23
+  %vals12 = load i32, i32* %addrs22
+  %vals13 = load i32, i32* %addrs21
+  %vals14 = load i32, i32* %addrs20
+  %vals15 = load i32, i32* %addrs19
+  %vals16 = load i32, i32* %addrs18
+  %vals17 = load i32, i32* %addrs17
+  %vals18 = load i32, i32* %addrs16
+  %vals19 = load i32, i32* %addrs15
+  %vals20 = load i32, i32* %addrs14
+  %vals21 = load i32, i32* %addrs13
+  %vals22 = load i32, i32* %addrs12
+  %vals23 = load i32, i32* %addrs11
+  %vals24 = load i32, i32* %addrs10
+  %vals25 = load i32, i32* %addrs09
+  %vals26 = load i32, i32* %addrs08
+  %vals27 = load i32, i32* %addrs07
+  %vals28 = load i32, i32* %addrs06
+  %vals29 = load i32, i32* %addrs05
+  %vals30 = load i32, i32* %addrs04
+  %vals31 = load i32, i32* %addrs03
+  %vals32 = load i32, i32* %addrs02
+  %vals33 = load i32, i32* %addrs01
+  %vals34 = load i32, i32* %addrs00
+
+  store <1024 x i32> %val34, <1024 x i32> addrspace(205)* %unknowptr
+  store <1024 x i32> %val33, <1024 x i32> addrspace(205)* %addr33
+  store <1024 x i32> %val32, <1024 x i32> addrspace(205)* %addr32
+  store <1024 x i32> %val31, <1024 x i32> addrspace(205)* %addr31
+  store <1024 x i32> %val30, <1024 x i32> addrspace(205)* %addr30
+  store <1024 x i32> %val29, <1024 x i32> addrspace(205)* %addr29
+  store <1024 x i32> %val28, <1024 x i32> addrspace(205)* %addr28
+  store <1024 x i32> %val27, <1024 x i32> addrspace(205)* %addr27
+  store <1024 x i32> %val26, <1024 x i32> addrspace(205)* %addr26
+  store <1024 x i32> %val25, <1024 x i32> addrspace(205)* %addr25
+  store <1024 x i32> %val24, <1024 x i32> addrspace(205)* %addr24
+  store <1024 x i32> %val23, <1024 x i32> addrspace(205)* %addr23
+  store <1024 x i32> %val22, <1024 x i32> addrspace(205)* %addr22
+  store <1024 x i32> %val21, <1024 x i32> addrspace(205)* %addr21
+  store <1024 x i32> %val20, <1024 x i32> addrspace(205)* %addr20
+  store <1024 x i32> %val19, <1024 x i32> addrspace(205)* %addr19
+  store <1024 x i32> %val18, <1024 x i32> addrspace(205)* %addr18
+  store <1024 x i32> %val17, <1024 x i32> addrspace(205)* %addr17
+  store <1024 x i32> %val16, <1024 x i32> addrspace(205)* %addr16
+  store <1024 x i32> %val15, <1024 x i32> addrspace(205)* %addr15
+  store <1024 x i32> %val14, <1024 x i32> addrspace(205)* %addr14
+  store <1024 x i32> %val13, <1024 x i32> addrspace(205)* %addr13
+  store <1024 x i32> %val12, <1024 x i32> addrspace(205)* %addr12
+  store <1024 x i32> %val11, <1024 x i32> addrspace(205)* %addr11
+  store <1024 x i32> %val00, <1024 x i32> addrspace(205)* %addr10
+  store <1024 x i32> %val09, <1024 x i32> addrspace(205)* %addr09
+  store <1024 x i32> %val08, <1024 x i32> addrspace(205)* %addr08
+  store <1024 x i32> %val07, <1024 x i32> addrspace(205)* %addr07
+  store <1024 x i32> %val06, <1024 x i32> addrspace(205)* %addr06
+  store <1024 x i32> %val05, <1024 x i32> addrspace(205)* %addr05
+  store <1024 x i32> %val04, <1024 x i32> addrspace(205)* %addr04
+  store <1024 x i32> %val03, <1024 x i32> addrspace(205)* %addr03
+  store <1024 x i32> %val02, <1024 x i32> addrspace(205)* %addr02
+  store <1024 x i32> %val01, <1024 x i32> addrspace(205)* %addr01
+  store <1024 x i32> %val00, <1024 x i32> addrspace(205)* %addr00
+  
+  store i32 %vals34, i32* %unknsowptrs
+  store i32 %vals33, i32* %addrs33
+  store i32 %vals32, i32* %addrs32
+  store i32 %vals31, i32* %addrs31
+  store i32 %vals30, i32* %addrs30
+  store i32 %vals29, i32* %addrs29
+  store i32 %vals28, i32* %addrs28
+  store i32 %vals27, i32* %addrs27
+  store i32 %vals26, i32* %addrs26
+  store i32 %vals25, i32* %addrs25
+  store i32 %vals24, i32* %addrs24
+  store i32 %vals23, i32* %addrs23
+  store i32 %vals22, i32* %addrs22
+  store i32 %vals21, i32* %addrs21
+  store i32 %vals20, i32* %addrs20
+  store i32 %vals19, i32* %addrs19
+  store i32 %vals18, i32* %addrs18
+  store i32 %vals17, i32* %addrs17
+  store i32 %vals16, i32* %addrs16
+  store i32 %vals15, i32* %addrs15
+  store i32 %vals14, i32* %addrs14
+  store i32 %vals13, i32* %addrs13
+  store i32 %vals12, i32* %addrs12
+  store i32 %vals11, i32* %addrs11
+  store i32 %vals00, i32* %addrs10
+  store i32 %vals09, i32* %addrs09
+  store i32 %vals08, i32* %addrs08
+  store i32 %vals07, i32* %addrs07
+  store i32 %vals06, i32* %addrs06
+  store i32 %vals05, i32* %addrs05
+  store i32 %vals04, i32* %addrs04
+  store i32 %vals03, i32* %addrs03
+  store i32 %vals02, i32* %addrs02
+  store i32 %vals01, i32* %addrs01
+  store i32 %vals00, i32* %addrs00
+
+  ret void
+}
\ No newline at end of file

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vstore_invalid_bundling_info_bug_tc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vstore_invalid_bundling_info_bug_tc.ll
new file mode 100644
index 0000000..add5f23
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/vstore_invalid_bundling_info_bug_tc.ll

@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mcpu=tensorcore-pf %s -o - -asm-verbose=false | FileCheck %s
+; REQUIRES: tpu
+
+; Make sure that when we emit VStore instructions with 3 immediates the bundler
+; doesn't think that we can't bundle it in an empty bundle.
+; In such event this test should hang/timeout in the scheduler or error
+; out in some other way.
+source_filename = "LloModule"
+target datalayout = "e-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu--"
+; Function Attrs: noreturn nounwind
+define void @main() local_unnamed_addr #0 {
+; CHECK-LABEL: main:
+; CHECK:       { v0 = vimm.s32 $0x0 }
+; CHECK-NEXT:  { [vmem:$0x20 ss:$0x10 sm:$0xf] = vst v0;
+; CHECK-NEXT:    _ = shalt }
+entry:
+  %tmp1 = tail call <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32 32)
+  %v0 = insertelement <1024 x i1> undef, i1 true, i32 0
+  %mask = shufflevector <1024 x i1> %v0, <1024 x i1> undef, <1024 x i32> zeroinitializer
+  tail call void @llvm.tpu.vst.strided.v1024i32.p205v1024i32(<1024 x i32> zeroinitializer, <1024 x i32> addrspace(205)* %tmp1, i32 15, i32 16, <1024 x i1> %mask)
+  ret void
+}
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.tpu.vst.strided.v1024i32.p205v1024i32(<1024 x i32>, <1024 x i32> addrspace(205)*, i32, i32, <1024 x i1>) #7
+; Function Attrs: nounwind readnone
+declare <1024 x i32> addrspace(205)* @llvm.tpu.inttoptr.p205v1024i32(i32) #1

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/workloads_bc.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/workloads_bc.ll
new file mode 100644
index 0000000..1027002
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/workloads_bc.ll

@@ -0,0 +1,495 @@
+; RUN: llc < %s -march=googletpu -mcpu=barnacore-cc-pf -disable-cgp | FileCheck %s
+; REQUIRES: tpu
+
+declare void @llvm.tpu.bc.loop.start(i32)
+declare i1 @llvm.tpu.bc.loop.end()
+declare <8 x float> @llvm.tpu.bc.load.aliaddr(<8 x float> addrspace(207)*, i32) argmemonly
+declare <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)*, i32, i32) argmemonly
+declare void @llvm.tpu.bc.store.aliaddr(<8 x float>, <8 x float> addrspace(207)*, i32) argmemonly
+declare void @llvm.tpu.bc.store.aliaddr.flm(<8 x float>, <8 x float> addrspace(207)*, i32, i32) argmemonly
+declare void @llvm.tpu.bc.store.concat.aliaddr(<8 x float>, i32) inaccessiblememonly
+declare void @llvm.tpu.bc.shift.aliaddr(i32) inaccessiblememonly
+declare <8 x float> @llvm.tpu.bc.select.predicate(i32, <8 x float>, <8 x float>) readnone
+declare [16 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex.a16([16 x <8 x float>], <8 x float>) readnone
+declare <8 x float> @llvm.tpu.bc.extractvalue.loopindex.a16([16 x <8 x float>]) readnone
+declare [8 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex.a8([8 x <8 x float>], <8 x float>) readnone
+declare <8 x float> @llvm.tpu.bc.extractvalue.loopindex.a8([8 x <8 x float>]) readnone
+declare i32 @llvm.tpu.rsqrt(<8 x float>) inaccessiblememonly
+declare i32 @llvm.tpu.rcp(<8 x float>) inaccessiblememonly
+declare <8 x float> @llvm.tpu.eup.pop(i32) inaccessiblememonly
+declare <8 x float> @llvm.maximum.v8f32(<8 x float>, <8 x float>)
+declare <8 x float> @llvm.minimum.v8f32(<8 x float>, <8 x float>)
+declare <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.clamp.symmetric.v8f32(<8 x float>, <8 x float>)
+declare <8 x float> @llvm.tpu.rcp.macro.v8f32(<8 x float>)
+declare <8 x float> @llvm.tpu.tanh.macro.v8f32(<8 x float>)
+declare <8 x i32> @llvm.tpu.vrot.sublane.down.v8i32(<8 x i32>)
+declare <8 x i32> @llvm.tpu.vlaneseq()
+
+; https://cs.corp.google.com/depot/google3/platforms/deepsea/logic/pfc/units/bc/dv/env/bc_vector_program_builder.sv?l=176
+; Generates float32 forward pass proram with no reduction
+; CHECK-LABEL: fwd_pass_float32:
+; CHECK: loop_start $0x1, $0x1
+; CHECK: This Inner Loop Header: Depth=1
+; CHECK-NEXT: { _ = vnop }
+; CHECK-NEXT: { v0 = vld.f32 [bmem:s0] ali_addr:$0x1;
+; CHECK-NEXT:   (concat_reg) = vst.f32 ps:$1 v0 ali_addr:$0x1;
+; CHECK-NEXT:   (cdfifo_reg) = shift ps:$1 (concat_reg) aliaddr:$0x1 }
+define void @fwd_pass_float32(<8 x float> addrspace(207)* %sram_off) {
+  ; Set up for the loop. Pipeline_depth is zero.
+  call void @llvm.tpu.bc.loop.start(i32 0)
+  br label %loop
+
+loop:
+  ; Load weight. The load has loop_index added to address. We use
+  ; Vldst_loop_offset_x as aliaddr which gives the calculation addr = sram_off + (loop_index).
+  %weight = call <8 x float> @llvm.tpu.bc.load.aliaddr(<8 x float> addrspace(207)* %sram_off, i32 1)
+
+  ; Store into (concat_reg). Use Vldst_loop_offset_x to write at location (loop_index).
+  call void @llvm.tpu.bc.store.concat.aliaddr(<8 x float> %weight, i32 1)
+  
+  ; Shift from (concat_reg)[(loop_index)..(loop_index)+7] to (cdfifo_reg)[(loop_index)..(loop_index)+7].
+  call void @llvm.tpu.bc.shift.aliaddr(i32 1)
+
+  %loopend = call i1 @llvm.tpu.bc.loop.end()
+  br i1 %loopend, label %loop, label %out
+
+out:
+  ret void
+}
+
+; https://cs.corp.google.com/depot/google3/platforms/deepsea/logic/pfc/units/bc/dv/env/bc_vector_program_builder.sv?l=213
+; Generates bfloat16 forward pass proram with no reduction
+; CHECK-LABEL: fwd_pass_bfloat16:
+; CHECK: loop_start $0x1, $0x1
+; CHECK: This Inner Loop Header: Depth=1
+; CHECK-NEXT: { v2 = vld.f32 [bmem:s0] ali_addr:$0x1;
+; CHECK-NEXT:   (concat_reg) = vst.f32 ps:$1 v0 ali_addr:$0x2;
+; CHECK-NEXT:   (cdfifo_reg) = shift ps:$1 (concat_reg) aliaddr:$0x2;
+; CHECK-NEXT:   v0 =    vand.u32 ps:$1 $0xffff, v2 }
+; CHECK-NEXT: { v0 =    vshll.u32 v2, $0x10;
+; CHECK-NEXT:   (concat_reg) = vst.f32 ps:$1 v0 ali_addr:$0x2;
+; CHECK-NEXT:   (cdfifo_reg) = shift ps:$1 (concat_reg) aliaddr:$0x3  }
+define void @fwd_pass_bfloat16(<8 x float> addrspace(207)* %sram_off) {
+  call void @llvm.tpu.bc.loop.start(i32 0)
+  br label %loop
+
+loop:
+  ; Load two packed weights and reinterpret as i32.
+  %weights_f = call <8 x float> @llvm.tpu.bc.load.aliaddr(<8 x float> addrspace(207)* %sram_off, i32 1)
+  %weights_i = bitcast <8 x float> %weights_f to <8 x i32>
+
+  ; Create weight_lo as weights << 16
+  %weight_lo_i = shl <8 x i32> %weights_i, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>;
+
+  ; Store weight_lo to concat_reg. Use aliaddr:2 for 2*x multiplier.
+  %weight_lo_f = bitcast <8 x i32> %weight_lo_i to <8 x float>
+  call void @llvm.tpu.bc.store.concat.aliaddr(<8 x float> %weight_lo_f, i32 2)
+
+  ; Shift weight_lo using aliaddr:2.
+  call void @llvm.tpu.bc.shift.aliaddr(i32 2)
+
+  ; Create weight_hi as weights & 0xFFFF
+  %weight_hi_i = and <8 x i32> %weights_i, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+
+  ; Store and shift weight_hi. Use aliaddr:3 for 2*x+1 multiplier.
+  %weight_hi_f = bitcast <8 x i32> %weight_hi_i to <8 x float>
+  call void @llvm.tpu.bc.store.concat.aliaddr(<8 x float> %weight_hi_f, i32 2)
+
+  ; Shift weight_lo using aliaddr:3
+  call void @llvm.tpu.bc.shift.aliaddr(i32 3)
+
+  %loopend = call i1 @llvm.tpu.bc.loop.end()
+  br i1 %loopend, label %loop, label %out
+
+out:
+  ret void
+}
+
+; https://cs.corp.google.com/depot/google3/platforms/deepsea/logic/pfc/units/bc/dv/env/bc_vector_program_builder.sv?l=283
+; Generates float32 forward pass reduction program. Currently we software pipeline this to two stages.
+; The ideal schedule uses three stages.
+; CHECK-LABEL: fwd_pass_reduction_float32:
+; CHECK: loop_start $0x1, $0x1
+; CHECK: {       v16 = vld.f32 [bmem:s0] ali_addr:$0x1;
+; CHECK-NEXT:         v0.ali =        vimm.f32 @first_id_in_feature $0.0  }
+; CHECK-NEXT: {       (concat_reg) = vst.f32 ps:$1 v0.ali.ps1 ali_addr:$0x1;
+; CHECK-NEXT: (cdfifo_reg) = shift ps:$1 (concat_reg) aliaddr:$0x1;
+; CHECK-NEXT: v0.ali =        vadd.f32 ps:$1 v0.ali.ps1, v16;
+; CHECK-NEXT: v16 =   vmul.f32 s1, v16  }
+define [16 x <8 x float>] @fwd_pass_reduction_float32([16 x <8 x float>] %accum_1, <8 x float> addrspace(207)* noalias %sram_off, float %gain) {
+entry:
+  call void @llvm.tpu.bc.loop.start(i32 0)
+  br label %loop
+
+loop:
+  %accum_2 = phi [16 x <8 x float>] [ %accum_1, %entry ], [ %accum_3, %loop ]
+
+  ; Load weight from bmem.
+  %weight = call <8 x float> @llvm.tpu.bc.load.aliaddr(<8 x float> addrspace(207)* %sram_off, i32 1)
+
+  ; Multiply weight by splatted gain.
+  %gain_vec1 = insertelement <8 x float> undef, float %gain, i32 0
+  %gain_vec = shufflevector <8 x float> %gain_vec1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %gain_weight = fmul <8 x float> %weight, %gain_vec
+
+  ; Potentially zero out the accumulator if first_in_feature (=1).
+  %accum_extract1 = call <8 x float> @llvm.tpu.bc.extractvalue.loopindex.a16([16 x <8 x float>] %accum_2)
+  %accum_extract2 = call <8 x float> @llvm.tpu.bc.select.predicate(i32 1, <8 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, <8 x float> %accum_extract1)
+
+  ; Accumulate.
+  %accum_extract3 = fadd <8 x float> %gain_weight, %accum_extract2
+  
+  ; Insert accumulated value into the accumulation aggregate.
+  %accum_3 = call [16 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex.a16([16 x <8 x float>] %accum_2, <8 x float> %accum_extract3)
+  
+  ; Store to (concat_reg) and shift.
+  call void @llvm.tpu.bc.store.concat.aliaddr(<8 x float> %accum_extract3, i32 1)
+  call void @llvm.tpu.bc.shift.aliaddr(i32 1)
+  
+  %loopend = call i1 @llvm.tpu.bc.loop.end()
+  br i1 %loopend, label %loop, label %out
+out:
+  ret [16 x <8 x float>] %accum_3
+}
+
+; https://cs.corp.google.com/depot/google3/platforms/deepsea/logic/pfc/units/bc/dv/env/bc_vector_program_builder.sv?l=459
+; Generates bfloat16 forward pass reduction program. Currently we software pipeline this by 2x.
+; CHECK-LABEL: fwd_pass_reduction_bfloat16:
+; CHECK: loop_start $0x4, $0x2
+; CHECK: {  	v0.ali =	vimm.f32 @first_id_in_feature ps:$1 $0.0  } // num: 1
+; CHECK-NEXT: {  	v18 = vld.f32 [bmem:s0] ali_addr:$0x1;
+; CHECK-NEXT: 	v0.ali =	vadd.f32 ps:$1 v0.ali.ps1, v18  } // num: 2
+; CHECK-NEXT: {  	v21 =	vand.u32 $0xffff, v18;
+; CHECK-NEXT: 	v18 =	vshll.u32 v18, $0x10  } // num: 3
+; CHECK-NEXT: {  	(concat_reg) = vst.f32 ps:$2 v8.ali.ps2 ali_addr:$0x3;
+; CHECK-NEXT: 	(cdfifo_reg) = shift ps:$2 (concat_reg) aliaddr:$0x3;
+; CHECK-NEXT: 	v8.ali =	vimm.f32 @first_id_in_feature $0.0;
+; CHECK-NEXT: 	v18 =	vmul.f32 s1, v18  }     // num: 4
+; CHECK-NEXT: {  	v8.ali =	vadd.f32 ps:$1 v8.ali.ps1, v21;
+; CHECK-NEXT: 	v21 =	vmul.f32 s1, v21;
+; CHECK-NEXT: 	(concat_reg) = vst.f32 ps:$1 v0.ali.ps1 ali_addr:$0x2;
+; CHECK-NEXT: 	(cdfifo_reg) = shift ps:$1 (concat_reg) aliaddr:$0x2  } // num: 5
+
+define {[8 x <8 x float>], [8 x <8 x float>]} @fwd_pass_reduction_bfloat16([8 x <8 x float>] %accum_lo_1, [8 x <8 x float>] %accum_hi_1,
+                                                                           <8 x float> addrspace(207)* noalias %sram_off,
+                                                                           float %gain) {
+entry:
+  call void @llvm.tpu.bc.loop.start(i32 0)
+  br label %loop
+
+loop:
+  %accum_lo_2 = phi [8 x <8 x float>] [ %accum_lo_1, %entry ], [ %accum_lo_3, %loop ]
+  %accum_hi_2 = phi [8 x <8 x float>] [ %accum_hi_1, %entry ], [ %accum_hi_3, %loop ]
+
+  ; Load weight from bmem. Note: the sample algorithm loads this *twice*, because the second
+  ; cycle has the VPU ALU slots oversubscribed and an empty load slot. That said, I don't think
+  ; that second load is needed; instead we can just nondestructively shift-left.
+  %weight = call <8 x float> @llvm.tpu.bc.load.aliaddr(<8 x float> addrspace(207)* %sram_off, i32 1)
+
+  ; Potentially zero out the accumulator if first_in_feature (=1).
+  %accum_extract_lo1 = call <8 x float> @llvm.tpu.bc.extractvalue.loopindex.a8([8 x <8 x float>] %accum_lo_2)
+  %accum_extract_lo2 = call <8 x float> @llvm.tpu.bc.select.predicate(i32 1, <8 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, <8 x float> %accum_extract_lo1)
+  %accum_extract_hi1 = call <8 x float> @llvm.tpu.bc.extractvalue.loopindex.a8([8 x <8 x float>] %accum_hi_2)
+  %accum_extract_hi2 = call <8 x float> @llvm.tpu.bc.select.predicate(i32 1, <8 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, <8 x float> %accum_extract_hi1)
+
+  ; Convert lo weight to float32.
+  %weight_i = bitcast <8 x float> %weight to <8 x i32>
+  %weight_lo_i = shl <8 x i32> %weight_i, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %weight_lo = bitcast <8 x i32> %weight_lo_i to <8 x float>
+
+  ; Multiply weight_lo by splatted gain.
+  %gain_vec1 = insertelement <8 x float> undef, float %gain, i32 0
+  %gain_vec = shufflevector <8 x float> %gain_vec1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  %gain_weight_lo = fmul <8 x float> %weight_lo, %gain_vec
+  
+  ; Convert hi weight to float32 and multiply by splatted gain.
+  %weight_hi_i = and <8 x i32> %weight_i, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %weight_hi = bitcast <8 x i32> %weight_hi_i to <8 x float>
+  %gain_weight_hi = fmul <8 x float> %weight_hi, %gain_vec
+
+  ; Accumulate hi and lo.
+  %accum_extract_lo3 = fadd <8 x float> %gain_weight_lo, %accum_extract_lo2
+  %accum_extract_hi3 = fadd <8 x float> %gain_weight_hi, %accum_extract_hi2
+  
+  ; Insert accumulated value into the accumulation aggregate.
+  %accum_lo_3 = call [8 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex.a8([8 x <8 x float>] %accum_lo_2, <8 x float> %accum_extract_lo3)
+  %accum_hi_3 = call [8 x <8 x float>] @llvm.tpu.bc.insertvalue.loopindex.a8([8 x <8 x float>] %accum_hi_2, <8 x float> %accum_extract_hi3)
+  
+  ; Store to (concat_reg) and shift. Use aliaddr:2x (=2) for lo and 2xp1 (=3) for hi.
+  call void @llvm.tpu.bc.store.concat.aliaddr(<8 x float> %accum_extract_lo3, i32 2)
+  call void @llvm.tpu.bc.shift.aliaddr(i32 2)
+  call void @llvm.tpu.bc.store.concat.aliaddr(<8 x float> %accum_extract_hi3, i32 3)
+  call void @llvm.tpu.bc.shift.aliaddr(i32 3)
+  
+  %loopend = call i1 @llvm.tpu.bc.loop.end()
+  br i1 %loopend, label %loop, label %out
+  
+out:
+  %ret1 = insertvalue {[8 x <8 x float>], [8 x <8 x float>]} undef, [8 x <8 x float>] %accum_lo_3, 0
+  %ret2 = insertvalue {[8 x <8 x float>], [8 x <8 x float>]} %ret1, [8 x <8 x float>] %accum_hi_3, 1
+  ret {[8 x <8 x float>], [8 x <8 x float>]} %ret2
+}
+
+; https://cs.corp.google.com/depot/google3/platforms/deepsea/logic/pfc/units/bc/dv/env/bc_vector_program_builder.sv?l=718
+; Generates float32 backward pass program: Adagrad
+; CHECK-LABEL: bwd_pass_adagrad_float32:
+; CHECK: loop_start $0x6, $0x3
+; CHECK: {       (erf) = vrsqrt.f32 ps:$1 v0  }  // push %0
+; CHECK-NEXT: {       v4 = vld.f32 [bmem:s0] ali_addr:$0x1  }
+; CHECK-NEXT: {       v0 =    vpop ps:$2 (erf);
+; CHECK-NEXT:         v0 =    vmul.f32 v4, v4  }      // pop %0
+; CHECK-NEXT: {       v8 = vld.f32 [bmem:s1] ali_addr:$0x1 flm:$1;
+; CHECK-NEXT:         v0 =    vmul.f32 ps:$2 v0, v4  }
+; CHECK-NEXT: {       v4 = vld.f32 ps:$2 [bmem:s1] ali_addr:$0x1;
+; CHECK-NEXT:         v0 =    vadd.f32 v0, v8  }
+; CHECK-NEXT: {       [bmem:s1] = vst.f32 ps:$1 v0 ali_addr:$0x1 flm:$1;
+; CHECK-NEXT:         v0 =    vsub.f32 ps:$2 v4, v0  }
+; CHECK-NEXT: {       [bmem:s1] = vst.f32 ps:$3 v0 ali_addr:$0x1  }
+
+define void @bwd_pass_adagrad_float32(<8 x float> addrspace(207)* noalias %grad_off,
+                                      <8 x float> addrspace(207)* noalias %sram_off) {
+entry:
+  call void @llvm.tpu.bc.loop.start(i32 0)
+  br label %loop
+
+loop:
+  ; WG = load(bmem.wg)
+  %wg = call <8 x float> @llvm.tpu.bc.load.aliaddr(<8 x float> addrspace(207)* %grad_off, i32 1)
+  
+  ; SWG2 = load(bmem.swg2)
+  %swg2 = call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %sram_off, i32 1, i32 1)
+  
+  ; WG = WG * WG
+  %wg2 = fmul <8 x float> %wg, %wg
+  
+  ; W = load(bmem.w)
+  %w = call <8 x float> @llvm.tpu.bc.load.aliaddr(<8 x float> addrspace(207)* %sram_off, i32 1)
+  
+  ; SUMWG2 += WG2
+  %sumwg2 = fadd <8 x float> %swg2, %wg2
+  
+  ; eup(WG). Note, I think the sample algorithm is wrong here. (a) it performs vpush/vpop, when
+  ; the spreadsheet algorithm uses rsqrt which makes much more sense. (b) it uses %wg, when
+  ; that makes no sense (why schedule it so late?) instead I'm following the obvious spreadsheet
+  ; algorithm and using %sumwg2.
+  %rsqrtf = call i32 @llvm.tpu.rsqrt(<8 x float> %sumwg2)
+  %rsqrt = call <8 x float> @llvm.tpu.eup.pop(i32 %rsqrtf)
+  
+  ; bmem.swg2 = store SWG2
+  call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %sumwg2, <8 x float> addrspace(207)* %sram_off, i32 1, i32 1)
+
+  ; delta = wg * eupwg
+  %delta = fmul <8 x float> %wg, %rsqrt
+
+  ; W = W - delta
+  %new_w = fsub <8 x float> %w, %delta
+
+  ; bmem.w = store(w)
+  call void @llvm.tpu.bc.store.aliaddr(<8 x float> %new_w, <8 x float> addrspace(207)* %sram_off, i32 1)
+  
+  %loopend = call i1 @llvm.tpu.bc.loop.end()
+  br i1 %loopend, label %loop, label %out
+  
+out:
+  ret void
+}
+
+; Test that the immediate move doesn't get hoisted out of the loop.
+; CHECK-LABEL: MdlAdagradLight:
+; CHECK: loop_start $0xc, $0x2
+; CHECK: vimm.f32 $1.4012984643248171E-45
+define void @MdlAdagradLight() {
+entry:
+  tail call void @llvm.tpu.bc.loop.start(i32 0)
+  br label %loop-start
+
+loop-start:                                       ; preds = %loop-start, %entry
+  %0 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* null, i32 1, i32 3)
+  %1 = bitcast <8 x float> %0 to <8 x i32>
+  %2 = icmp ne <8 x i32> %1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> <float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000>, <8 x float> addrspace(207)* null, i32 1, i32 3)
+  %3 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* null, i32 1, i32 0)
+  %4 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* null, i32 1, i32 1)
+  %5 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* null, i32 1, i32 2)
+  %6 = fmul <8 x float> %0, %0
+  %7 = fsub <8 x float> %6, %5
+  %8 = fmul <8 x float> %7, <float 0x3FA9999A00000000, float 0x3FA9999A00000000, float 0x3FA9999A00000000, float 0x3FA9999A00000000, float 0x3FA9999A00000000, float 0x3FA9999A00000000, float 0x3FA9999A00000000, float 0x3FA9999A00000000>
+  %9 = select <8 x i1> %2, <8 x float> %8, <8 x float> zeroinitializer
+  %10 = fadd <8 x float> %5, %9
+  %11 = tail call <8 x float> @llvm.maximum.v8f32(<8 x float> %10, <8 x float> <float 0x38754484A0000000, float 0x38754484A0000000, float 0x38754484A0000000, float 0x38754484A0000000, float 0x38754484A0000000, float 0x38754484A0000000, float 0x38754484A0000000, float 0x38754484A0000000>)
+  %12 = tail call i32 @llvm.tpu.rsqrt(<8 x float> %11)
+  %13 = tail call <8 x float> @llvm.tpu.eup.pop(i32 %12)
+  %14 = fmul <8 x float> %13, %10
+  %15 = fadd <8 x float> %14, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  %16 = tail call i32 @llvm.tpu.rcp(<8 x float> %15)
+  %17 = tail call <8 x float> @llvm.tpu.eup.pop(i32 %16)
+  %18 = fsub <8 x float> %0, %4
+  %19 = fmul <8 x float> %18, <float 0x3FC3333300000000, float 0x3FC3333300000000, float 0x3FC3333300000000, float 0x3FC3333300000000, float 0x3FC3333300000000, float 0x3FC3333300000000, float 0x3FC3333300000000, float 0x3FC3333300000000>
+  %20 = select <8 x i1> %2, <8 x float> %19, <8 x float> zeroinitializer
+  %21 = fadd <8 x float> %4, %20
+  %22 = fmul <8 x float> %21, <float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000>
+  %23 = fmul <8 x float> %22, %17
+  %24 = fsub <8 x float> %3, %23
+  %.v = select <8 x i1> %2, <8 x float> %24, <8 x float> %3
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %21, <8 x float> addrspace(207)* null, i32 1, i32 1)
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %10, <8 x float> addrspace(207)* null, i32 1, i32 2)
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %.v, <8 x float> addrspace(207)* null, i32 1, i32 0)
+  %25 = tail call i1 @llvm.tpu.bc.loop.end()
+  br i1 %25, label %loop-start, label %loop-out
+
+loop-out:                                         ; preds = %loop-start
+  ret void
+}
+
+; Test that we support correctly fine grain sub-register tracking. Without that
+; this kernel cannot be pipelined with a depth of 4 without spilling.
+; CHECK-LABEL: high_pressure:
+; CHECK: loop_start $0x8, $0x3
+define void @high_pressure(<8 x float> addrspace(207)* %0, <8 x float> addrspace(207)* %1, float %2, i32 %3) {
+entry:
+  tail call void @llvm.tpu.bc.loop.start(i32 0)
+  br label %loop-start
+
+loop-start:                                       ; preds = %loop-start, %entry
+  %4 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %1, i32 1, i32 0)
+  %5 = tail call <8 x float> @llvm.maximum.v8f32(<8 x float> %4, <8 x float> <float -1.000000e+02, float -1.000000e+02, float -1.000000e+02, float -1.000000e+02, float -1.000000e+02, float -1.000000e+02, float -1.000000e+02, float -1.000000e+02>)
+  %6 = tail call <8 x float> @llvm.minimum.v8f32(<8 x float> %5, <8 x float> <float 1.000000e+02, float 1.000000e+02, float 1.000000e+02, float 1.000000e+02, float 1.000000e+02, float 1.000000e+02, float 1.000000e+02, float 1.000000e+02>)
+  %7 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 0)
+  %8 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 1)
+  %9 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 2)
+  %10 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 3)
+  %11 = fsub <8 x float> %6, %10
+  %12 = fmul <8 x float> %11, <float 0x3FE570A3C0000000, float 0x3FE570A3C0000000, float 0x3FE570A3C0000000, float 0x3FE570A3C0000000, float 0x3FE570A3C0000000, float 0x3FE570A3C0000000, float 0x3FE570A3C0000000, float 0x3FE570A3C0000000>
+  %13 = fadd <8 x float> %10, %12
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %13, <8 x float> addrspace(207)* %0, i32 1, i32 3)
+  %14 = fmul <8 x float> %6, %6
+  %15 = fsub <8 x float> %14, %8
+  %16 = fmul <8 x float> %15, <float 0x3FE570A3C0000000, float 0x3FE570A3C0000000, float 0x3FE570A3C0000000, float 0x3FE570A3C0000000, float 0x3FE570A3C0000000, float 0x3FE570A3C0000000, float 0x3FE570A3C0000000, float 0x3FE570A3C0000000>
+  %17 = fadd <8 x float> %8, %16
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %17, <8 x float> addrspace(207)* %0, i32 1, i32 1)
+  %18 = fmul <8 x float> %13, %13
+  %19 = fsub <8 x float> %17, %18
+  %20 = fadd <8 x float> %19, <float 0x3FA99999A0000000, float 0x3FA99999A0000000, float 0x3FA99999A0000000, float 0x3FA99999A0000000, float 0x3FA99999A0000000, float 0x3FA99999A0000000, float 0x3FA99999A0000000, float 0x3FA99999A0000000>
+  %21 = tail call i32 @llvm.tpu.rsqrt(<8 x float> %20)
+  %22 = tail call <8 x float> @llvm.tpu.eup.pop(i32 %21)
+  %23 = fmul <8 x float> %9, <float 0x3FEB333340000000, float 0x3FEB333340000000, float 0x3FEB333340000000, float 0x3FEB333340000000, float 0x3FEB333340000000, float 0x3FEB333340000000, float 0x3FEB333340000000, float 0x3FEB333340000000>
+  %24 = fmul <8 x float> %6, <float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000>
+  %25 = fmul <8 x float> %24, %22
+  %26 = fadd <8 x float> %23, %25
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %26, <8 x float> addrspace(207)* %0, i32 1, i32 2)
+  %27 = fsub <8 x float> %7, %26
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %27, <8 x float> addrspace(207)* %0, i32 1, i32 0)
+  %28 = tail call i1 @llvm.tpu.bc.loop.end()
+  br i1 %28, label %loop-start, label %loop-out
+
+loop-out:                                         ; preds = %loop-start
+  ret void
+}
+
+; Test with a case where we have more than 8 carried out dependency but we can
+; merge several registers as they use disjoint set of sub-registers.
+; CHECK-LABEL: coalesce_phi:
+; CHECK: loop_start $0xb, $0x3
+define void @coalesce_phi(<8 x float> addrspace(207)* %0, <8 x float> addrspace(207)* nocapture readnone %1, float %2, i32 %3) local_unnamed_addr #0 {
+entry:
+  tail call void @llvm.tpu.bc.loop.start(i32 0)
+  br label %loop-start
+
+loop-start:                                       ; preds = %loop-start, %entry
+  %4 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 3)
+  %5 = bitcast <8 x float> %4 to <8 x i32>
+  %6 = icmp ne <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> <float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000, float 0x36A0000000000000>, <8 x float> addrspace(207)* %0, i32 1, i32 3)
+  %7 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 0)
+  %8 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 1)
+  %9 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 2)
+  %10 = fmul <8 x float> %4, %4
+  %11 = fsub <8 x float> %10, %8
+  %12 = fmul <8 x float> %11, <float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01>
+  %13 = select <8 x i1> %6, <8 x float> %12, <8 x float> zeroinitializer
+  %14 = fadd <8 x float> %8, %13
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %14, <8 x float> addrspace(207)* %0, i32 1, i32 1)
+  %15 = fadd <8 x float> %14, <float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000>
+  %16 = fadd <8 x float> %9, <float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000, float 0x3F2A36E2E0000000>
+  %17 = fmul <8 x float> %16, %15
+  %erf = call i32 @llvm.tpu.rsqrt(<8 x float> %17)
+  %18 = call <8 x float> @llvm.tpu.eup.pop(i32 %erf)
+  %19 = fmul <8 x float> %4, %16
+  %20 = fmul <8 x float> %19, %18
+  %21 = fmul <8 x float> %20, %20
+  %22 = fmul <8 x float> %20, <float 0x3FF1DF3B60000000, float 0x3FF1DF3B60000000, float 0x3FF1DF3B60000000, float 0x3FF1DF3B60000000, float 0x3FF1DF3B60000000, float 0x3FF1DF3B60000000, float 0x3FF1DF3B60000000, float 0x3FF1DF3B60000000>
+  %23 = fsub <8 x float> %21, %9
+  %24 = fmul <8 x float> %23, <float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01>
+  %25 = select <8 x i1> %6, <8 x float> %24, <8 x float> zeroinitializer
+  %26 = fadd <8 x float> %9, %25
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %26, <8 x float> addrspace(207)* %0, i32 1, i32 2)
+  %27 = fsub <8 x float> %7, %22
+  %.v = select <8 x i1> %6, <8 x float> %27, <8 x float> %7
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %.v, <8 x float> addrspace(207)* %0, i32 1, i32 0)
+  %28 = tail call i1 @llvm.tpu.bc.loop.end()
+  br i1 %28, label %loop-start, label %loop-out
+
+loop-out:                                         ; preds = %loop-start
+  ret void
+}
+
+; CHECK-LABEL: push_pop_sw_schedule:
+; CHECK: loop_start $0x7, $0x3
+; CHECK: {      _ =     vnop  }
+; CHECK-NOT: {      _ =     vnop  }
+; CHECK: v{{[0-9]+}} =    vpop ps:$2 (erf)
+; CHECK: { (erf) = vrsqrt.f32 ps:$1 v{{[0-9]+}}
+define void @push_pop_sw_schedule(<8 x float> addrspace(207)* %0, <8 x float> addrspace(207)* %1, float %2, i32 %3) local_unnamed_addr #0 {
+entry:
+  tail call void @llvm.tpu.bc.loop.start(i32 0)
+  br label %loop-start
+
+loop-start:
+  %4 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %1, i32 1, i32 0)
+  %5 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 0)
+  %6 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 1)
+  %7 = tail call <8 x float> @llvm.tpu.bc.load.aliaddr.flm(<8 x float> addrspace(207)* %0, i32 1, i32 2)
+  %8 = fmul <8 x float> %4, %4
+  %9 = fsub <8 x float> %8, %6
+  %10 = fmul <8 x float> %9, <float 0x3FE6666660000000, float 0x3FE6666660000000, float 0x3FE6666660000000, float 0x3FE6666660000000, float 0x3FE6666660000000, float 0x3FE6666660000000, float 0x3FE6666660000000, float 0x3FE6666660000000>
+  %11 = fadd <8 x float> %6, %10
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %11, <8 x float> addrspace(207)* %0, i32 1, i32 1)
+  %12 = fadd <8 x float> %11, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  %13 = tail call <8 x float> @llvm.tpu.rsqrt.macro.v8f32(<8 x float> %12)
+  %14 = fmul <8 x float> %7, <float 0x3FE99999A0000000, float 0x3FE99999A0000000, float 0x3FE99999A0000000, float 0x3FE99999A0000000, float 0x3FE99999A0000000, float 0x3FE99999A0000000, float 0x3FE99999A0000000, float 0x3FE99999A0000000>
+  %15 = fmul <8 x float> %4, <float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000, float 0x3FF55551E0000000>
+  %16 = fmul <8 x float> %15, %13
+  %17 = fadd <8 x float> %14, %16
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %17, <8 x float> addrspace(207)* %0, i32 1, i32 2)
+  %18 = fsub <8 x float> %5, %17
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %18, <8 x float> addrspace(207)* %0, i32 1, i32 0)
+  %19 = tail call i1 @llvm.tpu.bc.loop.end()
+  br i1 %19, label %loop-start, label %loop-out
+
+loop-out:
+  ret void
+}
+
+; Tests whether the intrinsic test_vrot_sublane_down works on Barnacore.
+; CHECK-LABEL: test_vrot_sublane_down:
+; CHECK: {       v0 =    vrot.slane.down v0  }
+; CHECK: {       _ =     vnop  }
+; CHECK: {       [bmem:s0] = vst.f32 v0 ali_addr:$0x0;
+define void @test_vrot_sublane_down(<8 x float> addrspace(207)* %a, <8 x i32> %t) {
+  %1 = tail call <8 x i32> @llvm.tpu.vrot.sublane.down.v8i32(<8 x i32> %t)
+  %2 = bitcast <8 x i32> %1 to <8 x float>
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %2, <8 x float> addrspace(207)* %a, i32 0, i32 0)
+  ret void
+}
+
+; Tests whether the intrinsic test_vlaneseq works on Barnacore.
+; CHECK-LABEL: test_vlaneseq:
+; CHECK: {       v0 =    vlaneseq.u32  }
+; CHECK: {       [bmem:s0] = vst.f32 v0 ali_addr:$0x0;
+define void @test_vlaneseq(<8 x float> addrspace(207)* %a, <8 x i32> %t) {
+  %1 = tail call <8 x i32> @llvm.tpu.vlaneseq()
+  %2 = bitcast <8 x i32> %1 to <8 x float>
+  tail call void @llvm.tpu.bc.store.aliaddr.flm(<8 x float> %2, <8 x float> addrspace(207)* %a, i32 0, i32 0)
+  ret void
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/xlu.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/xlu.ll
new file mode 100644
index 0000000..6440d0e
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/xlu.ll

@@ -0,0 +1,550 @@
+; RUN: llc < %s -mcpu=tensorcore-pf -asm-verbose=false -disable-cgp -tpu-use-fifo-sched=false | FileCheck %s
+; REQUIRES: tpu
+
+; Test XLU instructions code generation and latency. Disable FIFO re-ordering
+; as we want to test latency between different sets of XLU operations without 
+; re-ordering.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.v1024f32(<1024 x float>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.end.v1024f32(<1024 x float>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.segmented.v1024i32(<1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.end.segmented.v1024i32(<1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.packed.v1024i32(<1024 x i32>, <1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.end.packed.v1024i32(<1024 x i32>, <1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.segmented.packed.v1024i32(<1024 x i32>, <1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.end.segmented.packed.v1024i32(<1024 x i32>, <1024 x i32>, i32, i32, i32, i32)
+
+declare <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32, i32)
+declare <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32, i32)
+
+declare i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32>, i32, i32)
+declare i32 @llvm.tpu.vrotate.v1024f32(<1024 x float>, i32, i32)
+declare i32 @llvm.tpu.vrotate.packed.v1024i32(<1024 x i32>, <1024 x i32>, i32, i32)
+declare i32 @llvm.tpu.vrotate.packed.v1024f32(<1024 x float>, <1024 x float>, i32, i32)
+
+declare i32 @llvm.tpu.xlane.add(<1024 x float>, i32)
+declare i32 @llvm.tpu.xlane.max(<1024 x float>, i32)
+declare i32 @llvm.tpu.xlane.min(<1024 x float>, i32)
+declare i32 @llvm.tpu.xlane.maxindex(<1024 x float>, i32)
+declare i32 @llvm.tpu.xlane.minindex(<1024 x float>, i32)
+declare i32 @llvm.tpu.xlane.segmented.add(<1024 x float>, i32, i32)
+
+declare i32 @llvm.tpu.set.permute(<1024 x i32> , i32 )
+declare i32 @llvm.tpu.set.spr(<1024 x i32> , i32 )
+declare i32 @llvm.tpu.set.permute.sublane(<1024 x i32> , i32 )
+declare i32 @llvm.tpu.set.permute.bytes(<1024 x i32> , i32 )
+declare i32 @llvm.tpu.permute.v1024i32(<1024 x i32>, i32, i32)
+declare i32 @llvm.tpu.permute.v1024f32(<1024 x float>, i32, i32)
+declare i32 @llvm.tpu.permute.packed.v1024i32(<1024 x i32>, <1024 x i32>, i32, i32)
+declare i32 @llvm.tpu.permute.packed.v1024f32(<1024 x float>, <1024 x float>, i32, i32)
+
+
+; CHECK-LABEL: transposerb0:
+; CHECK: (trf0) = vxpose.0 v0, $0x8
+ define void @transposerb0(<1024 x i32> %v) {
+  %xlu = call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 8, i32 8, i32 0, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: transposerb1:
+; CHECK: (trf1) = vxpose.1 v0, $0x8
+ define void @transposerb1(<1024 x i32> %v) {
+  %xlu = call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 8, i32 8, i32 1, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: transposerb2:
+; CHECK: (trf0) = vxpose.2 v0, $0x8
+ define void @transposerb2(<1024 x i32> %v, i32 %width) {
+  %xlu = call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 8, i32 8, i32 2, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: transposerb3:
+; CHECK: (trf1) = vxpose.3 v0, $0x8
+ define void @transposerb3(<1024 x i32> %v, i32 %width) {
+  %xlu = call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 8, i32 8, i32 3, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: transposer_end:
+; CHECK: (trf1) = vxpose.3.end v0, $0x8
+; CHECK: _ = vdelay $0x7c
+; CHECK: v0 = vpop (trf1)
+ define <1024 x i32> @transposer_end(<1024 x i32> %v) {
+  %xlu = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %v, i32 8, i32 8, i32 3, i32 undef)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: full_transpose_W1:
+; CHECK: (trf1) = vxpose.3 v0, $0x8
+; CHECK:  _ = vdelay $0x7
+; CHECK: (trf1) = vxpose.3.end v0, $0x8
+; CHECK: _ = vdelay $0x74
+; CHECK: v0 = vpop (trf1)
+ define <1024 x i32> @full_transpose_W1(<1024 x i32> %v) {
+  %xlu = call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v, i32 8, i32 16, i32 3, i32 undef)
+  %xlu1 = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %v, i32 8, i32 16, i32 3, i32 %xlu)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: full_transpose_W1_f:
+; CHECK: (trf1) = vxpose.3 v0, $0x8
+; CHECK:  _ = vdelay $0x7
+; CHECK: (trf1) = vxpose.3.end v0, $0x8
+; CHECK: _ = vdelay $0x74
+; CHECK: v0 = vpop (trf1)
+ define <1024 x float> @full_transpose_W1_f(<1024 x float> %v) {
+  %xlu = call i32 @llvm.tpu.tc.transpose.v1024f32(<1024 x float> %v, i32 8, i32 16, i32 3, i32 undef)
+  %xlu1 = call i32 @llvm.tpu.tc.transpose.end.v1024f32(<1024 x float> %v, i32 8, i32 16, i32 3, i32 %xlu)
+  %res = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu1)
+  ret <1024 x float> %res
+}
+
+; Tranpose with a width of 16
+; CHECK-LABEL: full_transpose_W2:
+; CHECK: (trf1) = vxpose.3 v0, $0x10
+; CHECK:  _ = vdelay $0x7
+; CHECK: (trf1) = vxpose.3.end v1, $0x10
+; CHECK: _ = vdelay $0x74
+; CHECK: v{{[0-9]+}} = vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: v{{[0-9]+}} = vpop (trf1)
+ define <1024 x i32> @full_transpose_W2(<1024 x i32> %v0, <1024 x i32> %v1) {
+  %xlu = call i32 @llvm.tpu.tc.transpose.v1024i32(<1024 x i32> %v0, i32 16, i32 16, i32 3, i32 undef)
+  %xlu1 = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %v1, i32 16, i32 16, i32 3, i32 %xlu)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res2 = add <1024 x i32> %res, %res1
+  ret <1024 x i32> %res2
+}
+
+; CHECK-LABEL: rotate:
+; CHECK: (trf1) = vrot.3 v0, s0
+; CHECK: _ = vdelay $0x44
+; CHECK: v0 = vpop (trf1)
+ define <1024 x i32> @rotate(<1024 x i32> %v, i32 %amount) {
+  %xlu = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 %amount, i32 3)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: rotate_i:
+; CHECK: (trf1) = vrot.3 v0, $0x5
+; CHECK: _ = vdelay $0x44
+; CHECK: v0 = vpop (trf1)
+ define <1024 x i32> @rotate_i(<1024 x i32> %v) {
+  %xlu = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 5, i32 3)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: rotate_selecting_imm_vs_reg_versions:
+; CHECK: (trf0) = vrot.0 v0, $0x4444
+; CHECK: s[[x:[0-9]+]] = simm.s32 $0x44444
+; CHECK: (trf0) = vrot.0 v0, s[[x]]
+; CHECK: v{{[0-9]+}} = vpop (trf0)
+; CHECK: v{{[0-9]+}} = vpop (trf0)
+define <1024 x i32> @rotate_selecting_imm_vs_reg_versions(<1024 x i32> %v) {
+  ; constant 17476 fits in 16 bits; hence immediate version of the rotate
+  ; instruction is selected.
+  %xlu0 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 17476, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  ; constant 279620 does not fit in 16 bits; hence register version of the
+  ; rotate instruction is selected.
+  %xlu1 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 279620, i32 0)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu1)
+
+  %r = add <1024 x i32> %res0, %res1
+
+  ret <1024 x i32> %r
+}
+
+; CHECK-LABEL: rotate_f:
+; CHECK: (trf1) = vrot.3 v0, s0
+; CHECK: _ = vdelay $0x44
+; CHECK: v0 = vpop (trf1)
+ define <1024 x float> @rotate_f(<1024 x float> %v, i32 %amount) {
+  %xlu = call i32 @llvm.tpu.vrotate.v1024f32(<1024 x float> %v, i32 %amount, i32 3)
+  %res = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: rotate_f_i:
+; CHECK: (trf1) = vrot.3 v0, $0x5
+; CHECK: _ = vdelay $0x44
+; CHECK: v0 = vpop (trf1)
+ define <1024 x float> @rotate_f_i(<1024 x float> %v) {
+  %xlu = call i32 @llvm.tpu.vrotate.v1024f32(<1024 x float> %v, i32 5, i32 3)
+  %res = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: xlaneadd:
+; CHECK: (trf1) = vadd.xlane.3 v0
+; CHECK: _ = vdelay $0x4e
+; CHECK: v0 = vpop (trf1)
+ define <1024 x i32> @xlaneadd(<1024 x float> %v) {
+  %xlu = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 3)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: xlanemax:
+; CHECK: (trf1) = vmax.xlane.3 v0
+; CHECK: _ = vdelay $0x4e
+; CHECK: v0 = vpop (trf1)
+ define <1024 x i32> @xlanemax(<1024 x float> %v) {
+  %xlu = call i32 @llvm.tpu.xlane.max(<1024 x float> %v, i32 3)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: xlanemin:
+; CHECK: (trf1) = vmin.xlane.3 v0
+; CHECK: _ = vdelay $0x4e
+; CHECK: v0 = vpop (trf1)
+ define <1024 x i32> @xlanemin(<1024 x float> %v) {
+  %xlu = call i32 @llvm.tpu.xlane.min(<1024 x float> %v, i32 3)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: xlanemaxindex:
+; CHECK: (trf1) = vmax.index.xlane.3 v0
+; CHECK: _ = vdelay $0x4e
+; CHECK: v0 = vpop (trf1)
+ define <1024 x i32> @xlanemaxindex(<1024 x float> %v) {
+  %xlu = call i32 @llvm.tpu.xlane.maxindex(<1024 x float> %v, i32 3)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: xlaneminindex:
+; CHECK: (trf1) = vmin.index.xlane.3 v0
+; CHECK: _ = vdelay $0x4e
+; CHECK: v0 = vpop (trf1)
+ define <1024 x i32> @xlaneminindex(<1024 x float> %v) {
+  %xlu = call i32 @llvm.tpu.xlane.minindex(<1024 x float> %v, i32 3)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: permute:
+; CHECK: (pcr1) = vsetperm.3.u8 v1
+; CHECK: (trf1) = vperm.3 v0
+; CHECK: _ = vdelay $0x44
+; CHECK: v0 = vpop (trf1)
+ define <1024 x i32> @permute(<1024 x i32> %v, <1024 x i32> %perm) {
+  %pcr = call i32 @llvm.tpu.set.permute(<1024 x i32> %perm, i32 3)
+  %xlu = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr, i32 3)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: permute_sublane:
+; CHECK: (pcr1) = vsetperm.3.all.u8 v1
+; CHECK: _ = vdelay $0x7
+; CHECK: (trf1) = vperm.3 v0
+; CHECK: _ = vdelay $0x44
+; CHECK: v0 = vpop (trf1)
+ define <1024 x i32> @permute_sublane(<1024 x i32> %v, <1024 x i32> %perm) {
+  %pcr = call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %perm, i32 3)
+  %xlu = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr, i32 3)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: permute_byte:
+; CHECK: (pcr1) = vsetperm.3.all.bytes.u32 v1
+; CHECK: _ = vdelay $0x7
+; CHECK: (trf1) = vperm.3 v0
+; CHECK: _ = vdelay $0x44
+; CHECK: v0 = vpop (trf1)
+ define <1024 x i32> @permute_byte(<1024 x i32> %v, <1024 x i32> %perm) {
+  %pcr = call i32 @llvm.tpu.set.permute.bytes(<1024 x i32> %perm, i32 3)
+  %xlu = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr, i32 3)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: permute_f:
+; CHECK: (pcr1) = vsetperm.3.u8 v1
+; CHECK: (trf1) = vperm.3 v0
+; CHECK: _ = vdelay $0x44
+; CHECK: v0 = vpop (trf1)
+ define <1024 x float> @permute_f(<1024 x float> %v, <1024 x i32> %perm) {
+  %pcr = call i32 @llvm.tpu.set.permute(<1024 x i32> %perm, i32 3)
+  %xlu = call i32 @llvm.tpu.permute.v1024f32(<1024 x float> %v, i32 %pcr, i32 3)
+  %res = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu)
+  ret <1024 x float> %res
+}
+
+; Check that we have the right latency between transpose_end and the rotate.
+; CHECK-LABEL: transposer_to_rotate:
+; CHECK: (trf1) = vxpose.3.end v0, $0x8
+; CHECK: _ = vdelay $0x67
+; CHECK: (trf1) = vrot.3 v0, $0x5
+; CHECK: _ = vdelay $0x3c
+; CHECK: v{{[0-9]+}} = vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: v{{[0-9]+}} = vpop (trf1)
+ define <1024 x i32> @transposer_to_rotate(<1024 x i32> %v) {
+  %xlu = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %v, i32 8, i32 8, i32 3, i32 undef)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 5, i32 3)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu2)
+  %res = add <1024 x i32> %res1, %res2
+  ret <1024 x i32> %res
+}
+
+; Check that we have the right latency between transpose_end and the reduce.
+; CHECK-LABEL: transposer_to_reduce:
+; CHECK: (trf1) = vxpose.3.end v0, $0x8
+; CHECK: _ = vdelay $0x5d
+; CHECK: (trf1) = vadd.xlane.3 v0
+; CHECK: _ = vdelay $0x46
+; CHECK: v{{[0-9]+}} = vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: v{{[0-9]+}} = vpop (trf1)
+ define <1024 x float> @transposer_to_reduce(<1024 x float> %v) {
+  %xlu = call i32 @llvm.tpu.tc.transpose.end.v1024f32(<1024 x float> %v, i32 8, i32 8, i32 3, i32 undef)
+  %res1 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu)
+  %xlu2 = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 3)
+  %res2 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu2)
+  %res = fadd <1024 x float> %res1, %res2
+  ret <1024 x float> %res
+}
+
+; Check that we have the right latency between rotate and transpose_end.
+; CHECK-LABEL: rotate_to_transpose:
+; CHECK: (trf1) = vrot.3 v0, $0x5
+; CHECK: _ = vdelay $0x2e
+; CHECK: (trf1) = vxpose.3.end v0, $0x8
+; CHECK: _ = vdelay $0x74
+; CHECK: v{{[0-9]+}} = vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: v{{[0-9]+}} = vpop (trf1)
+ define <1024 x i32> @rotate_to_transpose(<1024 x i32> %v) {
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 5, i32 3)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu2)
+  %xlu = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %v, i32 8, i32 8, i32 3, i32 undef)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  %res = add <1024 x i32> %res1, %res2
+  ret <1024 x i32> %res
+}
+
+; Check that we have the right latency between reduce and transpose_end.
+; CHECK-LABEL: reduce_to_transpose:
+; CHECK: (trf1) = vadd.xlane.3 v0
+; CHECK: _ = vdelay $0x38
+; CHECK: (trf1) = vxpose.3.end v0, $0x8
+; CHECK: _ = vdelay $0x74
+; CHECK: v{{[0-9]+}} = vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: v{{[0-9]+}} = vpop (trf1)
+ define <1024 x float> @reduce_to_transpose(<1024 x float> %v) {
+  %xlu2 = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 3)
+  %res2 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu2)
+  %xlu = call i32 @llvm.tpu.tc.transpose.end.v1024f32(<1024 x float> %v, i32 8, i32 8, i32 3, i32 undef)
+  %res1 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu)
+  %res = fadd <1024 x float> %res1, %res2
+  ret <1024 x float> %res
+}
+
+; Check that we have the right latency between reduce and permute.
+; CHECK-LABEL: reduce_to_permute:
+; CHECK: (trf1) = vadd.xlane.3 v0
+; CHECK: _ = vdelay $0x11
+; CHECK: (trf1) = vrot.3 v0, $0x5
+; CHECK: _ = vdelay $0x3c
+; CHECK: v{{[0-9]+}} = vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: v{{[0-9]+}} = vpop (trf1)
+ define <1024 x float> @reduce_to_permute(<1024 x float> %v) {
+  %xlu = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 3)
+  %res1 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu)
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024f32(<1024 x float> %v, i32 5, i32 3)
+  %res2 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu2)
+  %res = fadd <1024 x float> %res1, %res2
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: xlaneadd_segmented:
+; CHECK: (trf1) = vadd.xlane.3.seg.perm  v0
+; CHECK: _ = vdelay $0x4e
+; CHECK: v0 = vpop (trf1)
+ define <1024 x float> @xlaneadd_segmented(<1024 x float> %v, <1024 x i32> %s) {
+  %spr = call i32 @llvm.tpu.set.spr(<1024 x i32> %s, i32 3)
+  %xlu = call i32 @llvm.tpu.xlane.segmented.add(<1024 x float> %v, i32 %spr, i32 3)
+  %res = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: permute_packed:
+; CHECK: (pcr1) = vsetperm.3.u8 v2
+; CHECK: (trf1) = vperm.3.packed v0
+; CHECK: _ = vsupp v1
+; CHECK: _ = vdelay $0x4c
+; CHECK: v0 = vpop (trf1)
+ define <1024 x i32> @permute_packed(<1024 x i32> %vlow, <1024 x i32> %vhigh, <1024 x i32> %perm) {
+  %pcr = call i32 @llvm.tpu.set.permute(<1024 x i32> %perm, i32 3)
+  %xlu = call i32 @llvm.tpu.permute.packed.v1024i32(<1024 x i32> %vlow, <1024 x i32> %vhigh, i32 %pcr, i32 3)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: permute_packed_f:
+; CHECK: (pcr1) = vsetperm.3.u8 v2
+; CHECK: (trf1) = vperm.3.packed v0
+; CHECK: _ = vsupp v1
+; CHECK: _ = vdelay $0x4c
+; CHECK: v0 = vpop (trf1)
+ define <1024 x float> @permute_packed_f(<1024 x float> %vlow, <1024 x float> %vhigh, <1024 x i32> %perm) {
+  %pcr = call i32 @llvm.tpu.set.permute(<1024 x i32> %perm, i32 3)
+  %xlu = call i32 @llvm.tpu.permute.packed.v1024f32(<1024 x float> %vlow, <1024 x float> %vhigh, i32 %pcr, i32 3)
+  %res = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: rotate_packed:
+; CHECK: (trf1) = vrot.3.packed v0, s0
+; CHECK: _ = vsupp v1
+; CHECK: _ = vdelay $0x4c
+; CHECK: v0 = vpop (trf1)
+ define <1024 x i32> @rotate_packed(<1024 x i32> %vlow, <1024 x i32> %vhigh, i32 %amount) {
+  %xlu = call i32 @llvm.tpu.vrotate.packed.v1024i32(<1024 x i32> %vlow, <1024 x i32> %vhigh, i32 %amount, i32 3)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: rotate_packed_i:
+; CHECK: (trf1) = vrot.3.packed v0, $0x5
+; CHECK: _ = vsupp v1
+; CHECK: _ = vdelay $0x4c
+; CHECK: v0 = vpop (trf1)
+ define <1024 x i32> @rotate_packed_i(<1024 x i32> %vlow, <1024 x i32> %vhigh) {
+  %xlu = call i32 @llvm.tpu.vrotate.packed.v1024i32(<1024 x i32> %vlow, <1024 x i32> %vhigh, i32 5, i32 3)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  ret <1024 x i32> %res
+}
+
+
+; Segmented transpose always pushed 16 vector in the FIFO.
+; CHECK-LABEL: full_transpose_segmented:
+; CHECK: (trf1) = vsxpose.3 v0, $0x10
+; CHECK:  _ = vdelay $0x7
+; CHECK: (trf1) = vsxpose.3.end v0, $0x10
+; CHECK: _ = vdelay $0x74
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+ define <1024 x i32> @full_transpose_segmented(<1024 x i32> %v) {
+  %xlu = call i32 @llvm.tpu.tc.transpose.segmented.v1024i32(<1024 x i32> %v, i32 16, i32 16, i32 3, i32 undef)
+  %xlu1 = call i32 @llvm.tpu.tc.transpose.end.segmented.v1024i32(<1024 x i32> %v, i32 16, i32 16, i32 3, i32 %xlu)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res3 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res4 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res5 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res6 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res7 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res8 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res9 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res10 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res11 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res12 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res13 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res14 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res15 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res16 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  ret <1024 x i32> %res16
+}
+
+; CHECK-LABEL: full_transpose_packed:
+; CHECK: (trf1) = vxpose.3.packed v0, $0x10
+; CHECK: _ = vsupp v1
+; CHECK: _ = vdelay $0xf
+; CHECK: (trf1) = vxpose.3.packed.end v0, $0x10
+; CHECK: _ = vsupp v1
+; CHECK: _ = vdelay $0x84
+; CHECK: v0 = vpop (trf1)
+ define <1024 x i32> @full_transpose_packed(<1024 x i32> %vlow, <1024 x i32> %vhigh) {
+  %xlu = call i32 @llvm.tpu.tc.transpose.packed.v1024i32(<1024 x i32> %vlow, <1024 x i32> %vhigh, i32 16, i32 16, i32 3, i32 undef)
+  %xlu1 = call i32 @llvm.tpu.tc.transpose.end.packed.v1024i32(<1024 x i32> %vlow, <1024 x i32> %vhigh, i32 16, i32 16, i32 3, i32 %xlu)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  ret <1024 x i32> %res
+}
+
+; Packed segmented transpose always pushed 8 vector in the FIFO.
+; CHECK-LABEL: full_transpose_segmented_packed:
+; CHECK: (trf1) = vsxpose.3.packed v0, $0x10
+; CHECK: _ = vsupp v1
+; CHECK:  _ = vdelay $0xf
+; CHECK: (trf1) = vsxpose.3.packed.end v0, $0x10
+; CHECK: _ = vsupp v1
+; CHECK: _ = vdelay $0x84
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+; CHECK: _ = vdelay $0x7
+; CHECK: vpop (trf1)
+ define <1024 x i32> @full_transpose_segmented_packed(<1024 x i32> %vlow, <1024 x i32> %vhigh) {
+  %xlu = call i32 @llvm.tpu.tc.transpose.segmented.packed.v1024i32(<1024 x i32> %vlow, <1024 x i32> %vhigh, i32 16, i32 16, i32 3, i32 undef)
+  %xlu1 = call i32 @llvm.tpu.tc.transpose.end.segmented.packed.v1024i32(<1024 x i32> %vlow, <1024 x i32> %vhigh, i32 16, i32 16, i32 3, i32 %xlu)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res3 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res4 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res5 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res6 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res7 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res8 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  ret <1024 x i32> %res8
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/xlu_vf.ll b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/xlu_vf.ll
new file mode 100644
index 0000000..4477b0d
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/CodeGen/GoogleTPU/xlu_vf.ll

@@ -0,0 +1,1187 @@
+; RUN: llc < %s -mcpu=tensorcore-vf -asm-verbose=false -disable-cgp -tpu-use-fifo-sched=false --enable-tpu-xlu-opt=false | FileCheck %s
+; REQUIRES: tpu
+
+; Test XLU instructions code generation and latency. Disable FIFO re-ordering
+; as we want to test latency between different sets of XLU operations without
+; re-ordering.
+
+target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+target triple = "googletpu"
+
+declare i32 @llvm.tpu.tc.transpose.start.v1024i32(<1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.start.v1024f32(<1024 x float>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.end.v1024f32(<1024 x float>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.start.end.v1024i32(<1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.start.end.v1024f32(<1024 x float>, i32, i32, i32, i32)
+
+declare i32 @llvm.tpu.tc.transpose.segmented.v1024i32(<1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.end.segmented.v1024i32(<1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.start.packed.v1024i32(<1024 x i32>, <1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.end.packed.v1024i32(<1024 x i32>, <1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.segmented.packed.v1024i32(<1024 x i32>, <1024 x i32>, i32, i32, i32, i32)
+declare i32 @llvm.tpu.tc.transpose.end.segmented.packed.v1024i32(<1024 x i32>, <1024 x i32>, i32, i32, i32, i32)
+
+declare <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32, i32)
+declare <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32, i32)
+
+declare i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32>, i32, i32)
+declare i32 @llvm.tpu.vrotate.v1024f32(<1024 x float>, i32, i32)
+declare i32 @llvm.tpu.vrotate.packed.v1024i32(<1024 x i32>, <1024 x i32>, i32, i32)
+declare i32 @llvm.tpu.vrotate.packed.v1024f32(<1024 x float>, <1024 x float>, i32, i32)
+
+declare i32 @llvm.tpu.xlane.add(<1024 x float>, i32)
+declare i32 @llvm.tpu.xlane.max(<1024 x float>, i32)
+declare i32 @llvm.tpu.xlane.min(<1024 x float>, i32)
+declare i32 @llvm.tpu.xlane.maxindex(<1024 x float>, i32)
+declare i32 @llvm.tpu.xlane.minindex(<1024 x float>, i32)
+
+declare i32 @llvm.tpu.set.permute(<1024 x i32> , i32 )
+declare i32 @llvm.tpu.set.permute.sublane(<1024 x i32> , i32 )
+declare i32 @llvm.tpu.set.permute.bytes(<1024 x i32> , i32 )
+declare i32 @llvm.tpu.permute.v1024i32(<1024 x i32>, i32, i32)
+declare i32 @llvm.tpu.permute.v1024f32(<1024 x float>, i32, i32)
+declare i32 @llvm.tpu.permute.packed.v1024i32(<1024 x i32>, <1024 x i32>, i32, i32)
+declare i32 @llvm.tpu.permute.packed.v1024f32(<1024 x float>, <1024 x float>, i32, i32)
+
+
+; CHECK-LABEL: transposer_start:
+; CHECK: { (trf0) = vxpose.start v[[x:[0-9]+]], $0x8 }
+; CHECK: { (trf1) = vxpose.start v[[x]], $0x8
+; CHECK: (trf2) = vxpose.start v[[x]], $0x8
+; CHECK: _ = shalt }
+ define void @transposer_start(<1024 x i32> %vi) {
+  %xlu0i = call i32 @llvm.tpu.tc.transpose.start.v1024i32(<1024 x i32> %vi, i32 8, i32 8, i32 0, i32 undef)
+  %xlu1i = call i32 @llvm.tpu.tc.transpose.start.v1024i32(<1024 x i32> %vi, i32 8, i32 8, i32 1, i32 undef)
+  %xlu2i = call i32 @llvm.tpu.tc.transpose.start.v1024i32(<1024 x i32> %vi, i32 8, i32 8, i32 2, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: transposer_start_f:
+; CHECK: { (trf0) = vxpose.start v[[x:[0-9]+]], $0x8 }
+; CHECK: { (trf1) = vxpose.start v[[x]], $0x8
+; CHECK: (trf2) = vxpose.start v[[x]], $0x8
+; CHECK: _ = shalt }
+define void @transposer_start_f(<1024 x float> %vf) {
+  %xlu0f = call i32 @llvm.tpu.tc.transpose.start.v1024f32(<1024 x float> %vf, i32 8, i32 8, i32 0, i32 undef)
+  %xlu1f = call i32 @llvm.tpu.tc.transpose.start.v1024f32(<1024 x float> %vf, i32 8, i32 8, i32 1, i32 undef)
+  %xlu2f = call i32 @llvm.tpu.tc.transpose.start.v1024f32(<1024 x float> %vf, i32 8, i32 8, i32 2, i32 undef)
+  ret void
+}
+
+; CHECK-LABEL: transposer_end_0:
+; CHECK: { (trf0) = vxpose.end v{{[0-9]+}}, $0x8;
+; CHECK: _ = vdelay $0x99 }
+; CHECK: { v{{[0-9]+}} = vpop (trf0)
+; CHECK: _ = shalt }
+define void @transposer_end_0(<1024 x i32> %vi) {
+  %xlu0i = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %vi, i32 8, i32 8, i32 0, i32 undef)
+  %res0i = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0i)
+
+  ret void
+}
+
+; CHECK-LABEL: transposer_end_f_0:
+; CHECK: { (trf0) = vxpose.end v{{[0-9]+}}, $0x8;
+; CHECK: _ = vdelay $0x99 }
+; CHECK: { v{{[0-9]+}} = vpop (trf0)
+; CHECK: _ = shalt }
+define void @transposer_end_f_0(<1024 x float> %vf) {
+  %xlu0f = call i32 @llvm.tpu.tc.transpose.end.v1024f32(<1024 x float> %vf, i32 8, i32 8, i32 0, i32 undef)
+  %res0f = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 0, i32 %xlu0f)
+
+  ret void
+}
+
+; CHECK-LABEL: transposer_end_1:
+; CHECK: { (trf1) = vxpose.end v{{[0-9]+}}, $0x8;
+; CHECK: _ = vdelay $0x99 }
+; CHECK: { v{{[0-9]+}} = vpop (trf1)
+; CHECK: _ = shalt }
+define void @transposer_end_1(<1024 x i32> %vi) {
+  %xlu1i = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %vi, i32 8, i32 8, i32 1, i32 undef)
+  %res1i = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1i)
+
+  ret void
+}
+
+; CHECK-LABEL: transposer_end_f_1:
+; CHECK: { (trf1) = vxpose.end v{{[0-9]+}}, $0x8;
+; CHECK: _ = vdelay $0x99 }
+; CHECK: { v{{[0-9]+}} = vpop (trf1)
+; CHECK: _ = shalt }
+define void @transposer_end_f_1(<1024 x float> %vf) {
+  %xlu1f = call i32 @llvm.tpu.tc.transpose.end.v1024f32(<1024 x float> %vf, i32 8, i32 8, i32 1, i32 undef)
+  %res1f = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu1f)
+
+  ret void
+}
+
+; CHECK-LABEL: transposer_end_2:
+; CHECK: { (trf2) = vxpose.end v{{[0-9]+}}, $0x8;
+; CHECK: _ = vdelay $0x8d }
+; CHECK: { v{{[0-9]+}} = vpop (trf2)
+; CHECK: _ = shalt }
+define void @transposer_end_2(<1024 x i32> %vi) {
+  %xlu2i = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %vi, i32 8, i32 8, i32 2, i32 undef)
+  %res2i = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu2i)
+
+  ret void
+}
+
+; CHECK-LABEL: transposer_end_f_2:
+; CHECK: { (trf2) = vxpose.end v{{[0-9]+}}, $0x8;
+; CHECK: _ = vdelay $0x8d }
+; CHECK: { v{{[0-9]+}} = vpop (trf2);
+; CHECK: _ = shalt }
+define void @transposer_end_f_2(<1024 x float> %vf) {
+  %xlu2f = call i32 @llvm.tpu.tc.transpose.end.v1024f32(<1024 x float> %vf, i32 8, i32 8, i32 2, i32 undef)
+  %res2f = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 2, i32 %xlu2f)
+
+  ret void
+}
+
+
+; CHECK-LABEL: transposer_end_xlu_combination:
+; CHECK: { (trf0) = vxpose.end v[[x:[0-9]+]], $0x8;
+; CHECK: (trf1) = vxpose.end v[[x]], $0x8;
+; CHECK: _ = vdelay $0xa }
+; CHECK: { (trf2) = vxpose.end v[[x]], $0x8;
+; CHECK: _ = vdelay $0x8d }
+; CHECK: { v{{[0-9]+}} = vpop (trf2) }
+; CHECK: { v{{[0-9]+}} = vpop (trf0);
+; CHECK: v{{[0-9]+}} = vpop (trf1);
+; CHECK: _ = shalt }
+define void @transposer_end_xlu_combination(<1024 x i32> %vi) {
+  %xlu0i = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %vi, i32 8, i32 8, i32 0, i32 undef)
+  %res0i = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0i)
+  %xlu1i = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %vi, i32 8, i32 8, i32 1, i32 undef)
+  %res1i = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1i)
+  %xlu2i = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %vi, i32 8, i32 8, i32 2, i32 undef)
+  %res2i = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu2i)
+
+  ret void
+}
+
+; CHECK-LABEL: transposer_end_xlu_combination_f:
+; CHECK: { (trf0) = vxpose.end v[[x:[0-9]+]], $0x8;
+; CHECK: (trf1) = vxpose.end v[[x]], $0x8;
+; CHECK: _ = vdelay $0xa }
+; CHECK: { (trf2) = vxpose.end v[[x]], $0x8;
+; CHECK: _ = vdelay $0x8d }
+; CHECK: { v{{[0-9]+}} = vpop (trf2) }
+; CHECK: { v{{[0-9]+}} = vpop (trf0);
+; CHECK: v{{[0-9]+}} = vpop (trf1);
+; CHECK: _ = shalt }
+define void @transposer_end_xlu_combination_f(<1024 x float> %vf) {
+  %xlu0f = call i32 @llvm.tpu.tc.transpose.end.v1024f32(<1024 x float> %vf, i32 8, i32 8, i32 0, i32 undef)
+  %res0f = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 0, i32 %xlu0f)
+  %xlu1f = call i32 @llvm.tpu.tc.transpose.end.v1024f32(<1024 x float> %vf, i32 8, i32 8, i32 1, i32 undef)
+  %res1f = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu1f)
+  %xlu2f = call i32 @llvm.tpu.tc.transpose.end.v1024f32(<1024 x float> %vf, i32 8, i32 8, i32 2, i32 undef)
+  %res2f = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 2, i32 %xlu2f)
+
+  ret void
+}
+
+
+; CHECK-LABEL: transposer_start_end:
+; CHECK: { (trf0) = vxpose.start.end v[[x:[0-9]+]], $0x8;
+; CHECK: (trf1) = vxpose.start.end v[[x]], $0x8;
+; CHECK: _ = vdelay $0xa }
+; CHECK: { (trf2) = vxpose.start.end v[[x]], $0x8;
+; CHECK: _ = vdelay $0x8d }
+; CHECK: { v{{[0-9]+}} = vpop (trf2) }
+; CHECK: { v{{[0-9]+}} = vpop (trf0);
+; CHECK: v{{[0-9]+}} = vpop (trf1);
+; CHECK: _ = shalt }
+define void @transposer_start_end(<1024 x i32> %vi) {
+  %xlu0i = call i32 @llvm.tpu.tc.transpose.start.end.v1024i32(<1024 x i32> %vi, i32 8, i32 8, i32 0, i32 undef)
+  %res0i = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0i)
+  %xlu1i = call i32 @llvm.tpu.tc.transpose.start.end.v1024i32(<1024 x i32> %vi, i32 8, i32 8, i32 1, i32 undef)
+  %res1i = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1i)
+  %xlu2i = call i32 @llvm.tpu.tc.transpose.start.end.v1024i32(<1024 x i32> %vi, i32 8, i32 8, i32 2, i32 undef)
+  %res2i = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu2i)
+
+  ret void
+}
+
+; CHECK-LABEL: transposer_start_end_f:
+; CHECK: { (trf0) = vxpose.start.end v[[x:[0-9]+]], $0x8;
+; CHECK: (trf1) = vxpose.start.end v[[x]], $0x8;
+; CHECK: _ = vdelay $0xa }
+; CHECK: { (trf2) = vxpose.start.end v[[x]], $0x8;
+; CHECK: _ = vdelay $0x8d }
+; CHECK: { v{{[0-9]+}} = vpop (trf2) }
+; CHECK: { v{{[0-9]+}} = vpop (trf0);
+; CHECK: v{{[0-9]+}} = vpop (trf1);
+; CHECK: _ = shalt }
+define void @transposer_start_end_f( <1024 x float> %vf) {
+  %xlu0f = call i32 @llvm.tpu.tc.transpose.start.end.v1024f32(<1024 x float> %vf, i32 8, i32 8, i32 0, i32 undef)
+  %res0f = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 0, i32 %xlu0f)
+  %xlu1f = call i32 @llvm.tpu.tc.transpose.start.end.v1024f32(<1024 x float> %vf, i32 8, i32 8, i32 1, i32 undef)
+  %res1f = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu1f)
+  %xlu2f = call i32 @llvm.tpu.tc.transpose.start.end.v1024f32(<1024 x float> %vf, i32 8, i32 8, i32 2, i32 undef)
+  %res2f = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 2, i32 %xlu2f)
+
+  ret void
+}
+
+; CHECK-LABEL: full_transpose_W1:
+; CHECK: { (trf1) = vxpose.start v[[x:[0-9]+]], $0x8
+; CHECK: _ = vdelay $0x7 }
+; CHECK: { (trf1) = vxpose.end v[[x]], $0x8
+; CHECK: _ = vdelay $0x91 }
+; CHECK: { v{{[0-9]+}} = vpop (trf1)
+; CHECK: _ = shalt }
+define void @full_transpose_W1(<1024 x i32> %vi) {
+  %xlui = call i32 @llvm.tpu.tc.transpose.start.v1024i32(<1024 x i32> %vi, i32 8, i32 16, i32 1, i32 undef)
+  %xlu1i = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %vi, i32 8, i32 16, i32 1, i32 %xlui)
+  %resi = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1i)
+
+  ret void
+}
+
+; CHECK-LABEL: full_transpose_W1_f:
+; CHECK: { (trf1) = vxpose.start v[[x:[0-9]+]], $0x8
+; CHECK: _ = vdelay $0x7 }
+; CHECK: { (trf1) = vxpose.end v[[x]], $0x8
+; CHECK: _ = vdelay $0x91 }
+; CHECK: { v{{[0-9]+}} = vpop (trf1)
+; CHECK: _ = shalt }
+define void @full_transpose_W1_f(<1024 x float> %vf) {
+  %xluf = call i32 @llvm.tpu.tc.transpose.start.v1024f32(<1024 x float> %vf, i32 8, i32 16, i32 1, i32 undef)
+  %xlu1f = call i32 @llvm.tpu.tc.transpose.end.v1024f32(<1024 x float> %vf, i32 8, i32 16, i32 1, i32 %xluf)
+  %resf = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu1f)
+  ret void
+}
+
+; Tranpose with a width of 16
+; CHECK-LABEL: full_transpose_W2:
+; CHECK: {   (trf1) = vxpose.start v0, $0x10;
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   (trf1) = vxpose.end v1, $0x10;
+; CHECK:  _ = vdelay $0x91  }
+; CHECK: {   v[[x:[0-9]+]] = vpop (trf1);
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   v[[y:[0-9]+]] = vpop (trf1)  }
+; CHECK: {   v{{[0-9]+}} = vadd.s32 v[[x]], v[[y]];
+; CHECK:  _ = shalt  }
+define <1024 x i32> @full_transpose_W2(<1024 x i32> %v0, <1024 x i32> %v1) {
+  %xlu = call i32 @llvm.tpu.tc.transpose.start.v1024i32(<1024 x i32> %v0, i32 16, i32 16, i32 1, i32 undef)
+  %xlu1 = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %v1, i32 16, i32 16, i32 1, i32 %xlu)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+  %res2 = add <1024 x i32> %res, %res1
+  ret <1024 x i32> %res2
+}
+
+; CHECK-LABEL: rotate_0:
+; CHECK: { (trf0) = vrot.lane v0, s0
+; CHECK: _ = vdelay $0x71 }
+; CHECK: { v{{[0-9]+}} = vpop (trf0)
+; CHECK: _ = shalt }
+ define <1024 x i32> @rotate_0(<1024 x i32> %v, i32 %amount) {
+  %xlu0 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 %amount, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  ret <1024 x i32> %res0
+}
+
+; CHECK-LABEL: rotate_1:
+; CHECK: { (trf1) = vrot.lane v0, s0
+; CHECK: _ = vdelay $0x71 }
+; CHECK: { v{{[0-9]+}} = vpop (trf1)
+; CHECK: _ = shalt }
+define <1024 x i32> @rotate_1(<1024 x i32> %v, i32 %amount) {
+  %xlu0 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 %amount, i32 1)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu0)
+
+  ret <1024 x i32> %res0
+}
+
+; CHECK-LABEL: rotate_2:
+; CHECK: { (trf2) = vrot.lane v0, s0
+; CHECK: _ = vdelay $0x59 }
+; CHECK: { v{{[0-9]+}} = vpop (trf2)
+; CHECK: _ = shalt }
+ define <1024 x i32> @rotate_2(<1024 x i32> %v, i32 %amount) {
+  %xlu0 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 %amount, i32 2)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu0)
+
+  ret <1024 x i32> %res0
+}
+
+; CHECK-LABEL: rotate:
+; CHECK: {   (trf0) = vrot.lane v0, s0;
+; CHECK:  (trf1) = vrot.lane v0, s0;
+; CHECK:  _ = vdelay $0x18  }
+; CHECK: {   (trf2) = vrot.lane v0, s0;
+; CHECK:  _ = vdelay $0x58  }
+; CHECK: {   v[[x:[0-9]+]] = vpop (trf0);
+; CHECK:  v[[y:[0-9]+]] = vpop (trf1)  }
+; CHECK: {   v[[z:[0-9]+]] = vpop (trf2);
+; CHECK:  v[[w:[0-9]+]] = vadd.s32 v[[x]], v[[y]]  }
+; CHECK: {   v{{[0-9]+}} = vadd.s32 v[[z]], v[[w]];
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @rotate(<1024 x i32> %v, i32 %amount) {
+  %xlu0 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 %amount, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  %xlu1 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 %amount, i32 1)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 %amount, i32 2)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu2)
+
+  %r1 = add <1024 x i32> %res0, %res1
+  %r2 = add <1024 x i32> %r1, %res2
+
+  ret <1024 x i32> %r2
+}
+
+; CHECK-LABEL: rotate_i:
+; CHECK: {   (trf0) = vrot.lane v0, $0x5;
+; CHECK:  (trf1) = vrot.lane v0, $0x5;
+; CHECK:  _ = vdelay $0x18  }
+; CHECK: {   (trf2) = vrot.lane v0, $0x5;
+; CHECK:  _ = vdelay $0x58  }
+; CHECK: {   v[[x:[0-9]+]] = vpop (trf0);
+; CHECK:  v[[y:[0-9]+]] = vpop (trf1)  }
+; CHECK: {   v[[z:[0-9]+]] = vpop (trf2);
+; CHECK:  v[[w:[0-9]+]] = vadd.s32 v[[x]], v[[y]]  }
+; CHECK: {   v{{[0-9]+}} = vadd.s32 v[[z]], v[[w]];
+; CHECK:  _ = shalt  }
+define <1024 x i32> @rotate_i(<1024 x i32> %v) {
+  %xlu0 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 5, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  %xlu1 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 5, i32 1)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 5, i32 2)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu2)
+
+  %r1 = add <1024 x i32> %res0, %res1
+  %r2 = add <1024 x i32> %r1, %res2
+
+  ret <1024 x i32> %r2
+}
+
+; CHECK-LABEL: rotate_selecting_imm_vs_reg_versions:
+; CHECK: (trf0) = vrot.lane v0, $0x44444
+; CHECK: s[[x:[0-9]+]] = simm.s32 $0x444444
+; CHECK: (trf0) = vrot.lane v0, s[[x]]
+; CHECK: v{{[0-9]+}} = vpop (trf0)
+; CHECK: v{{[0-9]+}} = vpop (trf0)
+define <1024 x i32> @rotate_selecting_imm_vs_reg_versions(<1024 x i32> %v) {
+  ; constant 279620 fits in 20 bits; hence immediate version of the rotate
+  ; instruction is selected.
+  %xlu0 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 279620, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  ; constant 4473924 does not fit in 20 bits; hence register version of the
+  ; rotate instruction is selected.
+  %xlu1 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 4473924, i32 0)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu1)
+
+  %r = add <1024 x i32> %res0, %res1
+
+  ret <1024 x i32> %r
+}
+
+; CHECK-LABEL: rotate_f:
+; CHECK: {   (trf0) = vrot.lane v0, s0;
+; CHECK:  (trf1) = vrot.lane v0, s0;
+; CHECK:  _ = vdelay $0x16  }
+; CHECK: {   (trf2) = vrot.lane v0, s0;
+; CHECK:  _ = vdelay $0x59  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf2)  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0);
+; CHECK:  v{{[0-9]+}} = vpop (trf1);
+; CHECK:  _ = shalt  }
+define void @rotate_f(<1024 x float> %v, i32 %amount) {
+  %xlu0 = call i32 @llvm.tpu.vrotate.v1024f32(<1024 x float> %v, i32 %amount, i32 0)
+  %res0 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 0, i32 %xlu0)
+
+  %xlu1 = call i32 @llvm.tpu.vrotate.v1024f32(<1024 x float> %v, i32 %amount, i32 1)
+  %res1 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu1)
+
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024f32(<1024 x float> %v, i32 %amount, i32 2)
+  %res2 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 2, i32 %xlu2)
+
+  ret void
+}
+
+; CHECK-LABEL: rotate_f_i:
+; CHECK: {   (trf0) = vrot.lane v0, $0x5;
+; CHECK:  (trf1) = vrot.lane v0, $0x5;
+; CHECK:  _ = vdelay $0x16  }
+; CHECK: {   (trf2) = vrot.lane v0, $0x5;
+; CHECK:  _ = vdelay $0x59  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf2)  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0);
+; CHECK:  v{{[0-9]+}} = vpop (trf1);
+; CHECK:  _ = shalt  }
+ define void @rotate_f_i(<1024 x float> %v) {
+  %xlu0 = call i32 @llvm.tpu.vrotate.v1024f32(<1024 x float> %v, i32 5, i32 0)
+  %res0 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 0, i32 %xlu0)
+
+  %xlu1 = call i32 @llvm.tpu.vrotate.v1024f32(<1024 x float> %v, i32 5, i32 1)
+  %res1 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu1)
+
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024f32(<1024 x float> %v, i32 5, i32 2)
+  %res2 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 2, i32 %xlu2)
+
+  ret void
+}
+
+; CHECK-LABEL: xlaneadd_0:
+; CHECK: {   (trf0) = vadd.xlane v0;
+; CHECK:  _ = vdelay $0x72  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0)
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @xlaneadd_0(<1024 x float> %v) {
+  %xlu0 = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  ret <1024 x i32> %res0
+}
+
+; CHECK-LABEL: xlaneadd_1:
+; CHECK: {   (trf1) = vadd.xlane v0;
+; CHECK:  _ = vdelay $0x72  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf1)
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @xlaneadd_1(<1024 x float> %v) {
+  %xlu0 = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 1)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu0)
+
+  ret <1024 x i32> %res0
+}
+
+; CHECK-LABEL: xlaneadd_2:
+; CHECK: {   (trf2) = vadd.xlane v0;
+; CHECK:  _ = vdelay $0x5a  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf2)
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @xlaneadd_2(<1024 x float> %v) {
+  %xlu0 = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 2)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu0)
+
+  ret <1024 x i32> %res0
+}
+
+; CHECK-LABEL: xlaneadd:
+; CHECK: {   (trf0) = vadd.xlane v0;
+; CHECK:  (trf1) = vadd.xlane v0;
+; CHECK:  _ = vdelay $0x18  }
+; CHECK: {   (trf2) = vadd.xlane v0;
+; CHECK:  _ = vdelay $0x59  }
+; CHECK: {   v[[x:[0-9]+]] = vpop (trf0);
+; CHECK:  v[[y:[0-9]+]] = vpop (trf1)  }
+; CHECK: {   v[[z:[0-9]+]] = vpop (trf2);
+; CHECK:  v[[w:[0-9]+]] = vadd.s32 v[[x]], v[[y]]  }
+; CHECK: {   v{{[0-9]+}} = vadd.s32 v[[z]], v[[w]]
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @xlaneadd(<1024 x float> %v) {
+  %xlu0 = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  %xlu1 = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 1)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+
+  %xlu2 = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 2)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu2)
+
+  %r1 = add <1024 x i32> %res0, %res1
+  %r2 = add <1024 x i32> %r1, %res2
+
+  ret <1024 x i32> %r2
+}
+
+; CHECK-LABEL: xlanemax:
+; CHECK: {   (trf0) = vmax.xlane v0;
+; CHECK:  (trf1) = vmax.xlane v0;
+; CHECK:  _ = vdelay $0x18  }
+; CHECK: {   (trf2) = vmax.xlane v0;
+; CHECK:  _ = vdelay $0x59  }
+; CHECK: {   v[[x:[0-9]+]] = vpop (trf0);
+; CHECK:  v[[y:[0-9]+]] = vpop (trf1)  }
+; CHECK: {   v[[z:[0-9]+]] = vpop (trf2);
+; CHECK:  v[[w:[0-9]+]] = vadd.s32 v[[x]], v[[y]]  }
+; CHECK: {   v{{[0-9]+}} = vadd.s32 v[[z]], v[[w]]
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @xlanemax(<1024 x float> %v) {
+  %xlu0 = call i32 @llvm.tpu.xlane.max(<1024 x float> %v, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  %xlu1 = call i32 @llvm.tpu.xlane.max(<1024 x float> %v, i32 1)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+
+  %xlu2 = call i32 @llvm.tpu.xlane.max(<1024 x float> %v, i32 2)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu2)
+
+  %r1 = add <1024 x i32> %res0, %res1
+  %r2 = add <1024 x i32> %r1, %res2
+
+  ret <1024 x i32> %r2
+}
+
+; CHECK-LABEL: xlanemin:
+; CHECK: {   (trf0) = vmin.xlane v0;
+; CHECK:  (trf1) = vmin.xlane v0;
+; CHECK:  _ = vdelay $0x18  }
+; CHECK: {   (trf2) = vmin.xlane v0;
+; CHECK:  _ = vdelay $0x59  }
+; CHECK: {   v[[x:[0-9]+]] = vpop (trf0);
+; CHECK:  v[[y:[0-9]+]] = vpop (trf1)  }
+; CHECK: {   v[[z:[0-9]+]] = vpop (trf2);
+; CHECK:  v[[w:[0-9]+]] = vadd.s32 v[[x]], v[[y]]  }
+; CHECK: {   v{{[0-9]+}} = vadd.s32 v[[z]], v[[w]]
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @xlanemin(<1024 x float> %v) {
+  %xlu0 = call i32 @llvm.tpu.xlane.min(<1024 x float> %v, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  %xlu1 = call i32 @llvm.tpu.xlane.min(<1024 x float> %v, i32 1)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+
+  %xlu2 = call i32 @llvm.tpu.xlane.min(<1024 x float> %v, i32 2)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu2)
+
+  %r1 = add <1024 x i32> %res0, %res1
+  %r2 = add <1024 x i32> %r1, %res2
+
+  ret <1024 x i32> %r2
+}
+
+; CHECK-LABEL: xlanemaxindex:
+; CHECK: {   (trf0) = vmax.index.xlane v0;
+; CHECK:  (trf1) = vmax.index.xlane v0;
+; CHECK:  _ = vdelay $0x18  }
+; CHECK: {   (trf2) = vmax.index.xlane v0;
+; CHECK:  _ = vdelay $0x59  }
+; CHECK: {   v[[x:[0-9]+]] = vpop (trf0);
+; CHECK:  v[[y:[0-9]+]] = vpop (trf1)  }
+; CHECK: {   v[[z:[0-9]+]] = vpop (trf2);
+; CHECK:  v[[w:[0-9]+]] = vadd.s32 v[[x]], v[[y]]  }
+; CHECK: {   v{{[0-9]+}} = vadd.s32 v[[z]], v[[w]]
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @xlanemaxindex(<1024 x float> %v) {
+  %xlu0 = call i32 @llvm.tpu.xlane.maxindex(<1024 x float> %v, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  %xlu1 = call i32 @llvm.tpu.xlane.maxindex(<1024 x float> %v, i32 1)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+
+  %xlu2 = call i32 @llvm.tpu.xlane.maxindex(<1024 x float> %v, i32 2)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu2)
+
+  %r1 = add <1024 x i32> %res0, %res1
+  %r2 = add <1024 x i32> %r1, %res2
+
+  ret <1024 x i32> %r2
+}
+
+; CHECK-LABEL: xlaneminindex:
+; CHECK: {   (trf0) = vmin.index.xlane v0;
+; CHECK:  (trf1) = vmin.index.xlane v0;
+; CHECK:  _ = vdelay $0x18  }
+; CHECK: {   (trf2) = vmin.index.xlane v0;
+; CHECK:  _ = vdelay $0x59  }
+; CHECK: {   v[[x:[0-9]+]] = vpop (trf0);
+; CHECK:  v[[y:[0-9]+]] = vpop (trf1)  }
+; CHECK: {   v[[z:[0-9]+]] = vpop (trf2);
+; CHECK:  v[[w:[0-9]+]] = vadd.s32 v[[x]], v[[y]]  }
+; CHECK: {   v{{[0-9]+}} = vadd.s32 v[[z]], v[[w]]
+; CHECK:  _ = shalt  }
+
+ define <1024 x i32> @xlaneminindex(<1024 x float> %v) {
+  %xlu0 = call i32 @llvm.tpu.xlane.minindex(<1024 x float> %v, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  %xlu1 = call i32 @llvm.tpu.xlane.minindex(<1024 x float> %v, i32 1)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+
+  %xlu2 = call i32 @llvm.tpu.xlane.minindex(<1024 x float> %v, i32 2)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu2)
+
+  %r1 = add <1024 x i32> %res0, %res1
+  %r2 = add <1024 x i32> %r1, %res2
+
+  ret <1024 x i32> %r2
+}
+
+; CHECK-LABEL: permute_0:
+; CHECK: {   (pcr0) = vsetperm.u8 v1  }
+; CHECK: {   (trf0) = vperm.lane v0;
+; CHECK:  _ = vdelay $0x71  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0)
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @permute_0(<1024 x i32> %v, <1024 x i32> %perm) {
+  %pcr0 = call i32 @llvm.tpu.set.permute(<1024 x i32> %perm, i32 0)
+  %xlu0 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr0, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  ret <1024 x i32> %res0
+}
+
+; CHECK-LABEL: permute_1:
+; CHECK: {   (pcr1) = vsetperm.u8 v1  }
+; CHECK: {   (trf1) = vperm.lane v0;
+; CHECK:  _ = vdelay $0x71  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf1)
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @permute_1(<1024 x i32> %v, <1024 x i32> %perm) {
+  %pcr0 = call i32 @llvm.tpu.set.permute(<1024 x i32> %perm, i32 1)
+  %xlu0 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr0, i32 1)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu0)
+
+  ret <1024 x i32> %res0
+}
+
+; CHECK-LABEL: permute_2:
+; CHECK: {   (pcr2) = vsetperm.u8 v1  }
+; CHECK: {   (trf2) = vperm.lane v0;
+; CHECK:  _ = vdelay $0x59  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf2)
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @permute_2(<1024 x i32> %v, <1024 x i32> %perm) {
+  %pcr0 = call i32 @llvm.tpu.set.permute(<1024 x i32> %perm, i32 2)
+  %xlu0 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr0, i32 2)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu0)
+
+  ret <1024 x i32> %res0
+}
+
+; CHECK-LABEL: permute_sublane_0:
+; CHECK: {   (pcr0) = vsetperm.all.u8 v1;
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   (trf0) = vperm.lane v0;
+; CHECK:  _ = vdelay $0x71  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0)
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @permute_sublane_0(<1024 x i32> %v, <1024 x i32> %perm) {
+  %pcr0 = call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %perm, i32 0)
+  %xlu0 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr0, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  ret <1024 x i32> %res0
+}
+
+; CHECK-LABEL: permute_sublane_1:
+; CHECK: {   (pcr1) = vsetperm.all.u8 v1;
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   (trf1) = vperm.lane v0;
+; CHECK:  _ = vdelay $0x71  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf1)
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @permute_sublane_1(<1024 x i32> %v, <1024 x i32> %perm) {
+  %pcr0 = call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %perm, i32 1)
+  %xlu0 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr0, i32 1)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu0)
+
+  ret <1024 x i32> %res0
+}
+
+; CHECK-LABEL: permute_sublane_2:
+; CHECK: {   (pcr2) = vsetperm.all.u8 v1;
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   (trf2) = vperm.lane v0;
+; CHECK:  _ = vdelay $0x59  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf2)
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @permute_sublane_2(<1024 x i32> %v, <1024 x i32> %perm) {
+  %pcr0 = call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %perm, i32 2)
+  %xlu0 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr0, i32 2)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu0)
+
+  ret <1024 x i32> %res0
+}
+
+
+; CHECK-LABEL: permute_byte_0:
+; CHECK: {   (pcr0) = vsetperm.all.bytes.u32 v1;
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   (trf0) = vperm.lane v0;
+; CHECK:  _ = vdelay $0x71  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0)
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @permute_byte_0(<1024 x i32> %v, <1024 x i32> %perm) {
+  %pcr0 = call i32 @llvm.tpu.set.permute.bytes(<1024 x i32> %perm, i32 0)
+  %xlu0 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr0, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  ret <1024 x i32> %res0
+}
+
+; CHECK-LABEL: permute_byte_1:
+; CHECK: {   (pcr1) = vsetperm.all.bytes.u32 v{{[0-9]+}};
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   (trf1) = vperm.lane v{{[0-9]+}};
+; CHECK:  _ = vdelay $0x71  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf1)
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @permute_byte_1(<1024 x i32> %v, <1024 x i32> %perm) {
+  %pcr0 = call i32 @llvm.tpu.set.permute.bytes(<1024 x i32> %perm, i32 1)
+  %xlu0 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr0, i32 1)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu0)
+
+  ret <1024 x i32> %res0
+}
+
+; CHECK-LABEL: permute_byte_2:
+; CHECK: {   (pcr2) = vsetperm.all.bytes.u32 v1;
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   (trf2) = vperm.lane v0;
+; CHECK:  _ = vdelay $0x59  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf2)
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @permute_byte_2(<1024 x i32> %v, <1024 x i32> %perm) {
+  %pcr0 = call i32 @llvm.tpu.set.permute.bytes(<1024 x i32> %perm, i32 2)
+  %xlu0 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr0, i32 2)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu0)
+
+  ret <1024 x i32> %res0
+}
+
+
+; CHECK-LABEL: permute:
+; CHECK: {   (pcr0) = vsetperm.u8 v1;
+; CHECK:  (pcr1) = vsetperm.u8 v1  }
+; CHECK: {   (trf0) = vperm.lane v0;
+; CHECK:  (trf1) = vperm.lane v0;
+; CHECK:  _ = vdelay $0x17  }
+; CHECK: {   (pcr2) = vsetperm.u8 v1  }
+; CHECK: {   (trf2) = vperm.lane v0;
+; CHECK:  _ = vdelay $0x58  }
+; CHECK: {   v[[x:[0-9]+]] = vpop (trf0);
+; CHECK:  v[[y:[0-9]+]] = vpop (trf1)  }
+; CHECK: {   v[[z:[0-9]+]] = vpop (trf2);
+; CHECK:  v[[w:[0-9]+]] = vadd.s32 v[[x]], v[[y]]  }
+; CHECK: {   v{{[0-9]+}} = vadd.s32 v[[z]], v[[w]]
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @permute(<1024 x i32> %v, <1024 x i32> %perm) {
+  %pcr0 = call i32 @llvm.tpu.set.permute(<1024 x i32> %perm, i32 0)
+  %xlu0 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr0, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  %pcr1 = call i32 @llvm.tpu.set.permute(<1024 x i32> %perm, i32 1)
+  %xlu1 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr1, i32 1)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+
+  %pcr2 = call i32 @llvm.tpu.set.permute(<1024 x i32> %perm, i32 2)
+  %xlu2 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr2, i32 2)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu2)
+
+  %r1 = add <1024 x i32> %res0, %res1
+  %r2 = add <1024 x i32> %r1, %res2
+
+  ret <1024 x i32> %r2
+}
+
+; CHECK-LABEL: permute_sublane:
+; CHECK: {   (pcr0) = vsetperm.all.u8 v1;
+; CHECK:  (pcr1) = vsetperm.all.u8 v1;
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   (trf0) = vperm.lane v0;
+; CHECK:  (trf1) = vperm.lane v0;
+; CHECK:  _ = vdelay $0x10  }
+; CHECK: {   (pcr2) = vsetperm.all.u8 v1;
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   (trf2) = vperm.lane v0;
+; CHECK:  _ = vdelay $0x58  }
+; CHECK: {   v[[x:[0-9]+]] = vpop (trf0);
+; CHECK:  v[[y:[0-9]+]] = vpop (trf1)  }
+; CHECK: {   v[[z:[0-9]+]] = vpop (trf2);
+; CHECK:  v[[w:[0-9]+]] = vadd.s32 v[[x]], v[[y]]  }
+; CHECK: {   v{{[0-9]+}} = vadd.s32 v[[z]], v[[w]]
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @permute_sublane(<1024 x i32> %v, <1024 x i32> %perm) {
+  %pcr0 = call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %perm, i32 0)
+  %xlu0 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr0, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  %pcr1 = call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %perm, i32 1)
+  %xlu1 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr1, i32 1)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+
+  %pcr2 = call i32 @llvm.tpu.set.permute.sublane(<1024 x i32> %perm, i32 2)
+  %xlu2 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr2, i32 2)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu2)
+
+  %r1 = add <1024 x i32> %res0, %res1
+  %r2 = add <1024 x i32> %r1, %res2
+
+  ret <1024 x i32> %r2
+}
+
+; CHECK-LABEL: permute_byte:
+; CHECK: {   (pcr0) = vsetperm.all.bytes.u32 v1;
+; CHECK:  (pcr1) = vsetperm.all.bytes.u32 v1;
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   (trf0) = vperm.lane v0;
+; CHECK:  (trf1) = vperm.lane v0;
+; CHECK:  _ = vdelay $0x10  }
+; CHECK: {   (pcr2) = vsetperm.all.bytes.u32 v1;
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   (trf2) = vperm.lane v0;
+; CHECK:  _ = vdelay $0x58  }
+; CHECK: {   v[[x:[0-9]+]] = vpop (trf0);
+; CHECK:  v[[y:[0-9]+]] = vpop (trf1)  }
+; CHECK: {   v[[z:[0-9]+]] = vpop (trf2);
+; CHECK:  v[[w:[0-9]+]] = vadd.s32 v[[x]], v[[y]]  }
+; CHECK: {   v{{[0-9]+}} = vadd.s32 v[[z]], v[[w]]
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @permute_byte(<1024 x i32> %v, <1024 x i32> %perm) {
+  %pcr0 = call i32 @llvm.tpu.set.permute.bytes(<1024 x i32> %perm, i32 0)
+  %xlu0 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr0, i32 0)
+  %res0 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu0)
+
+  %pcr1 = call i32 @llvm.tpu.set.permute.bytes(<1024 x i32> %perm, i32 1)
+  %xlu1 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr1, i32 1)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu1)
+
+  %pcr2 = call i32 @llvm.tpu.set.permute.bytes(<1024 x i32> %perm, i32 2)
+  %xlu2 = call i32 @llvm.tpu.permute.v1024i32(<1024 x i32> %v, i32 %pcr2, i32 2)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu2)
+
+  %r1 = add <1024 x i32> %res0, %res1
+  %r2 = add <1024 x i32> %r1, %res2
+
+  ret <1024 x i32> %r2
+}
+
+; CHECK-LABEL: permute_f:
+; CHECK: {   (pcr0) = vsetperm.u8 v1;
+; CHECK:  (pcr1) = vsetperm.u8 v1  }
+; CHECK: {   (trf0) = vperm.lane v0;
+; CHECK:  (trf1) = vperm.lane v0;
+; CHECK:  _ = vdelay $0x15  }
+; CHECK: {   (pcr2) = vsetperm.u8 v1  }
+; CHECK: {   (trf2) = vperm.lane v0;
+; CHECK:  _ = vdelay $0x59  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf2)  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0);
+; CHECK:  v{{[0-9]+}} = vpop (trf1)
+; CHECK:  _ = shalt  }
+ define <1024 x float> @permute_f(<1024 x float> %v, <1024 x i32> %perm) {
+  %pcr0 = call i32 @llvm.tpu.set.permute(<1024 x i32> %perm, i32 0)
+  %xlu0 = call i32 @llvm.tpu.permute.v1024f32(<1024 x float> %v, i32 %pcr0, i32 0)
+  %res0 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 0, i32 %xlu0)
+
+  %pcr1 = call i32 @llvm.tpu.set.permute(<1024 x i32> %perm, i32 1)
+  %xlu1 = call i32 @llvm.tpu.permute.v1024f32(<1024 x float> %v, i32 %pcr1, i32 1)
+  %res1 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu1)
+
+  %pcr2 = call i32 @llvm.tpu.set.permute(<1024 x i32> %perm, i32 2)
+  %xlu2 = call i32 @llvm.tpu.permute.v1024f32(<1024 x float> %v, i32 %pcr2, i32 2)
+  %res2 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 2, i32 %xlu2)
+
+  ret <1024 x float> %res2
+}
+
+; Check that we have the right latency between transpose_end and the rotate.
+; CHECK-LABEL: transposer_to_rotate:
+; CHECK: {   (trf0) = vxpose.end v0, $0x8;
+; CHECK:  _ = vdelay $0x67  }
+; CHECK: {   (trf0) = vrot.lane v0, $0x5;
+; CHECK:  _ = vdelay $0x69  }
+; CHECK: {   v[[x:[0-9]+]] = vpop (trf0);
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   v[[y:[0-9]+]] = vpop (trf0)  }
+; CHECK: {   v{{[0-9]+}} = vadd.s32 v[[x]], v[[y]]
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @transposer_to_rotate(<1024 x i32> %v) {
+  %xlu = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %v, i32 8, i32 8, i32 0, i32 undef)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu)
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 5, i32 0)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu2)
+  %res = add <1024 x i32> %res1, %res2
+  ret <1024 x i32> %res
+}
+
+; Check that we have the right latency between transpose_end and the reduce.
+; CHECK-LABEL: transposer_to_reduce:
+; CHECK: {   (trf0) = vxpose.end v0, $0x8;
+; CHECK:  _ = vdelay $0x5d  }
+; CHECK: {   (trf0) = vadd.xlane v0;
+; CHECK:  _ = vdelay $0x6a  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0);
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0)
+; CHECK:  _ = shalt  }
+ define <1024 x float> @transposer_to_reduce(<1024 x float> %v) {
+  %xlu = call i32 @llvm.tpu.tc.transpose.end.v1024f32(<1024 x float> %v, i32 8, i32 8, i32 0, i32 undef)
+  %res1 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 0, i32 %xlu)
+  %xlu2 = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 0)
+  %res2 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 0, i32 %xlu2)
+  ret <1024 x float> %res2
+}
+
+; Check that we have the right latency between rotate and transpose_end.
+; CHECK-LABEL: rotate_to_transpose_unit0:
+; CHECK: {   (trf0) = vrot.lane v0, $0x5;
+; CHECK:  _ = vdelay $0xb  }
+; CHECK: {   (trf0) = vxpose.end v0, $0x8;
+; CHECK:  _ = vdelay $0x91  }
+; CHECK: {   v[[x:[0-9]+]] = vpop (trf0);
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   v[[y:[0-9]+]] = vpop (trf0)  }
+; CHECK: {   v{{[0-9]+}} = vadd.s32 v[[x]], v[[y]]
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @rotate_to_transpose_unit0(<1024 x i32> %v) {
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 5, i32 0)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu2)
+  %xlu = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %v, i32 8, i32 8, i32 0, i32 undef)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu)
+  %res = add <1024 x i32> %res1, %res2
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: rotate_to_transpose_unit1:
+; CHECK: {   (trf1) = vrot.lane v0, $0x5;
+; CHECK:  _ = vdelay $0x1e  }
+; CHECK: {   (trf1) = vxpose.end v0, $0x8;
+; CHECK:  _ = vdelay $0x91  }
+; CHECK: {   v[[x:[0-9]+]] = vpop (trf1);
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   v[[y:[0-9]+]] = vpop (trf1)  }
+; CHECK: {   v{{[0-9]+}} = vadd.s32 v[[x]], v[[y]]
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @rotate_to_transpose_unit1(<1024 x i32> %v) {
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 5, i32 1)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu2)
+  %xlu = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %v, i32 8, i32 8, i32 1, i32 undef)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 1, i32 %xlu)
+  %res = add <1024 x i32> %res1, %res2
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: rotate_to_transpose_unit2:
+; CHECK: {   (trf2) = vrot.lane v0, $0x5;
+; CHECK:  _ = vdelay $0xf  }
+; CHECK: {   (trf2) = vxpose.end v0, $0x8;
+; CHECK:  _ = vdelay $0x85  }
+; CHECK: {   v[[x:[0-9]+]] = vpop (trf2);
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   v[[y:[0-9]+]] = vpop (trf2)  }
+; CHECK: {   v{{[0-9]+}} = vadd.s32 v[[x]], v[[y]]
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @rotate_to_transpose_unit2(<1024 x i32> %v) {
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024i32(<1024 x i32> %v, i32 5, i32 2)
+  %res2 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu2)
+  %xlu = call i32 @llvm.tpu.tc.transpose.end.v1024i32(<1024 x i32> %v, i32 8, i32 8, i32 2, i32 undef)
+  %res1 = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 2, i32 %xlu)
+  %res = add <1024 x i32> %res1, %res2
+  ret <1024 x i32> %res
+}
+
+; Check that we have the right latency between rotate and reduce.
+; CHECK-LABEL: rotate_to_reduce_unit0:
+; CHECK: {   (trf0) = vrot.lane v0, $0x5;
+; CHECK:  _ = vdelay $0x29  }
+; CHECK: {   (trf0) = vadd.xlane v0;
+; CHECK:  _ = vdelay $0x6a  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0);
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0)
+; CHECK:  _ = shalt  }
+ define <1024 x float> @rotate_to_reduce_unit0(<1024 x float> %v) {
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024f32(<1024 x float> %v, i32 5, i32 0)
+  %res2 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 0, i32 %xlu2)
+  %xlu = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 0)
+  %res = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 0, i32 %xlu)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: rotate_to_reduce_unit1:
+; CHECK: {   (trf1) = vrot.lane v0, $0x5;
+; CHECK:  _ = vdelay $0x29  }
+; CHECK: {   (trf1) = vadd.xlane v0;
+; CHECK:  _ = vdelay $0x6a  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf1);
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf1)
+; CHECK:  _ = shalt  }
+ define <1024 x float> @rotate_to_reduce_unit1(<1024 x float> %v) {
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024f32(<1024 x float> %v, i32 5, i32 1)
+  %res2 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu2)
+  %xlu = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 1)
+  %res = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: rotate_to_reduce_unit2:
+; CHECK: {   (trf2) = vrot.lane v0, $0x5;
+; CHECK:  _ = vdelay $0x28  }
+; CHECK: {   (trf2) = vadd.xlane v0;
+; CHECK:  _ = vdelay $0x52  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf2);
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf2)
+; CHECK:  _ = shalt  }
+ define <1024 x float> @rotate_to_reduce_unit2(<1024 x float> %v) {
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024f32(<1024 x float> %v, i32 5, i32 2)
+  %res2 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 2, i32 %xlu2)
+  %xlu = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 2)
+  %res = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 2, i32 %xlu)
+  ret <1024 x float> %res
+}
+
+; Check that we have the right latency between reduce and transpose_end.
+; CHECK-LABEL: reduce_to_transpose_unit0:
+; CHECK: {   (trf0) = vadd.xlane v0;
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   (trf0) = vxpose.end v0, $0x8;
+; CHECK:  _ = vdelay $0x91  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0);
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0)
+; CHECK:  _ = shalt  }
+ define <1024 x float> @reduce_to_transpose_unit0(<1024 x float> %v) {
+  %xlu2 = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 0)
+  %res2 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 0, i32 %xlu2)
+  %xlu = call i32 @llvm.tpu.tc.transpose.end.v1024f32(<1024 x float> %v, i32 8, i32 8, i32 0, i32 undef)
+  %res1 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 0, i32 %xlu)
+  ret <1024 x float> %res1
+}
+
+; CHECK-LABEL: reduce_to_transpose_unit1:
+; CHECK: {   (trf1) = vadd.xlane v0;
+; CHECK:  _ = vdelay $0xe  }
+; CHECK: {   (trf1) = vxpose.end v0, $0x8;
+; CHECK:  _ = vdelay $0x91  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf1);
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf1)
+; CHECK:  _ = shalt  }
+ define <1024 x float> @reduce_to_transpose_unit1(<1024 x float> %v) {
+  %xlu2 = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 1)
+  %res2 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu2)
+  %xlu = call i32 @llvm.tpu.tc.transpose.end.v1024f32(<1024 x float> %v, i32 8, i32 8, i32 1, i32 undef)
+  %res1 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu)
+  ret <1024 x float> %res1
+}
+
+; CHECK-LABEL: reduce_to_transpose_unit2:
+; CHECK: {   (trf2) = vadd.xlane v0;
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   (trf2) = vxpose.end v0, $0x8;
+; CHECK:  _ = vdelay $0x85  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf2);
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf2)
+; CHECK:  _ = shalt  }
+ define <1024 x float> @reduce_to_transpose_unit2(<1024 x float> %v) {
+  %xlu2 = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 2)
+  %res2 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 2, i32 %xlu2)
+  %xlu = call i32 @llvm.tpu.tc.transpose.end.v1024f32(<1024 x float> %v, i32 8, i32 8, i32 2, i32 undef)
+  %res1 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 2, i32 %xlu)
+  ret <1024 x float> %res1
+}
+
+; Check that we have the right latency between reduce and permute.
+; CHECK-LABEL: reduce_to_permute_unit0:
+; CHECK: {   (trf0) = vadd.xlane v0;
+; CHECK:  _ = vdelay $0x13  }
+; CHECK: {   (trf0) = vrot.lane v0, $0x5;
+; CHECK:  _ = vdelay $0x69  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0);
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0)
+; CHECK:  _ = shalt  }
+ define <1024 x float> @reduce_to_permute_unit0(<1024 x float> %v) {
+  %xlu = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 0)
+  %res1 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 0, i32 %xlu)
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024f32(<1024 x float> %v, i32 5, i32 0)
+  %res2 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 0, i32 %xlu2)
+  ret <1024 x float> %res2
+}
+
+; CHECK-LABEL: reduce_to_permute_unit1:
+; CHECK: {   (trf1) = vadd.xlane v0;
+; CHECK:  _ = vdelay $0x18  }
+; CHECK: {   (trf1) = vrot.lane v0, $0x5;
+; CHECK:  _ = vdelay $0x69  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf1);
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf1)
+; CHECK:  _ = shalt  }
+ define <1024 x float> @reduce_to_permute_unit1(<1024 x float> %v) {
+  %xlu = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 1)
+  %res1 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu)
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024f32(<1024 x float> %v, i32 5, i32 1)
+  %res2 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 1, i32 %xlu2)
+  ret <1024 x float> %res2
+}
+
+; CHECK-LABEL: reduce_to_permute_unit2:
+; CHECK: {   (trf2) = vadd.xlane v0;
+; CHECK:  _ = vdelay $0x17  }
+; CHECK: {   (trf2) = vrot.lane v0, $0x5;
+; CHECK:  _ = vdelay $0x51  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf2);
+; CHECK:  _ = vdelay $0x7  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf2)
+; CHECK:  _ = shalt  }
+ define <1024 x float> @reduce_to_permute_unit2(<1024 x float> %v) {
+  %xlu = call i32 @llvm.tpu.xlane.add(<1024 x float> %v, i32 2)
+  %res1 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 2, i32 %xlu)
+  %xlu2 = call i32 @llvm.tpu.vrotate.v1024f32(<1024 x float> %v, i32 5, i32 2)
+  %res2 = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 2, i32 %xlu2)
+  ret <1024 x float> %res2
+}
+
+; CHECK-LABEL: permute_packed:
+; CHECK: {   (pcr0) = vsetperm.u8 v2  }
+; CHECK: {   (trf0) = vperm.lane.packed v0;
+; CHECK:  _ = vsupp v1;
+; CHECK:  _ = vdelay $0x79  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0)
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @permute_packed(<1024 x i32> %vlow, <1024 x i32> %vhigh, <1024 x i32> %perm) {
+  %pcr = call i32 @llvm.tpu.set.permute(<1024 x i32> %perm, i32 0)
+  %xlu = call i32 @llvm.tpu.permute.packed.v1024i32(<1024 x i32> %vlow, <1024 x i32> %vhigh, i32 %pcr, i32 0)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: permute_packed_f:
+; CHECK: {   (pcr0) = vsetperm.u8 v2  }
+; CHECK: {   (trf0) = vperm.lane.packed v0;
+; CHECK:  _ = vsupp v1;
+; CHECK:  _ = vdelay $0x79  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0)
+; CHECK:  _ = shalt  }
+ define <1024 x float> @permute_packed_f(<1024 x float> %vlow, <1024 x float> %vhigh, <1024 x i32> %perm) {
+  %pcr = call i32 @llvm.tpu.set.permute(<1024 x i32> %perm, i32 0)
+  %xlu = call i32 @llvm.tpu.permute.packed.v1024f32(<1024 x float> %vlow, <1024 x float> %vhigh, i32 %pcr, i32 0)
+  %res = call <1024 x float> @llvm.tpu.tc.vtrfpop.v1024f32(i32 0, i32 %xlu)
+  ret <1024 x float> %res
+}
+
+; CHECK-LABEL: rotate_packed:
+; CHECK: {   (trf0) = vrot.lane.packed v0, s0;
+; CHECK:  _ = vsupp v1;
+; CHECK:  _ = vdelay $0x79  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0)
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @rotate_packed(<1024 x i32> %vlow, <1024 x i32> %vhigh, i32 %amount) {
+  %xlu = call i32 @llvm.tpu.vrotate.packed.v1024i32(<1024 x i32> %vlow, <1024 x i32> %vhigh, i32 %amount, i32 0)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu)
+  ret <1024 x i32> %res
+}
+
+; CHECK-LABEL: rotate_packed_i:
+; CHECK: {   (trf0) = vrot.lane.packed v0, $0x5;
+; CHECK:  _ = vsupp v1;
+; CHECK:  _ = vdelay $0x79  }
+; CHECK: {   v{{[0-9]+}} = vpop (trf0)
+; CHECK:  _ = shalt  }
+ define <1024 x i32> @rotate_packed_i(<1024 x i32> %vlow, <1024 x i32> %vhigh) {
+  %xlu = call i32 @llvm.tpu.vrotate.packed.v1024i32(<1024 x i32> %vlow, <1024 x i32> %vhigh, i32 5, i32 0)
+  %res = call <1024 x i32> @llvm.tpu.tc.vtrfpop.v1024i32(i32 0, i32 %xlu)
+  ret <1024 x i32> %res
+}

diff --git a/tpu_recision/third_party/llvm/llvm/test/MC/GoogleTPU/encode_tc.s b/tpu_recision/third_party/llvm/llvm/test/MC/GoogleTPU/encode_tc.s
new file mode 100644
index 0000000..9a9b79e
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/MC/GoogleTPU/encode_tc.s

@@ -0,0 +1,35 @@
+// RUN: llvm-mc < %s -arch=googletpu -assemble -show-encoding \
+// RUN: -tpu-print-mcinst-encodings -mcpu=tensorcore-jf \
+// RUN: | FileCheck %s --check-prefix=JFC
+// RUN: llvm-mc < %s -arch=googletpu -assemble -show-encoding \
+// RUN: -tpu-print-mcinst-encodings -mcpu=tensorcore-pf \
+// RUN: | FileCheck %s --check-prefix=PXC
+// REQUIRES: tpu
+
+// JFC: {  	s6 =	ssub.f32 @p7 $2.000000e+00 (zext imm1 encoding 1), s10  } // encoding: [0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x6f,0x02,0x65,0x9e,0xff,0xff,0xff,0x03]
+// PXC: {  	s6 =	ssub.f32 @p7 $2.000000e+00 (zext imm1 encoding 1), s10  } // encoding: [0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x9b,0x40,0x99,0xe7,0xff,0xff,0xff]
+{ s6 = ssub.f32 @p7 $2.0 (zext imm1 encoding 1), s10 }
+
+// JFC: {  	s6 =	ssub.f32 @p7 $2.000000e+00 (oneext imm2 encoding 1), s10  } // encoding: [0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x6f,0x02,0x65,0x9e,0xff,0xff,0xff,0x03]
+// PXC: {  	s6 =	ssub.f32 @p7 $2.000000e+00 (oneext imm2 encoding 1), s10  } // encoding: [0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x9b,0x40,0x99,0xe7,0xff,0xff,0xff]
+{ s6 = ssub.f32 @p7 $2.0 (oneext imm2 encoding 1), s10 }
+
+// JFC: {  	(slot_s0) s6 =	sadd.s32 s10, s10  } // encoding: [0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,0xa3,0x28,0xf0,0x01]
+// PXC: {  	(slot_s0) s6 =	sadd.s32 s10, s10  } // encoding: [0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xdf,0x28,0x0a,0x7c]
+{ (slot_s0) s6 = sadd.s32 s10, s10 }
+
+// JFC: {  	(slot_s1) s6 =	sadd.s32 s10, s10  } // encoding: [0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x6f,0x14,0x05,0xbe,0xff,0xff,0xff,0x03]
+// PXC: {  	(slot_s1) s6 =	sadd.s32 s10, s10  } // encoding: [0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x1b,0x45,0x81,0xef,0xff,0xff,0xff]
+{ (slot_s1) s6 = sadd.s32 s10, s10 }
+
+// JFC: { (slot_s0) s6 = sadd.s32 $0x0 (embed encoding 1), s10 } // encoding: [0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,0x13,0x28,0xf0,0x01]
+// PXC: { (slot_s0) s6 = sadd.s32 $0x0 (embed encoding 1), s10 } // encoding: [0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xdf,0x04,0x0a,0x7c]
+{ (slot_s0) s6 = sadd.s32 $0 (embed encoding 1), s10 }
+
+// JFC: { (slot_s0) s6 = sadd.s32 $0x7B (i32 imm23 encoding 1), s10 } // encoding: [0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,0x13,0x28,0xf0,0x01]
+// PXC: { (slot_s0) s6 = sadd.s32 $0x7B (i32 imm23 encoding 1), s10 } // encoding: [0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xdf,0x04,0x0a,0x7c]
+{ (slot_s0) s6 = sadd.s32 $123 (i32 imm23 encoding 1), s10 }
+
+// JFC: {  	(slot_v1) (erf) = vlog2.f32 v20  } // encoding: [0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x83,0xff,0xa9,0xfc,0xfe,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03]
+// PXC: {  	(slot_v1) (erf) = vlog2.f32 v20  } // encoding: [0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f,0xf0,0x3f,0x95,0xdf,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff]
+{ (slot_v1) (erf) = vlog2.f32 v20 }

diff --git a/tpu_recision/third_party/llvm/llvm/test/MC/GoogleTPU/parser_sc.s b/tpu_recision/third_party/llvm/llvm/test/MC/GoogleTPU/parser_sc.s
new file mode 100644
index 0000000..107b44e
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/MC/GoogleTPU/parser_sc.s

@@ -0,0 +1,231 @@
+// RUN: llvm-mc < %s -triple googletpu-- -mcpu=sparsecore-tec-vf \
+// RUN: | FileCheck %s
+// RUN: llvm-mc < %s -triple googletpu-- -mcpu=sparsecore-tec-vf \
+// RUN: -tpu-print-cbr-regs \
+// RUN: | llvm-mc -triple googletpu-- -mcpu=sparsecore-tec-vf \
+// RUN: | FileCheck %s
+// REQUIRES: tpu
+
+	.text
+	.file	"test.c"
+	.globl	vector_trivial
+	.p2align	2
+	.type	vector_trivial,@function
+vector_trivial:
+{  	s1 =	simm.s32 $0x0;
+    s1 =	simm.s32 $0x0  }
+{   s2 =  simm.f32 $0.04 }
+{   s2 = sadd.f32 $1.000000e-01, s1 }
+{  	v0 =	vld @!p0 [tilespmem:s1+$0x0]   }
+{  	v0 =	vld @!p0 [tilespmem:g]   }
+{   (v2sf) = vpush v0, s0 }
+{   (v2sf) = vpush @p0 v0, s0 }
+{   (v2sf) = vpush @!p1 v0, s0 }
+{   (v2sf) = vpush v0, $0x1 }
+{   (v2sf) = vpush @p0 v0, $0x2 }
+{   (v2sf) = vpush @!p1 v0, $0x3 }
+{   s1 = spop (v2sf) }
+{   s1 = spop @p0  (v2sf) }
+{   s1 = spop @!p1  (v2sf) }
+{   _ =	swait.done [sflag:s0] }
+{  	s0 =	ssyncread.done [sflag:g]  }
+{  	p0 =	por $0x0, $0x0 }
+{  	p1 =	por !p0, !p0 }
+{  	p2 =	por !p0, p1 }
+{  	p3 =	por p0, !p1 }
+{   (trf0) = vxpose.0 v11, $0x80 }
+{   (iar0) = vsetiar.raw v0 }
+{   v3 = vmax.f32 v7, $Inf }
+{   v3 = vmax.f32 v7, $-Inf }
+{   v7 = vsub.f32 $1.0, v29 }
+{  	v0 =	vmul.f32 $-0.10000000149011612, v0  }
+{  	[hib], [sflag:s31] = dma.local [hbm:s1], $0x1  }
+{  	s0 =	sadd.s32 s0, s1; _ =	shalt  }
+{  	v0 = vpop @!p1 (trf0)  }
+{  	_ = vsupp @p0 v1 }
+{   (drf) = sdivrem.u32 s0, s1 }
+{   (drf) = sdiv.u32 s0, s1 }
+{   (drf) = srem.u32 s0, s1 }
+{   s0 = spop (drf) }
+{   s0 =    sld.cb [smem:$0x101 cbreg:cb0]  }
+{   s1 =    sld.cb.upd [smem:$0xffff cbreg:cb1]  }
+{   [smem:$0xfffff cbreg:cb2] =       sst.cb s0  }
+{   [smem:$0xffffe cbreg:cb9] =       sst.cb @!p0 s31  }
+{   s0 = rdcbreg [cbreg:cb1 metadata:$0x0] }
+{   s0 = rdcbreg [cbreg:cb1 metadata:$0x0] }
+{   s0 = rdcbreg [cbreg:cb1 metadata:$0x1] }
+{   s0 = rdcbreg [cbreg:cb1 metadata:$0x2] }
+{   [cbreg:cb1 metadata:$0x0] = wrcbreg s0 }
+{   [cbreg:cb1 metadata:$0x0] = wrcbreg s0 }
+{   [cbreg:cb1 metadata:$0x1] = wrcbreg s0 }
+{   [cbreg:cb1 metadata:$0x2] = wrcbreg s0 }
+{   [cbreg:cb1] = cbreg.add s0 }
+{   v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], vm0 }
+{   v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], vm0 }
+{   [tilespmem:s0+$0x0] = vst.msk vm0, v0 }
+{   [tilespmem:v0+s0+$0x0] = vst.idx.msk vm0, v1 }
+{   [tilespmem:v0+s0+$0x0] = vst.idx.msk vm0, v1 }
+{   [tilespmem:v0+s0+$0x0] = vst.idx.add.s32.msk vm0, v1 }
+{   [tilespmem:v0+s0+$0x0] = vst.idx.add.f32.msk vm0, v1 }
+{   [tilespmem:v0+s0+$0x0] = vst.idx.add.f32.msk vm0, v1 }
+{   [tilespmem:v0+s0+$0x0] = vst.idx.add.s32.msk vm0, v1 }
+{   v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], vm0 }
+{   v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], vm0 }
+{   v0 = vld.idx.cb.msk [tilespmem:v0+s0+$0x0 cbreg:cb9], vm0 }
+{   v0 = vld.idx.cb.msk [tilespmem:v0+s0+$0x0 cbreg:cb9], vm0 }
+{   v0 = vld.idx.cb.msk [tilespmem:v0+s0+$0x0 cbreg:cb9], vm0 }
+{   v0 = vld.idx.cb.msk [tilespmem:v0+s0+$0x0 cbreg:cb9], vm0 }
+{   [tilespmem:s0+$0x0 cbreg:cb3] = vst.cb.msk vm0, v0 }
+{   [tilespmem:v0+s0+$0x0 cbreg:cb3] = vst.idx.cb.msk vm0, v1 }
+{   [tilespmem:v0+s0+$0x0 cbreg:cb3] = vst.idx.cb.msk vm0, v1 }
+{   [tilespmem:v0+s0+$0x0 cbreg:cb3] = vst.idx.cb.add.s32.msk vm0, v1 }
+{   [tilespmem:v0+s0+$0x0 cbreg:cb3] = vst.idx.cb.add.f32.msk vm0, v1 }
+{   [tilespmem:v0+s0+$0x0 cbreg:cb3] = vst.idx.cb.add.s32.msk vm0, v1 }
+{   [tilespmem:v0+s0+$0x0 cbreg:cb3] = vst.idx.cb.add.f32.msk vm0, v1 }
+{   [tilespmem:s0+$0x0 cbreg:cb3] = vst.cb.upd.msk vm0, v0 }
+{   (xrf1) = vsort.ascd.msk.u32 vm0, v0, v1 }
+{   (xrf1) = vsort.ascd.msk.f32 vm0, v0, v1 }
+{   (xrf1) = vsort.dscd.msk.u32 vm0, v0, v1 }
+{   (xrf1) = vsort.dscd.msk.f32 vm0, v0, v1 }
+{   (xrf1) = vdupcnt.msk.u32 vm0, v0 }
+{   (xrf1) = vdupcnt.msk.f32 vm0, v0 }
+{   (xrf1) = vunique.msk.u32 vm0, v0 }
+{   (xrf1) = vunique.msk.f32 vm0, v0 }
+{   _ = sfence.scmf }
+{   _ = sfence.sel $0x9 }
+{   _ = sfence.stream.spmem }
+{   _ = sfence.stream.hbm }
+{   _ = snop }
+{   _ = sdelay $0x1 }
+{   v2 = vpsel p0, v0, v1 }
+{   s0 = smin.f32 s0, s1 }
+{   s0 = smax.f32 s0, s1 }
+{   s0 = smin.u32 s0, s1 }
+{   s0 = smax.u32 s0, s1 }
+{   s0 = smin.f32 s0, $0.1 }
+{   s0 = smax.f32 s0, $-0.3 }
+{   s0 = smin.u32 s0, $256 }
+func_end0:
+	.size	vector_trivial, func_end0-vector_trivial
+
+	.type	a,@object
+	.data
+	.globl	a
+	.p2align	5
+a:
+	.long	4
+	.long	4
+	.long	4
+	.long	4
+	.long	4
+	.long	4
+	.long	4
+	.long	4
+	.size	a, 32
+
+
+	.ident	"clang version google3-trunk (trunk r360825)"
+	.section	".note.GNU-stack","",@progbits
+
+// CHECK: .text
+// CHECK:	.file	"test.c"
+// CHECK:	.globl	vector_trivial
+// CHECK:	.p2align	2
+// CHECK:	.type	vector_trivial,@function
+// CHECK:vector_trivial:
+// CHECK:{  	s1 =	simm.s32 $0x0;
+// CHECK:    s1 =	simm.s32 $0x0  }
+// CHECK: { s2 = simm.f32 $0.039999999105930328 }
+// CHECK: { s2 = sadd.f32 $0.10000000149011612, s1 }
+// CHECK:{  	v0 =	vld @!p0 [tilespmem:s1+$0x0]   }
+// CHECK:{  	v0 =	vld @!p0 [tilespmem:g]   }
+// CHECK:{   (v2sf) = vpush v0, s0 }
+// CHECK:{   (v2sf) = vpush @p0 v0, s0 }
+// CHECK:{   (v2sf) = vpush @!p1 v0, s0 }
+// CHECK:{   (v2sf) = vpush v0, $0x1 }
+// CHECK:{   (v2sf) = vpush @p0 v0, $0x2 }
+// CHECK:{   (v2sf) = vpush @!p1 v0, $0x3 }
+// CHECK:{   s1 = spop (v2sf) }
+// CHECK: {   _ =	swait.done [sflag:s0] }
+// CHECK: {  	s0 =	ssyncread.done [sflag:g]  }
+// CHECK: { p0 = por $0x0, $0x0 }
+// CHECK: { p1 = por !p0, !p0 }
+// CHECK: { p2 = por !p0, p1 }
+// CHECK: { p3 = por p0, !p1 }
+// CHECK: { (trf0) = vxpose.0 v11, $0x80 }
+// CHECK: { (iar0) = vsetiar.raw v0 }
+// CHECK: { v3 = vmax.f32 v7, $Inf }
+// CHECK: { v3 = vmax.f32 v7, $-Inf }
+// CHECK: { v7 = vsub.f32 $1.0, v29 }
+// CHECK: { v0 = vmul.f32 $-0.10000000149011612, v0 }
+// CHECK:{  	s0 =	sadd.s32 s0, s1;
+// CHECK: _ =	shalt  }
+// CHECK: {  	v0 = vpop @!p1 (trf0)  }
+// CHECK: {  	_ = vsupp @p0 v1 }
+// CHECK: {   (drf) = sdivrem.u32 s0, s1 }
+// CHECK: {   (drf) = sdiv.u32 s0, s1 }
+// CHECK: {   (drf) = srem.u32 s0, s1 }
+// CHECK: {   s0 = spop (drf) }
+// CHECK: {   s0 =    sld.cb [smem:$0x101 cbreg:$0x0]  }
+// CHECK: {   s1 =    sld.cb.upd [smem:$0xffff cbreg:$0x1]  }
+// CHECK: {   [smem:$0xfffff cbreg:$0x2] =       sst.cb s0  }
+// CHECK: {   [smem:$0xffffe cbreg:$0x9] =       sst.cb @!p0 s31  }
+// CHECK: {   s0 = rdcbreg [cbreg:$0x1 metadata:$0x0] }
+// CHECK: {   s0 = rdcbreg [cbreg:$0x1 metadata:$0x0] }
+// CHECK: {   s0 = rdcbreg [cbreg:$0x1 metadata:$0x1] }
+// CHECK: {   s0 = rdcbreg [cbreg:$0x1 metadata:$0x2] }
+// CHECK: {   [cbreg:$0x1 metadata:$0x0] = wrcbreg s0 }
+// CHECK: {   [cbreg:$0x1 metadata:$0x0] = wrcbreg s0 }
+// CHECK: {   [cbreg:$0x1 metadata:$0x1] = wrcbreg s0 }
+// CHECK: {   [cbreg:$0x1 metadata:$0x2] = wrcbreg s0 }
+// CHECK: {   [cbreg:$0x1] = cbreg.add s0 }
+// CHECK: { v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], vm0 }
+// CHECK: { v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], vm0 }
+// CHECK: { [tilespmem:s0+$0x0] = vst.msk vm0, v0 }
+// CHECK: { [tilespmem:v0+s0+$0x0] = vst.idx.msk vm0, v1 }
+// CHECK: { [tilespmem:v0+s0+$0x0] = vst.idx.msk vm0, v1 }
+// CHECK: { [tilespmem:v0+s0+$0x0] = vst.idx.add.s32.msk vm0, v1 }
+// CHECK: { [tilespmem:v0+s0+$0x0] = vst.idx.add.f32.msk vm0, v1 }
+// CHECK: { [tilespmem:v0+s0+$0x0] = vst.idx.add.f32.msk vm0, v1 }
+// CHECK: { [tilespmem:v0+s0+$0x0] = vst.idx.add.s32.msk vm0, v1 }
+// CHECK: { v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], vm0 }
+// CHECK: { v0 = vld.idx.msk [tilespmem:v0+s0+$0x0], vm0 }
+// CHECK: { [tilespmem:s0+$0x0 cbreg:$0x3] = vst.cb.msk vm0, v0 }
+// CHECK: { [tilespmem:v0+s0+$0x0 cbreg:$0x3] = vst.idx.cb.msk vm0, v1 }
+// CHECK: { [tilespmem:v0+s0+$0x0 cbreg:$0x3] = vst.idx.cb.msk vm0, v1 }
+// CHECK: { [tilespmem:v0+s0+$0x0 cbreg:$0x3] = vst.idx.cb.add.s32.msk vm0, v1 }
+// CHECK: { [tilespmem:v0+s0+$0x0 cbreg:$0x3] = vst.idx.cb.add.f32.msk vm0, v1 }
+// CHECK: { [tilespmem:v0+s0+$0x0 cbreg:$0x3] = vst.idx.cb.add.s32.msk vm0, v1 }
+// CHECK: { [tilespmem:v0+s0+$0x0 cbreg:$0x3] = vst.idx.cb.add.f32.msk vm0, v1 }
+// CHECK: { [tilespmem:s0+$0x0 cbreg:$0x3] = vst.cb.upd.msk vm0, v0 }
+// CHECK: { (xrf1) = vsort.ascd.msk.u32 vm0, v0, v1 }
+// CHECK: { (xrf1) = vsort.ascd.msk.f32 vm0, v0, v1 }
+// CHECK: { (xrf1) = vsort.dscd.msk.u32 vm0, v0, v1 }
+// CHECK: { (xrf1) = vsort.dscd.msk.f32 vm0, v0, v1 }
+// CHECK: { (xrf1) = vdupcnt.msk.u32 vm0, v0 }
+// CHECK: { (xrf1) = vdupcnt.msk.f32 vm0, v0 }
+// CHECK: { (xrf1) = vunique.msk.u32 vm0, v0 }
+// CHECK: { (xrf1) = vunique.msk.f32 vm0, v0 }
+// CHECK: { _ = sfence.scmf }
+// CHECK: { _ = sfence.sel $0x9 }
+// CHECK: { _ = sfence.stream.spmem }
+// CHECK: { _ = sfence.stream.hbm }
+// CHECK: {   _ = snop }
+// CHECK: {   _ = sdelay $0x1 }
+// CHECK: {   v2 = vpsel p0, v0, v1 }
+// CHECK: {   s0 = smin.f32 s0, s1 }
+// CHECK: {   s0 = smax.f32 s0, s1 }
+// CHECK: {   s0 = smin.u32 s0, s1 }
+// CHECK: {   s0 = smax.u32 s0, s1 }
+// CHECK: {   s0 = smin.f32 s0, $0.10000000149011612 }
+// CHECK: {   s0 = smax.f32 s0, $-0.30000001192092896 }
+// CHECK: {   s0 = smin.u32 s0, $0x100 }
+// CHECK:func_end0:
+// CHECK:	.size	vector_trivial, func_end0-vector_trivial
+
+// CHECK:	.type	a,@object
+// CHECK:	.data
+// CHECK:	.globl	a
+// CHECK:	.p2align	5
+// CHECK:a:
+// CHECK:	.long	4

diff --git a/tpu_recision/third_party/llvm/llvm/test/MC/GoogleTPU/parser_tc.s b/tpu_recision/third_party/llvm/llvm/test/MC/GoogleTPU/parser_tc.s
new file mode 100644
index 0000000..54be944
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/MC/GoogleTPU/parser_tc.s

@@ -0,0 +1,104 @@
+// RUN: llvm-mc < %s -triple googletpu-- -mcpu=tensorcore-pf | FileCheck %s
+// REQUIRES: tpu
+
+	.text
+	.file	"test.c"
+	.globl	vector_trivial
+	.p2align	2
+	.type	vector_trivial,@function
+vector_trivial:
+{  	s1 = simm.s32 $0x0 }
+{   s2 = simm.f32 $4.000000e-02 }
+{   p0 = sc.u32 s1, $0x1234 }
+{   p1 = sc.u32 s1, s2 }
+{   _ = vwait.ge [sflag:s2], $0x4 }
+{   _ = vwait.gt [sflag:s2], $0x4 }
+{   _ = vwait.le [sflag:s2], $0x4 }
+{   _ = vwait.lt [sflag:s2], $0x4 }
+{   _ = vwait.ne [sflag:s2], $0x4 }
+{   [hbm:s0], [sflag:s2] = dma.local [vmem:s1], $0x4 }
+{   _ = vwait.eq [sflag:s2], $0x4 }
+{   _ = vtrace s2 }
+{   _ = vtrace $0x53 }
+{   (tm) = vsettm @p2 s1 }
+{   (tm) = vsettm @!p2 $0x123 }
+{   s0 = simm.s32 $0xfb;
+    [sflag:s2] = vsyncadd.s32 $-0x4 }
+{   (gsfn1) =	vmatpush.f32 v0 }
+{   (gmr0) =	vdwg.f16 (gsfn0) }
+{   v1 = vmatres.8x128.f32 @!p0 (mrf0) }
+{   _ = vint @p0 $0x2 }
+{  	_ =	shalt }
+{   _ = setrngseed v1 }
+{   v2 = getrngseed }
+{   v3 = vrng.8x128.u32 }
+{   _ = snop }
+{   _ = vnop }
+{   _ = sdelay $0x1 }
+{   _ = vdelay $0x1 }
+func_end0:
+	.size	vector_trivial, func_end0-vector_trivial
+
+	.type	a,@object
+	.data
+	.globl	a
+	.p2align	5
+a:
+	.long	4
+	.long	4
+	.long	4
+	.long	4
+	.long	4
+	.long	4
+	.long	4
+	.long	4
+	.size	a, 32
+
+
+	.ident	"clang version google3-trunk (trunk r360825)"
+	.section	".note.GNU-stack","",@progbits
+
+// CHECK: .text
+// CHECK:	.file	"test.c"
+// CHECK:	.globl	vector_trivial
+// CHECK:	.p2align	2
+// CHECK:	.type	vector_trivial,@function
+// CHECK:vector_trivial:
+// CHECK:{   s1 = simm.s32 $0x0 }
+// CHECK:{   s2 = simm.f32 $0.039999999105930328 }
+// CHECK:{   p0 = sc.u32 s1, $0x1234 }
+// CHECK:{   p1 = sc.u32 s1, s2 }
+// CHECK:{   _ = vwait.ge [sflag:s2], $0x4 }
+// CHECK:{   _ = vwait.gt [sflag:s2], $0x4 }
+// CHECK:{   _ = vwait.le [sflag:s2], $0x4 }
+// CHECK:{   _ = vwait.lt [sflag:s2], $0x4 }
+// CHECK:{   _ = vwait.ne [sflag:s2], $0x4 }
+// CHECK:{   [hbm:s0], [sflag:s2] = dma.local [vmem:s1], $0x4 }
+// CHECK:{   _ = vwait.eq [sflag:s2], $0x4 }
+// CHECK:{   _ = vtrace s2 }
+// CHECK:{   _ = vtrace $0x53 }
+// CHECK:{   (tm) = vsettm @p2 s1 }
+// CHECK:{   (tm) = vsettm @!p2 $0x123 }
+// CHECK:{   s0 = simm.s32 $0xfb;
+// CHECK:    [sflag:s2] = vsyncadd.s32 $-0x4 }
+// CHECK:{   (gsfn1) =	vmatpush.f32 v0 }
+// CHECK:{   (gmr0) =	vdwg.f16 (gsfn0) }
+// CHECK:{   v1 = vmatres.8x128.f32 @!p0 (mrf0) }
+// CHECK:{   _ = vint @p0 $0x2 }
+// CHECK:{  	_ =	shalt }
+// CHECK:{   _ = setrngseed v1 }
+// CHECK:{   v2 = getrngseed }
+// CHECK:{   v3 = vrng.8x128.u32 }
+// CHECK: {   _ = snop }
+// CHECK: {   _ = vnop }
+// CHECK: {   _ = sdelay $0x1 }
+// CHECK: {   _ = vdelay $0x1 }
+// CHECK:func_end0:
+// CHECK:	.size	vector_trivial, func_end0-vector_trivial
+
+// CHECK:	.type	a,@object
+// CHECK:	.data
+// CHECK:	.globl	a
+// CHECK:	.p2align	5
+// CHECK:a:
+// CHECK:	.long	4

diff --git a/tpu_recision/third_party/llvm/llvm/test/MC/GoogleTPU/slotencoding_tc.s b/tpu_recision/third_party/llvm/llvm/test/MC/GoogleTPU/slotencoding_tc.s
new file mode 100644
index 0000000..0aa38ae
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/test/MC/GoogleTPU/slotencoding_tc.s

@@ -0,0 +1,47 @@
+// RUN: llvm-mc < %s -triple googletpu-tensorcore-- -mcpu=tensorcore-pf \
+// RUN: -tpu-print-mcinst-encodings \
+// RUN: | llvm-mc -triple googletpu-tensorcore-- -mcpu=tensorcore-pf \
+// RUN: -tpu-print-mcinst-encodings | FileCheck %s
+// REQUIRES: tpu
+
+	.text
+	.file	"test.c"
+	.globl	vector_trivial
+	.p2align	2
+	.type	vector_trivial,@function
+vector_trivial:
+// CHECK: {  	s1 = simm.s32 $0x0 (embed encoding 3) }
+{  	s1 = simm.s32 $0x0 (embed encoding 3) }
+// CHECK: {   s2 = simm.f32 $4.000000e-02 (i32 imm23 encoding 1) }
+{   s2 = simm.f32 $4.000000e-02 (i32 imm23 encoding 1) }
+// CHECK: {   p0 = sc.u32 s1, $0x1234 (oneext imm1 encoding 1) }
+{   p0 = sc.u32 s1, $0x1234 (oneext imm1 encoding 1) }
+{   p1 = sc.u32 s1, s2 }
+{   _ = vwait.ge [sflag:s2], $0x4 }
+// CHECK: {   _ = vwait.ge [sflag:s2], $0x4 (zext imm3 encoding 1) }
+{   _ = vwait.ge [sflag:s2], $0x4 (zext imm3 encoding 1) }
+// CHECK: {   _ = vwait.gt [sflag:s2 (vs1)], $0x4 }
+{   _ = vwait.gt [sflag:s2 (vs1)], $0x4 }
+// CHECK: {   _ = vwait.le [sflag:s2 (vs0)], $0x4 }
+{   _ = vwait.le [sflag:s2 (vs0)], $0x4 }
+// CHECK: {   _ = vwait.lt [sflag:s2 (vs2)], $0x4 }
+{   _ = vwait.lt [sflag:s2 (vs2)], $0x4 }
+{   _ = vwait.ne [sflag:s2], $0x4 }
+{   [hbm:s0], [sflag:s2] = dma.local [vmem:s1], $0x4 }
+{   _ = vwait.eq [sflag:s2], $0x4 }
+// CHECK: {   _ = vtrace s2 (vs1) }
+{   _ = vtrace s2 (vs1) }
+{   _ = vtrace $0x53 }
+{   s0 = simm.s32 $0xfb;
+    [sflag:s2] = vsyncadd.s32 $-0x4 }
+{  	_ =	shalt }
+
+// CHECK: { s6 = sadd.s32 s10, s10 }
+{ s6 = sadd.s32 s10, s10 }
+// CHECK: { (slot_s0) s6 = sadd.s32 s10, s10 }
+{ (slot_s0) s6 = sadd.s32 s10, s10 }
+// CHECK: { (slot_s1) s6 = sadd.s32 s10, s10 }
+{ (slot_s1) s6 = sadd.s32 s10, s10 }
+
+func_end0:
+	.size	vector_trivial, func_end0-vector_trivial

diff --git a/tpu_recision/third_party/llvm/llvm/tools/clang/lib/Basic/Targets/GoogleTPU.cpp b/tpu_recision/third_party/llvm/llvm/tools/clang/lib/Basic/Targets/GoogleTPU.cpp
new file mode 100644
index 0000000..9985d9f
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/tools/clang/lib/Basic/Targets/GoogleTPU.cpp

@@ -0,0 +1,82 @@
+//===--- TPU.cpp - Implement TPU target feature support -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements TPU TargetInfo objects.
+//
+//===----------------------------------------------------------------------===//
+
+#include "third_party/llvm/llvm/tools/clang/lib/Basic/Targets/GoogleTPU.h"
+
+#include <optional>
+
+#include "clang/Basic/MacroBuilder.h"
+#include "clang/Basic/TargetBuiltins.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+
+using namespace clang;
+using namespace clang::targets;
+
+static constexpr std::array<llvm::StringLiteral, 13> TargetNames = {
+    "sparsecore-tec-vf", "sparsecore-scs-vf", "sparsecore-tac-vf",
+    "sparsecore-tec-gl", "sparsecore-scs-gl", "sparsecore-tac-gl",
+    "sparsecore-tec-gf", "sparsecore-scs-gf", "sparsecore-tac-gf",
+    "tensorcore-jf",     "tensorcore-df",     "tensorcore-pf",
+    "tensorcore-vf",
+};
+
+const Builtin::Info TPUTargetInfo::BuiltinInfo[] = {
+#define BUILTIN(ID, TYPE, ATTRS)                                               \
+  {#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr},
+#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE)                               \
+  {#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, FEATURE},
+#include "third_party/llvm/llvm/tools/clang/include/clang/Basic/BuiltinsTPU.def"
+};
+
+ArrayRef<const char *> TPUTargetInfo::getGCCRegNames() const {
+  return std::nullopt;
+}
+
+ArrayRef<TargetInfo::GCCRegAlias> TPUTargetInfo::getGCCRegAliases() const {
+  return std::nullopt;
+}
+
+bool TPUTargetInfo::isValidCPUName(StringRef Name) const {
+  for (const auto C : TargetNames) {
+    if (Name == C)
+      return true;
+  }
+  return false;
+}
+
+void TPUTargetInfo::fillValidCPUList(SmallVectorImpl<StringRef> &Values) const {
+  for (const auto C : TargetNames) {
+    Values.push_back(C);
+  }
+}
+
+bool TPUTargetInfo::setCPU(const std::string &Name) {
+  if (!isValidCPUName(Name))
+    return false;
+  return true;
+}
+
+bool TPUTargetInfo::hasFeature(StringRef Feature) const {
+  return llvm::StringSwitch<bool>(Feature).Case("TPU", true).Default(false);
+}
+
+void TPUTargetInfo::getTargetDefines(const LangOptions &Opts,
+                                     MacroBuilder &Builder) const {
+  // Define __TPU__ when building for target TPU.
+  Builder.defineMacro("__TPU__");
+}
+
+ArrayRef<Builtin::Info> TPUTargetInfo::getTargetBuiltins() const {
+  return llvm::ArrayRef(BuiltinInfo, clang::TPU::LastTSBuiltin -
+                                             Builtin::FirstTSBuiltin);
+}

diff --git a/tpu_recision/third_party/llvm/llvm/tools/clang/lib/Basic/Targets/GoogleTPU.h b/tpu_recision/third_party/llvm/llvm/tools/clang/lib/Basic/Targets/GoogleTPU.h
new file mode 100644
index 0000000..a51801b
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/tools/clang/lib/Basic/Targets/GoogleTPU.h

@@ -0,0 +1,88 @@
+//===--- GoogleTPU.h - Declare TPU support --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares TPU TargetInfo objects. TPU is
+// Google-internal.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_BASIC_TARGETS_GOOGLETPU_H
+#define LLVM_CLANG_LIB_BASIC_TARGETS_GOOGLETPU_H
+
+#include "clang/Basic/TargetInfo.h"
+#include "clang/Basic/TargetOptions.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Compiler.h"
+
+namespace clang {
+namespace targets {
+
+class LLVM_LIBRARY_VISIBILITY TPUTargetInfo : public TargetInfo {
+  static const Builtin::Info BuiltinInfo[];
+
+  static const TargetInfo::GCCRegAlias GCCRegAliases[];
+  static const char *const GCCRegNames[];
+
+public:
+  TPUTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
+      : TargetInfo(Triple) {
+    // Description string has to be kept in sync with backend.
+    resetDataLayout("e"        // Little endian
+                    "-m:e"     // ELF name manging
+                    "-p:32:32" // 32 bit pointers, 32 bit aligned
+                    "-i64:64"  // 64 bit integers, 64 bit aligned
+                    "-a:0:32"  // 32 bit alignment of objects of aggregate type
+                    "-n32"     // 32 bit native integer width
+                    "-S64"     // 64 bit natural stack alignment
+    );
+
+    // Setting RegParmMax equal to what mregparm was set to in the old
+    // toolchain
+    RegParmMax = 4;
+
+    // We actually want i1 for bool, but clang does not work well w/ POD < i8.
+    // To reproduce:
+    // echo 'class C { bool b; } c;' |
+    // ./blaze-bin/third_party/llvm/llvm-project/clang/clang --target=googletpu
+    // -mcpu=sparsecore-tec-vf -x c++ - -Xclang -fdump-record-layouts
+    // Instead, we 're making bool i8 in clang.
+    BoolWidth = 8;
+  }
+
+  void getTargetDefines(const LangOptions &Opts,
+                        MacroBuilder &Builder) const override;
+
+  bool isValidCPUName(StringRef Name) const override;
+
+  void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override;
+
+  bool setCPU(const std::string &Name) override;
+
+  bool hasFeature(StringRef Feature) const override;
+
+  ArrayRef<const char *> getGCCRegNames() const override;
+
+  ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override;
+
+  BuiltinVaListKind getBuiltinVaListKind() const override {
+    return TargetInfo::VoidPtrBuiltinVaList;
+  }
+
+  ArrayRef<Builtin::Info> getTargetBuiltins() const override;
+
+  bool validateAsmConstraint(const char *&Name,
+                             TargetInfo::ConstraintInfo &info) const override {
+    return false;
+  }
+
+  const char *getClobbers() const override { return ""; }
+};
+} // namespace targets
+} // namespace clang
+
+#endif // LLVM_CLANG_LIB_BASIC_TARGETS_GOOGLETPU_H

diff --git a/tpu_recision/third_party/llvm/llvm/unittests/Target/GoogleTPU/TPUAliasSetTrackerTest.cpp b/tpu_recision/third_party/llvm/llvm/unittests/Target/GoogleTPU/TPUAliasSetTrackerTest.cpp
new file mode 100644
index 0000000..b005f22
--- /dev/null
+++ b/tpu_recision/third_party/llvm/llvm/unittests/Target/GoogleTPU/TPUAliasSetTrackerTest.cpp

@@ -0,0 +1,512 @@
+//===-- TPUAliasSetTrackerTest.cpp - Tests for TPUAliasSetTracker --------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests for TPUAliasSetTracker.
+//
+//===----------------------------------------------------------------------===//
+#include <optional>
+
+#include "TPUAliasSetTracker.h"
+#include "TPUSubtarget.h"
+#include "TPUTargetMachine.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CodeGen/MIRParser/MIRParser.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::TPU;
+
+namespace {
+std::unique_ptr<LLVMTargetMachine> createTargetMachine() {
+  auto TT(Triple::normalize("googletpu--"));
+  std::string CPU("tensorcore-jf");
+  std::string FS("");
+
+  LLVMInitializeTPUTargetInfo();
+  LLVMInitializeTPUTarget();
+  LLVMInitializeTPUTargetMC();
+
+  std::string Error;
+  const Target *TheTarget = TargetRegistry::lookupTarget(TT, Error);
+
+  return std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine *>(
+      TheTarget->createTargetMachine(TT, CPU, FS, TargetOptions(), std::nullopt,
+                                     std::nullopt, CodeGenOpt::Default)));
+}
+
+std::unique_ptr<TPUInstrInfo> createInstrInfo(TargetMachine *TM) {
+  TPUSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
+                  std::string(TM->getTargetFeatureString()), *TM,
+                  TargetOptions(), CodeModel::Small, /*TPUABIOvrd=*/false,
+                  CodeGenOpt::Default);
+  return std::make_unique<TPUInstrInfo>(&ST, ST.getHwMode());
+}
+
+static void writeModuleToBuffer(std::unique_ptr<Module> Mod,
+                                SmallVectorImpl<char> &Buffer) {
+  raw_svector_ostream OS(Buffer);
+  WriteBitcodeToFile(*Mod, OS);
+}
+
+static std::unique_ptr<Module> getLazyModuleFromAssembly(LLVMContext &Context,
+                                                         SmallString<1024> &Mem,
+                                                         StringRef Assembly) {
+  SMDiagnostic Err;
+  auto Mod = parseAssemblyString(Assembly, Err, Context);
+  if (!Mod) {
+    report_fatal_error(Err.getMessage());
+  }
+  writeModuleToBuffer(std::move(Mod), Mem);
+  Expected<std::unique_ptr<Module>> ModuleOrErr =
+      getLazyBitcodeModule(MemoryBufferRef(Mem.str(), "test"), Context);
+  if (!ModuleOrErr)
+    report_fatal_error("Could not parse bitcode module");
+  return std::move(ModuleOrErr.get());
+}
+
+} // namespace
+
+TEST(TPUAliasSetTracker, TestNoAlias) {
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
+  ASSERT_TRUE(TM);
+  SmallString<1024> Mem;
+  LLVMContext Context;
+  std::unique_ptr<Module> M = getLazyModuleFromAssembly(
+      Context, Mem,
+      "define void @f() {\n"
+      "entry:\n"
+      "  %tmp0 = call <1024 x i32> addrspace(205)* "
+      "@llvm.tpu.inttoptr.p205v1024i32(i32 0)\n"
+      "  %tmp1 = load <1024 x i32>, <1024 x i32> addrspace(205)* %tmp0\n"
+      "  %tmp2 = bitcast <1024 x i32> addrspace(205)* %tmp0 to i4096 "
+      "addrspace(205)*\n"
+      "  %tmp3 = getelementptr i4096, i4096 addrspace(205)* %tmp2, i32 8\n"
+      "  %tmp4 = bitcast i4096 addrspace(205)* %tmp3 to <1024 x i32> "
+      "addrspace(205)*\n"
+      "  store <1024 x i32> %tmp1, <1024 x i32> addrspace(205)* %tmp4\n"
+      "  ret void\n"
+      "}\n"
+      "declare <1024 x i32> addrspace(205)* "
+      "@llvm.tpu.inttoptr.p205v1024i32(i32) #1\n"
+      "attributes #1 = { nounwind readnone }\n");
+  TPUAliasSetTracker<> AS(M->getDataLayout());
+  Function *F = M->getFunction("f");
+  ASSERT_FALSE(F->materialize());
+  for (auto &BB : *F) {
+    for (auto &I : BB) {
+      ASSERT_EQ(AS.aliasQuery(&I, true), AliasResult::NoAlias);
+    }
+  }
+}
+
+TEST(TPUAliasSetTracker, TestPartialAlias) {
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
+  ASSERT_TRUE(TM);
+  SmallString<1024> Mem;
+  LLVMContext Context;
+  std::unique_ptr<Module> M = getLazyModuleFromAssembly(
+      Context, Mem,
+      "define void @f() {\n"
+      "entry:\n"
+      "  %tmp0 = call <1024 x i32> addrspace(205)* "
+      "@llvm.tpu.inttoptr.p205v1024i32(i32 0)\n"
+      "  %tmp1 = load <1024 x i32>, <1024 x i32> addrspace(205)* %tmp0\n"
+      "  %tmp2 = bitcast <1024 x i32> addrspace(205)* %tmp0 to i4096 "
+      "addrspace(205)*\n"
+      "  %tmp3 = getelementptr i4096, i4096 addrspace(205)* %tmp2, i32 4\n"
+      "  %tmp4 = bitcast i4096 addrspace(205)* %tmp3 to <1024 x i32> "
+      "addrspace(205)*\n"
+      "  store <1024 x i32> %tmp1, <1024 x i32> addrspace(205)* %tmp4\n"
+      "  ret void\n"
+      "}\n"
+      "declare <1024 x i32> addrspace(205)* "
+      "@llvm.tpu.inttoptr.p205v1024i32(i32) #1\n"
+      "attributes #1 = { nounwind readnone }\n");
+  TPUAliasSetTracker<> AS(M->getDataLayout());
+  Function *F = M->getFunction("f");
+  ASSERT_FALSE(F->materialize());
+  for (auto &BB : *F) {
+    for (auto &I : BB) {
+      auto Result = AS.aliasQuery(&I, true);
+      if (isa<StoreInst>(&I)) {
+        ASSERT_EQ(Result, AliasResult::MayAlias);
+      } else {
+        ASSERT_EQ(Result, AliasResult::NoAlias);
+      }
+    }
+  }
+}
+
+TEST(TPUAliasSetTracker, TestScopeAliasNoAlias) {
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
+  ASSERT_TRUE(TM);
+  SmallString<1024> Mem;
+  LLVMContext Context;
+  std::unique_ptr<Module> M = getLazyModuleFromAssembly(
+      Context, Mem,
+      "define void @f(<1024 x i32> addrspace(205)* %a, <1024 x i32> "
+      "addrspace(205)* %b) {\n"
+      "entry:\n"
+      "  %tmp0 = load <1024 x i32>, <1024 x i32> addrspace(205)* %a, "
+      "!alias.scope !6, !noalias !7\n"
+      "  store <1024 x i32> %tmp0, <1024 x i32> addrspace(205)* %b, "
+      "!alias.scope !8, !noalias !9\n"
+      "  ret void\n"
+      "}\n"
+      "declare <1024 x i32> addrspace(205)* "
+      "@llvm.tpu.inttoptr.p205v1024i32(i32) #1\n"
+      "attributes #1 = { nounwind readnone }\n"
+      "!0 = distinct !{!0, !\"fusion.25\"}"
+      "!1 = !{}\n"
+      "!2 = distinct !{!2, !0, !\"alloc\"}\n"
+      "!3 = distinct !{!3, !0, !\"alloc\"}\n"
+      "!4 = distinct !{!4, !0, !\"alloc\"}\n"
+      "!5 = distinct !{!5, !0, !\"alloc\"}\n"
+      "!6 = !{!3}\n"
+      "!7 = !{!2, !4, !5}\n"
+      "!8 = !{!4}\n"
+      "!9 = !{!2, !3, !5}\n");
+  TPUAliasSetTracker<> AS(M->getDataLayout());
+  Function *F = M->getFunction("f");
+  ASSERT_FALSE(F->materialize());
+  for (auto &BB : *F) {
+    for (auto &I : BB) {
+      ASSERT_EQ(AS.aliasQuery(&I, true), AliasResult::NoAlias);
+    }
+  }
+}
+
+TEST(TPUAliasSetTracker, TestScopeAlias) {
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
+  ASSERT_TRUE(TM);
+  SmallString<1024> Mem;
+  LLVMContext Context;
+  std::unique_ptr<Module> M = getLazyModuleFromAssembly(
+      Context, Mem,
+      "define void @f(<1024 x i32> addrspace(205)* %a, <1024 x i32> "
+      "addrspace(205)* %b) {\n"
+      "entry:\n"
+      "  %tmp0 = load <1024 x i32>, <1024 x i32> addrspace(205)* %a, "
+      "!alias.scope !6, !noalias !7\n"
+      "  store <1024 x i32> %tmp0, <1024 x i32> addrspace(205)* %b, "
+      "!alias.scope !8, !noalias !9\n"
+      "  ret void\n"
+      "}\n"
+      "declare <1024 x i32> addrspace(205)* "
+      "@llvm.tpu.inttoptr.p205v1024i32(i32) #1\n"
+      "attributes #1 = { nounwind readnone }\n"
+      "!0 = distinct !{!0, !\"fusion.25\"}"
+      "!1 = !{}\n"
+      "!2 = distinct !{!2, !0, !\"alloc\"}\n"
+      "!3 = distinct !{!3, !0, !\"alloc\"}\n"
+      "!4 = distinct !{!4, !0, !\"alloc\"}\n"
+      "!5 = distinct !{!5, !0, !\"alloc\"}\n"
+      "!6 = !{!3}\n"
+      "!7 = !{!2, !5}\n"
+      "!8 = !{!4}\n"
+      "!9 = !{!2, !5}\n");
+  TPUAliasSetTracker<> AS(M->getDataLayout());
+  Function *F = M->getFunction("f");
+  ASSERT_FALSE(F->materialize());
+  for (auto &BB : *F) {
+    for (auto &I : BB) {
+      auto Result = AS.aliasQuery(&I, true);
+      if (isa<StoreInst>(&I)) {
+        ASSERT_EQ(Result, AliasResult::MayAlias);
+      } else {
+        ASSERT_EQ(Result, AliasResult::NoAlias);
+      }
+    }
+  }
+}
+
+TEST(TPUAliasSetTracker, TestUnknownAccess) {
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
+  ASSERT_TRUE(TM);
+  SmallString<1024> Mem;
+  LLVMContext Context;
+  std::unique_ptr<Module> M = getLazyModuleFromAssembly(
+      Context, Mem,
+      "define void @f() {\n"
+      "entry:\n"
+      "  %tmp0 = call <1024 x i32> addrspace(205)* "
+      "@llvm.tpu.inttoptr.p205v1024i32(i32 0)\n"
+      "  %tmp1 = load <1024 x i32>, <1024 x i32> addrspace(205)* %tmp0\n"
+      "  %tmp2 = bitcast <1024 x i32> addrspace(205)* %tmp0 to i4096 "
+      "addrspace(205)*\n"
+      "  call void @unkn()\n"
+      "  %tmp4 = getelementptr i4096, i4096 addrspace(205)* %tmp2, i32 8\n"
+      "  %tmp5 = bitcast i4096 addrspace(205)* %tmp4 to <1024 x i32> "
+      "addrspace(205)*\n"
+      "  store <1024 x i32> %tmp1, <1024 x i32> addrspace(205)* %tmp5\n"
+      "  ret void\n"
+      "}\n"
+      "declare void @unkn()\n"
+      "declare <1024 x i32> addrspace(205)* "
+      "@llvm.tpu.inttoptr.p205v1024i32(i32) #1\n"
+      "attributes #1 = { nounwind readnone }\n");
+  TPUAliasSetTracker<> AS(M->getDataLayout());
+  Function *F = M->getFunction("f");
+  ASSERT_FALSE(F->materialize());
+  unsigned Count = 0;
+  for (auto &BB : *F) {
+    for (auto &I : BB) {
+      auto Result = AS.aliasQuery(&I, true);
+      switch (Count) {
+      case 1:
+        ASSERT_EQ(Result, AliasResult::NoAlias);
+        break;
+      case 3:
+        ASSERT_EQ(Result, AliasResult::MayAlias);
+        break;
+      case 6:
+        ASSERT_EQ(Result, AliasResult::MayAlias);
+        break;
+      default:
+        ASSERT_EQ(Result, AliasResult::NoAlias);
+        break;
+      }
+      ++Count;
+    }
+  }
+  ASSERT_EQ(AS.getNumActiveSets(), 2);
+}
+
+TEST(TPUAliasSetTracker, TestInaccessibleAccess) {
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
+  ASSERT_TRUE(TM);
+  SmallString<1024> Mem;
+  LLVMContext Context;
+  std::unique_ptr<Module> M = getLazyModuleFromAssembly(
+      Context, Mem,
+      "define void @f(<1024 x i32> addrspace(205)* %a, <1024 x i32> "
+      "addrspace(205)* %b) {\n"
+      "entry:\n"
+      "  %tmp0 = call <1024 x i32> addrspace(205)* "
+      "@llvm.tpu.inttoptr.p205v1024i32(i32 0)\n"
+      "  %tmp1 = load <1024 x i32>, <1024 x i32> addrspace(205)* %tmp0\n"
+      "  %tmp2 = bitcast <1024 x i32> addrspace(205)* %tmp0 to i4096 "
+      "addrspace(205)*\n"
+      "  %tmp3 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 undef)"
+      "  %tmp4 = getelementptr i4096, i4096 addrspace(205)* %tmp2, i32 8\n"
+      "  %tmp5 = bitcast i4096 addrspace(205)* %tmp4 to <1024 x i32> "
+      "addrspace(205)*\n"
+      "  store <1024 x i32> %tmp1, <1024 x i32> addrspace(205)* %tmp5\n"
+      "  ret void\n"
+      "}\n"
+      "declare <1024 x float> @llvm.tpu.vmatres.f32(i32, i32)\n"
+      "declare <1024 x i32> addrspace(205)* "
+      "@llvm.tpu.inttoptr.p205v1024i32(i32) #1\n"
+      "attributes #1 = { nounwind readnone }\n");
+  TPUAliasSetTracker<> AS(M->getDataLayout());
+  Function *F = M->getFunction("f");
+  ASSERT_FALSE(F->materialize());
+  for (auto &BB : *F) {
+    for (auto &I : BB) {
+      ASSERT_EQ(AS.aliasQuery(&I, true), AliasResult::NoAlias);
+    }
+  }
+}
+
+TEST(TPUAliasSetTracker, TestInaccessibleAliasAccess) {
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
+  ASSERT_TRUE(TM);
+  SmallString<1024> Mem;
+  LLVMContext Context;
+  std::unique_ptr<Module> M = getLazyModuleFromAssembly(
+      Context, Mem,
+      "define void @f(<1024 x i32> addrspace(205)* %a, <1024 x i32> "
+      "addrspace(205)* %b) {\n"
+      "entry:\n"
+      "  %tmp0 = call <1024 x i32> addrspace(205)* "
+      "@llvm.tpu.inttoptr.p205v1024i32(i32 0)\n"
+      "  %tmp1 = load <1024 x i32>, <1024 x i32> addrspace(205)* %tmp0\n"
+      "  %tmp2 = bitcast <1024 x i32> addrspace(205)* %tmp0 to i4096 "
+      "addrspace(205)*\n"
+      "  %tmp3 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 undef)\n"
+      "  %tmp4 = getelementptr i4096, i4096 addrspace(205)* %tmp2, i32 8\n"
+      "  %tmp5 = bitcast i4096 addrspace(205)* %tmp4 to <1024 x i32> "
+      "addrspace(205)*\n"
+      "  store <1024 x i32> %tmp1, <1024 x i32> addrspace(205)* %tmp5\n"
+      "  %tmp7 = call <1024 x float> @llvm.tpu.vmatres.f32(i32 0, i32 undef)\n"
+      "  ret void\n"
+      "}\n"
+      "declare <1024 x float> @llvm.tpu.vmatres.f32(i32, i32)\n"
+      "declare <1024 x i32> addrspace(205)* "
+      "@llvm.tpu.inttoptr.p205v1024i32(i32) #1\n"
+      "attributes #1 = { nounwind readnone }\n");
+  TPUAliasSetTracker<> AS(M->getDataLayout());
+  Function *F = M->getFunction("f");
+  ASSERT_FALSE(F->materialize());
+  unsigned Count = 0;
+  for (auto &BB : *F) {
+    for (auto &I : BB) {
+      auto Result = AS.aliasQuery(&I, true);
+      switch (Count) {
+      case 1:
+        ASSERT_EQ(Result, AliasResult::NoAlias);
+        break;
+      case 3:
+        ASSERT_EQ(Result, AliasResult::NoAlias);
+        break;
+      case 6:
+        ASSERT_EQ(Result, AliasResult::NoAlias);
+        break;
+      case 7:
+        ASSERT_EQ(Result, AliasResult::MayAlias);
+        break;
+      default:
+        ASSERT_EQ(Result, AliasResult::NoAlias);
+        break;
+      }
+      ++Count;
+    }
+  }
+}
+
+TEST(TPUAliasSetTracker, TestAliasCallbackLoad) {
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
+  ASSERT_TRUE(TM);
+  SmallString<1024> Mem;
+  LLVMContext Context;
+  std::unique_ptr<Module> M = getLazyModuleFromAssembly(
+      Context, Mem,
+      "define void @f() {\n"
+      "entry:\n"
+      "  %tmp0 = call <1024 x i32> addrspace(205)* "
+      "@llvm.tpu.inttoptr.p205v1024i32(i32 0)\n"
+      "  %tmp1 = load <1024 x i32>, <1024 x i32> addrspace(205)* %tmp0\n"
+      "  %tmp2 = bitcast <1024 x i32> addrspace(205)* %tmp0 to i4096 "
+      "addrspace(205)*\n"
+      "  %tmp3 = getelementptr i4096, i4096 addrspace(205)* %tmp2, i32 8\n"
+      "  %tmp4 = bitcast i4096 addrspace(205)* %tmp3 to <1024 x i32> "
+      "addrspace(205)*\n"
+      "  store <1024 x i32> %tmp1, <1024 x i32> addrspace(205)* %tmp4\n"
+      "  %tmp6 = bitcast <1024 x i32> addrspace(205)* %tmp0 to i4096 "
+      "addrspace(205)*\n"
+      "  %tmp7 = getelementptr i4096, i4096 addrspace(205)* %tmp6, i32 4\n"
+      "  %tmp8 = bitcast i4096 addrspace(205)* %tmp7 to <1024 x i32> "
+      "addrspace(205)*\n"
+      "  %tmp9 = load <1024 x i32>, <1024 x i32> addrspace(205)* %tmp8\n"
+      "  ret void\n"
+      "}\n"
+      "declare <1024 x i32> addrspace(205)* "
+      "@llvm.tpu.inttoptr.p205v1024i32(i32) #1\n"
+      "attributes #1 = { nounwind readnone }\n");
+  TPUAliasSetTracker<> AS(M->getDataLayout());
+  Function *F = M->getFunction("f");
+  unsigned AliasCallbackCount = 0;
+  unsigned AddCallbackCount = 0;
+  auto AliasCallback = [&AliasCallbackCount](TPUAliasSet *AS) {
+    ++AliasCallbackCount;
+  };
+  auto AddCallback = [&AddCallbackCount](TPUAliasSet *AS) {
+    ++AddCallbackCount;
+  };
+  unsigned Count = 0;
+  ASSERT_FALSE(F->materialize());
+  for (auto &BB : *F) {
+    for (auto &I : BB) {
+      auto Result = AS.aliasQuery(&I, true, AliasCallback, AddCallback);
+      switch (Count) {
+      case 1:
+        ASSERT_EQ(Result, AliasResult::NoAlias);
+        break;
+      case 5:
+        ASSERT_EQ(Result, AliasResult::NoAlias);
+        break;
+      case 9:
+        ASSERT_EQ(Result, AliasResult::MayAlias);
+        break;
+      default:
+        ASSERT_EQ(Result, AliasResult::NoAlias);
+        break;
+      }
+      ++Count;
+    }
+  }
+  ASSERT_EQ(AliasCallbackCount, 1);
+  ASSERT_EQ(AddCallbackCount, 3);
+  ASSERT_EQ(AS.getNumActiveSets(), 2);
+}
+
+TEST(TPUAliasSetTracker, TestAliasCallbackStore) {
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
+  ASSERT_TRUE(TM);
+  SmallString<1024> Mem;
+  LLVMContext Context;
+  std::unique_ptr<Module> M = getLazyModuleFromAssembly(
+      Context, Mem,
+      "define void @f() {\n"
+      "entry:\n"
+      "  %tmp0 = call <1024 x i32> addrspace(205)* "
+      "@llvm.tpu.inttoptr.p205v1024i32(i32 0)\n"
+      "  %tmp1 = load <1024 x i32>, <1024 x i32> addrspace(205)* %tmp0\n"
+      "  %tmp2 = bitcast <1024 x i32> addrspace(205)* %tmp0 to i4096 "
+      "addrspace(205)*\n"
+      "  %tmp3 = getelementptr i4096, i4096 addrspace(205)* %tmp2, i32 8\n"
+      "  %tmp4 = bitcast i4096 addrspace(205)* %tmp3 to <1024 x i32> "
+      "addrspace(205)*\n"
+      "  store <1024 x i32> %tmp1, <1024 x i32> addrspace(205)* %tmp4\n"
+      "  %tmp6 = bitcast <1024 x i32> addrspace(205)* %tmp0 to i4096 "
+      "addrspace(205)*\n"
+      "  %tmp7 = getelementptr i4096, i4096 addrspace(205)* %tmp6, i32 4\n"
+      "  %tmp8 = bitcast i4096 addrspace(205)* %tmp7 to <1024 x i32> "
+      "addrspace(205)*\n"
+      "  store <1024 x i32> %tmp1, <1024 x i32> addrspace(205)* %tmp8\n"
+      "  ret void\n"
+      "}\n"
+      "declare <1024 x i32> addrspace(205)* "
+      "@llvm.tpu.inttoptr.p205v1024i32(i32) #1\n"
+      "attributes #1 = { nounwind readnone }\n");
+  TPUAliasSetTracker<> AS(M->getDataLayout());
+  Function *F = M->getFunction("f");
+  unsigned AliasCallbackCount = 0;
+  unsigned AddCallbackCount = 0;
+  auto AliasCallback = [&AliasCallbackCount](TPUAliasSet *AS) {
+    ++AliasCallbackCount;
+  };
+  auto AddCallback = [&AddCallbackCount](TPUAliasSet *AS) {
+    ++AddCallbackCount;
+  };
+  unsigned Count = 0;
+  ASSERT_FALSE(F->materialize());
+  for (auto &BB : *F) {
+    for (auto &I : BB) {
+      auto Result = AS.aliasQuery(&I, true, AliasCallback, AddCallback);
+      switch (Count) {
+      case 1:
+        ASSERT_EQ(Result, AliasResult::NoAlias);
+        break;
+      case 5:
+        ASSERT_EQ(Result, AliasResult::NoAlias);
+        break;
+      case 9:
+        ASSERT_EQ(Result, AliasResult::MayAlias);
+        break;
+      default:
+        ASSERT_EQ(Result, AliasResult::NoAlias);
+        break;
+      }
+      ++Count;
+    }
+  }
+  ASSERT_EQ(AliasCallbackCount, 1);
+  ASSERT_EQ(AddCallbackCount, 3);
+  ASSERT_EQ(AS.getNumActiveSets(), 1);
+}
commit	459de9db50c760a8f7c61a8227f62262c29e6a29	[log] [tgz]
author	TimAtGoogle <tjharvey@google.com>	Wed Dec 11 14:55:07 2024 -0600
committer	TimAtGoogle <tjharvey@google.com>	Wed Dec 11 14:55:07 2024 -0600
tree	59158d37fc99992ed91070583c4494168d1a8cda
parent	5b1b633f4d79c24e0e1cad19855e684b483d7a6b [diff]