blob: 67b2aa732a56ef0b7e04e9408d5040e5e16fbc75 [file] [log] [blame]
// RUN: mlir-tpu-opt -- %s \
// RUN: --tpu-infer-memref-layout="hardware-generation=4" \
// RUN: --tpu-infer-vector-layout \
// RUN: --tpu-apply-vector-layout="hardware-generation=4" \
// RUN: --canonicalize \
// RUN: --lower-to-llo="mock-target=4" \
// RUN: --cse | FileCheck %s
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
//
// Loads a vector from singly tiled 512x512 i32 buffer (at
// kernel level), adds one to the vector and then outputs
// to another singly tiled 512x512 i32 buffer (at kernel
// level). The lowering decomposes the vector operations,
// within the kernel, along the x-axis and y-axis into
// fully unrolled code on the basic 8x128 vector operations.
// This implies that the fully unrolled version yields
// (512 / 8) * (512 / 128) = 64 * 4 = 256
// vector load, add, and store operations.
//
// Even though unrolling is good in general compared to
// a fully rolled loop, since it can hide latency and
// exploit multiple functional units (e.g. tmu_x for
// matrix multiplication), it would still be nice to
// have some level of control on partially unrolling
// the loop to save some code size.
//
// CHECK-LABEL: func.func @vecaddone
// CHECK-COUNT-256: llo.vector_load
// CHECK-COUNT-256: llo.vadd
// CHECK-COUNT-256: llo.vector_store
// CHECK: }
// CHECK: }
//
func.func @vecaddone(%t0: i32,
%t1: i32,
%INP: memref<512x512xi32>,
%OUT: memref<512x512xi32>)
attributes {
dimension_semantics =
[#tpu.dimension_semantics<parallel>,
#tpu.dimension_semantics<parallel>],
iteration_bounds = array<i64: 1, 1>,
window_params = [{transform_indices = #map},
{transform_indices = #map}]} {
%c0 = arith.constant 0 : index
%one = arith.constant 1 : i32
%vone = vector.broadcast %one : i32 to vector<512x512xi32>
%0 = vector.load %INP[%c0, %c0] : memref<512x512xi32>, vector<512x512xi32>
%1 = arith.addi %0, %vone : vector<512x512xi32>
vector.store %1, %OUT[%c0, %c0] : memref<512x512xi32>, vector<512x512xi32>
return
}
}