| // RUN: mlir-tpu-opt -- %s \ |
| // RUN: --tpu-infer-memref-layout="hardware-generation=4" \ |
| // RUN: --tpu-infer-vector-layout \ |
| // RUN: --tpu-apply-vector-layout="hardware-generation=4" \ |
| // RUN: --canonicalize \ |
| // RUN: --lower-to-llo="mock-target=4" \ |
| // RUN: --cse | FileCheck %s |
| |
| #map = affine_map<(d0, d1) -> (d0, d1)> |
| |
| module { |
| // |
| // Loads a vector from singly tiled 512x512 i32 buffer (at |
| // kernel level), adds one to the vector and then outputs |
| // to another singly tiled 512x512 i32 buffer (at kernel |
| // level). The lowering decomposes the vector operations, |
| // within the kernel, along the x-axis and y-axis into |
| // fully unrolled code on the basic 8x128 vector operations. |
| // This implies that the fully unrolled version yields |
| // (512 / 8) * (512 / 128) = 64 * 4 = 256 |
| // vector load, add, and store operations. |
| // |
| // Even though unrolling is good in general compared to |
| // a fully rolled loop, since it can hide latency and |
| // exploit multiple functional units (e.g. tmu_x for |
| // matrix multiplication), it would still be nice to |
| // have some level of control on partially unrolling |
| // the loop to save some code size. |
| // |
| // CHECK-LABEL: func.func @vecaddone |
| // CHECK-COUNT-256: llo.vector_load |
| // CHECK-COUNT-256: llo.vadd |
| // CHECK-COUNT-256: llo.vector_store |
| // CHECK: } |
| // CHECK: } |
| // |
| func.func @vecaddone(%t0: i32, |
| %t1: i32, |
| %INP: memref<512x512xi32>, |
| %OUT: memref<512x512xi32>) |
| attributes { |
| dimension_semantics = |
| [#tpu.dimension_semantics<parallel>, |
| #tpu.dimension_semantics<parallel>], |
| iteration_bounds = array<i64: 1, 1>, |
| window_params = [{transform_indices = #map}, |
| {transform_indices = #map}]} { |
| %c0 = arith.constant 0 : index |
| %one = arith.constant 1 : i32 |
| %vone = vector.broadcast %one : i32 to vector<512x512xi32> |
| %0 = vector.load %INP[%c0, %c0] : memref<512x512xi32>, vector<512x512xi32> |
| %1 = arith.addi %0, %vone : vector<512x512xi32> |
| vector.store %1, %OUT[%c0, %c0] : memref<512x512xi32>, vector<512x512xi32> |
| return |
| } |
| } |