mosaic/tests/lit/full_unrolling.mlir - collateral - Git at Google

 // RUN: mlir-tpu-opt -- %s \
 // RUN:   --tpu-infer-memref-layout="hardware-generation=4" \
 // RUN:   --tpu-infer-vector-layout \
 // RUN:   --tpu-apply-vector-layout="hardware-generation=4" \
 // RUN:   --canonicalize \
 // RUN:   --lower-to-llo="mock-target=4" \
 // RUN:   --cse | FileCheck %s

 #map = affine_map<(d0, d1) -> (d0, d1)>

 module {
   //
   // Loads a vector from singly tiled 512x512 i32 buffer (at
   // kernel level), adds one to the vector and then outputs
   // to another singly tiled 512x512 i32 buffer (at kernel
   // level). The lowering decomposes the vector operations,
   // within the kernel, along the x-axis and y-axis into
   // fully unrolled code on the basic 8x128 vector operations.
   // This implies that the fully unrolled version yields
   //   (512 / 8) * (512 / 128) = 64 * 4 = 256
   // vector load, add, and store operations.
   //
   // Even though unrolling is good in general compared to
   // a fully rolled loop, since it can hide latency and
   // exploit multiple functional units (e.g. tmu_x for
   // matrix multiplication), it would still be nice to
   // have some level of control on partially unrolling
   // the loop to save some code size.
   //
   // CHECK-LABEL: func.func @vecaddone
   // CHECK-COUNT-256:  llo.vector_load
   // CHECK-COUNT-256:  llo.vadd
   // CHECK-COUNT-256:  llo.vector_store
   // CHECK:       }
   // CHECK:     }
   //
   func.func @vecaddone(%t0: i32,
                        %t1: i32,
                        %INP: memref<512x512xi32>,
                        %OUT: memref<512x512xi32>)
      attributes {
        dimension_semantics =
          [#tpu.dimension_semantics<parallel>,
           #tpu.dimension_semantics<parallel>],
        iteration_bounds = array<i64: 1, 1>,
        window_params = [{transform_indices = #map},
                         {transform_indices = #map}]} {
     %c0 = arith.constant 0 : index
     %one = arith.constant 1 : i32
     %vone = vector.broadcast %one : i32 to vector<512x512xi32>
     %0 = vector.load %INP[%c0, %c0] : memref<512x512xi32>, vector<512x512xi32>
     %1 = arith.addi %0, %vone : vector<512x512xi32>
     vector.store %1, %OUT[%c0, %c0] : memref<512x512xi32>, vector<512x512xi32>
     return
   }
 }
	// RUN: mlir-tpu-opt -- %s \
	// RUN: --tpu-infer-memref-layout="hardware-generation=4" \
	// RUN: --tpu-infer-vector-layout \
	// RUN: --tpu-apply-vector-layout="hardware-generation=4" \
	// RUN: --canonicalize \
	// RUN: --lower-to-llo="mock-target=4" \
	// RUN: --cse \| FileCheck %s

	#map = affine_map<(d0, d1) -> (d0, d1)>

	module {
	//
	// Loads a vector from singly tiled 512x512 i32 buffer (at
	// kernel level), adds one to the vector and then outputs
	// to another singly tiled 512x512 i32 buffer (at kernel
	// level). The lowering decomposes the vector operations,
	// within the kernel, along the x-axis and y-axis into
	// fully unrolled code on the basic 8x128 vector operations.
	// This implies that the fully unrolled version yields
	// (512 / 8) * (512 / 128) = 64 * 4 = 256
	// vector load, add, and store operations.
	//
	// Even though unrolling is good in general compared to
	// a fully rolled loop, since it can hide latency and
	// exploit multiple functional units (e.g. tmu_x for
	// matrix multiplication), it would still be nice to
	// have some level of control on partially unrolling
	// the loop to save some code size.
	//
	// CHECK-LABEL: func.func @vecaddone
	// CHECK-COUNT-256: llo.vector_load
	// CHECK-COUNT-256: llo.vadd
	// CHECK-COUNT-256: llo.vector_store
	// CHECK: }
	// CHECK: }
	//
	func.func @vecaddone(%t0: i32,
	%t1: i32,
	%INP: memref<512x512xi32>,
	%OUT: memref<512x512xi32>)
	attributes {
	dimension_semantics =
	[#tpu.dimension_semantics<parallel>,
	#tpu.dimension_semantics<parallel>],
	iteration_bounds = array<i64: 1, 1>,
	window_params = [{transform_indices = #map},
	{transform_indices = #map}]} {
	%c0 = arith.constant 0 : index
	%one = arith.constant 1 : i32
	%vone = vector.broadcast %one : i32 to vector<512x512xi32>
	%0 = vector.load %INP[%c0, %c0] : memref<512x512xi32>, vector<512x512xi32>
	%1 = arith.addi %0, %vone : vector<512x512xi32>
	vector.store %1, %OUT[%c0, %c0] : memref<512x512xi32>, vector<512x512xi32>
	return
	}
	}