[mpact][benchmark] add more benchmarks (#27)

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 4256292..e473f9e 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -18,8 +18,26 @@
     utils/*.py
 )
 
-add_mlir_python_modules(MPACTBenchmarkPythonPythonModules
+add_mlir_python_modules(MPACTBenchmarkPythonModules
   ROOT_PREFIX "${MPACT_PYTHON_PACKAGES_DIR}/mpact/mpact_benchmark"
   INSTALL_PREFIX "python_packages/mpact/mpact_benchmark"
   DECLARED_SOURCES MPACTBenchmarkPythonSources
 )
+
+add_custom_target(build-benchmark-mpact)
+add_dependencies(build-benchmark-mpact MPACTPythonModules MPACTBenchmarkPythonModules)
+
+add_custom_target(benchmark-mpact)
+add_dependencies(benchmark-mpact build-benchmark-mpact)
+file(GLOB PYTHON_FILES "${CMAKE_CURRENT_SOURCE_DIR}/python/benchmarks/*.py")
+
+# Loop over each matched .py file and create a custom command to run it.
+foreach(PY_FILE IN LISTS PYTHON_FILES)
+    add_custom_command(
+        TARGET benchmark-mpact
+        COMMAND cmake -E echo "Running ${PY_FILE}"
+        COMMAND python ${PY_FILE}
+        DEPENDS ${PY_FILE}
+        USES_TERMINAL
+    )
+endforeach()
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 0000000..4749547
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,26 @@
+### Run benchmarks
+
+To run all benchmarks:
+
+```shell
+cmake --build build --target benchmark-mpact
+```
+
+To run selected benchmarks, build the benchmark modules first:
+
+```shell
+cmake --build build --target build-benchmark-mpact
+```
+
+And then run the benchmark file:
+
+```shell
+python path/to/the/_benchmark.py
+```
+
+If you would like to run selected kernels in kernels_benchmark.py,
+you can use `--benchmark-filter` flag like the following example:
+
+```shell
+python path/to/the/kernels_benchmark.py --benchmark-filter=add
+```
diff --git a/benchmark/python/benchmarks/gcn_benchmark.py b/benchmark/python/benchmarks/gcn_benchmark.py
old mode 100644
new mode 100755
index e69de29..ef7a3bd
--- a/benchmark/python/benchmarks/gcn_benchmark.py
+++ b/benchmark/python/benchmarks/gcn_benchmark.py
@@ -0,0 +1,33 @@
+import torch
+import numpy as np
+from mpact.models.gcn import GraphConv
+from mpact_benchmark.utils.benchmark_utils import benchmark, Backends
+
+
+@benchmark(
+    [
+        {
+            "name": f"{fmt}_{shape}_{dtype.__name__}",
+            "shape": shape,
+            "formats": fmt,
+            "dtype": dtype,
+            "drange": (1, 100),
+            "sparsity": [0, 0.5, 0.9, 0.99],
+            "backends": [b for b in Backends],
+        }
+        for shape in [
+            [[128, 128], [128, 128]],
+            [[512, 512], [512, 512]],
+            [[1024, 1024], [1024, 1024]],
+        ]
+        for fmt in [["dense", "csr"]]
+        for dtype in [np.float32]
+    ]
+)
+def GCN() -> torch.nn.Module:
+    """Graph Convolution Network."""
+    return GraphConv
+
+
+if __name__ == "__main__":
+    GCN()
diff --git a/benchmark/python/benchmarks/kernels_benchmark.py b/benchmark/python/benchmarks/kernels_benchmark.py
index e69de29..3ad9cce 100644
--- a/benchmark/python/benchmarks/kernels_benchmark.py
+++ b/benchmark/python/benchmarks/kernels_benchmark.py
@@ -0,0 +1,210 @@
+import torch
+import argparse
+import numpy as np
+from mpact_benchmark.utils.benchmark_utils import benchmark, Backends
+
+
+@benchmark(
+    [
+        {
+            "name": f"{lhs_fmt}_{rhs_fmt}_{shape}_{dtype.__name__}",
+            "shape": shape,
+            "formats": (lhs_fmt, rhs_fmt),
+            "dtype": dtype,
+            "backends": [b for b in Backends],
+            "drange": (1, 100),
+            "sparsity": [0, 0.5, 0.9, 0.99],
+        }
+        for shape in [([2**i, 2**i], [2**i, 2**i]) for i in range(5, 8)]
+        for lhs_fmt in ["dense", "csr"]
+        for rhs_fmt in ["dense", "csr"]
+        for dtype in [np.float64]
+    ]
+)
+def matmul() -> torch.nn.Module:
+    """Matrix multiplication."""
+
+    class Net(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x, y):
+            return torch.matmul(x, y)
+
+    return Net()
+
+
+@benchmark(
+    [
+        {
+            "name": f"{lhs_fmt}_{rhs_fmt}_{shape}_{dtype.__name__}",
+            "shape": shape,
+            "formats": (lhs_fmt, rhs_fmt),
+            "dtype": dtype,
+            "backends": [b for b in Backends],
+            "drange": (1, 100),
+            "sparsity": [0, 0.5, 0.9, 0.99],
+        }
+        for shape in [([2**i, 2**i], [2**i]) for i in range(5, 8)]
+        for lhs_fmt in ["dense", "csr"]
+        for rhs_fmt in ["dense"]  # torch.mv only supports dense vector for now.
+        for dtype in [np.float64]
+    ]
+)
+def matvec() -> torch.nn.Module:
+    """Matrix-vector multiplication."""
+
+    class Net(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x, y):
+            return torch.mv(x, y)
+
+    return Net()
+
+
+@benchmark(
+    [
+        {
+            "name": f"{lhs_fmt}_{rhs_fmt}_{shape}_{dtype.__name__}",
+            "shape": shape,
+            "formats": (lhs_fmt, rhs_fmt),
+            "dtype": dtype,
+            "backends": [b for b in Backends],
+            "drange": (1, 100),
+            "sparsity": [0, 0.5, 0.9, 0.99],
+        }
+        for shape in [
+            ([2**i, 2**i], [2**i, 2**i]) for i in range(5, 8)
+        ]  # 512x512 crashes runtime.
+        for lhs_fmt in ["dense", "csr"]
+        for rhs_fmt in ["dense", "csr"]
+        for dtype in [np.float64]
+    ]
+)
+def add() -> torch.nn.Module:
+    """Element-wise addition."""
+
+    class Net(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x, y):
+            return torch.add(x, y)
+
+    return Net()
+
+
+@benchmark(
+    [
+        {
+            "name": f"{lhs_fmt}_{rhs_fmt}_{shape}_{dtype.__name__}",
+            "shape": shape,
+            "formats": (lhs_fmt, rhs_fmt),
+            "dtype": dtype,
+            "backends": [b for b in Backends],
+            "drange": (1, 100),
+            "sparsity": [0, 0.5, 0.9, 0.99],
+        }
+        for shape in [([2**i, 2**i], [2**i, 2**i]) for i in range(5, 8)]
+        for lhs_fmt in ["dense", "csr"]
+        for rhs_fmt in ["dense", "csr"]
+        for dtype in [np.float64]
+    ]
+)
+def elt_mul() -> torch.nn.Module:
+    """Element-wise addition."""
+
+    class Net(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x, y):
+            return torch.mul(x, y)
+
+    return Net()
+
+
+@benchmark(
+    [
+        {
+            "name": f"{fmt}_{shape}_{dtype.__name__}",
+            "shape": shape,
+            "formats": (fmt,),
+            "dtype": dtype,
+            "backends": [b for b in Backends],
+            "drange": (1, 100),
+            "sparsity": [0, 0.5, 0.9, 0.99],
+        }
+        for shape in [([2**i, 2**i],) for i in range(2, 3)]
+        for fmt in ["dense", "csr"]
+        for dtype in [np.float64]
+    ]
+)
+def nop() -> torch.nn.Module:
+    """Returns matrix unmodified (speed of light)."""
+
+    class Net(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x):
+            return x
+
+    return Net()
+
+
+@benchmark(
+    [
+        {
+            "name": f"{sample_fmt}_sample_{shape}_{dtype.__name__}",
+            "shape": shape,
+            "formats": (sample_fmt, "dense", "dense"),
+            "dtype": dtype,
+            "backends": [b for b in Backends],
+            "drange": (1, 100),
+            "sparsity": [0, 0.5, 0.9, 0.99],
+        }
+        for shape in [
+            ([2**i, 2**i], [2**i, 2**i], [2**i, 2**i]) for i in range(5, 8)
+        ]
+        for sample_fmt in ["dense", "csr"]
+        for dtype in [np.float64]
+    ]
+)
+def sddmm() -> torch.nn.Module:
+    """SDDMM: C = S ○ (A X B) Sampled dense-dense matrix-matrix multiplication."""
+
+    class Net(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x, y, z):
+            return torch.mul(x, torch.matmul(y, z))
+
+    return Net()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="pytorch_kernel_benchmarks",
+        description="Run a set of given PyTorch kernel benchmarks",
+    )
+    parser.add_argument("--benchmark-filter", type=str, default="", required=False)
+    arguments = parser.parse_args()
+
+    benchmark_list = [
+        "nop",
+        "add",
+        "matmul",
+        "matvec",
+        "elt_mul",
+        "sddmm",
+    ]
+    if arguments.benchmark_filter:
+        benchmark_list = arguments.benchmark_filter.split(",")
+
+    # Run selected benchmarks.
+    for benchmark_name in benchmark_list:
+        globals()[benchmark_name]()
diff --git a/benchmark/python/benchmarks/lif_benchmark.py b/benchmark/python/benchmarks/lif_benchmark.py
index 5c789ea..bd5f0c1 100644
--- a/benchmark/python/benchmarks/lif_benchmark.py
+++ b/benchmark/python/benchmarks/lif_benchmark.py
@@ -9,7 +9,7 @@
         {
             "name": f"{fmt}_{shape}_{dtype.__name__}",
             "shape": shape,
-            "formats": [fmt],
+            "formats": fmt,
             "dtype": dtype,
             # Simulate batch normalization.
             "drange": (-1, 1),
@@ -31,13 +31,13 @@
             [[32, 3, 64, 64, 1]],
             [[16, 3, 224, 224, 1]],
         ]
-        for fmt in ["dense"]
+        for fmt in [["dense"]]
         for dtype in [np.float64]
     ]
 )
 def SNN() -> torch.nn.Module:
     """Spiking Neural Network."""
-    return Block
+    return Block()
 
 
 if __name__ == "__main__":
diff --git a/benchmark/python/benchmarks/resnet_benchmark.py b/benchmark/python/benchmarks/resnet_benchmark.py
index e69de29..b455ef0 100644
--- a/benchmark/python/benchmarks/resnet_benchmark.py
+++ b/benchmark/python/benchmarks/resnet_benchmark.py
@@ -0,0 +1,36 @@
+import torch
+import numpy as np
+from mpact.models.resnet import resnet20
+from mpact_benchmark.utils.benchmark_utils import benchmark, Backends
+
+
+@benchmark(
+    [
+        {
+            "name": f"{fmt}_{shape}_{dtype.__name__}",
+            "shape": shape,
+            "formats": fmt,
+            "dtype": dtype,
+            "drange": (1, 100),
+            "sparsity": [0.5, 0.9],
+            # TODO: Torch inductor requires lower precision with larger input size,
+            # such as [8, 3, 32, 32].
+            "precision": 1e-3,
+            "backends": [b for b in Backends],
+        }
+        for shape in [
+            [[1, 3, 16, 16]],
+        ]
+        for fmt in [["dense"]]
+        for dtype in [np.float32]
+    ]
+)
+def resnet() -> torch.nn.Module:
+    """Restnet20 model."""
+    resnet_model = resnet20()
+    resnet_model.train(False)
+    return resnet_model
+
+
+if __name__ == "__main__":
+    resnet()
diff --git a/benchmark/python/utils/benchmark_utils.py b/benchmark/python/utils/benchmark_utils.py
index 275853b..ec29309 100644
--- a/benchmark/python/utils/benchmark_utils.py
+++ b/benchmark/python/utils/benchmark_utils.py
@@ -123,9 +123,18 @@
                         )
                     )
                 case Backends.MPACT_SPARSE:
-                    output.append(
-                        torch.from_numpy(mpact_jit(torch_net, *sparse_inputs))
-                    )
+                    sp_out = mpact_jit(torch_net, *sparse_inputs)
+                    # Construct sparse csr tensor if the output type is csr.
+                    # TODO: return sparse tensor directly instead of a tuple of arrays.
+                    if type(sp_out) is tuple:
+                        # torch.sparse_csr_tensor could deduce the size incorrectly,
+                        # so pass the dense_out's shape explicitly.
+                        dense_out = mpact_jit(torch_net, *dense_inputs)
+                        output.append(
+                            torch.sparse_csr_tensor(*sp_out, size=dense_out.shape)
+                        )
+                    else:
+                        output.append(torch.from_numpy(sp_out))
                     invoker, f = mpact_jit_compile(torch_net, *sparse_inputs)
                     compile_time_results.append(
                         timer(
@@ -167,7 +176,11 @@
 
     # Sanity check.
     if output:
-        assert all(output[0].to_dense().allclose(out.to_dense()) for out in output)
+        rtol = variables["precision"] if "precision" in variables else 1e-5
+        assert all(
+            torch.allclose(output[0].to_dense(), out.to_dense(), rtol=rtol)
+            for out in output
+        )
 
 
 def benchmark(*args: Any) -> Callable:
@@ -176,9 +189,9 @@
     def decorator(func):
         @functools.wraps(func)
         def wrapper(test_cases=args[0]):
-            net = func()
             runtime_results = []
             compile_time_results = []
+            torch_net = net = func()
             for test_case in test_cases:
                 label = func.__name__
                 for sparsity in test_case["sparsity"]:
@@ -190,10 +203,11 @@
                         test_case["dtype"],
                         test_case["drange"],
                     )
+
                     if "GCN" in label:
                         torch_net = net(*test_case["shape"][0])
-                    else:
-                        torch_net = net()
+                    if "precision" in test_case:
+                        precision = test_case["precision"]
 
                     run_benchmark(
                         sparse_inputs,
diff --git a/setup.py b/setup.py
index 4208772..929ab41 100644
--- a/setup.py
+++ b/setup.py
@@ -96,7 +96,7 @@
             f".",
             f"--target",
             f"MPACTPythonModules",
-            f"MPACTBenchmarkPythonPythonModules",
+            f"MPACTBenchmarkPythonModules",
         ]
 
         try: