[mpact][benchmark] manual sum of squares benchmark (#65)

* [mpact][benchmark] manual sum of squares benchmark

This introduces a "manual" benchmark where we can put
some benchmarking code but without negatively adding
more load on the regular benchmark suite times.

* use 4K instead of 1K

* lint

* undo edits
diff --git a/benchmark/python/manual/README.md b/benchmark/python/manual/README.md
new file mode 100644
index 0000000..7064502
--- /dev/null
+++ b/benchmark/python/manual/README.md
@@ -0,0 +1,9 @@
+### Benchmarks run by hand
+
+These benchmarks are not run as part of MPACT's regular testing or benchmarking.
+To run an individual test, build the MPACT compiler, cd into this directory,
+and then simply run a benchmark as follows:
+
+```shell
+python <benchmark-name>.py
+```
diff --git a/benchmark/python/manual/sum_of_sq.py b/benchmark/python/manual/sum_of_sq.py
new file mode 100644
index 0000000..717259e
--- /dev/null
+++ b/benchmark/python/manual/sum_of_sq.py
@@ -0,0 +1,59 @@
+import torch
+import numpy as np
+import time
+
+from mpact.mpactbackend import mpact_jit_compile, mpact_jit_run
+from mpact_benchmark.utils.tensor_generator import generate_tensor
+
+
+def runbench_eager(tag, sp, net, x, num_iters=1000):
+    net(x)  # warmup
+    checksum = 0
+    start = time.time()
+    for i in range(num_iters):
+        res = net(x).item()
+        checksum = checksum + res
+    end = time.time()
+    time_ms = (end - start) * 1000 / num_iters
+    print("%s : %.2f : %8.4f ms. : checksum=%d" % (tag, sp, time_ms, checksum))
+
+
+def runbench_mpact(tag, sp, net, x, num_iters=1000):
+    invoker, fn = mpact_jit_compile(net, x)
+    mpact_jit_run(invoker, fn, x)  # warmup
+    checksum = 0
+    start = time.time()
+    for i in range(num_iters):
+        res = mpact_jit_run(invoker, fn, x)
+        checksum = checksum + res
+    end = time.time()
+    time_ms = (end - start) * 1000 / num_iters
+    print("%s : %.2f : %8.4f ms. : checksum=%d" % (tag, sp, time_ms, checksum))
+
+
+class SqSumNet(torch.nn.Module):
+    def forward(self, x):
+        # TODO: make this work too: return (x ** 2).sum()
+        return (x * x).sum()
+
+
+net = SqSumNet()
+h = 1024 * 4
+w = 1024 * 4
+
+for d in range(0, 101, 10):
+    sparsity = 1.0 - (d / 100.0)
+    x = generate_tensor(
+        seed=0, shape=(h, w), sparsity=sparsity, dtype=np.float32, drange=(1.0, 1.0)
+    )
+
+    # Note, we don't have binary-valued sparse tensors in PyTorch
+    # so we are using csr. For now, we have to hack the
+    #    "explicitVal=1.0:f32"
+    # into the MLIR sparse tensor type to make optimize it fully.
+    s = x.to_sparse_csr()
+
+    runbench_eager("PyTorch (dense) ", sparsity, net, x)
+    runbench_mpact("MPACT   (dense) ", sparsity, net, x)
+    runbench_eager("PyTorch (sparse)", sparsity, net, s)
+    runbench_mpact("MPACT   (sparse)", sparsity, net, s)