From 927d89c8ec956149aba9ae74e4f1853756748223 Mon Sep 17 00:00:00 2001
From: niklas <niklas@nik.run>
Date: Sat, 19 Oct 2024 15:07:53 +0000
Subject: [PATCH] init

---
 _chain.daph                                   |  15 +
 run_horz.py                                   | 133 +++
 shared.py                                     | 110 +++
 sketch/bench/abs_t_exp.daph                   |  11 +
 sketch/bench/kmeans.daphne                    |  29 +
 sketch/bench/outerAdd_exp.daph                |  12 +
 sketch/bench/outerAdd_sumCol_exp.daph         |  19 +
 sketch/bench/outerAdd_t.daph                  |  12 +
 sketch/bench/outerAdd_t_exp.daph              |  18 +
 sketch/bench/sqrt_sum.daph                    |   9 +
 sketch/bench/transpose_sum.daph               |   9 +
 src/api/cli/DaphneUserConfig.h                |   8 +
 src/api/internal/daphne_internal.cpp          |  34 +-
 src/compiler/execution/DaphneIrExecutor.cpp   |  24 +-
 src/compiler/lowering/CMakeLists.txt          |   6 +-
 .../DaphneVectorizeComputationsPass.cpp}      |  31 +-
 .../Greedy1VectorizeComputationsPass.cpp      | 261 ++++++
 .../vectorize/HorizontalFusionPass.cpp        | 251 +++++
 src/compiler/lowering/vectorize/VectorUtils.h | 886 ++++++++++++++++++
 .../lowering/vectorize/VectorizeDefs.h        |  22 +
 src/ir/daphneir/DaphneOps.td                  |  65 +-
 .../DaphneVectorizableOpInterface.cpp         | 380 ++++++--
 .../daphneir/DaphneVectorizableOpInterface.td |  13 +-
 src/ir/daphneir/Passes.h                      |   7 +-
 src/parser/daphnedsl/DaphneDSLBuiltins.cpp    |  11 +
 src/runtime/local/vectorized/MTWrapper.h      |   3 +
 .../local/vectorized/MTWrapper_dense.cpp      |  16 +-
 src/runtime/local/vectorized/Tasks.cpp        | 171 ++--
 src/runtime/local/vectorized/Tasks.h          |  28 +-
 29 files changed, 2384 insertions(+), 210 deletions(-)
 create mode 100644 _chain.daph
 create mode 100644 run_horz.py
 create mode 100644 shared.py
 create mode 100644 sketch/bench/abs_t_exp.daph
 create mode 100644 sketch/bench/kmeans.daphne
 create mode 100644 sketch/bench/outerAdd_exp.daph
 create mode 100644 sketch/bench/outerAdd_sumCol_exp.daph
 create mode 100644 sketch/bench/outerAdd_t.daph
 create mode 100644 sketch/bench/outerAdd_t_exp.daph
 create mode 100644 sketch/bench/sqrt_sum.daph
 create mode 100644 sketch/bench/transpose_sum.daph
 rename src/compiler/lowering/{VectorizeComputationsPass.cpp => vectorize/DaphneVectorizeComputationsPass.cpp} (94%)
 create mode 100644 src/compiler/lowering/vectorize/Greedy1VectorizeComputationsPass.cpp
 create mode 100644 src/compiler/lowering/vectorize/HorizontalFusionPass.cpp
 create mode 100644 src/compiler/lowering/vectorize/VectorUtils.h
 create mode 100644 src/compiler/lowering/vectorize/VectorizeDefs.h

diff --git a/_chain.daph b/_chain.daph
new file mode 100644
index 000000000..72b8a29ee
--- /dev/null
+++ b/_chain.daph
@@ -0,0 +1,15 @@
+X = fill(1.0, 30000, 30000);
+startProfiling();
+v0 = t(X);
+v1 = t(v0);
+v2 = t(v1);
+v3 = t(v2);
+v4 = t(v3);
+v5 = t(v4);
+v6 = t(v5);
+v7 = t(v6);
+v8 = t(v7);
+v9 = t(v8);
+stopProfiling();
+print(v9[0,0]);
+
diff --git a/run_horz.py b/run_horz.py
new file mode 100644
index 000000000..1e0faca88
--- /dev/null
+++ b/run_horz.py
@@ -0,0 +1,133 @@
+import sys
+import numpy as np
+import json
+import datetime
+import argparse
+from tabulate import tabulate
+import pandas as pd
+import shared as sh
+
+#------------------------------------------------------------------------------
+# GLOBAL
+#------------------------------------------------------------------------------
+
+GENERATE_FUNCS = {
+    "ADD": lambda i, arg: [f"v{i} = {arg} + {i * 0.1};"],
+    "ADD_SUM": lambda i, arg: [f"i{i} = {arg} + {i * 0.1};", f"v{i} = sum(i{i});"]
+}
+
+GENERATE_PRINT_FUNCS = {
+    "ADD": lambda i: [f"print(v{i}[0,0]);"],
+    "ADD_SUM": lambda i: [f"print(v{i});"]
+}
+
+BASE_CWD = "./"
+GLOBAL_ARGS = []
+BASE_COMMAND = lambda th, bs, no_hf: [
+    "./run-daphne.sh",
+    "--timing",
+    "--vec",
+    "--vec-type=GREEDY_1",
+    f"--num-threads={th}",
+    f"--batchSize={bs}",
+] + (["--no-hf"] if no_hf else []) + GLOBAL_ARGS + ["./_horz.daph"]
+
+#------------------------------------------------------------------------------
+# HELPER
+#------------------------------------------------------------------------------
+
+def generate_script(num_ops, tool, func, rows, cols):
+
+    script = []
+    
+    script.append(f"X = fill(1.0, {rows}, {cols});")
+    script.append(sh.TOOLS[tool]["START_OP"])
+
+    for j in range(0, num_ops):
+        script += GENERATE_FUNCS[func](j, "X")
+    script.append(sh.TOOLS[tool]["STOP_OP"])
+
+    for j in range(0, num_ops):
+        script += GENERATE_PRINT_FUNCS[func](j)
+
+    script.append(sh.TOOLS[tool]["END_OP"])
+
+    return script
+
+#------------------------------------------------------------------------------
+# ARGS
+#------------------------------------------------------------------------------
+
+parser = argparse.ArgumentParser(description="Arguments")
+parser.add_argument("--tool", type=str, choices=sh.TOOLS.keys(), help="", required=True)
+parser.add_argument("--script", type=str, choices=GENERATE_FUNCS.keys(), help="", required=True)
+parser.add_argument("--rows", type=int, default=10000, help="rows")
+parser.add_argument("--cols", type=int, default=10000, help="rows")
+parser.add_argument("--samples", type=int, default=3, help="")
+parser.add_argument("--num-ops", type=int, default=12, help="")
+parser.add_argument("--threads", type=int, default=1, help="")
+parser.add_argument("--batchSize", type=int, default=0, help="")
+parser.add_argument("--verbose-output", action="store_true")
+parser.add_argument("--explain", action="store_true")
+
+#------------------------------------------------------------------------------
+# MAIN
+#------------------------------------------------------------------------------
+
+if __name__ == "__main__":
+
+    args = parser.parse_args()
+    exp_start = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
+
+    if args.explain:
+        GLOBAL_ARGS += ["--explain=vectorized"]
+
+    output = []
+    for no_hf in [False, True]: 
+        
+        cmd = BASE_COMMAND(args.threads, args.batchSize, no_hf)
+
+        command_output = {}
+        for ops in range(args.num_ops, args.num_ops+1):
+
+            script = generate_script(ops, args.tool, args.script, args.rows, args.cols)
+            with open("_horz.daph", "w") as f:
+                for line in script:
+                    f.write(line + '\n')
+
+            timings = sh.runner(args, cmd, BASE_CWD) 
+
+            #command_output[ops] = timings 
+            command_output = timings 
+
+            print()
+            
+        output.append({
+            "cmd": cmd,
+            "timings": command_output,
+          
+        })
+
+    with open(exp_start + "-horz_timings.json", "w+") as f:
+        _output = {
+            "settings": {
+                "num-ops": args.num_ops,
+                "rows": args.rows,
+                "cols": args.cols,
+                "type": args.script,
+                "tool": args.tool,
+                "threads": args.threads,
+                "samples": args.samples,
+                "batchSize": args.batchSize
+            },
+            "execs": output
+        }
+        json.dump(_output, f, indent=4)
+        f.close()
+    
+    for i in output:
+        print(" ".join(i["cmd"]))
+        df = pd.json_normalize(i["timings"], sep=".")
+        tools_cols = [col for col in df.columns if col.startswith("tool")]
+        df[tools_cols] = df[tools_cols].astype(int)
+        print(tabulate(df.describe(), headers="keys", tablefmt="psql", showindex=True))
\ No newline at end of file
diff --git a/shared.py b/shared.py
new file mode 100644
index 000000000..eb12fdcd5
--- /dev/null
+++ b/shared.py
@@ -0,0 +1,110 @@
+import os
+import subprocess
+import json
+import pandas as pd
+from tabulate import tabulate
+
+#------------------------------------------------------------------------------
+# RUN COMMAND
+#------------------------------------------------------------------------------
+
+def run_command(cmd, cwd, env):
+
+    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd, env={**env, **os.environ})
+    stdout, stderr = process.communicate()
+
+    return stdout.decode(), stderr.decode()
+
+def runner(args, cmd, cwd):
+
+    tool_env = TOOLS[args.tool]["ENV"]
+    env_str = " ".join(f"{k}=\"{v}\"" for k, v in tool_env.items())
+    cmd_str = " ".join(cmd)
+    print(f"Run: {env_str} {cmd_str} {cwd}")
+
+    timings = []
+    for i in range(0, args.samples):
+
+        stdout, stderr = run_command(cmd, cwd, tool_env)
+                
+        if args.verbose_output:
+            print(stdout)
+            print(stderr)
+
+        timing = json.loads(stderr.split("\n")[-2])
+        timing["tool"] = TOOLS[args.tool]["GET_INFO"](stdout)
+
+        df = pd.json_normalize(timing, sep=".")
+        print(tabulate(df, headers="keys", tablefmt="psql", showindex=False))
+        timings.append(timing)
+
+    return timings
+
+#------------------------------------------------------------------------------
+# TOOLS
+#------------------------------------------------------------------------------
+
+def extract_f1xm3(stdout):
+    lines = stdout.split('\n')
+
+    for line in reversed(lines):
+        if "F1XM3" in line:
+            number = line.split("F1XM3:")[1]
+            return int(number)
+    return None
+
+def extract_papi(stdout):
+    lines = stdout.split('\n')
+
+    offset = 0
+    for i, line in enumerate(lines):
+        if line.startswith("PAPI-HL Output:"):
+           offset = i
+           break
+    t = "".join(lines[offset+1:])
+    j = json.loads(t)
+    out = j["threads"]["0"]["regions"]["0"]
+    del out["name"]
+    del out["parent_region_id"]
+    return out
+
+TOOLS = {
+    "PAPI_STD": {
+        "ENV": {
+            "PAPI_EVENTS": "perf::CYCLES,perf::INSTRUCTIONS,perf::CACHE-REFERENCES,perf::CACHE-MISSES,perf::BRANCHES,perf::BRANCH-MISSES",
+            "PAPI_REPORT": "1"
+        },
+        "START_OP": "startProfiling();",
+        "STOP_OP": "stopProfiling();",
+        "END_OP": "",
+        "GET_INFO": extract_papi
+    },
+    "PAPI_L1": {
+        "ENV": {
+            "PAPI_EVENTS": "perf::L1-dcache-load-misses,perf::L1-dcache-loads,perf::L1-dcache-prefetches,perf::L1-icache-load-misses,perf::L1-icache-loads",
+            "PAPI_REPORT": "1",
+        },
+        "START_OP": "startProfiling();",
+        "STOP_OP": "stopProfiling();",
+        "END_OP": "",
+        "GET_INFO": extract_papi
+    },
+    "PAPI_MPLX": {
+        "ENV": {
+            "PAPI_EVENTS": "perf::CYCLES,perf::INSTRUCTIONS,perf::CACHE-REFERENCES,perf::CACHE-MISSES,perf::BRANCHES,perf::BRANCH-MISSES,perf::L1-dcache-load-misses,perf::L1-dcache-loads,perf::L1-dcache-prefetches,perf::L1-icache-load-misses,perf::L1-icache-loads",
+            "PAPI_REPORT": "1",
+            "PAPI_MULTIPLEX": "1",
+        },
+        "START_OP": "startProfiling();",
+        "STOP_OP": "stopProfiling();",
+        "END_OP": "",
+        "GET_INFO": extract_papi
+    },
+    "NOW": {
+        "ENV": {},
+        "START_OP": "start = now();",
+        "STOP_OP": "end = now();",
+        "END_OP": "print(\"F1XM3:\"+ (end - start));",
+        "GET_INFO": extract_f1xm3
+    }
+}
\ No newline at end of file
diff --git a/sketch/bench/abs_t_exp.daph b/sketch/bench/abs_t_exp.daph
new file mode 100644
index 000000000..0358198fa
--- /dev/null
+++ b/sketch/bench/abs_t_exp.daph
@@ -0,0 +1,11 @@
+X = rand($r, $c, 0.0, 1.0, 1, 12345);
+
+<start>
+i1 = abs(X);
+i2 = t(i1);
+i3 = exp(i2); 
+i4 = i3 + 2; 
+<stop>
+
+print(i4[0,0]);
+<end>
\ No newline at end of file
diff --git a/sketch/bench/kmeans.daphne b/sketch/bench/kmeans.daphne
new file mode 100644
index 000000000..d722d21b4
--- /dev/null
+++ b/sketch/bench/kmeans.daphne
@@ -0,0 +1,29 @@
+// K-means clustering.
+
+// Arguments:
+// - r ... number of records
+// - c ... number of centroids
+// - f ... number of features
+// - i ... number of iterations
+
+// Data generation.
+X = rand($r, $f, 0.0, 1.0, 1, 12345);
+C = rand($c, $f, 0.0, 1.0, 1, 67890);
+
+// K-means clustering (decisive part).
+<start>
+for(i in 1:$i) {
+    D = (X @ t(C)) * -2 + t(sum(C ^ 2, 0));
+    minD = aggMin(D, 0);
+    P = D <= minD;
+    P = P / sum(P, 0);
+    P_denom = sum(P, 1);
+    C = (t(P) @ X) / t(P_denom);
+}
+<stop>
+
+// Result output.
+print(C[0,0]);
+print(C[1,1]);
+print(C[2,2]);
+<end>
\ No newline at end of file
diff --git a/sketch/bench/outerAdd_exp.daph b/sketch/bench/outerAdd_exp.daph
new file mode 100644
index 000000000..d52b39cca
--- /dev/null
+++ b/sketch/bench/outerAdd_exp.daph
@@ -0,0 +1,12 @@
+X = rand($r, 1, 0.0, 1.0, 1, 12345);
+Y = rand(1, $c, 0.0, 1.0, 1, 67890);
+
+<start>
+i1 = outerAdd(X,Y); 
+i2 = exp(i1);
+i3 = i2 + 2;
+<stop>
+
+
+print(i3[0,0]);
+<end>
\ No newline at end of file
diff --git a/sketch/bench/outerAdd_sumCol_exp.daph b/sketch/bench/outerAdd_sumCol_exp.daph
new file mode 100644
index 000000000..a3ae07afb
--- /dev/null
+++ b/sketch/bench/outerAdd_sumCol_exp.daph
@@ -0,0 +1,19 @@
+
+//r=40000, c=40000
+//NoVec: Killed
+//GR1: Killed
+//GR2: ~33s
+X = rand($r, 1, 0.0, 1.0, 1, 12345);
+Y = rand(1, $c, 0.0, 1.0, 1, 67890);
+
+<start>
+i1 = outerAdd(X,Y);
+i2 = sum(i1, 1);
+i3 = outerAdd(X,i2);
+i4 = sqrt(i3); 
+i5 = i4 + 2; 
+<stop>
+
+
+print(i5[0,0]);
+<end>
\ No newline at end of file
diff --git a/sketch/bench/outerAdd_t.daph b/sketch/bench/outerAdd_t.daph
new file mode 100644
index 000000000..fd45b4ac9
--- /dev/null
+++ b/sketch/bench/outerAdd_t.daph
@@ -0,0 +1,12 @@
+X = rand($r, 1, 0.0, 1.0, 1, 12345);
+Y = rand(1, $c, 0.0, 1.0, 1, 67890);
+
+<start>
+i1 = outerAdd(X,Y);
+i2 = t(i1);
+i3 = i2 + 2;
+<stop>
+
+
+print(i3[0,0]);
+<end>
\ No newline at end of file
diff --git a/sketch/bench/outerAdd_t_exp.daph b/sketch/bench/outerAdd_t_exp.daph
new file mode 100644
index 000000000..052abe92e
--- /dev/null
+++ b/sketch/bench/outerAdd_t_exp.daph
@@ -0,0 +1,18 @@
+
+//r=40000, c=40000
+//NoVec: Killed
+//GR1: Killed
+//GR2: ~33s
+X = rand($r, 1, 0.0, 1.0, 1, 12345);
+Y = rand(1, $c, 0.0, 1.0, 1, 67890);
+
+<start>
+i1 = outerAdd(X,Y);
+i2 = t(i1);
+i3 = exp(i2); 
+i4 = i3 + 2; 
+<stop>
+
+
+print(i4[0,0]);
+<end>
\ No newline at end of file
diff --git a/sketch/bench/sqrt_sum.daph b/sketch/bench/sqrt_sum.daph
new file mode 100644
index 000000000..0a5d95e69
--- /dev/null
+++ b/sketch/bench/sqrt_sum.daph
@@ -0,0 +1,9 @@
+X = rand($r, $c, 0.0, 1.0, 1, 12345);
+
+<start>
+i1 = sqrt(X);
+i2 = sum(i1);
+<stop>
+
+print(i2);
+<end>
\ No newline at end of file
diff --git a/sketch/bench/transpose_sum.daph b/sketch/bench/transpose_sum.daph
new file mode 100644
index 000000000..d087eb29d
--- /dev/null
+++ b/sketch/bench/transpose_sum.daph
@@ -0,0 +1,9 @@
+X = rand($r, $c, 0.0, 1.0, 1, 12345);
+
+<start>
+t = t(X);
+s = sum(t);
+<stop>
+
+print(s);
+<end>
\ No newline at end of file
diff --git a/src/api/cli/DaphneUserConfig.h b/src/api/cli/DaphneUserConfig.h
index d33a18e30..e2ee07ea8 100644
--- a/src/api/cli/DaphneUserConfig.h
+++ b/src/api/cli/DaphneUserConfig.h
@@ -18,6 +18,9 @@
 
 #include <api/daphnelib/DaphneLibResult.h>
 #include <compiler/catalog/KernelCatalog.h>
+#include <compiler/lowering/vectorize/VectorizeDefs.h>
+#include <cstddef>
+#include <runtime/local/vectorized/LoadPartitioningDefs.h>
 #include <runtime/local/datastructures/IAllocationDescriptor.h>
 #include <runtime/local/vectorized/LoadPartitioningDefs.h>
 #include <util/DaphneLogger.h>
@@ -38,6 +41,11 @@ struct DaphneUserConfig {
     // Remember to update UserConfig.json accordingly!
     bool use_cuda = false;
     bool use_vectorized_exec = false;
+
+    bool no_horizontal_fusion = false;
+    VectorizationType vectorizationType = GREEDY_1;
+    size_t batchSize = 0;
+
     bool use_distributed = false;
     bool use_obj_ref_mgnt = true;
     bool use_ipa_const_propa = true;
diff --git a/src/api/internal/daphne_internal.cpp b/src/api/internal/daphne_internal.cpp
index 5b533601b..384c991b7 100644
--- a/src/api/internal/daphne_internal.cpp
+++ b/src/api/internal/daphne_internal.cpp
@@ -24,7 +24,10 @@
 #include <api/cli/DaphneUserConfig.h>
 #include <api/cli/StatusCode.h>
 #include <api/daphnelib/DaphneLibResult.h>
-#include <api/internal/daphne_internal.h>
+#include <parser/daphnedsl/DaphneDSLParser.h>
+#include "compiler/execution/DaphneIrExecutor.h"
+#include "compiler/lowering/vectorize/VectorizeDefs.h"
+#include <runtime/local/vectorized/LoadPartitioning.h>
 #include <parser/catalog/KernelCatalogParser.h>
 #include <parser/config/ConfigParser.h>
 #include <parser/daphnedsl/DaphneDSLParser.h>
@@ -303,7 +306,27 @@ int startDAPHNE(int argc, const char **argv, DaphneLibResult *daphneLibRes, int
                          clEnumVal(llvm, "Show DaphneIR after llvm lowering"),
                          clEnumVal(mlir_codegen, "Show DaphneIR after MLIR codegen")),
         CommaSeparated);
-
+    
+    static opt<VectorizationType> vectorizeTypeList(
+        "vec-type", cat(daphneOptions),
+        llvm::cl::desc("Apply specific Vectorization pass"),
+        llvm::cl::values(
+            clEnumVal(DAPHNE, "Use original DAPHNE Vectorization pass"),
+            clEnumVal(GREEDY_1, "Use first Greedy Vectorization pass")),
+            init(GREEDY_1)
+    );
+
+    static opt<size_t> batchSize(
+            "batchSize", cat(daphneOptions),
+            desc(
+                "batchSize"
+            )
+    );
+
+    static opt<bool> noHorizontalFusion(
+        "no-hf", cat(daphneOptions),
+        desc("No horizontal fusion"));
+    
     static llvm::cl::list<string> scriptArgs1("args", cat(daphneOptions),
                                               desc("Alternative way of specifying arguments to the DaphneDSL "
                                                    "script; must be a comma-separated list of name-value-pairs, "
@@ -367,6 +390,9 @@ int startDAPHNE(int argc, const char **argv, DaphneLibResult *daphneLibRes, int
         logger = std::make_unique<DaphneLogger>(user_config);
 
     user_config.use_vectorized_exec = useVectorizedPipelines;
+    user_config.vectorizationType = vectorizeTypeList;
+    user_config.batchSize = batchSize;
+
     user_config.use_distributed = useDistributedRuntime;
     user_config.use_obj_ref_mgnt = !noObjRefMgnt;
     user_config.use_ipa_const_propa = !noIPAConstPropa;
@@ -514,6 +540,10 @@ int startDAPHNE(int argc, const char **argv, DaphneLibResult *daphneLibRes, int
         user_config.use_fpgaopencl = true;
     }
 
+    if (noHorizontalFusion) {
+        user_config.no_horizontal_fusion = true;
+    }
+
     if (enableProfiling) {
 #ifndef USE_PAPI
         throw std::runtime_error("you are trying to use profiling, but daphne "
diff --git a/src/compiler/execution/DaphneIrExecutor.cpp b/src/compiler/execution/DaphneIrExecutor.cpp
index 67fdac21d..e1e00ddc7 100644
--- a/src/compiler/execution/DaphneIrExecutor.cpp
+++ b/src/compiler/execution/DaphneIrExecutor.cpp
@@ -25,6 +25,7 @@
 
 #include <filesystem>
 
+#include "compiler/lowering/vectorize/VectorizeDefs.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
@@ -140,8 +141,27 @@ bool DaphneIrExecutor::runPasses(mlir::ModuleOp module) {
     if (userConfig_.use_vectorized_exec || userConfig_.use_distributed) {
         // TODO: add inference here if we have rewrites that could apply to
         // vectorized pipelines due to smaller sizes
-        pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createVectorizeComputationsPass());
+        switch (userConfig_.vectorizationType) {
+
+            case DAPHNE:
+                pm.addNestedPass<mlir::func::FuncOp>(
+                    mlir::daphne::createDaphneVectorizeComputationsPass());
+                break;
+            case GREEDY_1: 
+                pm.addNestedPass<mlir::func::FuncOp>(
+                    mlir::daphne::createGreedy1VectorizeComputationsPass(userConfig_));
+                break;
+            default:
+                pm.addNestedPass<mlir::func::FuncOp>(
+                    mlir::daphne::createGreedy1VectorizeComputationsPass(userConfig_));
+                break;
+        }
         pm.addPass(mlir::createCanonicalizerPass());
+        if (!userConfig_.no_horizontal_fusion) {
+            pm.addNestedPass<mlir::func::FuncOp>
+                (mlir::daphne::createHorizontalFusionPass());
+            pm.addPass(mlir::createCanonicalizerPass());
+        }
     }
     if (userConfig_.explain_vectorized)
         pm.addPass(mlir::daphne::createPrintIRPass("IR after vectorization:"));
@@ -193,7 +213,7 @@ bool DaphneIrExecutor::runPasses(mlir::ModuleOp module) {
 
     // Initialize the use of each distinct kernels library to false.
     usedLibPaths = userConfig_.kernelCatalog.getLibPaths();
-
+    
     try {
         if (failed(pm.run(module))) {
             module->dump();
diff --git a/src/compiler/lowering/CMakeLists.txt b/src/compiler/lowering/CMakeLists.txt
index af7f0eb88..c69f2368f 100644
--- a/src/compiler/lowering/CMakeLists.txt
+++ b/src/compiler/lowering/CMakeLists.txt
@@ -25,7 +25,11 @@ add_mlir_dialect_library(MLIRDaphneTransforms
     PhyOperatorSelectionPass.cpp
     RewriteToCallKernelOpPass.cpp
     SpecializeGenericFunctionsPass.cpp
-    VectorizeComputationsPass.cpp
+
+    vectorize/DaphneVectorizeComputationsPass.cpp
+    vectorize/Greedy1VectorizeComputationsPass.cpp
+    vectorize/HorizontalFusionPass.cpp
+
     DaphneOptPass.cpp
     EwOpsLowering.cpp
     ModOpLowering.cpp
diff --git a/src/compiler/lowering/VectorizeComputationsPass.cpp b/src/compiler/lowering/vectorize/DaphneVectorizeComputationsPass.cpp
similarity index 94%
rename from src/compiler/lowering/VectorizeComputationsPass.cpp
rename to src/compiler/lowering/vectorize/DaphneVectorizeComputationsPass.cpp
index 985c6442e..18d9c4075 100644
--- a/src/compiler/lowering/VectorizeComputationsPass.cpp
+++ b/src/compiler/lowering/vectorize/DaphneVectorizeComputationsPass.cpp
@@ -19,12 +19,11 @@
 #include "ir/daphneir/Passes.h"
 #include <util/ErrorHandler.h>
 
-#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
 
-#include <iostream>
 #include <memory>
-#include <set>
 
 using namespace mlir;
 
@@ -179,12 +178,15 @@ void movePipelineInterleavedOperations(Block::iterator pipelinePosition,
     }
 }
 
-struct VectorizeComputationsPass : public PassWrapper<VectorizeComputationsPass, OperationPass<func::FuncOp>> {
+struct DaphneVectorizeComputationsPass : public PassWrapper<DaphneVectorizeComputationsPass, OperationPass<func::FuncOp>> {
     void runOnOperation() final;
 };
 } // namespace
 
-void VectorizeComputationsPass::runOnOperation() {
+void DaphneVectorizeComputationsPass::runOnOperation() {
+
+    llvm::outs() << "DaphneVectorizeComputationsPass" << "\n";
+
     auto func = getOperation();
     // TODO: fuse pipelines that have the matching inputs, even if no output of
     // the one pipeline is used by the other.
@@ -194,13 +196,15 @@ void VectorizeComputationsPass::runOnOperation() {
     // Find vectorizable operations and their inputs of vectorizable operations
     std::vector<daphne::Vectorizable> vectOps;
     func->walk([&](daphne::Vectorizable op) {
-        if (CompilerUtils::isMatrixComputation(op))
+        if (CompilerUtils::isMatrixComputation(op) && !llvm::isa<daphne::AllAggSumOp>(op))
             vectOps.emplace_back(op);
     });
     std::vector<daphne::Vectorizable> vectorizables(vectOps.begin(), vectOps.end());
     std::multimap<daphne::Vectorizable, daphne::Vectorizable> possibleMerges;
     for (auto v : vectorizables) {
-        for (auto e : llvm::zip(v->getOperands(), v.getVectorSplits())) {
+        auto splits = v.getVectorSplits()[0];
+        for (auto e : llvm::zip(v->getOperands(), splits)) {
+
             auto operand = std::get<0>(e);
             auto defOp = operand.getDefiningOp<daphne::Vectorizable>();
             if (defOp && v->getBlock() == defOp->getBlock() && CompilerUtils::isMatrixComputation(defOp)) {
@@ -232,7 +236,7 @@ void VectorizeComputationsPass::runOnOperation() {
                     auto split = std::get<1>(e);
                     // find the corresponding `OpResult` to figure out combine
                     auto opResult = *llvm::find(defOp->getResults(), operand);
-                    auto combine = defOp.getVectorCombines()[opResult.getResultNumber()];
+                    auto combine = defOp.getVectorCombines()[0][opResult.getResultNumber()];
 
                     if (split == daphne::VectorSplit::ROWS) {
                         if (combine == daphne::VectorCombine::ROWS)
@@ -300,8 +304,9 @@ void VectorizeComputationsPass::runOnOperation() {
         movePipelineInterleavedOperations(builder.getInsertionPoint(), pipeline);
         for (auto vIt = pipeline.rbegin(); vIt != pipeline.rend(); ++vIt) {
             auto v = *vIt;
-            auto vSplits = v.getVectorSplits();
-            auto vCombines = v.getVectorCombines();
+            auto vSplits = v.getVectorSplits()[0];
+            auto vCombines = v.getVectorCombines()[0];
+            auto vOutSizes = v.createOpsOutputSizes(builder)[0];
             // TODO: although we do create enum attributes, it might make
             // sense/make it easier to
             //  just directly use an I64ArrayAttribute
@@ -319,7 +324,7 @@ void VectorizeComputationsPass::runOnOperation() {
             for (auto result : v->getResults()) {
                 results.push_back(result);
             }
-            for (auto outSize : v.createOpsOutputSizes(builder)) {
+            for (auto outSize : vOutSizes) {
                 outRows.push_back(outSize.first);
                 outCols.push_back(outSize.second);
             }
@@ -404,6 +409,6 @@ void VectorizeComputationsPass::runOnOperation() {
     }
 }
 
-std::unique_ptr<Pass> daphne::createVectorizeComputationsPass() {
-    return std::make_unique<VectorizeComputationsPass>();
+std::unique_ptr<Pass> daphne::createDaphneVectorizeComputationsPass() {
+    return std::make_unique<DaphneVectorizeComputationsPass>();
 }
diff --git a/src/compiler/lowering/vectorize/Greedy1VectorizeComputationsPass.cpp b/src/compiler/lowering/vectorize/Greedy1VectorizeComputationsPass.cpp
new file mode 100644
index 000000000..84941f8a0
--- /dev/null
+++ b/src/compiler/lowering/vectorize/Greedy1VectorizeComputationsPass.cpp
@@ -0,0 +1,261 @@
+/*
+ *  Copyright 2021 The DAPHNE Consortium
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include "api/cli/DaphneUserConfig.h"
+#include "compiler/lowering/vectorize/VectorUtils.h"
+#include "ir/daphneir/Daphne.h"
+#include "ir/daphneir/DaphneVectorizableOpInterface.h"
+#include "ir/daphneir/Passes.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include <cstdint>
+#include <mlir/IR/OpDefinition.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <map>
+#include <memory>
+#include <stack>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+#include <llvm/ADT/STLExtras.h>
+
+#include <spdlog/spdlog.h>
+#include <util/ErrorHandler.h>
+
+using namespace mlir;
+
+namespace {
+
+//-----------------------------------------------------------------
+// CONST
+//-----------------------------------------------------------------
+
+
+//-----------------------------------------------------------------
+// Class functions
+//-----------------------------------------------------------------
+
+struct Greedy1VectorizeComputationsPass
+    : public PassWrapper<Greedy1VectorizeComputationsPass, OperationPass<func::FuncOp>> {
+    void runOnOperation() final;
+
+    const DaphneUserConfig& userConfig;
+
+    explicit Greedy1VectorizeComputationsPass(const DaphneUserConfig& cfg) : userConfig(cfg) {}
+};
+
+void printStack(std::stack<std::tuple<mlir::Operation *, Pipeline *, Pipeline *>> s) {
+    llvm::outs() << "[";
+    while (!s.empty()) {
+        auto op = s.top();
+        llvm::outs() << "(" << std::get<0>(op)->getName().getStringRef().str() << ", " << std::get<1>(op) << "), ";
+        s.pop();
+    }
+    llvm::outs() << "]\n";
+}
+
+void printGraph(std::vector<mlir::Operation *> leafOps, std::string filename) {
+    std::stack<mlir::Operation *> stack;
+    std::ofstream dot(filename);
+    if (!dot.is_open()) {
+        throw std::runtime_error("test");
+    }
+
+    dot << "digraph G {\n";
+    for (auto leaf : leafOps) {
+        stack.push(leaf);
+    }
+
+    std::vector<mlir::Operation *> visited;
+
+    while (!stack.empty()) {
+        auto op = stack.top();
+        stack.pop();
+        if (std::find(visited.begin(), visited.end(), op) != visited.end()) {
+            continue;
+        }
+        visited.push_back(op);
+
+        auto v = llvm::dyn_cast<daphne::Vectorizable>(op);
+        for (unsigned i = 0; i < v->getNumOperands(); ++i) {
+            mlir::Value e = v->getOperand(i);
+            auto defOp = e.getDefiningOp();
+            if (llvm::isa<daphne::MatrixType>(e.getType()) && llvm::isa<daphne::Vectorizable>(defOp)) {
+                dot << "\"" << defOp->getName().getStringRef().str() << "+" << std::hex
+                    << reinterpret_cast<uintptr_t>(defOp) << "\" -> \"" << op->getName().getStringRef().str() << "+"
+                    << std::hex << reinterpret_cast<uintptr_t>(op) << "\" [label=\"" << i << "\"];\n";
+                stack.push(defOp);
+            }
+        }
+    }
+    dot << "}";
+    dot.close();
+}
+} // namespace
+
+void Greedy1VectorizeComputationsPass::runOnOperation() {
+
+    auto func = getOperation();
+
+    VectorIndex ZeroDecision = 0;
+    /*if (userConfig.colFirst) {
+        ZeroDecision = 1;
+    }*/
+
+    std::vector<mlir::Operation *> ops;
+    func->walk([&](daphne::Vectorizable op) {
+        for (auto opType : op->getOperandTypes()) {
+            if (!opType.isIntOrIndexOrFloat() && !llvm::isa<daphne::StringType>(opType)) {
+                ops.emplace_back(op);
+                break;
+            }
+        }
+    });
+    std::reverse(ops.begin(), ops.end());
+
+    // result
+    std::vector<Pipeline *> pipelines;
+    std::vector<mlir::Operation *> leafOps;
+    std::stack<std::tuple<mlir::Operation *, Pipeline *, DisconnectReason>> stack;
+
+    for (const auto &op : ops) {
+        auto users = op->getUsers();
+        bool found = false;
+        for (auto u : users) {
+            if (std::find(ops.begin(), ops.end(), u) != ops.end()) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            leafOps.push_back(op);
+            stack.push({op, nullptr, DisconnectReason::INVALID});
+        }
+    }
+
+    std::multimap<PipelinePair, DisconnectReason> mmProducerConsumerRelationships;
+    std::map<mlir::Operation *, Pipeline *> operationToPipeline;
+
+    // std::vector<mlir::Operation*> boundingOperations;
+
+    while (!stack.empty()) {
+        auto t = stack.top();
+        stack.pop();
+        auto op = std::get<0>(t);
+        auto currPipeline = std::get<1>(t);
+        auto disReason = std::get<2>(t);
+
+        if (operationToPipeline.find(op) != operationToPipeline.end()) {
+            auto producerPipeline = operationToPipeline.at(op);
+            mmProducerConsumerRelationships.insert({{currPipeline, producerPipeline}, disReason});
+            continue;
+        }
+
+        if (disReason != DisconnectReason::NONE) {
+            auto _pipeline = new Pipeline();
+            pipelines.push_back(_pipeline);
+
+            // check needed for empty init
+            if (currPipeline != nullptr)
+                mmProducerConsumerRelationships.insert({{currPipeline, _pipeline}, disReason});
+
+            currPipeline = _pipeline;
+        }
+
+        operationToPipeline.insert({op, currPipeline});
+        currPipeline->push_back(op);
+
+        auto vectOp = llvm::dyn_cast<daphne::Vectorizable>(op);
+
+        for (size_t i = 0; i < vectOp->getNumOperands(); ++i) {
+            auto operand = vectOp->getOperand(i);
+
+            // llvm::outs() << op->getName().getStringRef().str() << " ";
+
+            if (!llvm::isa<daphne::MatrixType>(operand.getType()))
+                continue;
+
+            if (llvm::isa<mlir::BlockArgument>(operand)) {
+                continue;
+            }
+
+            // could it help to check if we check if operand.getDefiningOp is inside (global) ops vector?
+            if (auto vectDefOp = llvm::dyn_cast<daphne::Vectorizable>(operand.getDefiningOp())) {
+                // llvm::outs() << vectDefOp->getName().getStringRef().str() << "\n";
+
+                auto split = vectOp.getVectorSplits()[ZeroDecision][i];
+                auto combine = vectDefOp.getVectorCombines()[ZeroDecision][0];
+
+                // same block missing
+                if (VectorUtils::matchingVectorSplitCombine(split, combine) &&
+                    vectDefOp->getBlock() == vectOp->getBlock()) {
+                    if (vectDefOp->hasOneUse()) {
+                        stack.push({vectDefOp, currPipeline, DisconnectReason::NONE});
+                    } else {
+                        stack.push({vectDefOp, currPipeline, DisconnectReason::MULTIPLE_CONSUMERS});
+                    }
+                } else {
+                    stack.push({vectDefOp, currPipeline, DisconnectReason::INVALID});
+                }
+            } else {
+                // defOp is outside of consideration, top horz. fusion possible
+                // boundingOperations.push_back(op);
+                // llvm::outs() << "\n";
+            }
+        }
+    }
+
+    // Needed as Greedy1 is only considering the first possiblity
+    std::map<mlir::Operation *, size_t> decisionIxs;
+    for (const auto &op : ops) {
+        decisionIxs.insert({op, ZeroDecision});
+    }
+
+    // mmPCR to PCR
+    std::map<PipelinePair, DisconnectReason> producerConsumerRelationships =
+        VectorUtils::consolidateProducerConsumerRelationship(mmProducerConsumerRelationships);
+
+    VectorUtils::greedyMergePipelinesProducerConsumer(pipelines, operationToPipeline, producerConsumerRelationships);
+
+    // VectorUtils::DEBUG::printPipelines(pipelines);
+
+    // Post Processing
+
+    std::vector<Pipeline> _pipelines;
+    _pipelines.resize(pipelines.size());
+
+    std::transform(pipelines.begin(), pipelines.end(), _pipelines.begin(), [](const auto &ptr) { return *ptr; });
+
+    // will crash if for some reason the pipelines itself are not topologically sorted
+    VectorUtils::createVectorizedPipelineOps(func, _pipelines, decisionIxs);
+
+    return;
+
+}
+
+std::unique_ptr<Pass> daphne::createGreedy1VectorizeComputationsPass(const DaphneUserConfig& cfg) {
+    return std::make_unique<Greedy1VectorizeComputationsPass>(cfg);
+}
\ No newline at end of file
diff --git a/src/compiler/lowering/vectorize/HorizontalFusionPass.cpp b/src/compiler/lowering/vectorize/HorizontalFusionPass.cpp
new file mode 100644
index 000000000..984b2cc9c
--- /dev/null
+++ b/src/compiler/lowering/vectorize/HorizontalFusionPass.cpp
@@ -0,0 +1,251 @@
+/*
+ *  Copyright 2021 The DAPHNE Consortium
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include "compiler/lowering/vectorize/VectorUtils.h"
+#include "ir/daphneir/Daphne.h"
+#include "ir/daphneir/Passes.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include <cstddef>
+#include <iterator>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/MathExtras.h>
+#include <mlir/IR/Location.h>
+#include <mlir/IR/OpDefinition.h>
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include <llvm/ADT/STLExtras.h>
+#include "llvm/Support/Casting.h"
+
+#include <spdlog/spdlog.h>
+#include <util/ErrorHandler.h>
+
+using namespace mlir;
+
+namespace
+{
+
+    //-----------------------------------------------------------------
+    // Class
+    //-----------------------------------------------------------------
+
+    struct HorizontalFusionPass : public PassWrapper<HorizontalFusionPass, OperationPass<func::FuncOp>> {
+        void runOnOperation() final;
+    };
+
+    //-----------------------------------------------------------------
+    // Helper function
+    //-----------------------------------------------------------------
+
+    void moveOperationToBlock(mlir::Builder &builder, mlir::Block *src, mlir::Block *dest, std::vector<mlir::Value> &newResults) {
+
+        // Iterate over all operations in src block and move them to dest block.
+        // Rewrite block arguments of operations to the dest block arguments 
+        // and store values for the results for overriding of the old values.
+        while(!src->empty()) {
+            auto op = src->begin();
+
+            for(size_t i = 0; i < op->getNumOperands(); ++i) {
+                auto operand = op->getOperand(i);
+                if (llvm::isa<mlir::BlockArgument>(operand)) {
+                    auto blockArgument = dest->addArgument(operand.getType(), builder.getUnknownLoc());
+                    op->setOperand(i, blockArgument);
+                }
+            }
+            if (!llvm::isa<daphne::ReturnOp>(op)) {
+                op->moveBefore(dest, dest->end());
+            }
+            else {
+                newResults.insert(newResults.end(), op->operand_begin(), op->operand_end());
+                op->erase();
+                return;
+            }
+        }
+    }
+
+}
+
+//-----------------------------------------------------------------
+// Horizontal Fusion / Sibling Fusion (Scan-sharing of inputs)
+//-----------------------------------------------------------------
+//
+// Two operations share a single operand from the same producer.
+//
+//          producer
+//         /        \
+//   consumer1   consumer2 
+//
+// => (consumer1, consumer2)
+//
+// consumer1 and consumer2 cannot have a producer-consumer relationship directly or transitively,
+// as if a merge of these operations where possible it would happen in Greedy1/Greedy2.
+
+void HorizontalFusionPass::runOnOperation()
+{   
+    auto func = getOperation();
+
+    // After merging of pipelines, we need to rerun the pass
+    // to check for additional (changed) fusion possiblities.
+    bool changed = true;
+    while(changed) {
+        changed = false;
+
+        std::vector<daphne::VectorizedPipelineOp> pipelineOps;
+        func->walk([&](daphne::VectorizedPipelineOp op) {
+            pipelineOps.emplace_back(op);
+        });
+        std::reverse(pipelineOps.begin(), pipelineOps.end()); 
+
+        //-----------------------------------------------------------------
+        // Identify horizontal fusion possibilities
+        //-----------------------------------------------------------------
+
+        // Check for overlapping/intersection of operands between pipeline arguments.
+        // They need to be compatible according to the corresponding split of an argument.
+        std::vector<PipelineOpPair> horizontalRelationships;
+        for (auto it1 = pipelineOps.begin(); it1 != pipelineOps.end(); ++it1) {
+            auto pipeOp1 = *it1;
+
+            // Store defOps for the corresponding arguments of pipeOp1.
+            llvm::SmallVector<mlir::Operation*> defOpsArgs;
+            // Running over the split size for consideration of relevant args (excl. OutCols, OutRows).
+            for(size_t operandIx1 = 0; operandIx1 < pipeOp1.getSplits().size(); ++operandIx1) {
+                auto operand1 = pipeOp1->getOperand(operandIx1);
+                if (auto defOp = operand1.getDefiningOp()) {
+                    defOpsArgs.push_back(defOp);
+                }
+            }
+
+            for (auto it2 = next(it1); it2 != pipelineOps.end(); ++it2) {
+                auto pipeOp2 = *it2;
+
+                // PipelineOps need to be in the same block.
+                if (pipeOp1->getBlock() != pipeOp2->getBlock())
+                    continue;
+
+                // PipelineOps cannot (transitively) depend on each other.
+                if (VectorUtils::arePipelineOpsDependent(pipeOp1, pipeOp2))
+                    continue;
+
+                // Checking for overlapping arguments.
+                for(size_t operandIx2 = 0; operandIx2 < pipeOp2.getSplits().size(); ++operandIx2) {
+                    auto operand2 = pipeOp2->getOperand(operandIx2);
+
+                    if (auto defOp = operand2.getDefiningOp()) {
+
+                        // Check if defOp is also in the defOps for the pipeOp1 arguments.
+                        auto fIt = std::find(defOpsArgs.begin(), defOpsArgs.end(), defOp);
+                        if (fIt != defOpsArgs.end()) {
+                            
+                            size_t operandIx1 = std::distance(defOpsArgs.begin(), fIt);
+
+                            if (pipeOp1.getSplits()[operandIx1] == pipeOp2.getSplits()[operandIx2] && 
+                                pipeOp1.getSplits()[operandIx1].cast<daphne::VectorSplitAttr>().getValue() != daphne::VectorSplit::NONE) {
+                                horizontalRelationships.push_back({pipeOp1, pipeOp2});
+                                break; // We only need one case of arguments matching.
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        //-----------------------------------------------------------------
+        // Merge VectorizedPipelineOps
+        //----------------------------------------------------------------- 
+
+        for(auto pipeOpPair : horizontalRelationships) {
+        
+            auto [pipeOp1, pipeOp2] = pipeOpPair;
+
+            mlir::Block* b1 = &pipeOp1.getBody().getBlocks().front();
+            mlir::Block* b2 = &pipeOp2.getBody().getBlocks().front();
+
+            // Merge attributes and values
+            auto vSplitAttrs = std::vector<mlir::Attribute>(pipeOp1.getSplits().begin(), pipeOp1.getSplits().end());
+            vSplitAttrs.insert(vSplitAttrs.end(), pipeOp2.getSplits().begin(), pipeOp2.getSplits().end());
+
+            auto vCombineAttrs = std::vector<mlir::Attribute>(pipeOp1.getCombines().begin(), pipeOp1.getCombines().end());
+            vCombineAttrs.insert(vCombineAttrs.end(), pipeOp2.getCombines().begin(), pipeOp2.getCombines().end());
+
+            auto oldResults = std::vector<mlir::Value>(pipeOp1->getResults().begin(), pipeOp1->getResults().end());
+            oldResults.insert(oldResults.end(), pipeOp2->getResults().begin(), pipeOp2->getResults().end());
+
+            auto operands = std::vector<mlir::Value>(pipeOp1->getOperands().begin(), pipeOp1->getOperands().begin() + pipeOp1.getSplits().size());
+            operands.insert(operands.end(), pipeOp2->getOperands().begin(), pipeOp2->getOperands().begin() + pipeOp2.getSplits().size());
+
+            auto outRows = std::vector<mlir::Value>(pipeOp1.getOutRows().begin(), pipeOp1.getOutRows().end());
+            outRows.insert(outRows.end(), pipeOp2.getOutRows().begin(), pipeOp2.getOutRows().end());
+
+            auto outCols = std::vector<mlir::Value>(pipeOp1.getOutCols().begin(), pipeOp1.getOutCols().end());
+            outCols.insert(outCols.end(), pipeOp2.getOutCols().begin(), pipeOp2.getOutCols().end());
+
+            // Create new PipelineOp 
+            mlir::OpBuilder builder(func);
+            auto loc = builder.getFusedLoc({pipeOp1.getLoc(), pipeOp2->getLoc()});
+            auto pipelineOp = builder.create<mlir::daphne::VectorizedPipelineOp>(loc,
+                mlir::ValueRange(oldResults).getTypes(),
+                operands,
+                outRows,
+                outCols,
+                builder.getArrayAttr(vSplitAttrs),
+                builder.getArrayAttr(vCombineAttrs),
+                nullptr);
+            mlir::Block *bodyBlock = builder.createBlock(&pipelineOp.getBody()); 
+
+            //Move operations to new PipelineOp block.
+            auto newResults = std::vector<mlir::Value>();
+            moveOperationToBlock(builder, b1, bodyBlock, newResults);
+            moveOperationToBlock(builder, b2, bodyBlock, newResults);
+
+            // Create new ReturnOp.
+            builder.setInsertionPointToEnd(bodyBlock);
+            builder.create<mlir::daphne::ReturnOp>(loc, newResults);
+
+            // Rewrite all uses to new ReturnOp.
+            for (size_t i = 0; i < oldResults.size(); ++i) {
+                oldResults.at(i).replaceAllUsesWith(pipelineOp.getResult(i));
+            }
+
+            // Place to the location after the last PipelineOp of this pair.
+            // Is this sufficient?
+            pipelineOp->moveAfter(pipeOp1);
+
+            // Clean up
+            pipeOp1->erase();
+            pipeOp2->erase();
+
+            //suboptimal
+            changed = true;
+            break;
+        }
+    }
+
+    return;
+}
+
+
+std::unique_ptr<Pass> daphne::createHorizontalFusionPass() {
+    return std::make_unique<HorizontalFusionPass>();
+}
\ No newline at end of file
diff --git a/src/compiler/lowering/vectorize/VectorUtils.h b/src/compiler/lowering/vectorize/VectorUtils.h
new file mode 100644
index 000000000..c8571d3ae
--- /dev/null
+++ b/src/compiler/lowering/vectorize/VectorUtils.h
@@ -0,0 +1,886 @@
+/*
+ *  Copyright 2021 The DAPHNE Consortium
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+#include "ir/daphneir/Daphne.h"
+#include "ir/daphneir/DaphneVectorizableOpInterface.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Transforms/TopologicalSortUtils.h"
+#include <algorithm>
+#include <cstddef>
+#include <fstream>
+#include <iostream>
+#include <llvm/ADT/SmallVector.h>
+#include <map>
+#include <queue>
+#include <stack>
+#include <stdexcept>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include <thread>
+
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/IR/PassManagerInternal.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ManagedStatic.h"
+
+using VectorIndex = std::size_t;
+using Pipeline = std::vector<mlir::Operation *>;
+using PipelinePair = std::pair<Pipeline *, Pipeline *>;
+
+using PipelineOpPair = std::pair<mlir::daphne::VectorizedPipelineOp, mlir::daphne::VectorizedPipelineOp>;
+
+namespace std {
+template <> struct hash<PipelinePair> {
+    size_t operator()(const PipelinePair &p) const {
+        return std::hash<Pipeline *>{}(p.first) ^ std::hash<Pipeline *>{}(p.second);
+    }
+};
+} // namespace std
+
+enum class DisconnectReason { NONE, MULTIPLE_CONSUMERS, INVALID };
+
+enum class EdgeStatus { INVALID, ACTIVE, INACTIVE };
+
+struct VectorUtils {
+
+    /**
+     * @brief Checks if a VectorSplit and a VectorCombine are compatible.
+     *
+     * This function compares the provided VectorSplit and VectorCombine to
+     * determine if they match by remapping the split to a matching combine.
+     * Compatible pairs are ROWS-ROWS and COLS-COLS.
+     *
+     * @param split VectorSplit value representing the split of a operation.
+     * @param combine VectorCombine value representing the combine of a operation.
+     * @return true, if VectorSplit and VectorCombine are compabitlbe
+     * @return false, otherwise
+     */
+
+    static bool matchingVectorSplitCombine(mlir::daphne::VectorSplit split, mlir::daphne::VectorCombine combine) {
+        // llvm::outs() << split << " " << combine << " ";
+        mlir::daphne::VectorCombine _operandCombine;
+        switch (split) {
+        case mlir::daphne::VectorSplit::ROWS:
+            _operandCombine = mlir::daphne::VectorCombine::ROWS;
+            break;
+        case mlir::daphne::VectorSplit::COLS:
+            _operandCombine = mlir::daphne::VectorCombine::COLS;
+            break;
+        default:
+            // No matching split/combine; basically resulting in separate pipelines
+            return false;
+        }
+        if (combine == _operandCombine) {
+            return true;
+        }
+        return false;
+    }
+
+    // Greedy merge along (valid) MULTIPLE_CONSUMER relationships
+    // by checking if resulting pipelines can be sorted topologically.
+    static void
+    greedyMergePipelinesProducerConsumer(std::vector<Pipeline *> &pipelines,
+                                         std::map<mlir::Operation *, Pipeline *> &operationToPipeline,
+                                         std::map<PipelinePair, DisconnectReason> &producerConsumerRelationships) {
+        bool change = true;
+        while (change) {
+            change = false;
+
+            std::multimap<PipelinePair, DisconnectReason> mmPCR;
+            for (const auto &[pipePair, disReason] : producerConsumerRelationships) {
+
+                if (disReason == DisconnectReason::INVALID)
+                    continue;
+
+                if (VectorUtils::tryTopologicalSortMerged(pipelines, producerConsumerRelationships, pipePair.first,
+                                                          pipePair.second)) {
+                    auto mergedPipeline =
+                        VectorUtils::mergePipelines(pipelines, operationToPipeline, pipePair.first, pipePair.second);
+
+                    for (const auto &[_pipePair, _disReason] : producerConsumerRelationships) {
+
+                        // Ignore in case that is current pair is pipePair
+                        if (_pipePair.first == pipePair.first && _pipePair.second == pipePair.second)
+                            continue;
+
+                        // Rewrite Relationships
+                        if (_pipePair.first == pipePair.first || _pipePair.first == pipePair.second) {
+                            auto newPipePair = std::make_pair(mergedPipeline, _pipePair.second);
+                            mmPCR.insert({newPipePair, _disReason});
+                        } else if (_pipePair.second == pipePair.first || _pipePair.second == pipePair.second) {
+                            auto newPipePair = std::make_pair(_pipePair.first, mergedPipeline);
+                            mmPCR.insert({newPipePair, _disReason});
+                        } else {
+                            mmPCR.insert({_pipePair, _disReason});
+                        }
+                    }
+
+                    change = true;
+                    break;
+                }
+            }
+
+            // In case of no change the mmPCR is not filled, ignore
+            if (change)
+                producerConsumerRelationships = VectorUtils::consolidateProducerConsumerRelationship(mmPCR);
+
+            // VectorUtils::DEBUG::printPCR(producerConsumerRelationships);
+            // VectorUtils::DEBUG::printPipelines(pipelines);
+        }
+    }
+
+    //------------------------------------------------------------------------------
+
+    // Function merges two pipelines into one by appending all operations from one pipeline to another
+    // Order is not really considered, as it is embodied in IR
+    static void mergePipelines(std::vector<Pipeline *> &pipelines,
+                               std::map<mlir::Operation *, size_t> &operationToPipelineIx, size_t pipeIx1,
+                               size_t pipeIx2) {
+        // llvm::outs() << mergeFromIx << " " << mergeIntoIx << "\n";
+        if (pipeIx1 == pipeIx2)
+            return;
+        if (pipeIx2 > pipeIx1)
+            std::swap(pipeIx1, pipeIx2);
+
+        std::vector<mlir::Operation *> *mergedPipeline(pipelines.at(pipeIx2));
+        for (auto op : *pipelines.at(pipeIx1)) {
+            if (std::find(mergedPipeline->begin(), mergedPipeline->end(), op) == mergedPipeline->end()) {
+                mergedPipeline->push_back(op);
+                operationToPipelineIx[op] = pipeIx2;
+            }
+        }
+        pipelines.at(pipeIx2) = std::move(mergedPipeline);
+        pipelines.erase(pipelines.begin() + pipeIx1);
+    }
+
+    static Pipeline *mergePipelines(std::vector<Pipeline *> &pipelines,
+                                    std::map<mlir::Operation *, Pipeline *> &operationToPipeline, Pipeline *pipe1,
+                                    Pipeline *pipe2) {
+        if (pipe1 == pipe2)
+            return nullptr;
+
+        for (auto op : *pipe2) {
+            if (std::find(pipe1->begin(), pipe1->end(), op) == pipe1->end()) {
+                pipe1->push_back(op);
+                operationToPipeline[op] = pipe1;
+            }
+        }
+
+        auto pipeIx2 = std::find(pipelines.begin(), pipelines.end(), pipe2);
+        pipelines.erase(pipeIx2);
+        return pipe1;
+    }
+
+    // only works if pipeline ops are topologically sorted in reverse
+    static bool arePipelineOpsDependent(mlir::daphne::VectorizedPipelineOp pipeOp1,
+                                        mlir::daphne::VectorizedPipelineOp pipeOp2) {
+
+        if (pipeOp1 == pipeOp2)
+            return true;
+
+        std::stack<mlir::Operation *> s;
+        std::unordered_set<mlir::Operation *> visited;
+
+        s.push(pipeOp1);
+        while (!s.empty()) {
+            mlir::Operation *currOp = s.top();
+            s.pop();
+
+            // Connection found
+            if (currOp == pipeOp2)
+                return true;
+
+            if (visited.insert(currOp).second) {
+                for (const auto &operand : currOp->getOperands()) {
+                    if (auto defOp = operand.getDefiningOp()) {
+                        s.push(defOp);
+                    }
+                }
+            }
+        }
+
+        return false;
+    }
+
+    static bool tryTopologicalSortMerged(std::vector<Pipeline *> &pipelines,
+                                         std::map<PipelinePair, DisconnectReason> &rel, Pipeline *pipe1,
+                                         Pipeline *pipe2) {
+
+        // if (pipe2 > pipe1)
+        //   std::swap(pipe1, pipe2);
+
+        // prealloc
+        std::map<Pipeline *, std::unordered_set<Pipeline *>> pipeline_graph;
+        for (auto pipe : pipelines) {
+            if (pipe == pipe1)
+                pipe = pipe2;
+            pipeline_graph.insert({pipe, {}});
+        }
+
+        for (auto &[key, _] : rel) {
+            auto consumer = key.second;
+            auto producer = key.first;
+
+            if (consumer == pipe1) {
+                consumer = pipe2;
+            } else if (producer == pipe1) {
+                producer = pipe2;
+            }
+
+            if (producer == consumer)
+                continue;
+
+            if (pipeline_graph.find(consumer) == pipeline_graph.end()) {
+                pipeline_graph.insert({consumer, {producer}});
+            } else {
+                pipeline_graph.at(consumer).insert(producer);
+            }
+        }
+
+        /*for (auto node : pipeline_graph) {
+            llvm::outs() << "Key: " << node.first << ", Values: ";
+            for (auto dependency : node.second) {
+                llvm::outs() << dependency << " ";
+            }
+            llvm::outs() << "\n";
+        }
+        llvm::outs() << "\n";*/
+
+        return tryTopologicalSort(pipeline_graph);
+    }
+
+    static std::map<PipelinePair, DisconnectReason>
+    consolidateProducerConsumerRelationship(std::multimap<PipelinePair, DisconnectReason> mmPCR) {
+        std::map<PipelinePair, DisconnectReason> pcr;
+        for (const auto &[pipePair, disReason] : mmPCR) {
+            if (pcr.find(pipePair) == pcr.end()) {
+                pcr.insert({pipePair, disReason});
+            } else {
+                // Overwrite if INVALID as it domiantes MULTI_CONSUMER relationship
+                if (disReason == DisconnectReason::INVALID) {
+                    pcr.insert_or_assign(pipePair, disReason);
+                }
+            }
+        }
+        return pcr;
+    }
+
+    //------------------------------------------------------------------------------
+
+  private:
+    // kahn: https://dev.to/leopfeiffer/topological-sort-with-kahns-algorithm-3dl1
+    // https://leetcode.com/problems/course-schedule/solutions/483330/c-kahns-algorithm-topological-sort-with-easy-detailed-explanation-16-ms-beats-98/
+    static bool tryTopologicalSort(std::map<size_t, std::unordered_set<size_t>> pipeline_graph) {
+
+        std::unordered_map<size_t, size_t> inDegrees;
+        for (auto node : pipeline_graph) {
+            for (auto dependency : node.second) {
+                ++inDegrees[dependency];
+            }
+        }
+
+        std::queue<size_t> queue;
+        for (auto node : pipeline_graph) {
+            if (inDegrees[node.first] == 0) {
+                queue.push(node.first);
+            }
+        }
+
+        std::vector<size_t> result;
+        while (!queue.empty()) {
+            size_t node = queue.front();
+            queue.pop();
+            result.push_back(node);
+            for (auto dependency : pipeline_graph.at(node)) {
+                if (--inDegrees[dependency] == 0) {
+                    queue.push(dependency);
+                }
+            }
+        }
+
+        return result.size() == pipeline_graph.size();
+    }
+
+    static bool tryTopologicalSort(std::map<Pipeline *, std::unordered_set<Pipeline *>> pipeline_graph) {
+
+        std::unordered_map<Pipeline *, size_t> inDegrees;
+        for (auto node : pipeline_graph) {
+            for (auto dependency : node.second) {
+                ++inDegrees[dependency];
+            }
+        }
+
+        std::queue<Pipeline *> queue;
+        for (auto node : pipeline_graph) {
+            if (inDegrees[node.first] == 0) {
+                queue.push(node.first);
+            }
+        }
+
+        std::vector<Pipeline *> result;
+        while (!queue.empty()) {
+            Pipeline *node = queue.front();
+            queue.pop();
+            result.push_back(node);
+            for (auto dependency : pipeline_graph.at(node)) {
+                if (--inDegrees[dependency] == 0) {
+                    queue.push(dependency);
+                }
+            }
+        }
+
+        return result.size() == pipeline_graph.size();
+    }
+
+  public:
+    /**
+     * @brief Recursive function checking if the given value is transitively dependant on the operation `op`.
+     * @param value The value to check
+     * @param op The operation to check
+     * @return true if there is a dependency, false otherwise
+     */
+    static bool valueDependsOnResultOf(mlir::Value value, mlir::Operation *op) {
+        if (auto defOp = value.getDefiningOp()) {
+            if (defOp == op)
+                return true;
+#if 1
+            // TODO This crashes if defOp and op are not in the same block.
+            // At the same time, it does not seem to be strictly required.
+            //            if (defOp->isBeforeInBlock(op))
+            // Nevertheless, this modified line seems to be a good soft-filter;
+            // without that, the vectorization pass may take very long on
+            // programs with 100s of operations.
+            if (defOp->getBlock() == op->getBlock() && defOp->isBeforeInBlock(op))
+                // can't have results of `op` as inputs, as it is defined before
+                return false;
+#endif
+            for (auto operand : defOp->getOperands()) {
+                if (valueDependsOnResultOf(operand, op))
+                    return true;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * @brief Moves operation which are between the operations, which should be fused into a single pipeline, before
+     * or after the position where the pipeline will be placed.
+     * @param pipelinePosition The position where the pipeline will be
+     * @param pipeline The pipeline for which this function should be executed
+     */
+    static void movePipelineInterleavedOperations(mlir::Block::iterator pipelinePosition,
+                                                  const std::vector<mlir::Operation *> pipeline) {
+        // first operation in pipeline vector is last in IR, and the last is the first
+        auto startPos = pipeline.back()->getIterator();
+        auto endPos = pipeline.front()->getIterator();
+        auto currSkip = pipeline.rbegin();
+
+        std::vector<mlir::Operation *> moveBeforeOps;
+        std::vector<mlir::Operation *> moveAfterOps;
+
+        for (auto it = startPos; it != endPos; ++it) {
+            if (it == (*currSkip)->getIterator()) {
+                ++currSkip;
+                continue;
+            }
+
+            bool dependsOnPipeline = false;
+            auto pipelineOpsBeforeIt = currSkip;
+            while (--pipelineOpsBeforeIt != pipeline.rbegin()) {
+                for (auto operand : it->getOperands()) {
+                    if (valueDependsOnResultOf(operand, *pipelineOpsBeforeIt)) {
+                        dependsOnPipeline = true;
+                        break;
+                    }
+                }
+                if (dependsOnPipeline) {
+                    break;
+                }
+            }
+
+            for (auto operand : it->getOperands()) {
+                if (valueDependsOnResultOf(operand, *pipelineOpsBeforeIt)) {
+                    dependsOnPipeline = true;
+                    break;
+                }
+            }
+            if (dependsOnPipeline) {
+                moveAfterOps.push_back(&(*it));
+            } else {
+                moveBeforeOps.push_back(&(*it));
+            }
+        }
+
+        for (auto moveBeforeOp : moveBeforeOps) {
+            moveBeforeOp->moveBefore(pipelinePosition->getBlock(), pipelinePosition);
+        }
+        for (auto moveAfterOp : moveAfterOps) {
+            moveAfterOp->moveAfter(pipelinePosition->getBlock(), pipelinePosition);
+            pipelinePosition = moveAfterOp->getIterator();
+        }
+    }
+
+    static void createVectorizedPipelineOps(mlir::func::FuncOp func, std::vector<Pipeline> pipelines,
+                                            std::map<mlir::Operation *, VectorIndex> decisionIxs) {
+        mlir::OpBuilder builder(func);
+
+        // Create the `VectorizedPipelineOp`s
+        for (auto _pipeline : pipelines) {
+            if (_pipeline.empty())
+                continue;
+
+            auto valueIsPartOfPipeline = [&](mlir::Value operand) {
+                return llvm::any_of(_pipeline, [&](mlir::Operation *lv) { return lv == operand.getDefiningOp(); });
+            };
+            std::vector<mlir::Attribute> vSplitAttrs;
+            std::vector<mlir::Attribute> vCombineAttrs;
+            std::vector<mlir::Location> locations;
+            std::vector<mlir::Value> results;
+            std::vector<mlir::Value> operands;
+            std::vector<mlir::Value> outRows;
+            std::vector<mlir::Value> outCols;
+
+            // first op in pipeline is last in IR
+            builder.setInsertionPoint(_pipeline.front());
+            // move all operations, between the operations that will be part of the pipeline, before or after the
+            // completed pipeline
+            VectorUtils::movePipelineInterleavedOperations(builder.getInsertionPoint(), _pipeline);
+
+            // potential addition for
+            std::vector<mlir::Operation *> pipeline;
+            for (auto vIt = _pipeline.rbegin(); vIt != _pipeline.rend(); ++vIt) {
+                auto v = *vIt;
+
+                auto vSplits = std::vector<mlir::daphne::VectorSplit>();
+                auto vCombines = std::vector<mlir::daphne::VectorCombine>();
+                auto opsOutputSizes = std::vector<std::pair<mlir::Value, mlir::Value>>();
+                if (auto vec = llvm::dyn_cast<mlir::daphne::Vectorizable>(v)) {
+                    size_t d = decisionIxs[v];
+                    vSplits = vec.getVectorSplits()[d];
+                    vCombines = vec.getVectorCombines()[d];
+                    opsOutputSizes = vec.createOpsOutputSizes(builder)[d];
+                } else {
+                    throw std::runtime_error("Vectorizable op not found");
+                }
+
+                pipeline.push_back(v);
+
+                // TODO: although we do create enum attributes, it might make sense/make it easier to
+                // just directly use an I64ArrayAttribute
+                // Determination of operands of VectorizedPipelineOps!
+                for (auto i = 0u; i < v->getNumOperands(); ++i) {
+                    auto operand = v->getOperand(i);
+                    if (!valueIsPartOfPipeline(operand)) {
+                        vSplitAttrs.push_back(mlir::daphne::VectorSplitAttr::get(func.getContext(), vSplits[i]));
+                        operands.push_back(operand);
+                    }
+                }
+
+                // Determination of results of VectorizedPipelineOps!
+                for (auto vCombine : vCombines) {
+                    vCombineAttrs.push_back(mlir::daphne::VectorCombineAttr::get(func.getContext(), vCombine));
+                }
+                locations.push_back(v->getLoc());
+                for (auto result : v->getResults()) {
+                    results.push_back(result);
+                }
+                for (auto outSize : opsOutputSizes) {
+                    outRows.push_back(outSize.first);
+                    outCols.push_back(outSize.second);
+                }
+
+                // check if any of the outputs type of an operator is a scalar value
+                // if yes, add additional castOps inside pipeline and outside pipeline
+                for (size_t i = 0; i < v->getNumResults(); i++) {
+                    auto r = v->getResult(0);
+                    // TODO: check if it includes all types used in daphne
+                    if (r.getType().isIntOrIndexOrFloat()) {
+                        auto m1x1 = mlir::daphne::MatrixType::get(func.getContext(), r.getType(), 1, 1, 1,
+                                                                  mlir::daphne::MatrixRepresentation::Dense);
+                        auto loc = v->getLoc();
+
+                        auto toCastOp = builder.create<mlir::daphne::CastOp>(loc, m1x1, r);
+                        toCastOp->moveAfter(v);
+
+                        // xxxxxx
+                        pipeline.push_back(toCastOp);
+                        vCombineAttrs.push_back(mlir::daphne::VectorCombineAttr::get(func.getContext(), vCombines[i]));
+                        auto cst1 = builder.create<mlir::daphne::ConstantOp>(loc, builder.getIndexType(),
+                                                                             builder.getIndexAttr(1l));
+                        outRows.push_back(cst1);
+                        outCols.push_back(cst1);
+                        results.push_back(toCastOp);
+
+                        auto fromCastOp = builder.create<mlir::daphne::CastOp>(loc, r.getType(), toCastOp);
+                        r.replaceAllUsesExcept(fromCastOp, toCastOp);
+
+                        mlir::Operation *firstUseOp = nullptr;
+                        for (const auto &use : fromCastOp->getUses()) {
+                            auto user = use.getOwner();
+
+                            if (!firstUseOp || user->isBeforeInBlock(firstUseOp)) {
+                                firstUseOp = user;
+                            }
+                        }
+
+                        fromCastOp->moveBefore(firstUseOp);
+                    }
+                }
+            }
+
+            std::vector<mlir::Location> locs;
+            locs.reserve(_pipeline.size());
+            for (auto op : pipeline) {
+                locs.push_back(op->getLoc());
+            }
+
+            auto loc = builder.getFusedLoc(locs);
+            auto pipelineOp = builder.create<mlir::daphne::VectorizedPipelineOp>(
+                loc, mlir::ValueRange(results).getTypes(), operands, outRows, outCols,
+                builder.getArrayAttr(vSplitAttrs), builder.getArrayAttr(vCombineAttrs), nullptr);
+            mlir::Block *bodyBlock = builder.createBlock(&pipelineOp.getBody());
+
+            // remove information from input matrices of pipeline
+            for (size_t i = 0u; i < operands.size(); ++i) {
+                auto argTy = operands[i].getType();
+                switch (vSplitAttrs[i].cast<mlir::daphne::VectorSplitAttr>().getValue()) {
+                case mlir::daphne::VectorSplit::ROWS: {
+                    auto matTy = argTy.cast<mlir::daphne::MatrixType>();
+                    // only remove row information
+                    argTy = matTy.withShape(-1, matTy.getNumCols());
+                    break;
+                }
+                case mlir::daphne::VectorSplit::COLS: {
+                    auto matTy = argTy.cast<mlir::daphne::MatrixType>();
+                    // only remove col information
+                    argTy = matTy.withShape(matTy.getNumRows(), -1);
+                    break;
+                }
+                case mlir::daphne::VectorSplit::NONE:
+                    // keep any size information
+                    break;
+                }
+                bodyBlock->addArgument(argTy, builder.getUnknownLoc());
+            }
+
+            auto argsIx = 0u;
+            auto resultsIx = 0u;
+            // for every op in pipeline
+            try {
+            
+            for (auto vIt = pipeline.begin(); vIt != pipeline.end(); ++vIt) {
+                auto v = *vIt;
+                auto numOperands = v->getNumOperands();
+                auto numResults = v->getNumResults();
+
+                // move v before end of block
+                v->moveBefore(bodyBlock, bodyBlock->end());
+
+                // set operands to arguments of body block, if defOp is not part of the pipeline
+                for (auto i = 0u; i < numOperands; ++i) {
+                    if (!valueIsPartOfPipeline(v->getOperand(i))) {
+                        v->setOperand(i, bodyBlock->getArgument(argsIx++));
+                    }
+                }
+
+                auto pipelineReplaceResults = pipelineOp->getResults().drop_front(resultsIx).take_front(numResults);
+                resultsIx += numResults;
+                for (auto z : llvm::zip(v->getResults(), pipelineReplaceResults)) {
+                    auto old = std::get<0>(z);
+                    auto replacement = std::get<1>(z);
+
+                    // TODO: switch to type based size inference instead
+                    // FIXME: if output is dynamic sized, we can't do this
+                    // replace `NumRowOp` and `NumColOp`s for output size inference
+                    for (auto &use : old.getUses()) {
+                        
+                        auto *op = use.getOwner();
+
+                        if (auto nrowOp = llvm::dyn_cast<mlir::daphne::NumRowsOp>(op)) {
+                            nrowOp.replaceAllUsesWith(pipelineOp.getOutRows()[replacement.getResultNumber()]);
+                            nrowOp.erase();
+                        }
+                        if (auto ncolOp = llvm::dyn_cast<mlir::daphne::NumColsOp>(op)) {
+                            ncolOp.replaceAllUsesWith(pipelineOp.getOutCols()[replacement.getResultNumber()]);
+                            ncolOp.erase();
+                        }
+                    }
+                    // Replace only if not used by pipeline op
+                    old.replaceUsesWithIf(replacement, [&](mlir::OpOperand &opOperand) {
+                        return llvm::count(pipeline, opOperand.getOwner()) == 0;
+                    });
+                }
+            }
+            } catch (...) {
+                llvm::outs() << "TEST:" << "\n";
+                func.print(llvm::outs());
+                llvm::outs() << "\n";
+            }
+            bodyBlock->walk([](mlir::Operation *op) {
+                for (auto resVal : op->getResults()) {
+                    if (auto ty = resVal.getType().dyn_cast<mlir::daphne::MatrixType>()) {
+                        resVal.setType(ty.withShape(-1, -1));
+                    }
+                }
+            });
+            builder.setInsertionPointToEnd(bodyBlock);
+            builder.create<mlir::daphne::ReturnOp>(loc, results);
+            if (!mlir::sortTopologically(bodyBlock)) {
+                throw std::runtime_error("topoSort");
+            }
+        }
+    }
+
+    //-----------------------------------------------------------------
+    //
+    //-----------------------------------------------------------------
+
+    struct DEBUG {
+
+        static std::string getColor(size_t pipelineId) {
+            std::vector<std::string> colors = {"tomato",   "lightgreen",  "lightblue",    "plum1",      "mistyrose2",
+                                               "seashell", "hotpink",     "lemonchiffon", "firebrick1", "ivory2",
+                                               "khaki1",   "lightcyan",   "olive",        "yellow",     "maroon",
+                                               "violet",   "navajowhite1"};
+            return colors[pipelineId % colors.size()];
+        }
+
+        static void drawPipelines(const std::vector<mlir::Operation *> &ops,
+                                  const std::map<mlir::Operation *, size_t> &operationToPipelineIx,
+                                  const std::map<mlir::Operation *, VectorIndex> &decisionIxs, std::string filename) {
+            std::ofstream outfile(filename);
+
+            outfile << "digraph G {" << std::endl;
+
+            std::map<mlir::Operation *, std::string> opToNodeName;
+
+            for (size_t i = 0; i < ops.size(); ++i) {
+                std::string nodeName = "node" + std::to_string(i);
+                opToNodeName[ops.at(i)] = nodeName;
+
+                size_t pipelineId = operationToPipelineIx.at(ops[i]);
+                VectorIndex vectIx = decisionIxs.at(ops.at(i));
+                std::string color = VectorUtils::DEBUG::getColor(pipelineId);
+
+                outfile << nodeName << " [label=\"" << ops.at(i)->getName().getStringRef().str()
+                        << "\\npIx: " << pipelineId << ", vectIx: " << vectIx << "\", fillcolor=" << color
+                        << ", style=filled];" << std::endl;
+            }
+
+            std::unordered_set<mlir::Operation *> outsideOps;
+
+            for (size_t i = 0; i < ops.size(); ++i) {
+                mlir::Operation *op = ops.at(i);
+                auto consumerPipelineIx = operationToPipelineIx.at(op);
+
+                for (const auto &operandValue : op->getOperands()) {
+                    mlir::Operation *operandOp = operandValue.getDefiningOp();
+                    auto it = operationToPipelineIx.find(operandOp);
+
+                    if (it != operationToPipelineIx.end()) {
+                        auto producerPipeplineIx = it->second;
+                        outfile << opToNodeName.at(operandOp) << " -> " << opToNodeName.at(op);
+
+                        if (producerPipeplineIx != consumerPipelineIx) {
+                            outfile << " [style=dotted]";
+                        }
+                        outfile << ";" << std::endl;
+                    } else {
+                        // also show the surrounding ops, e.g. to make horizontal fusion visible
+                    }
+                }
+            }
+            outfile << "}" << std::endl;
+        }
+
+        static std::string printPtr(void *ptr) {
+
+            std::ostringstream oss;
+            oss << std::hex << reinterpret_cast<uintptr_t>(ptr);
+
+            std::string str = oss.str();
+
+            return str.substr(str.size() - 3);
+        }
+
+        static void drawPipelines(const std::vector<mlir::Operation *> &ops,
+                                  const std::map<mlir::Operation *, Pipeline *> &operationToPipeline,
+                                  const std::map<mlir::Operation *, VectorIndex> &decisionIxs, std::string filename) {
+            std::ofstream outfile(filename);
+
+            outfile << "digraph G {" << std::endl;
+
+            std::map<mlir::Operation *, std::string> opToNodeName;
+            std::map<Pipeline *, size_t> pipelineToIx;
+
+            for (size_t i = 0; i < ops.size(); ++i) {
+                std::string nodeName = "node" + std::to_string(i);
+                opToNodeName[ops.at(i)] = nodeName;
+
+                auto pipeline = operationToPipeline.at(ops.at(i));
+                size_t pipelineIx;
+                if (pipelineToIx.find(pipeline) == pipelineToIx.end()) {
+                    pipelineIx = pipelineToIx.size();
+                    pipelineToIx.insert({pipeline, pipelineIx});
+                } else {
+                    pipelineIx = pipelineToIx.at(pipeline);
+                }
+                std::string color = VectorUtils::DEBUG::getColor(pipelineIx);
+                VectorIndex vectIx = decisionIxs.at(ops.at(i));
+
+                std::string pipeName = printPtr(pipeline);
+
+                outfile << nodeName << " [label=\"" << ops.at(i)->getName().getStringRef().str()
+                        << "\\npIx: " << pipeName << ", vectIx: " << vectIx << "\", fillcolor=" << color
+                        << ", style=filled];" << std::endl;
+            }
+
+            std::unordered_set<mlir::Operation *> outsideOps;
+
+            for (size_t i = 0; i < ops.size(); ++i) {
+                mlir::Operation *op = ops.at(i);
+                auto consumerPipelineIx = operationToPipeline.at(op);
+
+                for (const auto &operandValue : op->getOperands()) {
+                    mlir::Operation *operandOp = operandValue.getDefiningOp();
+                    auto it = operationToPipeline.find(operandOp);
+
+                    if (it != operationToPipeline.end()) {
+                        auto producerPipeplineIx = it->second;
+                        outfile << opToNodeName.at(operandOp) << " -> " << opToNodeName.at(op);
+
+                        if (producerPipeplineIx != consumerPipelineIx) {
+                            outfile << " [style=dotted]";
+                        }
+                        outfile << ";" << std::endl;
+                    } else {
+                        // also show the surrounding ops, e.g. to make horizontal fusion visible
+                    }
+                }
+            }
+            outfile << "}" << std::endl;
+        }
+
+        static void drawPipelineOps(std::vector<mlir::daphne::VectorizedPipelineOp> &ops, std::string filename) {
+            std::ofstream outfile(filename);
+
+            outfile << "digraph GGroup {" << "\n";
+            outfile << "compound=true;" << "\n";
+
+            std::map<mlir::Operation *, std::string> opToNodeName;
+            std::map<mlir::daphne::VectorizedPipelineOp, std::string> pipeOpToNodeName;
+            std::map<mlir::Operation *, size_t> operationToPipeline;
+            // std::map<mlir::Value, std::string> argToName;
+
+            for (size_t i = 0; i < ops.size(); ++i) {
+                std::string pipeName = "pipeOp" + std::to_string(i);
+                pipeOpToNodeName.insert({ops.at(i), pipeName});
+
+                std::string color = VectorUtils::DEBUG::getColor(i);
+
+                outfile << "subgraph cluster_" << pipeName << " {\n";
+
+                outfile << "label=\"S: [";
+                for (const auto &x : ops.at(i).getSplits()) {
+                    auto attr = static_cast<uint64_t>(llvm::dyn_cast<mlir::daphne::VectorSplitAttr>(x).getValue());
+                    outfile << attr << ", ";
+                }
+                outfile << "]\\n";
+
+                outfile << " C: [";
+                for (const auto &x : ops.at(i).getCombines()) {
+                    auto attr = static_cast<uint64_t>(llvm::dyn_cast<mlir::daphne::VectorCombineAttr>(x).getValue());
+                    outfile << attr << ", ";
+                }
+                outfile << "]\";\n";
+
+                outfile << "node [style=filled,color=" << color << "];\n";
+                outfile << "color=" << "lightgrey" << ";\n";
+                size_t j = 0;
+
+                mlir::Block *b = &ops.at(i).getBody().getBlocks().front();
+
+                for (const auto &arg : b->getArguments()) {
+                    std::string argName = "arg" + std::to_string(arg.getArgNumber());
+                    std::string qualArgName = pipeName + "_" + argName;
+                    outfile << qualArgName << "[label=\"" << argName << "\"shape=diamond,color=grey];\n";
+                    // argToName.insert({arg, qualArgName});
+                }
+
+                for (auto it = b->begin(); it != b->end(); ++it) {
+                    mlir::Operation *op = &(*it);
+                    std::string nodeName = pipeName + "_node" + std::to_string(j);
+                    opToNodeName.insert({op, nodeName});
+                    operationToPipeline.insert({op, i});
+                    outfile << nodeName << " [label=\"" << op->getName().getStringRef().str() << "\"];\n";
+                    j++;
+                }
+                outfile << pipeName << "_inv [style=invis,shape=point]" << ";\n";
+                outfile << "}" << "\n";
+            }
+
+            for (size_t i = 0; i < ops.size(); ++i) {
+                std::string pipeName = pipeOpToNodeName.at(ops.at(i));
+
+                mlir::Block *b = &ops.at(i).getBody().getBlocks().front();
+                for (auto it = b->begin(); it != b->end(); ++it) {
+                    mlir::Operation *op = &(*it);
+
+                    if (llvm::isa<mlir::daphne::ReturnOp>(op)) {
+                        outfile << opToNodeName.at(op) << " -> " << pipeName << "_inv" << ";\n";
+                    }
+
+                    for (const auto &operandValue : op->getOperands()) {
+                        auto operandOp = operandValue.getDefiningOp();
+                        auto it = operationToPipeline.find(operandOp);
+
+                        if (it != operationToPipeline.end()) {
+                            outfile << opToNodeName.at(operandOp) << " -> " << opToNodeName.at(op);
+                            outfile << ";" << std::endl;
+                        } else {
+                            if (auto arg = llvm::dyn_cast<mlir::BlockArgument>(operandValue)) {
+                                std::string argName = "arg" + std::to_string(arg.getArgNumber());
+                                std::string qualArgName = pipeName + "_" + argName;
+                                outfile << qualArgName << " -> " << opToNodeName.at(op) << ";\n";
+                            }
+                        }
+                    }
+                }
+            }
+
+            for (size_t i = 0; i < ops.size(); ++i) {
+                std::string pipeName = pipeOpToNodeName.at(ops.at(i));
+                auto op = ops.at(i);
+
+                for (size_t j = 0; j < op.getSplits().size(); ++j) {
+                    if (auto operandOp = op.getOperand(j).getDefiningOp()) {
+                        if (auto defOp = llvm::dyn_cast<mlir::daphne::VectorizedPipelineOp>(operandOp)) {
+                            std::string pipeName2 = pipeOpToNodeName.at(defOp);
+                            std::string argName = pipeName + "_arg" + std::to_string(j);
+                            outfile << pipeName2 << "_inv" << " -> " << argName << "[ltail=cluster_" << pipeName2
+                                    << "];\n";
+                        }
+                    }
+                }
+            }
+            outfile << "}" << "\n";
+        }
+    };
+};
\ No newline at end of file
diff --git a/src/compiler/lowering/vectorize/VectorizeDefs.h b/src/compiler/lowering/vectorize/VectorizeDefs.h
new file mode 100644
index 000000000..165ce4687
--- /dev/null
+++ b/src/compiler/lowering/vectorize/VectorizeDefs.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2021 The DAPHNE Consortium
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+enum VectorizationType {
+    DAPHNE,
+    GREEDY_1
+};
\ No newline at end of file
diff --git a/src/ir/daphneir/DaphneOps.td b/src/ir/daphneir/DaphneOps.td
index d934f8463..39ef49843 100644
--- a/src/ir/daphneir/DaphneOps.td
+++ b/src/ir/daphneir/DaphneOps.td
@@ -210,7 +210,8 @@ class Daphne_EwUnaryOp<string name, Type scalarType, list<Trait> traits = []> :
     DataTypeFromFirstArg,
     ShapeFromArg,
     CastArgsToResType,
-    NoMemoryEffect
+    NoMemoryEffect,
+    DeclareOpInterfaceMethods<VectorizableOpInterface>
 ])> {
     let arguments = (ins AnyTypeOf<[MatrixOf<[scalarType]>, scalarType, Unknown]>:$arg);
     let results = (outs AnyTypeOf<[MatrixOf<[scalarType]>, scalarType, Unknown]>:$res);
@@ -229,7 +230,7 @@ def Daphne_EwAbsOp : Daphne_EwUnaryOp<"ewAbs", NumScalar, [ValueTypeFromFirstArg
 def Daphne_EwSignOp : Daphne_EwUnaryOp<"ewSign", NumScalar, [ValueTypeFromFirstArg]>;
 def Daphne_EwExpOp : Daphne_EwUnaryOp<"ewExp", NumScalar, [ValueTypeFromArgsFP]>;
 def Daphne_EwLnOp : Daphne_EwUnaryOp<"ewLn", NumScalar, [ValueTypeFromArgsFP]>;
-def Daphne_EwSqrtOp : Daphne_EwUnaryOp<"ewSqrt", NumScalar, [ValueTypeFromArgsFP, DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
+def Daphne_EwSqrtOp : Daphne_EwUnaryOp<"ewSqrt", NumScalar, [ValueTypeFromArgsFP]>;
 
 // ----------------------------------------------------------------------------
 // Logical
@@ -374,28 +375,40 @@ class Daphne_OuterBinaryOp<string name, Type scalarType, list<Trait> traits = []
 // Arithmetic
 // ----------------------------------------------------------------------------
 
-def Daphne_OuterAddOp : Daphne_OuterBinaryOp<"outerAdd", NumScalar, [ValueTypeFromArgs]>;
-def Daphne_OuterSubOp : Daphne_OuterBinaryOp<"outerSub", NumScalar, [ValueTypeFromArgs]>;
-def Daphne_OuterMulOp : Daphne_OuterBinaryOp<"outerMul", NumScalar, [ValueTypeFromArgs]>;
-def Daphne_OuterDivOp : Daphne_OuterBinaryOp<"outerDiv", NumScalar, [ValueTypeFromArgs]>;
-def Daphne_OuterPowOp : Daphne_OuterBinaryOp<"outerPow", NumScalar, [ValueTypeFromArgs]>;
-def Daphne_OuterModOp : Daphne_OuterBinaryOp<"outerMod", NumScalar, [ValueTypeFromArgs]>;
-def Daphne_OuterLogOp : Daphne_OuterBinaryOp<"outerLog", NumScalar, [ValueTypeFromArgsFP]>;
+def Daphne_OuterAddOp : Daphne_OuterBinaryOp<"outerAdd", NumScalar, [ValueTypeFromArgs,
+    DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
+def Daphne_OuterSubOp : Daphne_OuterBinaryOp<"outerSub", NumScalar, [ValueTypeFromArgs,
+    DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
+def Daphne_OuterMulOp : Daphne_OuterBinaryOp<"outerMul", NumScalar, [ValueTypeFromArgs,
+    DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
+def Daphne_OuterDivOp : Daphne_OuterBinaryOp<"outerDiv", NumScalar, [ValueTypeFromArgs,
+    DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
+def Daphne_OuterPowOp : Daphne_OuterBinaryOp<"outerPow", NumScalar, [ValueTypeFromArgs,
+    DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
+def Daphne_OuterModOp : Daphne_OuterBinaryOp<"outerMod", NumScalar, [ValueTypeFromArgs,
+    DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
+def Daphne_OuterLogOp : Daphne_OuterBinaryOp<"outerLog", NumScalar, [ValueTypeFromArgsFP,
+    DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
 
 // ----------------------------------------------------------------------------
 // Min/max
 // ----------------------------------------------------------------------------
 
-def Daphne_OuterMinOp : Daphne_OuterBinaryOp<"outerMin", AnyScalar, [ValueTypeFromArgs]>;
-def Daphne_OuterMaxOp : Daphne_OuterBinaryOp<"outerMax", AnyScalar, [ValueTypeFromArgs]>;
+def Daphne_OuterMinOp : Daphne_OuterBinaryOp<"outerMin", AnyScalar, [ValueTypeFromArgs,
+    DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
+def Daphne_OuterMaxOp : Daphne_OuterBinaryOp<"outerMax", AnyScalar, [ValueTypeFromArgs,
+    DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
 
 // ----------------------------------------------------------------------------
 // Logical
 // ----------------------------------------------------------------------------
 
-def Daphne_OuterAndOp : Daphne_OuterBinaryOp<"outerAnd", NumScalar, [ValueTypeFromArgsInt]>;
-def Daphne_OuterOrOp  : Daphne_OuterBinaryOp<"outerOr" , NumScalar, [ValueTypeFromArgsInt]>;
-def Daphne_OuterXorOp : Daphne_OuterBinaryOp<"outerXor", NumScalar, [ValueTypeFromArgsInt]>;
+def Daphne_OuterAndOp : Daphne_OuterBinaryOp<"outerAnd", NumScalar, [ValueTypeFromArgsInt,
+    DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
+def Daphne_OuterOrOp  : Daphne_OuterBinaryOp<"outerOr" , NumScalar, [ValueTypeFromArgsInt,
+    DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
+def Daphne_OuterXorOp : Daphne_OuterBinaryOp<"outerXor", NumScalar, [ValueTypeFromArgsInt,
+    DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
 
 // ----------------------------------------------------------------------------
 // Strings
@@ -408,7 +421,8 @@ def Daphne_OuterConcatOp : Daphne_OuterBinaryOp<"outerConcat", StrScalar>;
 // ----------------------------------------------------------------------------
 
 class Daphne_OuterCmpOp<string name, Type inputScalarType, list<Trait> traits = []>
-: Daphne_OuterBinaryOp<name, inputScalarType, !listconcat(traits, [ValueTypeFromArgs])> {
+: Daphne_OuterBinaryOp<name, inputScalarType, !listconcat(traits, [ValueTypeFromArgs,
+    DeclareOpInterfaceMethods<VectorizableOpInterface>])> {
     // TODO: We do not enforce (matrix of) boolean output any more, but should
     // think about that again.
     //let results = (outs AnyTypeOf<[MatrixOf<[BoolScalar]>, BoolScalar, Unknown]>:$res);
@@ -448,13 +462,13 @@ class Daphne_AggOp<string name, Type inScalarType, list<Trait> traits = []> : Da
 // ----------------------------------------------------------------------------
 
 class Daphne_AllAggOp<string name, Type scalarType, list<Trait> traits = []>
-: Daphne_AggOp<name, scalarType, !listconcat(traits, [DataTypeSca, CastArgsToResType])> {
-    let results = (outs scalarType:$res);
+: Daphne_AggOp<name, scalarType, !listconcat(traits, [DataTypeSca, CastArgsToResType, ])> {
+    let results = (outs AnyType:$res);
 }
 
-def Daphne_AllAggSumOp    : Daphne_AllAggOp<"sumAll", NumScalar, [ValueTypeFromFirstArg]>;
-def Daphne_AllAggMinOp    : Daphne_AllAggOp<"minAll", NumScalar, [ValueTypeFromFirstArg]>;
-def Daphne_AllAggMaxOp    : Daphne_AllAggOp<"maxAll", NumScalar, [ValueTypeFromFirstArg]>;
+def Daphne_AllAggSumOp    : Daphne_AllAggOp<"sumAll", NumScalar, [ValueTypeFromFirstArg, DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
+def Daphne_AllAggMinOp    : Daphne_AllAggOp<"minAll", NumScalar, [ValueTypeFromFirstArg, DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
+def Daphne_AllAggMaxOp    : Daphne_AllAggOp<"maxAll", NumScalar, [ValueTypeFromFirstArg, DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
 def Daphne_AllAggMeanOp   : Daphne_AllAggOp<"meanAll", NumScalar, [ValueTypeFromArgsFP]>;
 def Daphne_AllAggVarOp    : Daphne_AllAggOp<"varAll", NumScalar, [ValueTypeFromArgsFP]>;
 def Daphne_AllAggStddevOp : Daphne_AllAggOp<"stddevAll", NumScalar, [ValueTypeFromArgsFP]>;
@@ -479,7 +493,8 @@ class Daphne_ColAggOp<string name, Type inScalarType, Type outScalarType = inSca
 
 def Daphne_RowAggSumOp    : Daphne_RowAggOp<"sumRow"   , NumScalar, NumScalar, [ValueTypeFromFirstArg, CastArgsToResType,
         CUDASupport, DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
-def Daphne_RowAggMinOp    : Daphne_RowAggOp<"minRow"   , AnyScalar, AnyScalar, [ValueTypeFromFirstArg, CastArgsToResType, DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
+def Daphne_RowAggMinOp    : Daphne_RowAggOp<"minRow"   , AnyScalar, AnyScalar, [ValueTypeFromFirstArg, CastArgsToResType,
+        DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
 def Daphne_RowAggMaxOp    : Daphne_RowAggOp<"maxRow"   , AnyScalar, AnyScalar, [ValueTypeFromFirstArg, CastArgsToResType,
         CUDASupport, DeclareOpInterfaceMethods<VectorizableOpInterface>, DeclareOpInterfaceMethods<DistributableOpInterface>]>;
 def Daphne_RowAggIdxMinOp : Daphne_RowAggOp<"idxminRow", NumScalar, Size, [ValueTypeSize]>;
@@ -490,8 +505,10 @@ def Daphne_RowAggStddevOp : Daphne_RowAggOp<"stddevRow", NumScalar, NumScalar, [
 
 def Daphne_ColAggSumOp    : Daphne_ColAggOp<"sumCol"   , NumScalar, NumScalar, [ValueTypeFromFirstArg, CastArgsToResType,
         CUDASupport, DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
-def Daphne_ColAggMinOp    : Daphne_ColAggOp<"minCol"   , AnyScalar, AnyScalar, [ValueTypeFromFirstArg, CastArgsToResType]>;
-def Daphne_ColAggMaxOp    : Daphne_ColAggOp<"maxCol"   , AnyScalar, AnyScalar, [ValueTypeFromFirstArg, CastArgsToResType]>;
+def Daphne_ColAggMinOp    : Daphne_ColAggOp<"minCol"   , AnyScalar, AnyScalar, [ValueTypeFromFirstArg, CastArgsToResType,
+        DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
+def Daphne_ColAggMaxOp    : Daphne_ColAggOp<"maxCol"   , AnyScalar, AnyScalar, [ValueTypeFromFirstArg, CastArgsToResType,
+        DeclareOpInterfaceMethods<VectorizableOpInterface>]>;
 def Daphne_ColAggIdxMinOp : Daphne_ColAggOp<"idxminCol", NumScalar, Size, [ValueTypeSize]>;
 def Daphne_ColAggIdxMaxOp : Daphne_ColAggOp<"idxmaxCol", NumScalar, Size, [ValueTypeSize]>;
 def Daphne_ColAggMeanOp   : Daphne_ColAggOp<"meanCol"  , NumScalar, NumScalar, [ValueTypeFromArgsFP, CastArgsToResType]>;
@@ -1564,7 +1581,7 @@ def Daphne_VectorizedPipelineOp : Daphne_Op<"vectorizedPipeline", [AttrSizedOper
             TypedArrayAttrBase<VectorSplitAttr, "Vector-Splits">:$splits,
             TypedArrayAttrBase<VectorCombineAttr, "Vector-Combines">:$combines,
             Optional<DaphneContext>:$ctx);
-    let results = (outs Variadic<MatrixOrFrame>:$outputs);
+    let results = (outs Variadic<AnyType>:$outputs);
     let regions = (region SizedRegion<1>:$body, AnyRegion:$cuda);
 
     let hasCanonicalizeMethod = 1;
diff --git a/src/ir/daphneir/DaphneVectorizableOpInterface.cpp b/src/ir/daphneir/DaphneVectorizableOpInterface.cpp
index cf835d368..b7563facd 100644
--- a/src/ir/daphneir/DaphneVectorizableOpInterface.cpp
+++ b/src/ir/daphneir/DaphneVectorizableOpInterface.cpp
@@ -30,65 +30,65 @@ using namespace mlir;
 // ****************************************************************************
 // For families of operations.
 
-template <class EwBinaryOp> std::vector<daphne::VectorSplit> getVectorSplits_EwBinaryOp(EwBinaryOp *op) {
-    // Matrix -> row-wise, Scalar -> none
-    auto lhsSplit = op->getLhs().getType().template isa<daphne::MatrixType>() ? daphne::VectorSplit::ROWS
-                                                                              : daphne::VectorSplit::NONE;
-    auto rhsSplit = op->getRhs().getType().template isa<daphne::MatrixType>() ? daphne::VectorSplit::ROWS
-                                                                              : daphne::VectorSplit::NONE;
-    return {lhsSplit, rhsSplit};
+// EwBinaryOp
+template <class EwBinaryOp> std::vector<std::vector<daphne::VectorSplit>> getVectorSplits_EwBinaryOp(EwBinaryOp *op) {
+    bool isLhsMatrix = op->getLhs().getType().template isa<daphne::MatrixType>();
+    bool isRhsMatrix = op->getRhs().getType().template isa<daphne::MatrixType>();
+
+    auto lhsSplitRow = isLhsMatrix ? daphne::VectorSplit::ROWS : daphne::VectorSplit::NONE;
+    auto rhsSplitRow = isRhsMatrix ? daphne::VectorSplit::ROWS : daphne::VectorSplit::NONE;
+
+    return {{lhsSplitRow, rhsSplitRow}};
 }
-template <class EwBinaryOp> std::vector<daphne::VectorCombine> getVectorCombines_EwBinaryOp(EwBinaryOp *op) {
-    return {daphne::VectorCombine::ROWS};
+template <class EwBinaryOp>
+std::vector<std::vector<daphne::VectorCombine>> getVectorCombines_EwBinaryOp(EwBinaryOp *op) {
+    return {{daphne::VectorCombine::ROWS}};
 }
 template <class EwBinaryOp>
-std::vector<std::pair<Value, Value>> createOpsOutputSizes_EwBinaryOp(EwBinaryOp *op, OpBuilder &builder) {
+std::vector<std::vector<std::pair<Value, Value>>> createOpsOutputSizes_EwBinaryOp(EwBinaryOp *op, OpBuilder &builder) {
     auto loc = op->getLoc();
     auto sizeTy = builder.getIndexType();
     auto lhsRows = builder.create<daphne::NumRowsOp>(loc, sizeTy, op->getLhs());
     auto lhsCols = builder.create<daphne::NumColsOp>(loc, sizeTy, op->getLhs());
     // TODO: do max on #rows/#cols of lhs and rhs for broadcasting
-    return {{lhsRows, lhsCols}};
+    return {{{lhsRows, lhsCols}}};
 }
-template <class EwUnaryOp> std::vector<daphne::VectorSplit> getVectorSplits_EwUnaryOp(EwUnaryOp *op) {
-    return {daphne::VectorSplit::ROWS};
+
+// EwUnaryOp
+template <class EwUnaryOp> std::vector<std::vector<daphne::VectorSplit>> getVectorSplits_EwUnaryOp(EwUnaryOp *op) {
+    return {{daphne::VectorSplit::ROWS}};
 }
-template <class EwUnaryOp> std::vector<daphne::VectorCombine> getVectorCombines_EwUnaryOp(EwUnaryOp *op) {
-    return {daphne::VectorCombine::ROWS};
+template <class EwUnaryOp> std::vector<std::vector<daphne::VectorCombine>> getVectorCombines_EwUnaryOp(EwUnaryOp *op) {
+    return {{daphne::VectorCombine::ROWS}};
 }
 template <class EwUnaryOp>
-std::vector<std::pair<Value, Value>> createOpsOutputSizes_EwUnaryOp(EwUnaryOp *op, OpBuilder &builder) {
+std::vector<std::vector<std::pair<Value, Value>>> createOpsOutputSizes_EwUnaryOp(EwUnaryOp *op, OpBuilder &builder) {
     auto loc = op->getLoc();
     auto sizeTy = builder.getIndexType();
     auto rows = builder.create<daphne::NumRowsOp>(loc, sizeTy, op->getArg());
     auto cols = builder.create<daphne::NumColsOp>(loc, sizeTy, op->getArg());
     // TODO: do max on #rows/#cols of lhs and rhs for broadcasting
-    return {{rows, cols}};
+    return {{{rows, cols}}, {{rows, cols}}};
 }
-template <class RowAggOp> std::vector<daphne::VectorSplit> getVectorSplits_RowAggOp(RowAggOp *op) {
-    return {daphne::VectorSplit::ROWS};
-}
-template <class RowAggOp> std::vector<daphne::VectorCombine> getVectorCombines_RowAggOp(RowAggOp *op) {
-    return {daphne::VectorCombine::ROWS};
-}
-template <class RowAggOp>
-std::vector<std::pair<Value, Value>> createOpsOutputSizes_RowAggOp(RowAggOp *op, OpBuilder &builder) {
-    auto loc = op->getLoc();
-    auto sizeTy = builder.getIndexType();
-    auto rows = builder.create<daphne::NumRowsOp>(loc, sizeTy, op->getArg());
-    auto cst1 = builder.create<daphne::ConstantOp>(loc, sizeTy, builder.getIndexAttr(1l));
-    return {{rows, cst1}};
+
+// OuterBinary
+template <class OuterBinaryOp>
+std::vector<std::vector<daphne::VectorSplit>> getVectorSplits_OuterBinaryOp(OuterBinaryOp *op) {
+    return {{daphne::VectorSplit::ROWS, daphne::VectorSplit::NONE}};
 }
-template <class ColAggOp> std::vector<daphne::VectorSplit> getVectorSplits_ColAggOp(ColAggOp *op) {
-    return {daphne::VectorSplit::ROWS};
+template <class OuterBinaryOp>
+std::vector<std::vector<daphne::VectorCombine>> getVectorCombines_OuterBinaryOp(OuterBinaryOp *op) {
+    return {{daphne::VectorCombine::ROWS}};
 }
-template <class ColAggOp>
-std::vector<std::pair<Value, Value>> createOpsOutputSizes_ColAggOp(ColAggOp *op, OpBuilder &builder) {
+template <class OuterBinaryOp>
+std::vector<std::vector<std::pair<Value, Value>>> createOpsOutputSizes_OuterBinaryOp(OuterBinaryOp *op,
+                                                                                     OpBuilder &builder) {
     auto loc = op->getLoc();
     auto sizeTy = builder.getIndexType();
-    auto cst1 = builder.create<daphne::ConstantOp>(loc, sizeTy, builder.getIndexAttr(1l));
-    auto cols = builder.create<daphne::NumColsOp>(loc, sizeTy, op->getArg());
-    return {{cst1, cols}};
+    auto rows = builder.create<daphne::NumRowsOp>(loc, sizeTy, op->getLhs());
+    auto cols = builder.create<daphne::NumColsOp>(loc, sizeTy, op->getRhs());
+    // TODO: do max on #rows/#cols of lhs and rhs for broadcasting
+    return {{{rows, cols}}};
 }
 
 // ****************************************************************************
@@ -97,16 +97,19 @@ std::vector<std::pair<Value, Value>> createOpsOutputSizes_ColAggOp(ColAggOp *op,
 
 // ----------------------------------------------------------------------------
 // Matrix multiplication
-std::vector<daphne::VectorSplit> daphne::MatMulOp::getVectorSplits() {
-    return {
-        daphne::VectorSplit::ROWS, // lhs
-        daphne::VectorSplit::NONE, // rhs
-        daphne::VectorSplit::NONE, // transa
-        daphne::VectorSplit::NONE  // transb
-    };
-}
-std::vector<daphne::VectorCombine> daphne::MatMulOp::getVectorCombines() { return {daphne::VectorCombine::ROWS}; }
-std::vector<std::pair<Value, Value>> daphne::MatMulOp::createOpsOutputSizes(OpBuilder &builder) {
+// ----------------------------------------------------------------------------
+std::vector<std::vector<daphne::VectorSplit>> daphne::MatMulOp::getVectorSplits() {
+    return {{
+                daphne::VectorSplit::ROWS, // lhs
+                daphne::VectorSplit::NONE, // rhs
+                daphne::VectorSplit::NONE, // transa
+                daphne::VectorSplit::NONE  // transb
+            }};
+}
+std::vector<std::vector<daphne::VectorCombine>> daphne::MatMulOp::getVectorCombines() {
+    return {{daphne::VectorCombine::ROWS}};
+}
+std::vector<std::vector<std::pair<Value, Value>>> daphne::MatMulOp::createOpsOutputSizes(OpBuilder &builder) {
     auto loc = getLoc();
     auto sizeTy = builder.getIndexType();
 
@@ -127,16 +130,21 @@ std::vector<std::pair<Value, Value>> daphne::MatMulOp::createOpsOutputSizes(OpBu
     cols = tb ? builder.create<daphne::NumRowsOp>(loc, sizeTy, getRhs()).getResult()
               : builder.create<daphne::NumColsOp>(loc, sizeTy, getRhs()).getResult();
 
-    return {{rows, cols}};
+    return {{{rows, cols}}};
 }
 // ----------------------------------------------------------------------------
 
 // ----------------------------------------------------------------------------
 // Binary
+// ----------------------------------------------------------------------------
 #define IMPL_SPLIT_COMBINE_EWBINARYOP(OP)                                                                              \
-    std::vector<daphne::VectorSplit> daphne::OP::getVectorSplits() { return getVectorSplits_EwBinaryOp(this); }        \
-    std::vector<daphne::VectorCombine> daphne::OP::getVectorCombines() { return getVectorCombines_EwBinaryOp(this); }  \
-    std::vector<std::pair<Value, Value>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) {                        \
+    std::vector<std::vector<daphne::VectorSplit>> daphne::OP::getVectorSplits() {                                      \
+        return getVectorSplits_EwBinaryOp(this);                                                                       \
+    }                                                                                                                  \
+    std::vector<std::vector<daphne::VectorCombine>> daphne::OP::getVectorCombines() {                                  \
+        return getVectorCombines_EwBinaryOp(this);                                                                     \
+    }                                                                                                                  \
+    std::vector<std::vector<std::pair<Value, Value>>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) {           \
         return createOpsOutputSizes_EwBinaryOp(this, builder);                                                         \
     }
 
@@ -176,40 +184,163 @@ IMPL_SPLIT_COMBINE_EWBINARYOP(EwGeOp)
 
 // ----------------------------------------------------------------------------
 // Unary
+// ----------------------------------------------------------------------------
 #define IMPL_SPLIT_COMBINE_EWUNARYOP(OP)                                                                               \
-    std::vector<daphne::VectorSplit> daphne::OP::getVectorSplits() { return getVectorSplits_EwUnaryOp(this); }         \
-    std::vector<daphne::VectorCombine> daphne::OP::getVectorCombines() { return getVectorCombines_EwUnaryOp(this); }   \
-    std::vector<std::pair<Value, Value>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) {                        \
+    std::vector<std::vector<daphne::VectorSplit>> daphne::OP::getVectorSplits() {                                      \
+        return getVectorSplits_EwUnaryOp(this);                                                                        \
+    }                                                                                                                  \
+    std::vector<std::vector<daphne::VectorCombine>> daphne::OP::getVectorCombines() {                                  \
+        return getVectorCombines_EwUnaryOp(this);                                                                      \
+    }                                                                                                                  \
+    std::vector<std::vector<std::pair<Value, Value>>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) {           \
         return createOpsOutputSizes_EwUnaryOp(this, builder);                                                          \
     }
 
+// Arithmetic/general math
+
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwMinusOp)
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwAbsOp)
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwSignOp)
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwExpOp)
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwLnOp)
 IMPL_SPLIT_COMBINE_EWUNARYOP(EwSqrtOp)
 
+// Logical
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwNegOp)
+
+// Rounding
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwRoundOp)
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwFloorOp)
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwCeilOp)
+
+// Trigonometric
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwSinOp)
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwCosOp)
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwTanOp)
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwSinhOp)
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwCoshOp)
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwTanhOp)
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwAsinOp)
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwAcosOp)
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwAtanOp)
+
+// Comparison
+// changes value type?
+IMPL_SPLIT_COMBINE_EWUNARYOP(EwIsNanOp)
+
 #undef IMPL_SPLIT_COMBINE_EWUNARYOP
 // ----------------------------------------------------------------------------
 
 // ----------------------------------------------------------------------------
-// Aggregations
-// TODO: splitting and combining by column probably makes more sense
+// Full Aggregations
+// ----------------------------------------------------------------------------
+
+template <class AllAggOp> std::vector<std::vector<daphne::VectorSplit>> getVectorSplits_AllAggOp(AllAggOp *op) {
+    return {{daphne::VectorSplit::ROWS}, {daphne::VectorSplit::COLS}};
+}
+template <class AllAggOp> std::vector<std::vector<daphne::VectorCombine>> getVectorCombines_AllAggOp(AllAggOp *op) {
+    return {{daphne::VectorCombine::ADD}, {daphne::VectorCombine::ADD}};
+}
+template <class AllAggOp>
+std::vector<std::vector<std::pair<Value, Value>>> createOpsOutputSizes_AllAggOp(AllAggOp *op, OpBuilder &builder) {
+    auto loc = op->getLoc();
+    auto sizeTy = builder.getIndexType();
+    auto cst1 = builder.create<daphne::ConstantOp>(loc, sizeTy, builder.getIndexAttr(1l));
+    return {{{cst1, cst1}}, {{cst1, cst1}}};
+}
+
+#define IMPL_SPLIT_COMBINE_ALLAGG(OP)                                                                                  \
+    std::vector<std::vector<daphne::VectorSplit>> daphne::OP::getVectorSplits() {                                      \
+        return getVectorSplits_AllAggOp(this);                                                                         \
+    }                                                                                                                  \
+    std::vector<std::vector<daphne::VectorCombine>> daphne::OP::getVectorCombines() {                                  \
+        return getVectorCombines_AllAggOp(this);                                                                       \
+    }                                                                                                                  \
+    std::vector<std::vector<std::pair<Value, Value>>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) {           \
+        return createOpsOutputSizes_AllAggOp(this, builder);                                                           \
+    }
+
+// RowAgg
+IMPL_SPLIT_COMBINE_ALLAGG(AllAggSumOp)
+IMPL_SPLIT_COMBINE_ALLAGG(AllAggMaxOp)
+IMPL_SPLIT_COMBINE_ALLAGG(AllAggMinOp)
+
+#undef IMPL_SPLIT_COMBINE_ALLAGG
+
+// ----------------------------------------------------------------------------
+// Dimension Aggregations
+// ----------------------------------------------------------------------------
+
+template <class RowAggOp> std::vector<std::vector<daphne::VectorSplit>> getVectorSplits_RowAggOp(RowAggOp *op) {
+    return {{daphne::VectorSplit::ROWS}, {daphne::VectorSplit::COLS}};
+}
+template <class RowAggOp>
+std::vector<std::vector<std::pair<Value, Value>>> createOpsOutputSizes_RowAggOp(RowAggOp *op, OpBuilder &builder) {
+    auto loc = op->getLoc();
+    auto sizeTy = builder.getIndexType();
+    auto rows = builder.create<daphne::NumRowsOp>(loc, sizeTy, op->getArg());
+    auto cst1 = builder.create<daphne::ConstantOp>(loc, sizeTy, builder.getIndexAttr(1l));
+    return {{{rows, cst1}}};
+}
+
 #define IMPL_SPLIT_COMBINE_ROWAGG(OP)                                                                                  \
-    std::vector<daphne::VectorSplit> daphne::OP::getVectorSplits() { return getVectorSplits_RowAggOp(this); }          \
-    std::vector<daphne::VectorCombine> daphne::OP::getVectorCombines() { return getVectorCombines_RowAggOp(this); }    \
-    std::vector<std::pair<Value, Value>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) {                        \
+    std::vector<std::vector<daphne::VectorSplit>> daphne::OP::getVectorSplits() {                                      \
+        return getVectorSplits_RowAggOp(this);                                                                         \
+    }                                                                                                                  \
+    std::vector<std::vector<std::pair<Value, Value>>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) {           \
         return createOpsOutputSizes_RowAggOp(this, builder);                                                           \
     }
-#define IMPL_SPLIT_COMBINE_COLAGG(OP)                                                                                  \
-    std::vector<daphne::VectorSplit> daphne::OP::getVectorSplits() { return getVectorSplits_ColAggOp(this); }          \
-    std::vector<std::pair<Value, Value>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) {                        \
-        return createOpsOutputSizes_ColAggOp(this, builder);                                                           \
-    }
 
 // RowAgg
 IMPL_SPLIT_COMBINE_ROWAGG(RowAggMinOp)
 IMPL_SPLIT_COMBINE_ROWAGG(RowAggMaxOp)
 IMPL_SPLIT_COMBINE_ROWAGG(RowAggSumOp)
 
+std::vector<std::vector<daphne::VectorCombine>> daphne::RowAggSumOp::getVectorCombines() {
+    return {{daphne::VectorCombine::ROWS}};
+}
+std::vector<std::vector<daphne::VectorCombine>> daphne::RowAggMinOp::getVectorCombines() {
+    return {{daphne::VectorCombine::ROWS}};
+}
+std::vector<std::vector<daphne::VectorCombine>> daphne::RowAggMaxOp::getVectorCombines() {
+    return {{daphne::VectorCombine::ROWS}};
+}
+
+//-----
+
+template <class ColAggOp> std::vector<std::vector<daphne::VectorSplit>> getVectorSplits_ColAggOp(ColAggOp *op) {
+    return {{daphne::VectorSplit::ROWS}};
+}
+template <class ColAggOp>
+std::vector<std::vector<std::pair<Value, Value>>> createOpsOutputSizes_ColAggOp(ColAggOp *op, OpBuilder &builder) {
+    auto loc = op->getLoc();
+    auto sizeTy = builder.getIndexType();
+    auto cst1 = builder.create<daphne::ConstantOp>(loc, sizeTy, builder.getIndexAttr(1l));
+    auto cols = builder.create<daphne::NumColsOp>(loc, sizeTy, op->getArg());
+    return {{{cst1, cols}}};
+}
+
+#define IMPL_SPLIT_COMBINE_COLAGG(OP)                                                                                  \
+    std::vector<std::vector<daphne::VectorSplit>> daphne::OP::getVectorSplits() {                                      \
+        return getVectorSplits_ColAggOp(this);                                                                         \
+    }                                                                                                                  \
+    std::vector<std::vector<std::pair<Value, Value>>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) {           \
+        return createOpsOutputSizes_ColAggOp(this, builder);                                                           \
+    }
+
+IMPL_SPLIT_COMBINE_COLAGG(ColAggMinOp)
+IMPL_SPLIT_COMBINE_COLAGG(ColAggMaxOp)
 IMPL_SPLIT_COMBINE_COLAGG(ColAggSumOp)
-std::vector<daphne::VectorCombine> daphne::ColAggSumOp::getVectorCombines() { return {daphne::VectorCombine::ADD}; }
+
+std::vector<std::vector<daphne::VectorCombine>> daphne::ColAggSumOp::getVectorCombines() {
+    return {{daphne::VectorCombine::ADD}};
+}
+std::vector<std::vector<daphne::VectorCombine>> daphne::ColAggMinOp::getVectorCombines() {
+    return {{daphne::VectorCombine::MIN}};
+}
+std::vector<std::vector<daphne::VectorCombine>> daphne::ColAggMaxOp::getVectorCombines() {
+    return {{daphne::VectorCombine::MAX}};
+}
 
 #undef IMPL_SPLIT_COMBINE_ROWAGG
 #undef IMPL_SPLIT_COMBINE_COLAGG
@@ -217,72 +348,137 @@ std::vector<daphne::VectorCombine> daphne::ColAggSumOp::getVectorCombines() { re
 
 // ----------------------------------------------------------------------------
 // Left and right indexing
-std::vector<daphne::VectorSplit> daphne::ExtractColOp::getVectorSplits() {
-    return {daphne::VectorSplit::ROWS, daphne::VectorSplit::NONE};
+// ----------------------------------------------------------------------------
+std::vector<std::vector<daphne::VectorSplit>> daphne::ExtractColOp::getVectorSplits() {
+    return {{daphne::VectorSplit::ROWS, daphne::VectorSplit::NONE}};
 }
-std::vector<daphne::VectorCombine> daphne::ExtractColOp::getVectorCombines() { return {daphne::VectorCombine::ROWS}; }
-std::vector<std::pair<Value, Value>> daphne::ExtractColOp::createOpsOutputSizes(OpBuilder &builder) {
+std::vector<std::vector<daphne::VectorCombine>> daphne::ExtractColOp::getVectorCombines() {
+    return {{daphne::VectorCombine::ROWS}};
+}
+std::vector<std::vector<std::pair<Value, Value>>> daphne::ExtractColOp::createOpsOutputSizes(OpBuilder &builder) {
     auto loc = getLoc();
     auto sizeTy = builder.getIndexType();
     auto rows = builder.create<daphne::NumRowsOp>(loc, sizeTy, getSource());
     // TODO: support scalar and maybe (based on definition of `ExtractColOp`)
     // apply some kind of `unique()` op
     auto cols = builder.create<daphne::NumRowsOp>(loc, sizeTy, getSelectedCols());
-    return {{rows, cols}};
+    return {{{rows, cols}}};
 }
 // ----------------------------------------------------------------------------
 
 // ----------------------------------------------------------------------------
 // Reorganization
-std::vector<daphne::VectorSplit> daphne::TransposeOp::getVectorSplits() { return {daphne::VectorSplit::ROWS}; }
-std::vector<daphne::VectorCombine> daphne::TransposeOp::getVectorCombines() { return {daphne::VectorCombine::COLS}; }
-std::vector<std::pair<Value, Value>> daphne::TransposeOp::createOpsOutputSizes(OpBuilder &builder) {
+// ----------------------------------------------------------------------------
+std::vector<std::vector<daphne::VectorSplit>> daphne::TransposeOp::getVectorSplits() {
+    return {{daphne::VectorSplit::ROWS}};
+}
+std::vector<std::vector<daphne::VectorCombine>> daphne::TransposeOp::getVectorCombines() {
+    return {{daphne::VectorCombine::COLS}};
+}
+std::vector<std::vector<std::pair<Value, Value>>> daphne::TransposeOp::createOpsOutputSizes(OpBuilder &builder) {
     auto loc = getLoc();
     auto sizeTy = builder.getIndexType();
     auto rows = builder.create<daphne::NumRowsOp>(loc, sizeTy, getArg());
     auto cols = builder.create<daphne::NumColsOp>(loc, sizeTy, getArg());
-    return {{cols, rows}};
+    return {{{cols, rows}}};
 }
 
-std::vector<daphne::VectorSplit> daphne::ColBindOp::getVectorSplits() {
-    return {daphne::VectorSplit::ROWS, daphne::VectorSplit::ROWS};
+std::vector<std::vector<daphne::VectorSplit>> daphne::ColBindOp::getVectorSplits() {
+    return {{daphne::VectorSplit::ROWS, daphne::VectorSplit::ROWS}};
+}
+std::vector<std::vector<daphne::VectorCombine>> daphne::ColBindOp::getVectorCombines() {
+    return {{daphne::VectorCombine::ROWS}};
 }
-std::vector<daphne::VectorCombine> daphne::ColBindOp::getVectorCombines() { return {daphne::VectorCombine::ROWS}; }
-std::vector<std::pair<Value, Value>> daphne::ColBindOp::createOpsOutputSizes(OpBuilder &builder) {
+std::vector<std::vector<std::pair<Value, Value>>> daphne::ColBindOp::createOpsOutputSizes(OpBuilder &builder) {
     auto loc = getLoc();
     auto i64Ty = builder.getIntegerType(64, true);
     auto sizeTy = builder.getIndexType();
     auto rows = builder.create<daphne::NumRowsOp>(loc, sizeTy, getLhs());
     auto colsLhs = builder.create<daphne::NumColsOp>(loc, sizeTy, getLhs());
     auto colsRhs = builder.create<daphne::NumColsOp>(loc, sizeTy, getRhs());
-    return {{rows, builder.create<daphne::CastOp>(
-                       loc, sizeTy,
-                       builder.create<daphne::EwAddOp>(loc, builder.create<daphne::CastOp>(loc, i64Ty, colsLhs),
-                                                       builder.create<daphne::CastOp>(loc, i64Ty, colsRhs)))}};
+    return {{{rows, builder.create<daphne::CastOp>(
+                        loc, sizeTy,
+                        builder.create<daphne::EwAddOp>(loc, builder.create<daphne::CastOp>(loc, i64Ty, colsLhs),
+                                                        builder.create<daphne::CastOp>(loc, i64Ty, colsRhs)))}}};
 }
 // ----------------------------------------------------------------------------
 
+// ----------------------------------------------------------------------------
+// Outer binary (generalized outer product)
+// ----------------------------------------------------------------------------
+#define IMPL_SPLIT_COMBINE_OUTERBINARY(OP)                                                                             \
+    std::vector<std::vector<daphne::VectorSplit>> daphne::OP::getVectorSplits() {                                      \
+        return getVectorSplits_OuterBinaryOp(this);                                                                    \
+    }                                                                                                                  \
+    std::vector<std::vector<daphne::VectorCombine>> daphne::OP::getVectorCombines() {                                  \
+        return getVectorCombines_OuterBinaryOp(this);                                                                  \
+    }                                                                                                                  \
+    std::vector<std::vector<std::pair<Value, Value>>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) {           \
+        return createOpsOutputSizes_OuterBinaryOp(this, builder);                                                      \
+    }
+
+// Arithmetic
+
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterAddOp)
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterSubOp)
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterMulOp)
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterDivOp)
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterPowOp)
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterModOp)
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterLogOp)
+
+// Min/max
+
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterMinOp)
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterMaxOp)
+
+// Logical
+
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterAndOp)
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterOrOp)
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterXorOp)
+
+// Comparisons
+
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterEqOp)
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterNeqOp)
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterLtOp)
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterLeOp)
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterGtOp)
+IMPL_SPLIT_COMBINE_OUTERBINARY(OuterGeOp)
+
+#undef IMPL_SPLIT_COMBINE_OUTERBINARY
+
+// ----------------------------------------------------------------------------
+
 // ----------------------------------------------------------------------------
 // Other
-std::vector<daphne::VectorSplit> daphne::SyrkOp::getVectorSplits() { return {daphne::VectorSplit::ROWS}; }
-std::vector<daphne::VectorCombine> daphne::SyrkOp::getVectorCombines() { return {daphne::VectorCombine::ADD}; }
-std::vector<std::pair<Value, Value>> daphne::SyrkOp::createOpsOutputSizes(OpBuilder &builder) {
+// ----------------------------------------------------------------------------
+std::vector<std::vector<daphne::VectorSplit>> daphne::SyrkOp::getVectorSplits() {
+    return {{daphne::VectorSplit::ROWS}};
+}
+std::vector<std::vector<daphne::VectorCombine>> daphne::SyrkOp::getVectorCombines() {
+    return {{daphne::VectorCombine::ADD}};
+}
+std::vector<std::vector<std::pair<Value, Value>>> daphne::SyrkOp::createOpsOutputSizes(OpBuilder &builder) {
     auto loc = getLoc();
     auto sizeTy = builder.getIndexType();
     auto cols = builder.create<daphne::NumColsOp>(loc, sizeTy, getArg());
     // TODO: do max on #rows/#cols of lhs and rhs for broadcasting
-    return {{cols, cols}};
+    return {{{cols, cols}}};
 }
 
-std::vector<daphne::VectorSplit> daphne::GemvOp::getVectorSplits() {
-    return {daphne::VectorSplit::ROWS, daphne::VectorSplit::ROWS};
+std::vector<std::vector<daphne::VectorSplit>> daphne::GemvOp::getVectorSplits() {
+    return {{daphne::VectorSplit::ROWS, daphne::VectorSplit::ROWS}};
+}
+std::vector<std::vector<daphne::VectorCombine>> daphne::GemvOp::getVectorCombines() {
+    return {{daphne::VectorCombine::ADD}};
 }
-std::vector<daphne::VectorCombine> daphne::GemvOp::getVectorCombines() { return {daphne::VectorCombine::ADD}; }
-std::vector<std::pair<Value, Value>> daphne::GemvOp::createOpsOutputSizes(OpBuilder &builder) {
+std::vector<std::vector<std::pair<Value, Value>>> daphne::GemvOp::createOpsOutputSizes(OpBuilder &builder) {
     auto loc = getLoc();
     auto sizeTy = builder.getIndexType();
     auto cols = builder.create<daphne::NumColsOp>(loc, sizeTy, getMat());
     auto one = builder.create<daphne::ConstantOp>(loc, builder.getIndexType(), builder.getIndexAttr(1));
-    return {{cols, one}};
+    return {{{cols, one}}};
 }
-// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
\ No newline at end of file
diff --git a/src/ir/daphneir/DaphneVectorizableOpInterface.td b/src/ir/daphneir/DaphneVectorizableOpInterface.td
index 747eb19b0..baae04dcd 100644
--- a/src/ir/daphneir/DaphneVectorizableOpInterface.td
+++ b/src/ir/daphneir/DaphneVectorizableOpInterface.td
@@ -22,16 +22,19 @@ include "mlir/IR/OpBase.td"
 
 def VECTOR_SPLIT_NONE : I64EnumAttrCase<"NONE", 0>;
 def VECTOR_SPLIT_ROWS : I64EnumAttrCase<"ROWS", 1>;
+def VECTOR_SPLIT_COLS : I64EnumAttrCase<"COLS", 2>;
 
-def VectorSplitAttr : I64EnumAttr<"VectorSplit", "", [VECTOR_SPLIT_NONE, VECTOR_SPLIT_ROWS]> {
+def VectorSplitAttr : I64EnumAttr<"VectorSplit", "", [VECTOR_SPLIT_NONE, VECTOR_SPLIT_ROWS, VECTOR_SPLIT_COLS]> {
     let cppNamespace = "::mlir::daphne";
 }
 
 def VECTOR_COMBINE_ROWS : I64EnumAttrCase<"ROWS", 1>;
 def VECTOR_COMBINE_COLS : I64EnumAttrCase<"COLS", 2>;
 def VECTOR_COMBINE_ADD : I64EnumAttrCase<"ADD", 3>;
+def VECTOR_COMBINE_MAX : I64EnumAttrCase<"MAX", 4>;
+def VECTOR_COMBINE_MIN : I64EnumAttrCase<"MIN", 5>;
 
-def VectorCombineAttr : I64EnumAttr<"VectorCombine", "", [VECTOR_COMBINE_ROWS, VECTOR_COMBINE_COLS, VECTOR_COMBINE_ADD]> {
+def VectorCombineAttr : I64EnumAttr<"VectorCombine", "", [VECTOR_COMBINE_ROWS, VECTOR_COMBINE_COLS, VECTOR_COMBINE_ADD, VECTOR_COMBINE_MAX, VECTOR_COMBINE_MIN]> {
     let cppNamespace = "::mlir::daphne";
 }
 
@@ -42,11 +45,11 @@ def VectorizableOpInterface : OpInterface<"Vectorizable"> {
 
     let methods = [
         InterfaceMethod<"Get the vector split kind for each input.",
-                        "std::vector<daphne::VectorSplit>", "getVectorSplits", (ins)>,
+                        "std::vector<std::vector<daphne::VectorSplit>>", "getVectorSplits", (ins)>,
         InterfaceMethod<"Get the vector combine kind for each output.",
-                        "std::vector<daphne::VectorCombine>", "getVectorCombines", (ins)>,
+                        "std::vector<std::vector<daphne::VectorCombine>>", "getVectorCombines", (ins)>,
         InterfaceMethod<"Create values for #rows and #cols of each output. -1 for dynamic/unknown.",
-                        "std::vector<std::pair<Value, Value>>", "createOpsOutputSizes", (ins "mlir::OpBuilder&":$builder)>,
+                        "std::vector<std::vector<std::pair<Value, Value>>>", "createOpsOutputSizes", (ins "mlir::OpBuilder&":$builder)>,
         // TODO: for complex operations (non element-wise) where the computation per vector is not equal to the operation
         //  itself on the whole input, we will require a new method generating the operations in the pipeline. This is
         //  the same behaviour as with `Distributable` Ops, and therefore combining them might make sense.
diff --git a/src/ir/daphneir/Passes.h b/src/ir/daphneir/Passes.h
index 3dbb67e4f..7911e3b0b 100644
--- a/src/ir/daphneir/Passes.h
+++ b/src/ir/daphneir/Passes.h
@@ -67,7 +67,12 @@ std::unique_ptr<Pass> createRewriteToCallKernelOpPass(const DaphneUserConfig &cf
                                                       std::unordered_map<std::string, bool> &usedLibPaths);
 std::unique_ptr<Pass> createSelectMatrixRepresentationsPass(const DaphneUserConfig &cfg);
 std::unique_ptr<Pass> createSpecializeGenericFunctionsPass(const DaphneUserConfig &cfg);
-std::unique_ptr<Pass> createVectorizeComputationsPass();
+
+std::unique_ptr<Pass> createDaphneVectorizeComputationsPass();
+std::unique_ptr<Pass> createGreedy1VectorizeComputationsPass(const DaphneUserConfig& cfg);
+std::unique_ptr<Pass> createHorizontalFusionPass();
+std::unique_ptr<Pass> createDrawPipelineOpsPass(const std::string filename);
+
 #ifdef USE_CUDA
 std::unique_ptr<Pass> createMarkCUDAOpsPass(const DaphneUserConfig &cfg);
 #endif
diff --git a/src/parser/daphnedsl/DaphneDSLBuiltins.cpp b/src/parser/daphnedsl/DaphneDSLBuiltins.cpp
index afe498a20..5c9a0456e 100644
--- a/src/parser/daphnedsl/DaphneDSLBuiltins.cpp
+++ b/src/parser/daphnedsl/DaphneDSLBuiltins.cpp
@@ -1248,6 +1248,17 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string &fu
             builder.create<MapOp>(loc, source.getType(), source, attr.dyn_cast<mlir::StringAttr>()));
     }
 
+    // ****************************************************************************
+    // Profiling
+    // ****************************************************************************
+
+    if (func == "startProfiling") {
+        return builder.create<StartProfilingOp>(loc);
+    }
+    if (func == "stopProfiling") {
+        return builder.create<StopProfilingOp>(loc);
+    }
+
     // ****************************************************************************
     // List operations
     // ****************************************************************************
diff --git a/src/runtime/local/vectorized/MTWrapper.h b/src/runtime/local/vectorized/MTWrapper.h
index a802f2d87..236f7853f 100644
--- a/src/runtime/local/vectorized/MTWrapper.h
+++ b/src/runtime/local/vectorized/MTWrapper.h
@@ -68,6 +68,9 @@ template <typename DT> class MTWrapperBase {
             if (splits[i] == mlir::daphne::VectorSplit::ROWS) {
                 len = std::max(len, inputs[i]->getNumRows());
                 mem_required += inputs[i]->getNumItems() * sizeof(typename DT::VT);
+            } else if (splits[i] == mlir::daphne::VectorSplit::COLS) {
+                len = std::max(len, inputs[i]->getNumCols());
+                mem_required += inputs[i]->getNumItems() * sizeof(typename DT::VT); 
             }
         }
         return std::make_pair(len, mem_required);
diff --git a/src/runtime/local/vectorized/MTWrapper_dense.cpp b/src/runtime/local/vectorized/MTWrapper_dense.cpp
index ffed73054..08382fc60 100644
--- a/src/runtime/local/vectorized/MTWrapper_dense.cpp
+++ b/src/runtime/local/vectorized/MTWrapper_dense.cpp
@@ -36,7 +36,13 @@ template <typename VT>
     std::unique_ptr<TaskQueue> q = std::make_unique<BlockingTaskQueue>(len);
 
     std::vector<TaskQueue *> tmp_q{q.get()};
-    auto batchSize8M = std::max(100ul, static_cast<size_t>(std::ceil(8388608 / row_mem)));
+    
+    auto batchSize8M = ctx->config.batchSize;
+    if (batchSize8M == 0) {
+        batchSize8M = std::max(100ul, static_cast<size_t>(std::ceil(8388608 / row_mem)));
+    }
+    //llvm::outs() << "si: " << batchSize8M << "\n";
+
     this->initCPPWorkers(tmp_q, batchSize8M, verbose, 1, 0, false);
 
 #ifdef USE_CUDA
@@ -109,7 +115,13 @@ template <typename VT>
         }
     }
 
-    auto batchSize8M = std::max(100ul, static_cast<size_t>(std::ceil(8388608 / row_mem)));
+
+    auto batchSize8M = ctx->config.batchSize;
+    if (batchSize8M == 0) {
+        batchSize8M = std::max(100ul, static_cast<size_t>(std::ceil(8388608 / row_mem)));
+    }
+    //llvm::outs() << "mu: " << batchSize8M << "\n";
+
     this->initCPPWorkers(qvector, batchSize8M, verbose, this->_numQueues, this->_queueMode,
                          ctx->getUserConfig().pinWorkers);
 
diff --git a/src/runtime/local/vectorized/Tasks.cpp b/src/runtime/local/vectorized/Tasks.cpp
index 8291f2623..caebb3464 100644
--- a/src/runtime/local/vectorized/Tasks.cpp
+++ b/src/runtime/local/vectorized/Tasks.cpp
@@ -15,7 +15,18 @@
  */
 
 #include "runtime/local/vectorized/Tasks.h"
+#include "ir/daphneir/Daphne.h"
+#include "runtime/local/datastructures/DenseMatrix.h"
+#include "runtime/local/kernels/BinaryOpCode.h"
 #include "runtime/local/kernels/EwBinaryMat.h"
+#include <cstdint>
+#include <llvm/Support/raw_ostream.h>
+#include <stdexcept>
+#include <chrono>
+
+#ifdef USE_PAPI
+#include <papi.h>
+#endif
 
 template <typename VT> void CompiledPipelineTask<DenseMatrix<VT>>::execute(uint32_t fid, uint32_t batchSize) {
     // local add aggregation to minimize locking
@@ -24,15 +35,15 @@ template <typename VT> void CompiledPipelineTask<DenseMatrix<VT>>::execute(uint3
     std::vector<DenseMatrix<VT> **> outputs;
     for (auto &lres : localResults)
         outputs.push_back(&lres);
-    for (uint64_t r = _data._rl; r < _data._ru; r += batchSize) {
+    for (uint64_t d = _data._dl; d < _data._du; d += batchSize) {
         // create zero-copy views of inputs/outputs
-        uint64_t r2 = std::min(r + batchSize, _data._ru);
+        uint64_t d2 = std::min(d + batchSize, _data._du);
 
-        auto linputs = this->createFuncInputs(r, r2);
+        auto linputs = this->createFuncInputs(d, d2);
 
         // execute function on given data binding (batch size)
         _data._funcs[fid](outputs.data(), linputs.data(), _data._ctx);
-        accumulateOutputs(localResults, localAddRes, r, r2);
+        accumulateOutputs(localResults, localAddRes, d, d2);
 
         // cleanup
         for (auto &localResult : localResults)
@@ -47,75 +58,119 @@ template <typename VT> void CompiledPipelineTask<DenseMatrix<VT>>::execute(uint3
     }
 
     for (size_t o = 0; o < _data._numOutputs; ++o) {
-        if (_data._combines[o] == VectorCombine::ADD) {
-            auto &result = (*_res[o]);
-            _resLock.lock();
-            if (result == nullptr) {
-                result = localAddRes[o];
-                _resLock.unlock();
-            } else {
-                ewBinaryMat(BinaryOpCode::ADD, result, result, localAddRes[o], _data._ctx);
-                _resLock.unlock();
-                // cleanup
-                DataObjectFactory::destroy(localAddRes[o]);
+
+        if (_data._combines[o] == VectorCombine::ROWS || _data._combines[o] == VectorCombine::COLS)
+            continue;
+
+        auto &result = (*_res[o]);
+        _resLock.lock();
+        if (result == nullptr) {
+            result = localAddRes[o];
+            _resLock.unlock();
+        } else {
+            switch (_data._combines[o]) {
+                case VectorCombine::ADD:
+                    ewBinaryMat(BinaryOpCode::ADD, result, result, localAddRes[o], _data._ctx);
+                    break;
+                case VectorCombine::MIN:
+                    ewBinaryMat(BinaryOpCode::MIN, result, result, localAddRes[o], _data._ctx);
+                    break;
+                case VectorCombine::MAX:
+                    ewBinaryMat(BinaryOpCode::MAX, result, result, localAddRes[o], _data._ctx);
+                    break;
+                default:
+                    throw std::runtime_error("not implemented");
+                    break;
             }
+            _resLock.unlock();
+            // cleanup
+            DataObjectFactory::destroy(localAddRes[o]);
         }
     }
 }
 
-template <typename VT> uint64_t CompiledPipelineTask<DenseMatrix<VT>>::getTaskSize() { return _data._ru - _data._rl; }
+template <typename VT> uint64_t CompiledPipelineTask<DenseMatrix<VT>>::getTaskSize() { return _data._du - _data._dl; }
 
 template <typename VT>
 void CompiledPipelineTask<DenseMatrix<VT>>::accumulateOutputs(std::vector<DenseMatrix<VT> *> &localResults,
                                                               std::vector<DenseMatrix<VT> *> &localAddRes,
-                                                              uint64_t rowStart, uint64_t rowEnd) {
+                                                              uint64_t dimStart, uint64_t dimEnd) {
     // TODO: in-place computation via better compiled pipelines
     // TODO: multi-return
     for (auto o = 0u; o < _data._numOutputs; ++o) {
         auto &result = (*_res[o]);
         switch (_data._combines[o]) {
-        case VectorCombine::ROWS: {
-            auto slice = result->sliceRow(rowStart - _data._offset, rowEnd - _data._offset);
-            // TODO It's probably more efficient to memcpy than to get/set.
-            // But eventually, we don't want to copy at all.
-            for (auto i = 0u; i < slice->getNumRows(); ++i) {
-                for (auto j = 0u; j < slice->getNumCols(); ++j) {
-                    slice->set(i, j, localResults[o]->get(i, j));
+            case VectorCombine::ROWS: {
+                auto slice = result->sliceRow(dimStart - _data._offset, dimEnd - _data._offset);
+
+                //PAPI_hl_region_begin("fixme_rows");
+                VT *sliceValues = slice->getValues();
+                VT *localResultsValues = localResults[o]->getValues();
+                for (auto i = 0u; i < slice->getNumRows(); ++i) {
+                    for (auto j = 0u; j < slice->getNumCols(); ++j) {
+                        sliceValues[i * slice->getRowSkip() + j] =
+                            localResultsValues[i * localResults[o]->getRowSkip() + j];
+                    }
                 }
+                //PAPI_hl_region_end("fixme_rows");
+
+                DataObjectFactory::destroy(slice);
+                break;
             }
-            DataObjectFactory::destroy(slice);
-            break;
-        }
-        case VectorCombine::COLS: {
-            auto slice = result->sliceCol(rowStart - _data._offset, rowEnd - _data._offset);
-            // TODO It's probably more efficient to memcpy than to get/set.
-            // But eventually, we don't want to copy at all.
-            for (auto i = 0u; i < slice->getNumRows(); ++i) {
-                for (auto j = 0u; j < slice->getNumCols(); ++j) {
-                    slice->set(i, j, localResults[o]->get(i, j));
+            case VectorCombine::COLS: {
+
+                auto slice = result->sliceCol(dimStart - _data._offset, dimEnd - _data._offset);
+
+                //PAPI_hl_region_begin("fixme_cols");
+                VT *sliceValues = slice->getValues();
+                VT *localResultsValues = localResults[o]->getValues();
+                for (auto i = 0u; i < slice->getNumRows(); ++i) {
+                    for (auto j = 0u; j < slice->getNumCols(); ++j) {
+                        sliceValues[i * slice->getRowSkip() + j] =
+                            localResultsValues[i * localResults[o]->getRowSkip() + j];
+                    }
                 }
+                //PAPI_hl_region_end("fixme_cols");
+
+                DataObjectFactory::destroy(slice);
+                break;
             }
-            DataObjectFactory::destroy(slice);
-            break;
-        }
-        case VectorCombine::ADD: {
-            if (localAddRes[o] == nullptr) {
-                // take lres and reset it to nullptr
-                localAddRes[o] = localResults[o];
-                localResults[o] = nullptr;
-            } else {
-                ewBinaryMat(BinaryOpCode::ADD, localAddRes[o], localAddRes[o], localResults[o], nullptr);
+            case VectorCombine::ADD: {
+                accumulateAggregate(localAddRes[o], localResults[0], BinaryOpCode::ADD);
+                break;
+            }
+            case VectorCombine::MAX: {
+                accumulateAggregate(localAddRes[o], localResults[0], BinaryOpCode::MAX);
+                break;
+            }
+            case VectorCombine::MIN: {
+                accumulateAggregate(localAddRes[o], localResults[0], BinaryOpCode::MIN);
+                break;
+            }
+            default: {
+                throw std::runtime_error(("VectorCombine case `" +
+                                        std::to_string(static_cast<int64_t>(_data._combines[o])) + "` not supported"));
             }
-            break;
-        }
-        default: {
-            throw std::runtime_error(("VectorCombine case `" +
-                                      std::to_string(static_cast<int64_t>(_data._combines[o])) + "` not supported"));
-        }
         }
     }
 }
 
+template<typename VT>
+void CompiledPipelineTask<DenseMatrix<VT>>::accumulateAggregate(DenseMatrix<VT>*& localAddRes,
+                                                                DenseMatrix<VT>*& localResult,
+                                                                BinaryOpCode opCode) {
+    if (localAddRes == nullptr) {
+        // take lres and reset it to nullptr
+        localAddRes = localResult;
+        localResult = nullptr;
+    } else {
+        ewBinaryMat(opCode, localAddRes, localAddRes, localResult, nullptr);
+    }
+}
+
+
+//-----------------------------------------------------------------------------
+
 template <typename VT> void CompiledPipelineTask<CSRMatrix<VT>>::execute(uint32_t fid, uint32_t batchSize) {
     std::vector<size_t> localResNumRows(_data._numOutputs);
     std::vector<size_t> localResNumCols(_data._numOutputs);
@@ -125,7 +180,7 @@ template <typename VT> void CompiledPipelineTask<CSRMatrix<VT>>::execute(uint32_
             if (_data._wholeResultCols[i] == -1)
                 throw std::runtime_error("TODO: CompiledPipeLineTask (CSRMatrix) Rows "
                                          "_data._wholeResultCols[i] == -1");
-            localResNumRows[i] = _data._ru - _data._rl;
+            localResNumRows[i] = _data._du - _data._dl;
             localResNumCols[i] = _data._wholeResultCols[i];
             break;
         }
@@ -134,7 +189,7 @@ template <typename VT> void CompiledPipelineTask<CSRMatrix<VT>>::execute(uint32_
                 throw std::runtime_error("TODO: CompiledPipeLineTask (CSRMatrix) Cols "
                                          "_data._wholeResultRows[i] == -1");
             localResNumRows[i] = _data._wholeResultRows[i];
-            localResNumCols[i] = _data._ru - _data._rl;
+            localResNumCols[i] = _data._du - _data._dl;
             break;
         }
         default:
@@ -148,11 +203,11 @@ template <typename VT> void CompiledPipelineTask<CSRMatrix<VT>>::execute(uint32_
             new VectorizedDataSink<CSRMatrix<VT>>(_data._combines[i], localResNumRows[i], localResNumCols[i]);
 
     std::vector<CSRMatrix<VT> *> lres(_data._numOutputs, nullptr);
-    for (uint64_t r = _data._rl; r < _data._ru; r += batchSize) {
+    for (uint64_t d = _data._dl; d < _data._du; d += batchSize) {
         // create zero-copy views of inputs/outputs
-        uint64_t r2 = std::min(r + batchSize, _data._ru);
+        uint64_t d2 = std::min(d + batchSize, _data._du);
 
-        auto linputs = this->createFuncInputs(r, r2);
+        auto linputs = this->createFuncInputs(d, d2);
         CSRMatrix<VT> ***outputs = new CSRMatrix<VT> **[_data._numOutputs];
         for (size_t i = 0; i < _data._numOutputs; i++)
             outputs[i] = &(lres[i]);
@@ -160,7 +215,7 @@ template <typename VT> void CompiledPipelineTask<CSRMatrix<VT>>::execute(uint32_
         _data._funcs[fid](outputs, linputs.data(), _data._ctx);
         delete[] outputs;
         for (size_t i = 0; i < _data._numOutputs; i++)
-            localSinks[i]->add(lres[i], r - _data._rl, false);
+            localSinks[i]->add(lres[i], d - _data._dl, false);
 
         // cleanup
         for (size_t i = 0; i < _data._numOutputs; i++)
@@ -171,12 +226,12 @@ template <typename VT> void CompiledPipelineTask<CSRMatrix<VT>>::execute(uint32_
         // here.
     }
     for (size_t i = 0; i < _data._numOutputs; i++) {
-        _resultSinks[i]->add(localSinks[i]->consume(), _data._rl);
+        _resultSinks[i]->add(localSinks[i]->consume(), _data._dl);
         delete localSinks[i];
     }
 }
 
-template <typename VT> uint64_t CompiledPipelineTask<CSRMatrix<VT>>::getTaskSize() { return _data._ru - _data._rl; }
+template <typename VT> uint64_t CompiledPipelineTask<CSRMatrix<VT>>::getTaskSize() { return _data._du - _data._dl; }
 
 template class CompiledPipelineTask<DenseMatrix<double>>;
 template class CompiledPipelineTask<DenseMatrix<float>>;
diff --git a/src/runtime/local/vectorized/Tasks.h b/src/runtime/local/vectorized/Tasks.h
index e7b4f783f..83f80fe3c 100644
--- a/src/runtime/local/vectorized/Tasks.h
+++ b/src/runtime/local/vectorized/Tasks.h
@@ -57,17 +57,17 @@ template <class DT> struct CompiledPipelineTaskData {
     const int64_t *_outCols;
     const VectorSplit *_splits;
     const VectorCombine *_combines;
-    const uint64_t _rl;              // row lower index
-    const uint64_t _ru;              // row upper index
+    const uint64_t _dl;              // dim lower index
+    const uint64_t _du;              // dim upper index
     const int64_t *_wholeResultRows; // number of rows of the complete result
     const int64_t *_wholeResultCols; // number of cols of the complete result
     const uint64_t _offset;
     DCTX(_ctx);
 
-    [[maybe_unused]] CompiledPipelineTaskData<DT> withDifferentRange(uint64_t newRl, uint64_t newRu) {
+    [[maybe_unused]] CompiledPipelineTaskData<DT> withDifferentRange(uint64_t newDl, uint64_t newDu) {
         CompiledPipelineTaskData<DT> flatCopy = *this;
-        flatCopy._rl = newRl;
-        flatCopy._ru = newRu;
+        flatCopy._dl = newDl;
+        flatCopy._du = newDu;
         return flatCopy;
     }
 };
@@ -83,10 +83,12 @@ template <class DT> class CompiledPipelineTaskBase : public Task {
 
   protected:
     bool isBroadcast(mlir::daphne::VectorSplit splitMethod, Structure *input) {
-        return splitMethod == VectorSplit::NONE || (splitMethod == VectorSplit::ROWS && input->getNumRows() == 1);
+        return splitMethod == VectorSplit::NONE || 
+              (splitMethod == VectorSplit::ROWS && input->getNumRows() == 1) || 
+              (splitMethod == VectorSplit::COLS && input->getNumCols() == 1);
     }
 
-    std::vector<Structure *> createFuncInputs(uint64_t rowStart, uint64_t rowEnd) {
+    std::vector<Structure *> createFuncInputs(uint64_t dimStart, uint64_t dimEnd) {
         std::vector<Structure *> linputs;
         for (auto i = 0u; i < _data._numInputs; i++) {
             if (isBroadcast(_data._splits[i], _data._inputs[i])) {
@@ -101,8 +103,12 @@ template <class DT> class CompiledPipelineTaskBase : public Task {
                     // alternative.
                     _data._inputs[i]->increaseRefCounter();
             } else if (VectorSplit::ROWS == _data._splits[i]) {
-                linputs.push_back(_data._inputs[i]->sliceRow(rowStart, rowEnd));
-            } else {
+                linputs.push_back(_data._inputs[i]->sliceRow(dimStart, dimEnd));
+            }
+            else if(VectorSplit::COLS == _data._splits[i]) {
+                linputs.push_back(_data._inputs[i]->sliceCol(dimStart, dimEnd));
+            }
+            else {
                 llvm_unreachable("Not all vector splits handled");
             }
         }
@@ -126,7 +132,9 @@ template <typename VT> class CompiledPipelineTask<DenseMatrix<VT>> : public Comp
 
   private:
     void accumulateOutputs(std::vector<DenseMatrix<VT> *> &localResults, std::vector<DenseMatrix<VT> *> &localAddRes,
-                           uint64_t rowStart, uint64_t rowEnd);
+                           uint64_t dimStart, uint64_t dimEnd);
+    void accumulateAggregate(DenseMatrix<VT>*& localAddRes, DenseMatrix<VT>* &localResult, 
+                           BinaryOpCode opCode);
 };
 
 template <typename VT> class CompiledPipelineTask<CSRMatrix<VT>> : public CompiledPipelineTaskBase<CSRMatrix<VT>> {