From 927d89c8ec956149aba9ae74e4f1853756748223 Mon Sep 17 00:00:00 2001 From: niklas Date: Sat, 19 Oct 2024 15:07:53 +0000 Subject: [PATCH] init --- _chain.daph | 15 + run_horz.py | 133 +++ shared.py | 110 +++ sketch/bench/abs_t_exp.daph | 11 + sketch/bench/kmeans.daphne | 29 + sketch/bench/outerAdd_exp.daph | 12 + sketch/bench/outerAdd_sumCol_exp.daph | 19 + sketch/bench/outerAdd_t.daph | 12 + sketch/bench/outerAdd_t_exp.daph | 18 + sketch/bench/sqrt_sum.daph | 9 + sketch/bench/transpose_sum.daph | 9 + src/api/cli/DaphneUserConfig.h | 8 + src/api/internal/daphne_internal.cpp | 34 +- src/compiler/execution/DaphneIrExecutor.cpp | 24 +- src/compiler/lowering/CMakeLists.txt | 6 +- .../DaphneVectorizeComputationsPass.cpp} | 31 +- .../Greedy1VectorizeComputationsPass.cpp | 261 ++++++ .../vectorize/HorizontalFusionPass.cpp | 251 +++++ src/compiler/lowering/vectorize/VectorUtils.h | 886 ++++++++++++++++++ .../lowering/vectorize/VectorizeDefs.h | 22 + src/ir/daphneir/DaphneOps.td | 65 +- .../DaphneVectorizableOpInterface.cpp | 380 ++++++-- .../daphneir/DaphneVectorizableOpInterface.td | 13 +- src/ir/daphneir/Passes.h | 7 +- src/parser/daphnedsl/DaphneDSLBuiltins.cpp | 11 + src/runtime/local/vectorized/MTWrapper.h | 3 + .../local/vectorized/MTWrapper_dense.cpp | 16 +- src/runtime/local/vectorized/Tasks.cpp | 171 ++-- src/runtime/local/vectorized/Tasks.h | 28 +- 29 files changed, 2384 insertions(+), 210 deletions(-) create mode 100644 _chain.daph create mode 100644 run_horz.py create mode 100644 shared.py create mode 100644 sketch/bench/abs_t_exp.daph create mode 100644 sketch/bench/kmeans.daphne create mode 100644 sketch/bench/outerAdd_exp.daph create mode 100644 sketch/bench/outerAdd_sumCol_exp.daph create mode 100644 sketch/bench/outerAdd_t.daph create mode 100644 sketch/bench/outerAdd_t_exp.daph create mode 100644 sketch/bench/sqrt_sum.daph create mode 100644 sketch/bench/transpose_sum.daph rename src/compiler/lowering/{VectorizeComputationsPass.cpp => vectorize/DaphneVectorizeComputationsPass.cpp} (94%) create mode 100644 src/compiler/lowering/vectorize/Greedy1VectorizeComputationsPass.cpp create mode 100644 src/compiler/lowering/vectorize/HorizontalFusionPass.cpp create mode 100644 src/compiler/lowering/vectorize/VectorUtils.h create mode 100644 src/compiler/lowering/vectorize/VectorizeDefs.h diff --git a/_chain.daph b/_chain.daph new file mode 100644 index 000000000..72b8a29ee --- /dev/null +++ b/_chain.daph @@ -0,0 +1,15 @@ +X = fill(1.0, 30000, 30000); +startProfiling(); +v0 = t(X); +v1 = t(v0); +v2 = t(v1); +v3 = t(v2); +v4 = t(v3); +v5 = t(v4); +v6 = t(v5); +v7 = t(v6); +v8 = t(v7); +v9 = t(v8); +stopProfiling(); +print(v9[0,0]); + diff --git a/run_horz.py b/run_horz.py new file mode 100644 index 000000000..1e0faca88 --- /dev/null +++ b/run_horz.py @@ -0,0 +1,133 @@ +import sys +import numpy as np +import json +import datetime +import argparse +from tabulate import tabulate +import pandas as pd +import shared as sh + +#------------------------------------------------------------------------------ +# GLOBAL +#------------------------------------------------------------------------------ + +GENERATE_FUNCS = { + "ADD": lambda i, arg: [f"v{i} = {arg} + {i * 0.1};"], + "ADD_SUM": lambda i, arg: [f"i{i} = {arg} + {i * 0.1};", f"v{i} = sum(i{i});"] +} + +GENERATE_PRINT_FUNCS = { + "ADD": lambda i: [f"print(v{i}[0,0]);"], + "ADD_SUM": lambda i: [f"print(v{i});"] +} + +BASE_CWD = "./" +GLOBAL_ARGS = [] +BASE_COMMAND = lambda th, bs, no_hf: [ + "./run-daphne.sh", + "--timing", + "--vec", + "--vec-type=GREEDY_1", + f"--num-threads={th}", + f"--batchSize={bs}", +] + (["--no-hf"] if no_hf else []) + GLOBAL_ARGS + ["./_horz.daph"] + +#------------------------------------------------------------------------------ +# HELPER +#------------------------------------------------------------------------------ + +def generate_script(num_ops, tool, func, rows, cols): + + script = [] + + script.append(f"X = fill(1.0, {rows}, {cols});") + script.append(sh.TOOLS[tool]["START_OP"]) + + for j in range(0, num_ops): + script += GENERATE_FUNCS[func](j, "X") + script.append(sh.TOOLS[tool]["STOP_OP"]) + + for j in range(0, num_ops): + script += GENERATE_PRINT_FUNCS[func](j) + + script.append(sh.TOOLS[tool]["END_OP"]) + + return script + +#------------------------------------------------------------------------------ +# ARGS +#------------------------------------------------------------------------------ + +parser = argparse.ArgumentParser(description="Arguments") +parser.add_argument("--tool", type=str, choices=sh.TOOLS.keys(), help="", required=True) +parser.add_argument("--script", type=str, choices=GENERATE_FUNCS.keys(), help="", required=True) +parser.add_argument("--rows", type=int, default=10000, help="rows") +parser.add_argument("--cols", type=int, default=10000, help="rows") +parser.add_argument("--samples", type=int, default=3, help="") +parser.add_argument("--num-ops", type=int, default=12, help="") +parser.add_argument("--threads", type=int, default=1, help="") +parser.add_argument("--batchSize", type=int, default=0, help="") +parser.add_argument("--verbose-output", action="store_true") +parser.add_argument("--explain", action="store_true") + +#------------------------------------------------------------------------------ +# MAIN +#------------------------------------------------------------------------------ + +if __name__ == "__main__": + + args = parser.parse_args() + exp_start = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + + if args.explain: + GLOBAL_ARGS += ["--explain=vectorized"] + + output = [] + for no_hf in [False, True]: + + cmd = BASE_COMMAND(args.threads, args.batchSize, no_hf) + + command_output = {} + for ops in range(args.num_ops, args.num_ops+1): + + script = generate_script(ops, args.tool, args.script, args.rows, args.cols) + with open("_horz.daph", "w") as f: + for line in script: + f.write(line + '\n') + + timings = sh.runner(args, cmd, BASE_CWD) + + #command_output[ops] = timings + command_output = timings + + print() + + output.append({ + "cmd": cmd, + "timings": command_output, + + }) + + with open(exp_start + "-horz_timings.json", "w+") as f: + _output = { + "settings": { + "num-ops": args.num_ops, + "rows": args.rows, + "cols": args.cols, + "type": args.script, + "tool": args.tool, + "threads": args.threads, + "samples": args.samples, + "batchSize": args.batchSize + }, + "execs": output + } + json.dump(_output, f, indent=4) + f.close() + + for i in output: + print(" ".join(i["cmd"])) + df = pd.json_normalize(i["timings"], sep=".") + tools_cols = [col for col in df.columns if col.startswith("tool")] + df[tools_cols] = df[tools_cols].astype(int) + print(tabulate(df.describe(), headers="keys", tablefmt="psql", showindex=True)) \ No newline at end of file diff --git a/shared.py b/shared.py new file mode 100644 index 000000000..eb12fdcd5 --- /dev/null +++ b/shared.py @@ -0,0 +1,110 @@ +import os +import subprocess +import json +import pandas as pd +from tabulate import tabulate + +#------------------------------------------------------------------------------ +# RUN COMMAND +#------------------------------------------------------------------------------ + +def run_command(cmd, cwd, env): + + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd, env={**env, **os.environ}) + stdout, stderr = process.communicate() + + return stdout.decode(), stderr.decode() + +def runner(args, cmd, cwd): + + tool_env = TOOLS[args.tool]["ENV"] + env_str = " ".join(f"{k}=\"{v}\"" for k, v in tool_env.items()) + cmd_str = " ".join(cmd) + print(f"Run: {env_str} {cmd_str} {cwd}") + + timings = [] + for i in range(0, args.samples): + + stdout, stderr = run_command(cmd, cwd, tool_env) + + if args.verbose_output: + print(stdout) + print(stderr) + + timing = json.loads(stderr.split("\n")[-2]) + timing["tool"] = TOOLS[args.tool]["GET_INFO"](stdout) + + df = pd.json_normalize(timing, sep=".") + print(tabulate(df, headers="keys", tablefmt="psql", showindex=False)) + timings.append(timing) + + return timings + +#------------------------------------------------------------------------------ +# TOOLS +#------------------------------------------------------------------------------ + +def extract_f1xm3(stdout): + lines = stdout.split('\n') + + for line in reversed(lines): + if "F1XM3" in line: + number = line.split("F1XM3:")[1] + return int(number) + return None + +def extract_papi(stdout): + lines = stdout.split('\n') + + offset = 0 + for i, line in enumerate(lines): + if line.startswith("PAPI-HL Output:"): + offset = i + break + t = "".join(lines[offset+1:]) + j = json.loads(t) + out = j["threads"]["0"]["regions"]["0"] + del out["name"] + del out["parent_region_id"] + return out + +TOOLS = { + "PAPI_STD": { + "ENV": { + "PAPI_EVENTS": "perf::CYCLES,perf::INSTRUCTIONS,perf::CACHE-REFERENCES,perf::CACHE-MISSES,perf::BRANCHES,perf::BRANCH-MISSES", + "PAPI_REPORT": "1" + }, + "START_OP": "startProfiling();", + "STOP_OP": "stopProfiling();", + "END_OP": "", + "GET_INFO": extract_papi + }, + "PAPI_L1": { + "ENV": { + "PAPI_EVENTS": "perf::L1-dcache-load-misses,perf::L1-dcache-loads,perf::L1-dcache-prefetches,perf::L1-icache-load-misses,perf::L1-icache-loads", + "PAPI_REPORT": "1", + }, + "START_OP": "startProfiling();", + "STOP_OP": "stopProfiling();", + "END_OP": "", + "GET_INFO": extract_papi + }, + "PAPI_MPLX": { + "ENV": { + "PAPI_EVENTS": "perf::CYCLES,perf::INSTRUCTIONS,perf::CACHE-REFERENCES,perf::CACHE-MISSES,perf::BRANCHES,perf::BRANCH-MISSES,perf::L1-dcache-load-misses,perf::L1-dcache-loads,perf::L1-dcache-prefetches,perf::L1-icache-load-misses,perf::L1-icache-loads", + "PAPI_REPORT": "1", + "PAPI_MULTIPLEX": "1", + }, + "START_OP": "startProfiling();", + "STOP_OP": "stopProfiling();", + "END_OP": "", + "GET_INFO": extract_papi + }, + "NOW": { + "ENV": {}, + "START_OP": "start = now();", + "STOP_OP": "end = now();", + "END_OP": "print(\"F1XM3:\"+ (end - start));", + "GET_INFO": extract_f1xm3 + } +} \ No newline at end of file diff --git a/sketch/bench/abs_t_exp.daph b/sketch/bench/abs_t_exp.daph new file mode 100644 index 000000000..0358198fa --- /dev/null +++ b/sketch/bench/abs_t_exp.daph @@ -0,0 +1,11 @@ +X = rand($r, $c, 0.0, 1.0, 1, 12345); + + +i1 = abs(X); +i2 = t(i1); +i3 = exp(i2); +i4 = i3 + 2; + + +print(i4[0,0]); + \ No newline at end of file diff --git a/sketch/bench/kmeans.daphne b/sketch/bench/kmeans.daphne new file mode 100644 index 000000000..d722d21b4 --- /dev/null +++ b/sketch/bench/kmeans.daphne @@ -0,0 +1,29 @@ +// K-means clustering. + +// Arguments: +// - r ... number of records +// - c ... number of centroids +// - f ... number of features +// - i ... number of iterations + +// Data generation. +X = rand($r, $f, 0.0, 1.0, 1, 12345); +C = rand($c, $f, 0.0, 1.0, 1, 67890); + +// K-means clustering (decisive part). + +for(i in 1:$i) { + D = (X @ t(C)) * -2 + t(sum(C ^ 2, 0)); + minD = aggMin(D, 0); + P = D <= minD; + P = P / sum(P, 0); + P_denom = sum(P, 1); + C = (t(P) @ X) / t(P_denom); +} + + +// Result output. +print(C[0,0]); +print(C[1,1]); +print(C[2,2]); + \ No newline at end of file diff --git a/sketch/bench/outerAdd_exp.daph b/sketch/bench/outerAdd_exp.daph new file mode 100644 index 000000000..d52b39cca --- /dev/null +++ b/sketch/bench/outerAdd_exp.daph @@ -0,0 +1,12 @@ +X = rand($r, 1, 0.0, 1.0, 1, 12345); +Y = rand(1, $c, 0.0, 1.0, 1, 67890); + + +i1 = outerAdd(X,Y); +i2 = exp(i1); +i3 = i2 + 2; + + + +print(i3[0,0]); + \ No newline at end of file diff --git a/sketch/bench/outerAdd_sumCol_exp.daph b/sketch/bench/outerAdd_sumCol_exp.daph new file mode 100644 index 000000000..a3ae07afb --- /dev/null +++ b/sketch/bench/outerAdd_sumCol_exp.daph @@ -0,0 +1,19 @@ + +//r=40000, c=40000 +//NoVec: Killed +//GR1: Killed +//GR2: ~33s +X = rand($r, 1, 0.0, 1.0, 1, 12345); +Y = rand(1, $c, 0.0, 1.0, 1, 67890); + + +i1 = outerAdd(X,Y); +i2 = sum(i1, 1); +i3 = outerAdd(X,i2); +i4 = sqrt(i3); +i5 = i4 + 2; + + + +print(i5[0,0]); + \ No newline at end of file diff --git a/sketch/bench/outerAdd_t.daph b/sketch/bench/outerAdd_t.daph new file mode 100644 index 000000000..fd45b4ac9 --- /dev/null +++ b/sketch/bench/outerAdd_t.daph @@ -0,0 +1,12 @@ +X = rand($r, 1, 0.0, 1.0, 1, 12345); +Y = rand(1, $c, 0.0, 1.0, 1, 67890); + + +i1 = outerAdd(X,Y); +i2 = t(i1); +i3 = i2 + 2; + + + +print(i3[0,0]); + \ No newline at end of file diff --git a/sketch/bench/outerAdd_t_exp.daph b/sketch/bench/outerAdd_t_exp.daph new file mode 100644 index 000000000..052abe92e --- /dev/null +++ b/sketch/bench/outerAdd_t_exp.daph @@ -0,0 +1,18 @@ + +//r=40000, c=40000 +//NoVec: Killed +//GR1: Killed +//GR2: ~33s +X = rand($r, 1, 0.0, 1.0, 1, 12345); +Y = rand(1, $c, 0.0, 1.0, 1, 67890); + + +i1 = outerAdd(X,Y); +i2 = t(i1); +i3 = exp(i2); +i4 = i3 + 2; + + + +print(i4[0,0]); + \ No newline at end of file diff --git a/sketch/bench/sqrt_sum.daph b/sketch/bench/sqrt_sum.daph new file mode 100644 index 000000000..0a5d95e69 --- /dev/null +++ b/sketch/bench/sqrt_sum.daph @@ -0,0 +1,9 @@ +X = rand($r, $c, 0.0, 1.0, 1, 12345); + + +i1 = sqrt(X); +i2 = sum(i1); + + +print(i2); + \ No newline at end of file diff --git a/sketch/bench/transpose_sum.daph b/sketch/bench/transpose_sum.daph new file mode 100644 index 000000000..d087eb29d --- /dev/null +++ b/sketch/bench/transpose_sum.daph @@ -0,0 +1,9 @@ +X = rand($r, $c, 0.0, 1.0, 1, 12345); + + +t = t(X); +s = sum(t); + + +print(s); + \ No newline at end of file diff --git a/src/api/cli/DaphneUserConfig.h b/src/api/cli/DaphneUserConfig.h index d33a18e30..e2ee07ea8 100644 --- a/src/api/cli/DaphneUserConfig.h +++ b/src/api/cli/DaphneUserConfig.h @@ -18,6 +18,9 @@ #include #include +#include +#include +#include #include #include #include @@ -38,6 +41,11 @@ struct DaphneUserConfig { // Remember to update UserConfig.json accordingly! bool use_cuda = false; bool use_vectorized_exec = false; + + bool no_horizontal_fusion = false; + VectorizationType vectorizationType = GREEDY_1; + size_t batchSize = 0; + bool use_distributed = false; bool use_obj_ref_mgnt = true; bool use_ipa_const_propa = true; diff --git a/src/api/internal/daphne_internal.cpp b/src/api/internal/daphne_internal.cpp index 5b533601b..384c991b7 100644 --- a/src/api/internal/daphne_internal.cpp +++ b/src/api/internal/daphne_internal.cpp @@ -24,7 +24,10 @@ #include #include #include -#include +#include +#include "compiler/execution/DaphneIrExecutor.h" +#include "compiler/lowering/vectorize/VectorizeDefs.h" +#include #include #include #include @@ -303,7 +306,27 @@ int startDAPHNE(int argc, const char **argv, DaphneLibResult *daphneLibRes, int clEnumVal(llvm, "Show DaphneIR after llvm lowering"), clEnumVal(mlir_codegen, "Show DaphneIR after MLIR codegen")), CommaSeparated); - + + static opt vectorizeTypeList( + "vec-type", cat(daphneOptions), + llvm::cl::desc("Apply specific Vectorization pass"), + llvm::cl::values( + clEnumVal(DAPHNE, "Use original DAPHNE Vectorization pass"), + clEnumVal(GREEDY_1, "Use first Greedy Vectorization pass")), + init(GREEDY_1) + ); + + static opt batchSize( + "batchSize", cat(daphneOptions), + desc( + "batchSize" + ) + ); + + static opt noHorizontalFusion( + "no-hf", cat(daphneOptions), + desc("No horizontal fusion")); + static llvm::cl::list scriptArgs1("args", cat(daphneOptions), desc("Alternative way of specifying arguments to the DaphneDSL " "script; must be a comma-separated list of name-value-pairs, " @@ -367,6 +390,9 @@ int startDAPHNE(int argc, const char **argv, DaphneLibResult *daphneLibRes, int logger = std::make_unique(user_config); user_config.use_vectorized_exec = useVectorizedPipelines; + user_config.vectorizationType = vectorizeTypeList; + user_config.batchSize = batchSize; + user_config.use_distributed = useDistributedRuntime; user_config.use_obj_ref_mgnt = !noObjRefMgnt; user_config.use_ipa_const_propa = !noIPAConstPropa; @@ -514,6 +540,10 @@ int startDAPHNE(int argc, const char **argv, DaphneLibResult *daphneLibRes, int user_config.use_fpgaopencl = true; } + if (noHorizontalFusion) { + user_config.no_horizontal_fusion = true; + } + if (enableProfiling) { #ifndef USE_PAPI throw std::runtime_error("you are trying to use profiling, but daphne " diff --git a/src/compiler/execution/DaphneIrExecutor.cpp b/src/compiler/execution/DaphneIrExecutor.cpp index 67fdac21d..e1e00ddc7 100644 --- a/src/compiler/execution/DaphneIrExecutor.cpp +++ b/src/compiler/execution/DaphneIrExecutor.cpp @@ -25,6 +25,7 @@ #include +#include "compiler/lowering/vectorize/VectorizeDefs.h" #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" #include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h" #include "mlir/Conversion/MathToLLVM/MathToLLVM.h" @@ -140,8 +141,27 @@ bool DaphneIrExecutor::runPasses(mlir::ModuleOp module) { if (userConfig_.use_vectorized_exec || userConfig_.use_distributed) { // TODO: add inference here if we have rewrites that could apply to // vectorized pipelines due to smaller sizes - pm.addNestedPass(mlir::daphne::createVectorizeComputationsPass()); + switch (userConfig_.vectorizationType) { + + case DAPHNE: + pm.addNestedPass( + mlir::daphne::createDaphneVectorizeComputationsPass()); + break; + case GREEDY_1: + pm.addNestedPass( + mlir::daphne::createGreedy1VectorizeComputationsPass(userConfig_)); + break; + default: + pm.addNestedPass( + mlir::daphne::createGreedy1VectorizeComputationsPass(userConfig_)); + break; + } pm.addPass(mlir::createCanonicalizerPass()); + if (!userConfig_.no_horizontal_fusion) { + pm.addNestedPass + (mlir::daphne::createHorizontalFusionPass()); + pm.addPass(mlir::createCanonicalizerPass()); + } } if (userConfig_.explain_vectorized) pm.addPass(mlir::daphne::createPrintIRPass("IR after vectorization:")); @@ -193,7 +213,7 @@ bool DaphneIrExecutor::runPasses(mlir::ModuleOp module) { // Initialize the use of each distinct kernels library to false. usedLibPaths = userConfig_.kernelCatalog.getLibPaths(); - + try { if (failed(pm.run(module))) { module->dump(); diff --git a/src/compiler/lowering/CMakeLists.txt b/src/compiler/lowering/CMakeLists.txt index af7f0eb88..c69f2368f 100644 --- a/src/compiler/lowering/CMakeLists.txt +++ b/src/compiler/lowering/CMakeLists.txt @@ -25,7 +25,11 @@ add_mlir_dialect_library(MLIRDaphneTransforms PhyOperatorSelectionPass.cpp RewriteToCallKernelOpPass.cpp SpecializeGenericFunctionsPass.cpp - VectorizeComputationsPass.cpp + + vectorize/DaphneVectorizeComputationsPass.cpp + vectorize/Greedy1VectorizeComputationsPass.cpp + vectorize/HorizontalFusionPass.cpp + DaphneOptPass.cpp EwOpsLowering.cpp ModOpLowering.cpp diff --git a/src/compiler/lowering/VectorizeComputationsPass.cpp b/src/compiler/lowering/vectorize/DaphneVectorizeComputationsPass.cpp similarity index 94% rename from src/compiler/lowering/VectorizeComputationsPass.cpp rename to src/compiler/lowering/vectorize/DaphneVectorizeComputationsPass.cpp index 985c6442e..18d9c4075 100644 --- a/src/compiler/lowering/VectorizeComputationsPass.cpp +++ b/src/compiler/lowering/vectorize/DaphneVectorizeComputationsPass.cpp @@ -19,12 +19,11 @@ #include "ir/daphneir/Passes.h" #include -#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Transforms/DialectConversion.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/raw_ostream.h" -#include #include -#include using namespace mlir; @@ -179,12 +178,15 @@ void movePipelineInterleavedOperations(Block::iterator pipelinePosition, } } -struct VectorizeComputationsPass : public PassWrapper> { +struct DaphneVectorizeComputationsPass : public PassWrapper> { void runOnOperation() final; }; } // namespace -void VectorizeComputationsPass::runOnOperation() { +void DaphneVectorizeComputationsPass::runOnOperation() { + + llvm::outs() << "DaphneVectorizeComputationsPass" << "\n"; + auto func = getOperation(); // TODO: fuse pipelines that have the matching inputs, even if no output of // the one pipeline is used by the other. @@ -194,13 +196,15 @@ void VectorizeComputationsPass::runOnOperation() { // Find vectorizable operations and their inputs of vectorizable operations std::vector vectOps; func->walk([&](daphne::Vectorizable op) { - if (CompilerUtils::isMatrixComputation(op)) + if (CompilerUtils::isMatrixComputation(op) && !llvm::isa(op)) vectOps.emplace_back(op); }); std::vector vectorizables(vectOps.begin(), vectOps.end()); std::multimap possibleMerges; for (auto v : vectorizables) { - for (auto e : llvm::zip(v->getOperands(), v.getVectorSplits())) { + auto splits = v.getVectorSplits()[0]; + for (auto e : llvm::zip(v->getOperands(), splits)) { + auto operand = std::get<0>(e); auto defOp = operand.getDefiningOp(); if (defOp && v->getBlock() == defOp->getBlock() && CompilerUtils::isMatrixComputation(defOp)) { @@ -232,7 +236,7 @@ void VectorizeComputationsPass::runOnOperation() { auto split = std::get<1>(e); // find the corresponding `OpResult` to figure out combine auto opResult = *llvm::find(defOp->getResults(), operand); - auto combine = defOp.getVectorCombines()[opResult.getResultNumber()]; + auto combine = defOp.getVectorCombines()[0][opResult.getResultNumber()]; if (split == daphne::VectorSplit::ROWS) { if (combine == daphne::VectorCombine::ROWS) @@ -300,8 +304,9 @@ void VectorizeComputationsPass::runOnOperation() { movePipelineInterleavedOperations(builder.getInsertionPoint(), pipeline); for (auto vIt = pipeline.rbegin(); vIt != pipeline.rend(); ++vIt) { auto v = *vIt; - auto vSplits = v.getVectorSplits(); - auto vCombines = v.getVectorCombines(); + auto vSplits = v.getVectorSplits()[0]; + auto vCombines = v.getVectorCombines()[0]; + auto vOutSizes = v.createOpsOutputSizes(builder)[0]; // TODO: although we do create enum attributes, it might make // sense/make it easier to // just directly use an I64ArrayAttribute @@ -319,7 +324,7 @@ void VectorizeComputationsPass::runOnOperation() { for (auto result : v->getResults()) { results.push_back(result); } - for (auto outSize : v.createOpsOutputSizes(builder)) { + for (auto outSize : vOutSizes) { outRows.push_back(outSize.first); outCols.push_back(outSize.second); } @@ -404,6 +409,6 @@ void VectorizeComputationsPass::runOnOperation() { } } -std::unique_ptr daphne::createVectorizeComputationsPass() { - return std::make_unique(); +std::unique_ptr daphne::createDaphneVectorizeComputationsPass() { + return std::make_unique(); } diff --git a/src/compiler/lowering/vectorize/Greedy1VectorizeComputationsPass.cpp b/src/compiler/lowering/vectorize/Greedy1VectorizeComputationsPass.cpp new file mode 100644 index 000000000..84941f8a0 --- /dev/null +++ b/src/compiler/lowering/vectorize/Greedy1VectorizeComputationsPass.cpp @@ -0,0 +1,261 @@ +/* + * Copyright 2021 The DAPHNE Consortium + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "api/cli/DaphneUserConfig.h" +#include "compiler/lowering/vectorize/VectorUtils.h" +#include "ir/daphneir/Daphne.h" +#include "ir/daphneir/DaphneVectorizableOpInterface.h" +#include "ir/daphneir/Passes.h" + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/Value.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "llvm/Support/Casting.h" +#include "llvm/Support/raw_ostream.h" +#include + +#include +#include + +using namespace mlir; + +namespace { + +//----------------------------------------------------------------- +// CONST +//----------------------------------------------------------------- + + +//----------------------------------------------------------------- +// Class functions +//----------------------------------------------------------------- + +struct Greedy1VectorizeComputationsPass + : public PassWrapper> { + void runOnOperation() final; + + const DaphneUserConfig& userConfig; + + explicit Greedy1VectorizeComputationsPass(const DaphneUserConfig& cfg) : userConfig(cfg) {} +}; + +void printStack(std::stack> s) { + llvm::outs() << "["; + while (!s.empty()) { + auto op = s.top(); + llvm::outs() << "(" << std::get<0>(op)->getName().getStringRef().str() << ", " << std::get<1>(op) << "), "; + s.pop(); + } + llvm::outs() << "]\n"; +} + +void printGraph(std::vector leafOps, std::string filename) { + std::stack stack; + std::ofstream dot(filename); + if (!dot.is_open()) { + throw std::runtime_error("test"); + } + + dot << "digraph G {\n"; + for (auto leaf : leafOps) { + stack.push(leaf); + } + + std::vector visited; + + while (!stack.empty()) { + auto op = stack.top(); + stack.pop(); + if (std::find(visited.begin(), visited.end(), op) != visited.end()) { + continue; + } + visited.push_back(op); + + auto v = llvm::dyn_cast(op); + for (unsigned i = 0; i < v->getNumOperands(); ++i) { + mlir::Value e = v->getOperand(i); + auto defOp = e.getDefiningOp(); + if (llvm::isa(e.getType()) && llvm::isa(defOp)) { + dot << "\"" << defOp->getName().getStringRef().str() << "+" << std::hex + << reinterpret_cast(defOp) << "\" -> \"" << op->getName().getStringRef().str() << "+" + << std::hex << reinterpret_cast(op) << "\" [label=\"" << i << "\"];\n"; + stack.push(defOp); + } + } + } + dot << "}"; + dot.close(); +} +} // namespace + +void Greedy1VectorizeComputationsPass::runOnOperation() { + + auto func = getOperation(); + + VectorIndex ZeroDecision = 0; + /*if (userConfig.colFirst) { + ZeroDecision = 1; + }*/ + + std::vector ops; + func->walk([&](daphne::Vectorizable op) { + for (auto opType : op->getOperandTypes()) { + if (!opType.isIntOrIndexOrFloat() && !llvm::isa(opType)) { + ops.emplace_back(op); + break; + } + } + }); + std::reverse(ops.begin(), ops.end()); + + // result + std::vector pipelines; + std::vector leafOps; + std::stack> stack; + + for (const auto &op : ops) { + auto users = op->getUsers(); + bool found = false; + for (auto u : users) { + if (std::find(ops.begin(), ops.end(), u) != ops.end()) { + found = true; + break; + } + } + if (!found) { + leafOps.push_back(op); + stack.push({op, nullptr, DisconnectReason::INVALID}); + } + } + + std::multimap mmProducerConsumerRelationships; + std::map operationToPipeline; + + // std::vector boundingOperations; + + while (!stack.empty()) { + auto t = stack.top(); + stack.pop(); + auto op = std::get<0>(t); + auto currPipeline = std::get<1>(t); + auto disReason = std::get<2>(t); + + if (operationToPipeline.find(op) != operationToPipeline.end()) { + auto producerPipeline = operationToPipeline.at(op); + mmProducerConsumerRelationships.insert({{currPipeline, producerPipeline}, disReason}); + continue; + } + + if (disReason != DisconnectReason::NONE) { + auto _pipeline = new Pipeline(); + pipelines.push_back(_pipeline); + + // check needed for empty init + if (currPipeline != nullptr) + mmProducerConsumerRelationships.insert({{currPipeline, _pipeline}, disReason}); + + currPipeline = _pipeline; + } + + operationToPipeline.insert({op, currPipeline}); + currPipeline->push_back(op); + + auto vectOp = llvm::dyn_cast(op); + + for (size_t i = 0; i < vectOp->getNumOperands(); ++i) { + auto operand = vectOp->getOperand(i); + + // llvm::outs() << op->getName().getStringRef().str() << " "; + + if (!llvm::isa(operand.getType())) + continue; + + if (llvm::isa(operand)) { + continue; + } + + // could it help to check if we check if operand.getDefiningOp is inside (global) ops vector? + if (auto vectDefOp = llvm::dyn_cast(operand.getDefiningOp())) { + // llvm::outs() << vectDefOp->getName().getStringRef().str() << "\n"; + + auto split = vectOp.getVectorSplits()[ZeroDecision][i]; + auto combine = vectDefOp.getVectorCombines()[ZeroDecision][0]; + + // same block missing + if (VectorUtils::matchingVectorSplitCombine(split, combine) && + vectDefOp->getBlock() == vectOp->getBlock()) { + if (vectDefOp->hasOneUse()) { + stack.push({vectDefOp, currPipeline, DisconnectReason::NONE}); + } else { + stack.push({vectDefOp, currPipeline, DisconnectReason::MULTIPLE_CONSUMERS}); + } + } else { + stack.push({vectDefOp, currPipeline, DisconnectReason::INVALID}); + } + } else { + // defOp is outside of consideration, top horz. fusion possible + // boundingOperations.push_back(op); + // llvm::outs() << "\n"; + } + } + } + + // Needed as Greedy1 is only considering the first possiblity + std::map decisionIxs; + for (const auto &op : ops) { + decisionIxs.insert({op, ZeroDecision}); + } + + // mmPCR to PCR + std::map producerConsumerRelationships = + VectorUtils::consolidateProducerConsumerRelationship(mmProducerConsumerRelationships); + + VectorUtils::greedyMergePipelinesProducerConsumer(pipelines, operationToPipeline, producerConsumerRelationships); + + // VectorUtils::DEBUG::printPipelines(pipelines); + + // Post Processing + + std::vector _pipelines; + _pipelines.resize(pipelines.size()); + + std::transform(pipelines.begin(), pipelines.end(), _pipelines.begin(), [](const auto &ptr) { return *ptr; }); + + // will crash if for some reason the pipelines itself are not topologically sorted + VectorUtils::createVectorizedPipelineOps(func, _pipelines, decisionIxs); + + return; + +} + +std::unique_ptr daphne::createGreedy1VectorizeComputationsPass(const DaphneUserConfig& cfg) { + return std::make_unique(cfg); +} \ No newline at end of file diff --git a/src/compiler/lowering/vectorize/HorizontalFusionPass.cpp b/src/compiler/lowering/vectorize/HorizontalFusionPass.cpp new file mode 100644 index 000000000..984b2cc9c --- /dev/null +++ b/src/compiler/lowering/vectorize/HorizontalFusionPass.cpp @@ -0,0 +1,251 @@ +/* + * Copyright 2021 The DAPHNE Consortium + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compiler/lowering/vectorize/VectorUtils.h" +#include "ir/daphneir/Daphne.h" +#include "ir/daphneir/Passes.h" + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include +#include +#include +#include +#include +#include +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/Value.h" +#include "mlir/Transforms/DialectConversion.h" + +#include +#include +#include + +#include +#include "llvm/Support/Casting.h" + +#include +#include + +using namespace mlir; + +namespace +{ + + //----------------------------------------------------------------- + // Class + //----------------------------------------------------------------- + + struct HorizontalFusionPass : public PassWrapper> { + void runOnOperation() final; + }; + + //----------------------------------------------------------------- + // Helper function + //----------------------------------------------------------------- + + void moveOperationToBlock(mlir::Builder &builder, mlir::Block *src, mlir::Block *dest, std::vector &newResults) { + + // Iterate over all operations in src block and move them to dest block. + // Rewrite block arguments of operations to the dest block arguments + // and store values for the results for overriding of the old values. + while(!src->empty()) { + auto op = src->begin(); + + for(size_t i = 0; i < op->getNumOperands(); ++i) { + auto operand = op->getOperand(i); + if (llvm::isa(operand)) { + auto blockArgument = dest->addArgument(operand.getType(), builder.getUnknownLoc()); + op->setOperand(i, blockArgument); + } + } + if (!llvm::isa(op)) { + op->moveBefore(dest, dest->end()); + } + else { + newResults.insert(newResults.end(), op->operand_begin(), op->operand_end()); + op->erase(); + return; + } + } + } + +} + +//----------------------------------------------------------------- +// Horizontal Fusion / Sibling Fusion (Scan-sharing of inputs) +//----------------------------------------------------------------- +// +// Two operations share a single operand from the same producer. +// +// producer +// / \ +// consumer1 consumer2 +// +// => (consumer1, consumer2) +// +// consumer1 and consumer2 cannot have a producer-consumer relationship directly or transitively, +// as if a merge of these operations where possible it would happen in Greedy1/Greedy2. + +void HorizontalFusionPass::runOnOperation() +{ + auto func = getOperation(); + + // After merging of pipelines, we need to rerun the pass + // to check for additional (changed) fusion possiblities. + bool changed = true; + while(changed) { + changed = false; + + std::vector pipelineOps; + func->walk([&](daphne::VectorizedPipelineOp op) { + pipelineOps.emplace_back(op); + }); + std::reverse(pipelineOps.begin(), pipelineOps.end()); + + //----------------------------------------------------------------- + // Identify horizontal fusion possibilities + //----------------------------------------------------------------- + + // Check for overlapping/intersection of operands between pipeline arguments. + // They need to be compatible according to the corresponding split of an argument. + std::vector horizontalRelationships; + for (auto it1 = pipelineOps.begin(); it1 != pipelineOps.end(); ++it1) { + auto pipeOp1 = *it1; + + // Store defOps for the corresponding arguments of pipeOp1. + llvm::SmallVector defOpsArgs; + // Running over the split size for consideration of relevant args (excl. OutCols, OutRows). + for(size_t operandIx1 = 0; operandIx1 < pipeOp1.getSplits().size(); ++operandIx1) { + auto operand1 = pipeOp1->getOperand(operandIx1); + if (auto defOp = operand1.getDefiningOp()) { + defOpsArgs.push_back(defOp); + } + } + + for (auto it2 = next(it1); it2 != pipelineOps.end(); ++it2) { + auto pipeOp2 = *it2; + + // PipelineOps need to be in the same block. + if (pipeOp1->getBlock() != pipeOp2->getBlock()) + continue; + + // PipelineOps cannot (transitively) depend on each other. + if (VectorUtils::arePipelineOpsDependent(pipeOp1, pipeOp2)) + continue; + + // Checking for overlapping arguments. + for(size_t operandIx2 = 0; operandIx2 < pipeOp2.getSplits().size(); ++operandIx2) { + auto operand2 = pipeOp2->getOperand(operandIx2); + + if (auto defOp = operand2.getDefiningOp()) { + + // Check if defOp is also in the defOps for the pipeOp1 arguments. + auto fIt = std::find(defOpsArgs.begin(), defOpsArgs.end(), defOp); + if (fIt != defOpsArgs.end()) { + + size_t operandIx1 = std::distance(defOpsArgs.begin(), fIt); + + if (pipeOp1.getSplits()[operandIx1] == pipeOp2.getSplits()[operandIx2] && + pipeOp1.getSplits()[operandIx1].cast().getValue() != daphne::VectorSplit::NONE) { + horizontalRelationships.push_back({pipeOp1, pipeOp2}); + break; // We only need one case of arguments matching. + } + } + } + } + } + } + + //----------------------------------------------------------------- + // Merge VectorizedPipelineOps + //----------------------------------------------------------------- + + for(auto pipeOpPair : horizontalRelationships) { + + auto [pipeOp1, pipeOp2] = pipeOpPair; + + mlir::Block* b1 = &pipeOp1.getBody().getBlocks().front(); + mlir::Block* b2 = &pipeOp2.getBody().getBlocks().front(); + + // Merge attributes and values + auto vSplitAttrs = std::vector(pipeOp1.getSplits().begin(), pipeOp1.getSplits().end()); + vSplitAttrs.insert(vSplitAttrs.end(), pipeOp2.getSplits().begin(), pipeOp2.getSplits().end()); + + auto vCombineAttrs = std::vector(pipeOp1.getCombines().begin(), pipeOp1.getCombines().end()); + vCombineAttrs.insert(vCombineAttrs.end(), pipeOp2.getCombines().begin(), pipeOp2.getCombines().end()); + + auto oldResults = std::vector(pipeOp1->getResults().begin(), pipeOp1->getResults().end()); + oldResults.insert(oldResults.end(), pipeOp2->getResults().begin(), pipeOp2->getResults().end()); + + auto operands = std::vector(pipeOp1->getOperands().begin(), pipeOp1->getOperands().begin() + pipeOp1.getSplits().size()); + operands.insert(operands.end(), pipeOp2->getOperands().begin(), pipeOp2->getOperands().begin() + pipeOp2.getSplits().size()); + + auto outRows = std::vector(pipeOp1.getOutRows().begin(), pipeOp1.getOutRows().end()); + outRows.insert(outRows.end(), pipeOp2.getOutRows().begin(), pipeOp2.getOutRows().end()); + + auto outCols = std::vector(pipeOp1.getOutCols().begin(), pipeOp1.getOutCols().end()); + outCols.insert(outCols.end(), pipeOp2.getOutCols().begin(), pipeOp2.getOutCols().end()); + + // Create new PipelineOp + mlir::OpBuilder builder(func); + auto loc = builder.getFusedLoc({pipeOp1.getLoc(), pipeOp2->getLoc()}); + auto pipelineOp = builder.create(loc, + mlir::ValueRange(oldResults).getTypes(), + operands, + outRows, + outCols, + builder.getArrayAttr(vSplitAttrs), + builder.getArrayAttr(vCombineAttrs), + nullptr); + mlir::Block *bodyBlock = builder.createBlock(&pipelineOp.getBody()); + + //Move operations to new PipelineOp block. + auto newResults = std::vector(); + moveOperationToBlock(builder, b1, bodyBlock, newResults); + moveOperationToBlock(builder, b2, bodyBlock, newResults); + + // Create new ReturnOp. + builder.setInsertionPointToEnd(bodyBlock); + builder.create(loc, newResults); + + // Rewrite all uses to new ReturnOp. + for (size_t i = 0; i < oldResults.size(); ++i) { + oldResults.at(i).replaceAllUsesWith(pipelineOp.getResult(i)); + } + + // Place to the location after the last PipelineOp of this pair. + // Is this sufficient? + pipelineOp->moveAfter(pipeOp1); + + // Clean up + pipeOp1->erase(); + pipeOp2->erase(); + + //suboptimal + changed = true; + break; + } + } + + return; +} + + +std::unique_ptr daphne::createHorizontalFusionPass() { + return std::make_unique(); +} \ No newline at end of file diff --git a/src/compiler/lowering/vectorize/VectorUtils.h b/src/compiler/lowering/vectorize/VectorUtils.h new file mode 100644 index 000000000..c8571d3ae --- /dev/null +++ b/src/compiler/lowering/vectorize/VectorUtils.h @@ -0,0 +1,886 @@ +/* + * Copyright 2021 The DAPHNE Consortium + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include "ir/daphneir/Daphne.h" +#include "ir/daphneir/DaphneVectorizableOpInterface.h" +#include "mlir/IR/Value.h" +#include "mlir/Transforms/TopologicalSortUtils.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mlir/IR/Operation.h" +#include "mlir/Support/LLVM.h" +#include "llvm/IR/PassManagerInternal.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ManagedStatic.h" + +using VectorIndex = std::size_t; +using Pipeline = std::vector; +using PipelinePair = std::pair; + +using PipelineOpPair = std::pair; + +namespace std { +template <> struct hash { + size_t operator()(const PipelinePair &p) const { + return std::hash{}(p.first) ^ std::hash{}(p.second); + } +}; +} // namespace std + +enum class DisconnectReason { NONE, MULTIPLE_CONSUMERS, INVALID }; + +enum class EdgeStatus { INVALID, ACTIVE, INACTIVE }; + +struct VectorUtils { + + /** + * @brief Checks if a VectorSplit and a VectorCombine are compatible. + * + * This function compares the provided VectorSplit and VectorCombine to + * determine if they match by remapping the split to a matching combine. + * Compatible pairs are ROWS-ROWS and COLS-COLS. + * + * @param split VectorSplit value representing the split of a operation. + * @param combine VectorCombine value representing the combine of a operation. + * @return true, if VectorSplit and VectorCombine are compabitlbe + * @return false, otherwise + */ + + static bool matchingVectorSplitCombine(mlir::daphne::VectorSplit split, mlir::daphne::VectorCombine combine) { + // llvm::outs() << split << " " << combine << " "; + mlir::daphne::VectorCombine _operandCombine; + switch (split) { + case mlir::daphne::VectorSplit::ROWS: + _operandCombine = mlir::daphne::VectorCombine::ROWS; + break; + case mlir::daphne::VectorSplit::COLS: + _operandCombine = mlir::daphne::VectorCombine::COLS; + break; + default: + // No matching split/combine; basically resulting in separate pipelines + return false; + } + if (combine == _operandCombine) { + return true; + } + return false; + } + + // Greedy merge along (valid) MULTIPLE_CONSUMER relationships + // by checking if resulting pipelines can be sorted topologically. + static void + greedyMergePipelinesProducerConsumer(std::vector &pipelines, + std::map &operationToPipeline, + std::map &producerConsumerRelationships) { + bool change = true; + while (change) { + change = false; + + std::multimap mmPCR; + for (const auto &[pipePair, disReason] : producerConsumerRelationships) { + + if (disReason == DisconnectReason::INVALID) + continue; + + if (VectorUtils::tryTopologicalSortMerged(pipelines, producerConsumerRelationships, pipePair.first, + pipePair.second)) { + auto mergedPipeline = + VectorUtils::mergePipelines(pipelines, operationToPipeline, pipePair.first, pipePair.second); + + for (const auto &[_pipePair, _disReason] : producerConsumerRelationships) { + + // Ignore in case that is current pair is pipePair + if (_pipePair.first == pipePair.first && _pipePair.second == pipePair.second) + continue; + + // Rewrite Relationships + if (_pipePair.first == pipePair.first || _pipePair.first == pipePair.second) { + auto newPipePair = std::make_pair(mergedPipeline, _pipePair.second); + mmPCR.insert({newPipePair, _disReason}); + } else if (_pipePair.second == pipePair.first || _pipePair.second == pipePair.second) { + auto newPipePair = std::make_pair(_pipePair.first, mergedPipeline); + mmPCR.insert({newPipePair, _disReason}); + } else { + mmPCR.insert({_pipePair, _disReason}); + } + } + + change = true; + break; + } + } + + // In case of no change the mmPCR is not filled, ignore + if (change) + producerConsumerRelationships = VectorUtils::consolidateProducerConsumerRelationship(mmPCR); + + // VectorUtils::DEBUG::printPCR(producerConsumerRelationships); + // VectorUtils::DEBUG::printPipelines(pipelines); + } + } + + //------------------------------------------------------------------------------ + + // Function merges two pipelines into one by appending all operations from one pipeline to another + // Order is not really considered, as it is embodied in IR + static void mergePipelines(std::vector &pipelines, + std::map &operationToPipelineIx, size_t pipeIx1, + size_t pipeIx2) { + // llvm::outs() << mergeFromIx << " " << mergeIntoIx << "\n"; + if (pipeIx1 == pipeIx2) + return; + if (pipeIx2 > pipeIx1) + std::swap(pipeIx1, pipeIx2); + + std::vector *mergedPipeline(pipelines.at(pipeIx2)); + for (auto op : *pipelines.at(pipeIx1)) { + if (std::find(mergedPipeline->begin(), mergedPipeline->end(), op) == mergedPipeline->end()) { + mergedPipeline->push_back(op); + operationToPipelineIx[op] = pipeIx2; + } + } + pipelines.at(pipeIx2) = std::move(mergedPipeline); + pipelines.erase(pipelines.begin() + pipeIx1); + } + + static Pipeline *mergePipelines(std::vector &pipelines, + std::map &operationToPipeline, Pipeline *pipe1, + Pipeline *pipe2) { + if (pipe1 == pipe2) + return nullptr; + + for (auto op : *pipe2) { + if (std::find(pipe1->begin(), pipe1->end(), op) == pipe1->end()) { + pipe1->push_back(op); + operationToPipeline[op] = pipe1; + } + } + + auto pipeIx2 = std::find(pipelines.begin(), pipelines.end(), pipe2); + pipelines.erase(pipeIx2); + return pipe1; + } + + // only works if pipeline ops are topologically sorted in reverse + static bool arePipelineOpsDependent(mlir::daphne::VectorizedPipelineOp pipeOp1, + mlir::daphne::VectorizedPipelineOp pipeOp2) { + + if (pipeOp1 == pipeOp2) + return true; + + std::stack s; + std::unordered_set visited; + + s.push(pipeOp1); + while (!s.empty()) { + mlir::Operation *currOp = s.top(); + s.pop(); + + // Connection found + if (currOp == pipeOp2) + return true; + + if (visited.insert(currOp).second) { + for (const auto &operand : currOp->getOperands()) { + if (auto defOp = operand.getDefiningOp()) { + s.push(defOp); + } + } + } + } + + return false; + } + + static bool tryTopologicalSortMerged(std::vector &pipelines, + std::map &rel, Pipeline *pipe1, + Pipeline *pipe2) { + + // if (pipe2 > pipe1) + // std::swap(pipe1, pipe2); + + // prealloc + std::map> pipeline_graph; + for (auto pipe : pipelines) { + if (pipe == pipe1) + pipe = pipe2; + pipeline_graph.insert({pipe, {}}); + } + + for (auto &[key, _] : rel) { + auto consumer = key.second; + auto producer = key.first; + + if (consumer == pipe1) { + consumer = pipe2; + } else if (producer == pipe1) { + producer = pipe2; + } + + if (producer == consumer) + continue; + + if (pipeline_graph.find(consumer) == pipeline_graph.end()) { + pipeline_graph.insert({consumer, {producer}}); + } else { + pipeline_graph.at(consumer).insert(producer); + } + } + + /*for (auto node : pipeline_graph) { + llvm::outs() << "Key: " << node.first << ", Values: "; + for (auto dependency : node.second) { + llvm::outs() << dependency << " "; + } + llvm::outs() << "\n"; + } + llvm::outs() << "\n";*/ + + return tryTopologicalSort(pipeline_graph); + } + + static std::map + consolidateProducerConsumerRelationship(std::multimap mmPCR) { + std::map pcr; + for (const auto &[pipePair, disReason] : mmPCR) { + if (pcr.find(pipePair) == pcr.end()) { + pcr.insert({pipePair, disReason}); + } else { + // Overwrite if INVALID as it domiantes MULTI_CONSUMER relationship + if (disReason == DisconnectReason::INVALID) { + pcr.insert_or_assign(pipePair, disReason); + } + } + } + return pcr; + } + + //------------------------------------------------------------------------------ + + private: + // kahn: https://dev.to/leopfeiffer/topological-sort-with-kahns-algorithm-3dl1 + // https://leetcode.com/problems/course-schedule/solutions/483330/c-kahns-algorithm-topological-sort-with-easy-detailed-explanation-16-ms-beats-98/ + static bool tryTopologicalSort(std::map> pipeline_graph) { + + std::unordered_map inDegrees; + for (auto node : pipeline_graph) { + for (auto dependency : node.second) { + ++inDegrees[dependency]; + } + } + + std::queue queue; + for (auto node : pipeline_graph) { + if (inDegrees[node.first] == 0) { + queue.push(node.first); + } + } + + std::vector result; + while (!queue.empty()) { + size_t node = queue.front(); + queue.pop(); + result.push_back(node); + for (auto dependency : pipeline_graph.at(node)) { + if (--inDegrees[dependency] == 0) { + queue.push(dependency); + } + } + } + + return result.size() == pipeline_graph.size(); + } + + static bool tryTopologicalSort(std::map> pipeline_graph) { + + std::unordered_map inDegrees; + for (auto node : pipeline_graph) { + for (auto dependency : node.second) { + ++inDegrees[dependency]; + } + } + + std::queue queue; + for (auto node : pipeline_graph) { + if (inDegrees[node.first] == 0) { + queue.push(node.first); + } + } + + std::vector result; + while (!queue.empty()) { + Pipeline *node = queue.front(); + queue.pop(); + result.push_back(node); + for (auto dependency : pipeline_graph.at(node)) { + if (--inDegrees[dependency] == 0) { + queue.push(dependency); + } + } + } + + return result.size() == pipeline_graph.size(); + } + + public: + /** + * @brief Recursive function checking if the given value is transitively dependant on the operation `op`. + * @param value The value to check + * @param op The operation to check + * @return true if there is a dependency, false otherwise + */ + static bool valueDependsOnResultOf(mlir::Value value, mlir::Operation *op) { + if (auto defOp = value.getDefiningOp()) { + if (defOp == op) + return true; +#if 1 + // TODO This crashes if defOp and op are not in the same block. + // At the same time, it does not seem to be strictly required. + // if (defOp->isBeforeInBlock(op)) + // Nevertheless, this modified line seems to be a good soft-filter; + // without that, the vectorization pass may take very long on + // programs with 100s of operations. + if (defOp->getBlock() == op->getBlock() && defOp->isBeforeInBlock(op)) + // can't have results of `op` as inputs, as it is defined before + return false; +#endif + for (auto operand : defOp->getOperands()) { + if (valueDependsOnResultOf(operand, op)) + return true; + } + } + return false; + } + + /** + * @brief Moves operation which are between the operations, which should be fused into a single pipeline, before + * or after the position where the pipeline will be placed. + * @param pipelinePosition The position where the pipeline will be + * @param pipeline The pipeline for which this function should be executed + */ + static void movePipelineInterleavedOperations(mlir::Block::iterator pipelinePosition, + const std::vector pipeline) { + // first operation in pipeline vector is last in IR, and the last is the first + auto startPos = pipeline.back()->getIterator(); + auto endPos = pipeline.front()->getIterator(); + auto currSkip = pipeline.rbegin(); + + std::vector moveBeforeOps; + std::vector moveAfterOps; + + for (auto it = startPos; it != endPos; ++it) { + if (it == (*currSkip)->getIterator()) { + ++currSkip; + continue; + } + + bool dependsOnPipeline = false; + auto pipelineOpsBeforeIt = currSkip; + while (--pipelineOpsBeforeIt != pipeline.rbegin()) { + for (auto operand : it->getOperands()) { + if (valueDependsOnResultOf(operand, *pipelineOpsBeforeIt)) { + dependsOnPipeline = true; + break; + } + } + if (dependsOnPipeline) { + break; + } + } + + for (auto operand : it->getOperands()) { + if (valueDependsOnResultOf(operand, *pipelineOpsBeforeIt)) { + dependsOnPipeline = true; + break; + } + } + if (dependsOnPipeline) { + moveAfterOps.push_back(&(*it)); + } else { + moveBeforeOps.push_back(&(*it)); + } + } + + for (auto moveBeforeOp : moveBeforeOps) { + moveBeforeOp->moveBefore(pipelinePosition->getBlock(), pipelinePosition); + } + for (auto moveAfterOp : moveAfterOps) { + moveAfterOp->moveAfter(pipelinePosition->getBlock(), pipelinePosition); + pipelinePosition = moveAfterOp->getIterator(); + } + } + + static void createVectorizedPipelineOps(mlir::func::FuncOp func, std::vector pipelines, + std::map decisionIxs) { + mlir::OpBuilder builder(func); + + // Create the `VectorizedPipelineOp`s + for (auto _pipeline : pipelines) { + if (_pipeline.empty()) + continue; + + auto valueIsPartOfPipeline = [&](mlir::Value operand) { + return llvm::any_of(_pipeline, [&](mlir::Operation *lv) { return lv == operand.getDefiningOp(); }); + }; + std::vector vSplitAttrs; + std::vector vCombineAttrs; + std::vector locations; + std::vector results; + std::vector operands; + std::vector outRows; + std::vector outCols; + + // first op in pipeline is last in IR + builder.setInsertionPoint(_pipeline.front()); + // move all operations, between the operations that will be part of the pipeline, before or after the + // completed pipeline + VectorUtils::movePipelineInterleavedOperations(builder.getInsertionPoint(), _pipeline); + + // potential addition for + std::vector pipeline; + for (auto vIt = _pipeline.rbegin(); vIt != _pipeline.rend(); ++vIt) { + auto v = *vIt; + + auto vSplits = std::vector(); + auto vCombines = std::vector(); + auto opsOutputSizes = std::vector>(); + if (auto vec = llvm::dyn_cast(v)) { + size_t d = decisionIxs[v]; + vSplits = vec.getVectorSplits()[d]; + vCombines = vec.getVectorCombines()[d]; + opsOutputSizes = vec.createOpsOutputSizes(builder)[d]; + } else { + throw std::runtime_error("Vectorizable op not found"); + } + + pipeline.push_back(v); + + // TODO: although we do create enum attributes, it might make sense/make it easier to + // just directly use an I64ArrayAttribute + // Determination of operands of VectorizedPipelineOps! + for (auto i = 0u; i < v->getNumOperands(); ++i) { + auto operand = v->getOperand(i); + if (!valueIsPartOfPipeline(operand)) { + vSplitAttrs.push_back(mlir::daphne::VectorSplitAttr::get(func.getContext(), vSplits[i])); + operands.push_back(operand); + } + } + + // Determination of results of VectorizedPipelineOps! + for (auto vCombine : vCombines) { + vCombineAttrs.push_back(mlir::daphne::VectorCombineAttr::get(func.getContext(), vCombine)); + } + locations.push_back(v->getLoc()); + for (auto result : v->getResults()) { + results.push_back(result); + } + for (auto outSize : opsOutputSizes) { + outRows.push_back(outSize.first); + outCols.push_back(outSize.second); + } + + // check if any of the outputs type of an operator is a scalar value + // if yes, add additional castOps inside pipeline and outside pipeline + for (size_t i = 0; i < v->getNumResults(); i++) { + auto r = v->getResult(0); + // TODO: check if it includes all types used in daphne + if (r.getType().isIntOrIndexOrFloat()) { + auto m1x1 = mlir::daphne::MatrixType::get(func.getContext(), r.getType(), 1, 1, 1, + mlir::daphne::MatrixRepresentation::Dense); + auto loc = v->getLoc(); + + auto toCastOp = builder.create(loc, m1x1, r); + toCastOp->moveAfter(v); + + // xxxxxx + pipeline.push_back(toCastOp); + vCombineAttrs.push_back(mlir::daphne::VectorCombineAttr::get(func.getContext(), vCombines[i])); + auto cst1 = builder.create(loc, builder.getIndexType(), + builder.getIndexAttr(1l)); + outRows.push_back(cst1); + outCols.push_back(cst1); + results.push_back(toCastOp); + + auto fromCastOp = builder.create(loc, r.getType(), toCastOp); + r.replaceAllUsesExcept(fromCastOp, toCastOp); + + mlir::Operation *firstUseOp = nullptr; + for (const auto &use : fromCastOp->getUses()) { + auto user = use.getOwner(); + + if (!firstUseOp || user->isBeforeInBlock(firstUseOp)) { + firstUseOp = user; + } + } + + fromCastOp->moveBefore(firstUseOp); + } + } + } + + std::vector locs; + locs.reserve(_pipeline.size()); + for (auto op : pipeline) { + locs.push_back(op->getLoc()); + } + + auto loc = builder.getFusedLoc(locs); + auto pipelineOp = builder.create( + loc, mlir::ValueRange(results).getTypes(), operands, outRows, outCols, + builder.getArrayAttr(vSplitAttrs), builder.getArrayAttr(vCombineAttrs), nullptr); + mlir::Block *bodyBlock = builder.createBlock(&pipelineOp.getBody()); + + // remove information from input matrices of pipeline + for (size_t i = 0u; i < operands.size(); ++i) { + auto argTy = operands[i].getType(); + switch (vSplitAttrs[i].cast().getValue()) { + case mlir::daphne::VectorSplit::ROWS: { + auto matTy = argTy.cast(); + // only remove row information + argTy = matTy.withShape(-1, matTy.getNumCols()); + break; + } + case mlir::daphne::VectorSplit::COLS: { + auto matTy = argTy.cast(); + // only remove col information + argTy = matTy.withShape(matTy.getNumRows(), -1); + break; + } + case mlir::daphne::VectorSplit::NONE: + // keep any size information + break; + } + bodyBlock->addArgument(argTy, builder.getUnknownLoc()); + } + + auto argsIx = 0u; + auto resultsIx = 0u; + // for every op in pipeline + try { + + for (auto vIt = pipeline.begin(); vIt != pipeline.end(); ++vIt) { + auto v = *vIt; + auto numOperands = v->getNumOperands(); + auto numResults = v->getNumResults(); + + // move v before end of block + v->moveBefore(bodyBlock, bodyBlock->end()); + + // set operands to arguments of body block, if defOp is not part of the pipeline + for (auto i = 0u; i < numOperands; ++i) { + if (!valueIsPartOfPipeline(v->getOperand(i))) { + v->setOperand(i, bodyBlock->getArgument(argsIx++)); + } + } + + auto pipelineReplaceResults = pipelineOp->getResults().drop_front(resultsIx).take_front(numResults); + resultsIx += numResults; + for (auto z : llvm::zip(v->getResults(), pipelineReplaceResults)) { + auto old = std::get<0>(z); + auto replacement = std::get<1>(z); + + // TODO: switch to type based size inference instead + // FIXME: if output is dynamic sized, we can't do this + // replace `NumRowOp` and `NumColOp`s for output size inference + for (auto &use : old.getUses()) { + + auto *op = use.getOwner(); + + if (auto nrowOp = llvm::dyn_cast(op)) { + nrowOp.replaceAllUsesWith(pipelineOp.getOutRows()[replacement.getResultNumber()]); + nrowOp.erase(); + } + if (auto ncolOp = llvm::dyn_cast(op)) { + ncolOp.replaceAllUsesWith(pipelineOp.getOutCols()[replacement.getResultNumber()]); + ncolOp.erase(); + } + } + // Replace only if not used by pipeline op + old.replaceUsesWithIf(replacement, [&](mlir::OpOperand &opOperand) { + return llvm::count(pipeline, opOperand.getOwner()) == 0; + }); + } + } + } catch (...) { + llvm::outs() << "TEST:" << "\n"; + func.print(llvm::outs()); + llvm::outs() << "\n"; + } + bodyBlock->walk([](mlir::Operation *op) { + for (auto resVal : op->getResults()) { + if (auto ty = resVal.getType().dyn_cast()) { + resVal.setType(ty.withShape(-1, -1)); + } + } + }); + builder.setInsertionPointToEnd(bodyBlock); + builder.create(loc, results); + if (!mlir::sortTopologically(bodyBlock)) { + throw std::runtime_error("topoSort"); + } + } + } + + //----------------------------------------------------------------- + // + //----------------------------------------------------------------- + + struct DEBUG { + + static std::string getColor(size_t pipelineId) { + std::vector colors = {"tomato", "lightgreen", "lightblue", "plum1", "mistyrose2", + "seashell", "hotpink", "lemonchiffon", "firebrick1", "ivory2", + "khaki1", "lightcyan", "olive", "yellow", "maroon", + "violet", "navajowhite1"}; + return colors[pipelineId % colors.size()]; + } + + static void drawPipelines(const std::vector &ops, + const std::map &operationToPipelineIx, + const std::map &decisionIxs, std::string filename) { + std::ofstream outfile(filename); + + outfile << "digraph G {" << std::endl; + + std::map opToNodeName; + + for (size_t i = 0; i < ops.size(); ++i) { + std::string nodeName = "node" + std::to_string(i); + opToNodeName[ops.at(i)] = nodeName; + + size_t pipelineId = operationToPipelineIx.at(ops[i]); + VectorIndex vectIx = decisionIxs.at(ops.at(i)); + std::string color = VectorUtils::DEBUG::getColor(pipelineId); + + outfile << nodeName << " [label=\"" << ops.at(i)->getName().getStringRef().str() + << "\\npIx: " << pipelineId << ", vectIx: " << vectIx << "\", fillcolor=" << color + << ", style=filled];" << std::endl; + } + + std::unordered_set outsideOps; + + for (size_t i = 0; i < ops.size(); ++i) { + mlir::Operation *op = ops.at(i); + auto consumerPipelineIx = operationToPipelineIx.at(op); + + for (const auto &operandValue : op->getOperands()) { + mlir::Operation *operandOp = operandValue.getDefiningOp(); + auto it = operationToPipelineIx.find(operandOp); + + if (it != operationToPipelineIx.end()) { + auto producerPipeplineIx = it->second; + outfile << opToNodeName.at(operandOp) << " -> " << opToNodeName.at(op); + + if (producerPipeplineIx != consumerPipelineIx) { + outfile << " [style=dotted]"; + } + outfile << ";" << std::endl; + } else { + // also show the surrounding ops, e.g. to make horizontal fusion visible + } + } + } + outfile << "}" << std::endl; + } + + static std::string printPtr(void *ptr) { + + std::ostringstream oss; + oss << std::hex << reinterpret_cast(ptr); + + std::string str = oss.str(); + + return str.substr(str.size() - 3); + } + + static void drawPipelines(const std::vector &ops, + const std::map &operationToPipeline, + const std::map &decisionIxs, std::string filename) { + std::ofstream outfile(filename); + + outfile << "digraph G {" << std::endl; + + std::map opToNodeName; + std::map pipelineToIx; + + for (size_t i = 0; i < ops.size(); ++i) { + std::string nodeName = "node" + std::to_string(i); + opToNodeName[ops.at(i)] = nodeName; + + auto pipeline = operationToPipeline.at(ops.at(i)); + size_t pipelineIx; + if (pipelineToIx.find(pipeline) == pipelineToIx.end()) { + pipelineIx = pipelineToIx.size(); + pipelineToIx.insert({pipeline, pipelineIx}); + } else { + pipelineIx = pipelineToIx.at(pipeline); + } + std::string color = VectorUtils::DEBUG::getColor(pipelineIx); + VectorIndex vectIx = decisionIxs.at(ops.at(i)); + + std::string pipeName = printPtr(pipeline); + + outfile << nodeName << " [label=\"" << ops.at(i)->getName().getStringRef().str() + << "\\npIx: " << pipeName << ", vectIx: " << vectIx << "\", fillcolor=" << color + << ", style=filled];" << std::endl; + } + + std::unordered_set outsideOps; + + for (size_t i = 0; i < ops.size(); ++i) { + mlir::Operation *op = ops.at(i); + auto consumerPipelineIx = operationToPipeline.at(op); + + for (const auto &operandValue : op->getOperands()) { + mlir::Operation *operandOp = operandValue.getDefiningOp(); + auto it = operationToPipeline.find(operandOp); + + if (it != operationToPipeline.end()) { + auto producerPipeplineIx = it->second; + outfile << opToNodeName.at(operandOp) << " -> " << opToNodeName.at(op); + + if (producerPipeplineIx != consumerPipelineIx) { + outfile << " [style=dotted]"; + } + outfile << ";" << std::endl; + } else { + // also show the surrounding ops, e.g. to make horizontal fusion visible + } + } + } + outfile << "}" << std::endl; + } + + static void drawPipelineOps(std::vector &ops, std::string filename) { + std::ofstream outfile(filename); + + outfile << "digraph GGroup {" << "\n"; + outfile << "compound=true;" << "\n"; + + std::map opToNodeName; + std::map pipeOpToNodeName; + std::map operationToPipeline; + // std::map argToName; + + for (size_t i = 0; i < ops.size(); ++i) { + std::string pipeName = "pipeOp" + std::to_string(i); + pipeOpToNodeName.insert({ops.at(i), pipeName}); + + std::string color = VectorUtils::DEBUG::getColor(i); + + outfile << "subgraph cluster_" << pipeName << " {\n"; + + outfile << "label=\"S: ["; + for (const auto &x : ops.at(i).getSplits()) { + auto attr = static_cast(llvm::dyn_cast(x).getValue()); + outfile << attr << ", "; + } + outfile << "]\\n"; + + outfile << " C: ["; + for (const auto &x : ops.at(i).getCombines()) { + auto attr = static_cast(llvm::dyn_cast(x).getValue()); + outfile << attr << ", "; + } + outfile << "]\";\n"; + + outfile << "node [style=filled,color=" << color << "];\n"; + outfile << "color=" << "lightgrey" << ";\n"; + size_t j = 0; + + mlir::Block *b = &ops.at(i).getBody().getBlocks().front(); + + for (const auto &arg : b->getArguments()) { + std::string argName = "arg" + std::to_string(arg.getArgNumber()); + std::string qualArgName = pipeName + "_" + argName; + outfile << qualArgName << "[label=\"" << argName << "\"shape=diamond,color=grey];\n"; + // argToName.insert({arg, qualArgName}); + } + + for (auto it = b->begin(); it != b->end(); ++it) { + mlir::Operation *op = &(*it); + std::string nodeName = pipeName + "_node" + std::to_string(j); + opToNodeName.insert({op, nodeName}); + operationToPipeline.insert({op, i}); + outfile << nodeName << " [label=\"" << op->getName().getStringRef().str() << "\"];\n"; + j++; + } + outfile << pipeName << "_inv [style=invis,shape=point]" << ";\n"; + outfile << "}" << "\n"; + } + + for (size_t i = 0; i < ops.size(); ++i) { + std::string pipeName = pipeOpToNodeName.at(ops.at(i)); + + mlir::Block *b = &ops.at(i).getBody().getBlocks().front(); + for (auto it = b->begin(); it != b->end(); ++it) { + mlir::Operation *op = &(*it); + + if (llvm::isa(op)) { + outfile << opToNodeName.at(op) << " -> " << pipeName << "_inv" << ";\n"; + } + + for (const auto &operandValue : op->getOperands()) { + auto operandOp = operandValue.getDefiningOp(); + auto it = operationToPipeline.find(operandOp); + + if (it != operationToPipeline.end()) { + outfile << opToNodeName.at(operandOp) << " -> " << opToNodeName.at(op); + outfile << ";" << std::endl; + } else { + if (auto arg = llvm::dyn_cast(operandValue)) { + std::string argName = "arg" + std::to_string(arg.getArgNumber()); + std::string qualArgName = pipeName + "_" + argName; + outfile << qualArgName << " -> " << opToNodeName.at(op) << ";\n"; + } + } + } + } + } + + for (size_t i = 0; i < ops.size(); ++i) { + std::string pipeName = pipeOpToNodeName.at(ops.at(i)); + auto op = ops.at(i); + + for (size_t j = 0; j < op.getSplits().size(); ++j) { + if (auto operandOp = op.getOperand(j).getDefiningOp()) { + if (auto defOp = llvm::dyn_cast(operandOp)) { + std::string pipeName2 = pipeOpToNodeName.at(defOp); + std::string argName = pipeName + "_arg" + std::to_string(j); + outfile << pipeName2 << "_inv" << " -> " << argName << "[ltail=cluster_" << pipeName2 + << "];\n"; + } + } + } + } + outfile << "}" << "\n"; + } + }; +}; \ No newline at end of file diff --git a/src/compiler/lowering/vectorize/VectorizeDefs.h b/src/compiler/lowering/vectorize/VectorizeDefs.h new file mode 100644 index 000000000..165ce4687 --- /dev/null +++ b/src/compiler/lowering/vectorize/VectorizeDefs.h @@ -0,0 +1,22 @@ +/* + * Copyright 2021 The DAPHNE Consortium + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +enum VectorizationType { + DAPHNE, + GREEDY_1 +}; \ No newline at end of file diff --git a/src/ir/daphneir/DaphneOps.td b/src/ir/daphneir/DaphneOps.td index d934f8463..39ef49843 100644 --- a/src/ir/daphneir/DaphneOps.td +++ b/src/ir/daphneir/DaphneOps.td @@ -210,7 +210,8 @@ class Daphne_EwUnaryOp traits = []> : DataTypeFromFirstArg, ShapeFromArg, CastArgsToResType, - NoMemoryEffect + NoMemoryEffect, + DeclareOpInterfaceMethods ])> { let arguments = (ins AnyTypeOf<[MatrixOf<[scalarType]>, scalarType, Unknown]>:$arg); let results = (outs AnyTypeOf<[MatrixOf<[scalarType]>, scalarType, Unknown]>:$res); @@ -229,7 +230,7 @@ def Daphne_EwAbsOp : Daphne_EwUnaryOp<"ewAbs", NumScalar, [ValueTypeFromFirstArg def Daphne_EwSignOp : Daphne_EwUnaryOp<"ewSign", NumScalar, [ValueTypeFromFirstArg]>; def Daphne_EwExpOp : Daphne_EwUnaryOp<"ewExp", NumScalar, [ValueTypeFromArgsFP]>; def Daphne_EwLnOp : Daphne_EwUnaryOp<"ewLn", NumScalar, [ValueTypeFromArgsFP]>; -def Daphne_EwSqrtOp : Daphne_EwUnaryOp<"ewSqrt", NumScalar, [ValueTypeFromArgsFP, DeclareOpInterfaceMethods]>; +def Daphne_EwSqrtOp : Daphne_EwUnaryOp<"ewSqrt", NumScalar, [ValueTypeFromArgsFP]>; // ---------------------------------------------------------------------------- // Logical @@ -374,28 +375,40 @@ class Daphne_OuterBinaryOp traits = [] // Arithmetic // ---------------------------------------------------------------------------- -def Daphne_OuterAddOp : Daphne_OuterBinaryOp<"outerAdd", NumScalar, [ValueTypeFromArgs]>; -def Daphne_OuterSubOp : Daphne_OuterBinaryOp<"outerSub", NumScalar, [ValueTypeFromArgs]>; -def Daphne_OuterMulOp : Daphne_OuterBinaryOp<"outerMul", NumScalar, [ValueTypeFromArgs]>; -def Daphne_OuterDivOp : Daphne_OuterBinaryOp<"outerDiv", NumScalar, [ValueTypeFromArgs]>; -def Daphne_OuterPowOp : Daphne_OuterBinaryOp<"outerPow", NumScalar, [ValueTypeFromArgs]>; -def Daphne_OuterModOp : Daphne_OuterBinaryOp<"outerMod", NumScalar, [ValueTypeFromArgs]>; -def Daphne_OuterLogOp : Daphne_OuterBinaryOp<"outerLog", NumScalar, [ValueTypeFromArgsFP]>; +def Daphne_OuterAddOp : Daphne_OuterBinaryOp<"outerAdd", NumScalar, [ValueTypeFromArgs, + DeclareOpInterfaceMethods]>; +def Daphne_OuterSubOp : Daphne_OuterBinaryOp<"outerSub", NumScalar, [ValueTypeFromArgs, + DeclareOpInterfaceMethods]>; +def Daphne_OuterMulOp : Daphne_OuterBinaryOp<"outerMul", NumScalar, [ValueTypeFromArgs, + DeclareOpInterfaceMethods]>; +def Daphne_OuterDivOp : Daphne_OuterBinaryOp<"outerDiv", NumScalar, [ValueTypeFromArgs, + DeclareOpInterfaceMethods]>; +def Daphne_OuterPowOp : Daphne_OuterBinaryOp<"outerPow", NumScalar, [ValueTypeFromArgs, + DeclareOpInterfaceMethods]>; +def Daphne_OuterModOp : Daphne_OuterBinaryOp<"outerMod", NumScalar, [ValueTypeFromArgs, + DeclareOpInterfaceMethods]>; +def Daphne_OuterLogOp : Daphne_OuterBinaryOp<"outerLog", NumScalar, [ValueTypeFromArgsFP, + DeclareOpInterfaceMethods]>; // ---------------------------------------------------------------------------- // Min/max // ---------------------------------------------------------------------------- -def Daphne_OuterMinOp : Daphne_OuterBinaryOp<"outerMin", AnyScalar, [ValueTypeFromArgs]>; -def Daphne_OuterMaxOp : Daphne_OuterBinaryOp<"outerMax", AnyScalar, [ValueTypeFromArgs]>; +def Daphne_OuterMinOp : Daphne_OuterBinaryOp<"outerMin", AnyScalar, [ValueTypeFromArgs, + DeclareOpInterfaceMethods]>; +def Daphne_OuterMaxOp : Daphne_OuterBinaryOp<"outerMax", AnyScalar, [ValueTypeFromArgs, + DeclareOpInterfaceMethods]>; // ---------------------------------------------------------------------------- // Logical // ---------------------------------------------------------------------------- -def Daphne_OuterAndOp : Daphne_OuterBinaryOp<"outerAnd", NumScalar, [ValueTypeFromArgsInt]>; -def Daphne_OuterOrOp : Daphne_OuterBinaryOp<"outerOr" , NumScalar, [ValueTypeFromArgsInt]>; -def Daphne_OuterXorOp : Daphne_OuterBinaryOp<"outerXor", NumScalar, [ValueTypeFromArgsInt]>; +def Daphne_OuterAndOp : Daphne_OuterBinaryOp<"outerAnd", NumScalar, [ValueTypeFromArgsInt, + DeclareOpInterfaceMethods]>; +def Daphne_OuterOrOp : Daphne_OuterBinaryOp<"outerOr" , NumScalar, [ValueTypeFromArgsInt, + DeclareOpInterfaceMethods]>; +def Daphne_OuterXorOp : Daphne_OuterBinaryOp<"outerXor", NumScalar, [ValueTypeFromArgsInt, + DeclareOpInterfaceMethods]>; // ---------------------------------------------------------------------------- // Strings @@ -408,7 +421,8 @@ def Daphne_OuterConcatOp : Daphne_OuterBinaryOp<"outerConcat", StrScalar>; // ---------------------------------------------------------------------------- class Daphne_OuterCmpOp traits = []> -: Daphne_OuterBinaryOp { +: Daphne_OuterBinaryOp])> { // TODO: We do not enforce (matrix of) boolean output any more, but should // think about that again. //let results = (outs AnyTypeOf<[MatrixOf<[BoolScalar]>, BoolScalar, Unknown]>:$res); @@ -448,13 +462,13 @@ class Daphne_AggOp traits = []> : Da // ---------------------------------------------------------------------------- class Daphne_AllAggOp traits = []> -: Daphne_AggOp { - let results = (outs scalarType:$res); +: Daphne_AggOp { + let results = (outs AnyType:$res); } -def Daphne_AllAggSumOp : Daphne_AllAggOp<"sumAll", NumScalar, [ValueTypeFromFirstArg]>; -def Daphne_AllAggMinOp : Daphne_AllAggOp<"minAll", NumScalar, [ValueTypeFromFirstArg]>; -def Daphne_AllAggMaxOp : Daphne_AllAggOp<"maxAll", NumScalar, [ValueTypeFromFirstArg]>; +def Daphne_AllAggSumOp : Daphne_AllAggOp<"sumAll", NumScalar, [ValueTypeFromFirstArg, DeclareOpInterfaceMethods]>; +def Daphne_AllAggMinOp : Daphne_AllAggOp<"minAll", NumScalar, [ValueTypeFromFirstArg, DeclareOpInterfaceMethods]>; +def Daphne_AllAggMaxOp : Daphne_AllAggOp<"maxAll", NumScalar, [ValueTypeFromFirstArg, DeclareOpInterfaceMethods]>; def Daphne_AllAggMeanOp : Daphne_AllAggOp<"meanAll", NumScalar, [ValueTypeFromArgsFP]>; def Daphne_AllAggVarOp : Daphne_AllAggOp<"varAll", NumScalar, [ValueTypeFromArgsFP]>; def Daphne_AllAggStddevOp : Daphne_AllAggOp<"stddevAll", NumScalar, [ValueTypeFromArgsFP]>; @@ -479,7 +493,8 @@ class Daphne_ColAggOp]>; -def Daphne_RowAggMinOp : Daphne_RowAggOp<"minRow" , AnyScalar, AnyScalar, [ValueTypeFromFirstArg, CastArgsToResType, DeclareOpInterfaceMethods]>; +def Daphne_RowAggMinOp : Daphne_RowAggOp<"minRow" , AnyScalar, AnyScalar, [ValueTypeFromFirstArg, CastArgsToResType, + DeclareOpInterfaceMethods]>; def Daphne_RowAggMaxOp : Daphne_RowAggOp<"maxRow" , AnyScalar, AnyScalar, [ValueTypeFromFirstArg, CastArgsToResType, CUDASupport, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]>; def Daphne_RowAggIdxMinOp : Daphne_RowAggOp<"idxminRow", NumScalar, Size, [ValueTypeSize]>; @@ -490,8 +505,10 @@ def Daphne_RowAggStddevOp : Daphne_RowAggOp<"stddevRow", NumScalar, NumScalar, [ def Daphne_ColAggSumOp : Daphne_ColAggOp<"sumCol" , NumScalar, NumScalar, [ValueTypeFromFirstArg, CastArgsToResType, CUDASupport, DeclareOpInterfaceMethods]>; -def Daphne_ColAggMinOp : Daphne_ColAggOp<"minCol" , AnyScalar, AnyScalar, [ValueTypeFromFirstArg, CastArgsToResType]>; -def Daphne_ColAggMaxOp : Daphne_ColAggOp<"maxCol" , AnyScalar, AnyScalar, [ValueTypeFromFirstArg, CastArgsToResType]>; +def Daphne_ColAggMinOp : Daphne_ColAggOp<"minCol" , AnyScalar, AnyScalar, [ValueTypeFromFirstArg, CastArgsToResType, + DeclareOpInterfaceMethods]>; +def Daphne_ColAggMaxOp : Daphne_ColAggOp<"maxCol" , AnyScalar, AnyScalar, [ValueTypeFromFirstArg, CastArgsToResType, + DeclareOpInterfaceMethods]>; def Daphne_ColAggIdxMinOp : Daphne_ColAggOp<"idxminCol", NumScalar, Size, [ValueTypeSize]>; def Daphne_ColAggIdxMaxOp : Daphne_ColAggOp<"idxmaxCol", NumScalar, Size, [ValueTypeSize]>; def Daphne_ColAggMeanOp : Daphne_ColAggOp<"meanCol" , NumScalar, NumScalar, [ValueTypeFromArgsFP, CastArgsToResType]>; @@ -1564,7 +1581,7 @@ def Daphne_VectorizedPipelineOp : Daphne_Op<"vectorizedPipeline", [AttrSizedOper TypedArrayAttrBase:$splits, TypedArrayAttrBase:$combines, Optional:$ctx); - let results = (outs Variadic:$outputs); + let results = (outs Variadic:$outputs); let regions = (region SizedRegion<1>:$body, AnyRegion:$cuda); let hasCanonicalizeMethod = 1; diff --git a/src/ir/daphneir/DaphneVectorizableOpInterface.cpp b/src/ir/daphneir/DaphneVectorizableOpInterface.cpp index cf835d368..b7563facd 100644 --- a/src/ir/daphneir/DaphneVectorizableOpInterface.cpp +++ b/src/ir/daphneir/DaphneVectorizableOpInterface.cpp @@ -30,65 +30,65 @@ using namespace mlir; // **************************************************************************** // For families of operations. -template std::vector getVectorSplits_EwBinaryOp(EwBinaryOp *op) { - // Matrix -> row-wise, Scalar -> none - auto lhsSplit = op->getLhs().getType().template isa() ? daphne::VectorSplit::ROWS - : daphne::VectorSplit::NONE; - auto rhsSplit = op->getRhs().getType().template isa() ? daphne::VectorSplit::ROWS - : daphne::VectorSplit::NONE; - return {lhsSplit, rhsSplit}; +// EwBinaryOp +template std::vector> getVectorSplits_EwBinaryOp(EwBinaryOp *op) { + bool isLhsMatrix = op->getLhs().getType().template isa(); + bool isRhsMatrix = op->getRhs().getType().template isa(); + + auto lhsSplitRow = isLhsMatrix ? daphne::VectorSplit::ROWS : daphne::VectorSplit::NONE; + auto rhsSplitRow = isRhsMatrix ? daphne::VectorSplit::ROWS : daphne::VectorSplit::NONE; + + return {{lhsSplitRow, rhsSplitRow}}; } -template std::vector getVectorCombines_EwBinaryOp(EwBinaryOp *op) { - return {daphne::VectorCombine::ROWS}; +template +std::vector> getVectorCombines_EwBinaryOp(EwBinaryOp *op) { + return {{daphne::VectorCombine::ROWS}}; } template -std::vector> createOpsOutputSizes_EwBinaryOp(EwBinaryOp *op, OpBuilder &builder) { +std::vector>> createOpsOutputSizes_EwBinaryOp(EwBinaryOp *op, OpBuilder &builder) { auto loc = op->getLoc(); auto sizeTy = builder.getIndexType(); auto lhsRows = builder.create(loc, sizeTy, op->getLhs()); auto lhsCols = builder.create(loc, sizeTy, op->getLhs()); // TODO: do max on #rows/#cols of lhs and rhs for broadcasting - return {{lhsRows, lhsCols}}; + return {{{lhsRows, lhsCols}}}; } -template std::vector getVectorSplits_EwUnaryOp(EwUnaryOp *op) { - return {daphne::VectorSplit::ROWS}; + +// EwUnaryOp +template std::vector> getVectorSplits_EwUnaryOp(EwUnaryOp *op) { + return {{daphne::VectorSplit::ROWS}}; } -template std::vector getVectorCombines_EwUnaryOp(EwUnaryOp *op) { - return {daphne::VectorCombine::ROWS}; +template std::vector> getVectorCombines_EwUnaryOp(EwUnaryOp *op) { + return {{daphne::VectorCombine::ROWS}}; } template -std::vector> createOpsOutputSizes_EwUnaryOp(EwUnaryOp *op, OpBuilder &builder) { +std::vector>> createOpsOutputSizes_EwUnaryOp(EwUnaryOp *op, OpBuilder &builder) { auto loc = op->getLoc(); auto sizeTy = builder.getIndexType(); auto rows = builder.create(loc, sizeTy, op->getArg()); auto cols = builder.create(loc, sizeTy, op->getArg()); // TODO: do max on #rows/#cols of lhs and rhs for broadcasting - return {{rows, cols}}; + return {{{rows, cols}}, {{rows, cols}}}; } -template std::vector getVectorSplits_RowAggOp(RowAggOp *op) { - return {daphne::VectorSplit::ROWS}; -} -template std::vector getVectorCombines_RowAggOp(RowAggOp *op) { - return {daphne::VectorCombine::ROWS}; -} -template -std::vector> createOpsOutputSizes_RowAggOp(RowAggOp *op, OpBuilder &builder) { - auto loc = op->getLoc(); - auto sizeTy = builder.getIndexType(); - auto rows = builder.create(loc, sizeTy, op->getArg()); - auto cst1 = builder.create(loc, sizeTy, builder.getIndexAttr(1l)); - return {{rows, cst1}}; + +// OuterBinary +template +std::vector> getVectorSplits_OuterBinaryOp(OuterBinaryOp *op) { + return {{daphne::VectorSplit::ROWS, daphne::VectorSplit::NONE}}; } -template std::vector getVectorSplits_ColAggOp(ColAggOp *op) { - return {daphne::VectorSplit::ROWS}; +template +std::vector> getVectorCombines_OuterBinaryOp(OuterBinaryOp *op) { + return {{daphne::VectorCombine::ROWS}}; } -template -std::vector> createOpsOutputSizes_ColAggOp(ColAggOp *op, OpBuilder &builder) { +template +std::vector>> createOpsOutputSizes_OuterBinaryOp(OuterBinaryOp *op, + OpBuilder &builder) { auto loc = op->getLoc(); auto sizeTy = builder.getIndexType(); - auto cst1 = builder.create(loc, sizeTy, builder.getIndexAttr(1l)); - auto cols = builder.create(loc, sizeTy, op->getArg()); - return {{cst1, cols}}; + auto rows = builder.create(loc, sizeTy, op->getLhs()); + auto cols = builder.create(loc, sizeTy, op->getRhs()); + // TODO: do max on #rows/#cols of lhs and rhs for broadcasting + return {{{rows, cols}}}; } // **************************************************************************** @@ -97,16 +97,19 @@ std::vector> createOpsOutputSizes_ColAggOp(ColAggOp *op, // ---------------------------------------------------------------------------- // Matrix multiplication -std::vector daphne::MatMulOp::getVectorSplits() { - return { - daphne::VectorSplit::ROWS, // lhs - daphne::VectorSplit::NONE, // rhs - daphne::VectorSplit::NONE, // transa - daphne::VectorSplit::NONE // transb - }; -} -std::vector daphne::MatMulOp::getVectorCombines() { return {daphne::VectorCombine::ROWS}; } -std::vector> daphne::MatMulOp::createOpsOutputSizes(OpBuilder &builder) { +// ---------------------------------------------------------------------------- +std::vector> daphne::MatMulOp::getVectorSplits() { + return {{ + daphne::VectorSplit::ROWS, // lhs + daphne::VectorSplit::NONE, // rhs + daphne::VectorSplit::NONE, // transa + daphne::VectorSplit::NONE // transb + }}; +} +std::vector> daphne::MatMulOp::getVectorCombines() { + return {{daphne::VectorCombine::ROWS}}; +} +std::vector>> daphne::MatMulOp::createOpsOutputSizes(OpBuilder &builder) { auto loc = getLoc(); auto sizeTy = builder.getIndexType(); @@ -127,16 +130,21 @@ std::vector> daphne::MatMulOp::createOpsOutputSizes(OpBu cols = tb ? builder.create(loc, sizeTy, getRhs()).getResult() : builder.create(loc, sizeTy, getRhs()).getResult(); - return {{rows, cols}}; + return {{{rows, cols}}}; } // ---------------------------------------------------------------------------- // ---------------------------------------------------------------------------- // Binary +// ---------------------------------------------------------------------------- #define IMPL_SPLIT_COMBINE_EWBINARYOP(OP) \ - std::vector daphne::OP::getVectorSplits() { return getVectorSplits_EwBinaryOp(this); } \ - std::vector daphne::OP::getVectorCombines() { return getVectorCombines_EwBinaryOp(this); } \ - std::vector> daphne::OP::createOpsOutputSizes(OpBuilder &builder) { \ + std::vector> daphne::OP::getVectorSplits() { \ + return getVectorSplits_EwBinaryOp(this); \ + } \ + std::vector> daphne::OP::getVectorCombines() { \ + return getVectorCombines_EwBinaryOp(this); \ + } \ + std::vector>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) { \ return createOpsOutputSizes_EwBinaryOp(this, builder); \ } @@ -176,40 +184,163 @@ IMPL_SPLIT_COMBINE_EWBINARYOP(EwGeOp) // ---------------------------------------------------------------------------- // Unary +// ---------------------------------------------------------------------------- #define IMPL_SPLIT_COMBINE_EWUNARYOP(OP) \ - std::vector daphne::OP::getVectorSplits() { return getVectorSplits_EwUnaryOp(this); } \ - std::vector daphne::OP::getVectorCombines() { return getVectorCombines_EwUnaryOp(this); } \ - std::vector> daphne::OP::createOpsOutputSizes(OpBuilder &builder) { \ + std::vector> daphne::OP::getVectorSplits() { \ + return getVectorSplits_EwUnaryOp(this); \ + } \ + std::vector> daphne::OP::getVectorCombines() { \ + return getVectorCombines_EwUnaryOp(this); \ + } \ + std::vector>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) { \ return createOpsOutputSizes_EwUnaryOp(this, builder); \ } +// Arithmetic/general math + +IMPL_SPLIT_COMBINE_EWUNARYOP(EwMinusOp) +IMPL_SPLIT_COMBINE_EWUNARYOP(EwAbsOp) +IMPL_SPLIT_COMBINE_EWUNARYOP(EwSignOp) +IMPL_SPLIT_COMBINE_EWUNARYOP(EwExpOp) +IMPL_SPLIT_COMBINE_EWUNARYOP(EwLnOp) IMPL_SPLIT_COMBINE_EWUNARYOP(EwSqrtOp) +// Logical +IMPL_SPLIT_COMBINE_EWUNARYOP(EwNegOp) + +// Rounding +IMPL_SPLIT_COMBINE_EWUNARYOP(EwRoundOp) +IMPL_SPLIT_COMBINE_EWUNARYOP(EwFloorOp) +IMPL_SPLIT_COMBINE_EWUNARYOP(EwCeilOp) + +// Trigonometric +IMPL_SPLIT_COMBINE_EWUNARYOP(EwSinOp) +IMPL_SPLIT_COMBINE_EWUNARYOP(EwCosOp) +IMPL_SPLIT_COMBINE_EWUNARYOP(EwTanOp) +IMPL_SPLIT_COMBINE_EWUNARYOP(EwSinhOp) +IMPL_SPLIT_COMBINE_EWUNARYOP(EwCoshOp) +IMPL_SPLIT_COMBINE_EWUNARYOP(EwTanhOp) +IMPL_SPLIT_COMBINE_EWUNARYOP(EwAsinOp) +IMPL_SPLIT_COMBINE_EWUNARYOP(EwAcosOp) +IMPL_SPLIT_COMBINE_EWUNARYOP(EwAtanOp) + +// Comparison +// changes value type? +IMPL_SPLIT_COMBINE_EWUNARYOP(EwIsNanOp) + #undef IMPL_SPLIT_COMBINE_EWUNARYOP // ---------------------------------------------------------------------------- // ---------------------------------------------------------------------------- -// Aggregations -// TODO: splitting and combining by column probably makes more sense +// Full Aggregations +// ---------------------------------------------------------------------------- + +template std::vector> getVectorSplits_AllAggOp(AllAggOp *op) { + return {{daphne::VectorSplit::ROWS}, {daphne::VectorSplit::COLS}}; +} +template std::vector> getVectorCombines_AllAggOp(AllAggOp *op) { + return {{daphne::VectorCombine::ADD}, {daphne::VectorCombine::ADD}}; +} +template +std::vector>> createOpsOutputSizes_AllAggOp(AllAggOp *op, OpBuilder &builder) { + auto loc = op->getLoc(); + auto sizeTy = builder.getIndexType(); + auto cst1 = builder.create(loc, sizeTy, builder.getIndexAttr(1l)); + return {{{cst1, cst1}}, {{cst1, cst1}}}; +} + +#define IMPL_SPLIT_COMBINE_ALLAGG(OP) \ + std::vector> daphne::OP::getVectorSplits() { \ + return getVectorSplits_AllAggOp(this); \ + } \ + std::vector> daphne::OP::getVectorCombines() { \ + return getVectorCombines_AllAggOp(this); \ + } \ + std::vector>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) { \ + return createOpsOutputSizes_AllAggOp(this, builder); \ + } + +// RowAgg +IMPL_SPLIT_COMBINE_ALLAGG(AllAggSumOp) +IMPL_SPLIT_COMBINE_ALLAGG(AllAggMaxOp) +IMPL_SPLIT_COMBINE_ALLAGG(AllAggMinOp) + +#undef IMPL_SPLIT_COMBINE_ALLAGG + +// ---------------------------------------------------------------------------- +// Dimension Aggregations +// ---------------------------------------------------------------------------- + +template std::vector> getVectorSplits_RowAggOp(RowAggOp *op) { + return {{daphne::VectorSplit::ROWS}, {daphne::VectorSplit::COLS}}; +} +template +std::vector>> createOpsOutputSizes_RowAggOp(RowAggOp *op, OpBuilder &builder) { + auto loc = op->getLoc(); + auto sizeTy = builder.getIndexType(); + auto rows = builder.create(loc, sizeTy, op->getArg()); + auto cst1 = builder.create(loc, sizeTy, builder.getIndexAttr(1l)); + return {{{rows, cst1}}}; +} + #define IMPL_SPLIT_COMBINE_ROWAGG(OP) \ - std::vector daphne::OP::getVectorSplits() { return getVectorSplits_RowAggOp(this); } \ - std::vector daphne::OP::getVectorCombines() { return getVectorCombines_RowAggOp(this); } \ - std::vector> daphne::OP::createOpsOutputSizes(OpBuilder &builder) { \ + std::vector> daphne::OP::getVectorSplits() { \ + return getVectorSplits_RowAggOp(this); \ + } \ + std::vector>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) { \ return createOpsOutputSizes_RowAggOp(this, builder); \ } -#define IMPL_SPLIT_COMBINE_COLAGG(OP) \ - std::vector daphne::OP::getVectorSplits() { return getVectorSplits_ColAggOp(this); } \ - std::vector> daphne::OP::createOpsOutputSizes(OpBuilder &builder) { \ - return createOpsOutputSizes_ColAggOp(this, builder); \ - } // RowAgg IMPL_SPLIT_COMBINE_ROWAGG(RowAggMinOp) IMPL_SPLIT_COMBINE_ROWAGG(RowAggMaxOp) IMPL_SPLIT_COMBINE_ROWAGG(RowAggSumOp) +std::vector> daphne::RowAggSumOp::getVectorCombines() { + return {{daphne::VectorCombine::ROWS}}; +} +std::vector> daphne::RowAggMinOp::getVectorCombines() { + return {{daphne::VectorCombine::ROWS}}; +} +std::vector> daphne::RowAggMaxOp::getVectorCombines() { + return {{daphne::VectorCombine::ROWS}}; +} + +//----- + +template std::vector> getVectorSplits_ColAggOp(ColAggOp *op) { + return {{daphne::VectorSplit::ROWS}}; +} +template +std::vector>> createOpsOutputSizes_ColAggOp(ColAggOp *op, OpBuilder &builder) { + auto loc = op->getLoc(); + auto sizeTy = builder.getIndexType(); + auto cst1 = builder.create(loc, sizeTy, builder.getIndexAttr(1l)); + auto cols = builder.create(loc, sizeTy, op->getArg()); + return {{{cst1, cols}}}; +} + +#define IMPL_SPLIT_COMBINE_COLAGG(OP) \ + std::vector> daphne::OP::getVectorSplits() { \ + return getVectorSplits_ColAggOp(this); \ + } \ + std::vector>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) { \ + return createOpsOutputSizes_ColAggOp(this, builder); \ + } + +IMPL_SPLIT_COMBINE_COLAGG(ColAggMinOp) +IMPL_SPLIT_COMBINE_COLAGG(ColAggMaxOp) IMPL_SPLIT_COMBINE_COLAGG(ColAggSumOp) -std::vector daphne::ColAggSumOp::getVectorCombines() { return {daphne::VectorCombine::ADD}; } + +std::vector> daphne::ColAggSumOp::getVectorCombines() { + return {{daphne::VectorCombine::ADD}}; +} +std::vector> daphne::ColAggMinOp::getVectorCombines() { + return {{daphne::VectorCombine::MIN}}; +} +std::vector> daphne::ColAggMaxOp::getVectorCombines() { + return {{daphne::VectorCombine::MAX}}; +} #undef IMPL_SPLIT_COMBINE_ROWAGG #undef IMPL_SPLIT_COMBINE_COLAGG @@ -217,72 +348,137 @@ std::vector daphne::ColAggSumOp::getVectorCombines() { re // ---------------------------------------------------------------------------- // Left and right indexing -std::vector daphne::ExtractColOp::getVectorSplits() { - return {daphne::VectorSplit::ROWS, daphne::VectorSplit::NONE}; +// ---------------------------------------------------------------------------- +std::vector> daphne::ExtractColOp::getVectorSplits() { + return {{daphne::VectorSplit::ROWS, daphne::VectorSplit::NONE}}; } -std::vector daphne::ExtractColOp::getVectorCombines() { return {daphne::VectorCombine::ROWS}; } -std::vector> daphne::ExtractColOp::createOpsOutputSizes(OpBuilder &builder) { +std::vector> daphne::ExtractColOp::getVectorCombines() { + return {{daphne::VectorCombine::ROWS}}; +} +std::vector>> daphne::ExtractColOp::createOpsOutputSizes(OpBuilder &builder) { auto loc = getLoc(); auto sizeTy = builder.getIndexType(); auto rows = builder.create(loc, sizeTy, getSource()); // TODO: support scalar and maybe (based on definition of `ExtractColOp`) // apply some kind of `unique()` op auto cols = builder.create(loc, sizeTy, getSelectedCols()); - return {{rows, cols}}; + return {{{rows, cols}}}; } // ---------------------------------------------------------------------------- // ---------------------------------------------------------------------------- // Reorganization -std::vector daphne::TransposeOp::getVectorSplits() { return {daphne::VectorSplit::ROWS}; } -std::vector daphne::TransposeOp::getVectorCombines() { return {daphne::VectorCombine::COLS}; } -std::vector> daphne::TransposeOp::createOpsOutputSizes(OpBuilder &builder) { +// ---------------------------------------------------------------------------- +std::vector> daphne::TransposeOp::getVectorSplits() { + return {{daphne::VectorSplit::ROWS}}; +} +std::vector> daphne::TransposeOp::getVectorCombines() { + return {{daphne::VectorCombine::COLS}}; +} +std::vector>> daphne::TransposeOp::createOpsOutputSizes(OpBuilder &builder) { auto loc = getLoc(); auto sizeTy = builder.getIndexType(); auto rows = builder.create(loc, sizeTy, getArg()); auto cols = builder.create(loc, sizeTy, getArg()); - return {{cols, rows}}; + return {{{cols, rows}}}; } -std::vector daphne::ColBindOp::getVectorSplits() { - return {daphne::VectorSplit::ROWS, daphne::VectorSplit::ROWS}; +std::vector> daphne::ColBindOp::getVectorSplits() { + return {{daphne::VectorSplit::ROWS, daphne::VectorSplit::ROWS}}; +} +std::vector> daphne::ColBindOp::getVectorCombines() { + return {{daphne::VectorCombine::ROWS}}; } -std::vector daphne::ColBindOp::getVectorCombines() { return {daphne::VectorCombine::ROWS}; } -std::vector> daphne::ColBindOp::createOpsOutputSizes(OpBuilder &builder) { +std::vector>> daphne::ColBindOp::createOpsOutputSizes(OpBuilder &builder) { auto loc = getLoc(); auto i64Ty = builder.getIntegerType(64, true); auto sizeTy = builder.getIndexType(); auto rows = builder.create(loc, sizeTy, getLhs()); auto colsLhs = builder.create(loc, sizeTy, getLhs()); auto colsRhs = builder.create(loc, sizeTy, getRhs()); - return {{rows, builder.create( - loc, sizeTy, - builder.create(loc, builder.create(loc, i64Ty, colsLhs), - builder.create(loc, i64Ty, colsRhs)))}}; + return {{{rows, builder.create( + loc, sizeTy, + builder.create(loc, builder.create(loc, i64Ty, colsLhs), + builder.create(loc, i64Ty, colsRhs)))}}}; } // ---------------------------------------------------------------------------- +// ---------------------------------------------------------------------------- +// Outer binary (generalized outer product) +// ---------------------------------------------------------------------------- +#define IMPL_SPLIT_COMBINE_OUTERBINARY(OP) \ + std::vector> daphne::OP::getVectorSplits() { \ + return getVectorSplits_OuterBinaryOp(this); \ + } \ + std::vector> daphne::OP::getVectorCombines() { \ + return getVectorCombines_OuterBinaryOp(this); \ + } \ + std::vector>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) { \ + return createOpsOutputSizes_OuterBinaryOp(this, builder); \ + } + +// Arithmetic + +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterAddOp) +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterSubOp) +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterMulOp) +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterDivOp) +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterPowOp) +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterModOp) +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterLogOp) + +// Min/max + +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterMinOp) +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterMaxOp) + +// Logical + +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterAndOp) +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterOrOp) +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterXorOp) + +// Comparisons + +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterEqOp) +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterNeqOp) +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterLtOp) +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterLeOp) +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterGtOp) +IMPL_SPLIT_COMBINE_OUTERBINARY(OuterGeOp) + +#undef IMPL_SPLIT_COMBINE_OUTERBINARY + +// ---------------------------------------------------------------------------- + // ---------------------------------------------------------------------------- // Other -std::vector daphne::SyrkOp::getVectorSplits() { return {daphne::VectorSplit::ROWS}; } -std::vector daphne::SyrkOp::getVectorCombines() { return {daphne::VectorCombine::ADD}; } -std::vector> daphne::SyrkOp::createOpsOutputSizes(OpBuilder &builder) { +// ---------------------------------------------------------------------------- +std::vector> daphne::SyrkOp::getVectorSplits() { + return {{daphne::VectorSplit::ROWS}}; +} +std::vector> daphne::SyrkOp::getVectorCombines() { + return {{daphne::VectorCombine::ADD}}; +} +std::vector>> daphne::SyrkOp::createOpsOutputSizes(OpBuilder &builder) { auto loc = getLoc(); auto sizeTy = builder.getIndexType(); auto cols = builder.create(loc, sizeTy, getArg()); // TODO: do max on #rows/#cols of lhs and rhs for broadcasting - return {{cols, cols}}; + return {{{cols, cols}}}; } -std::vector daphne::GemvOp::getVectorSplits() { - return {daphne::VectorSplit::ROWS, daphne::VectorSplit::ROWS}; +std::vector> daphne::GemvOp::getVectorSplits() { + return {{daphne::VectorSplit::ROWS, daphne::VectorSplit::ROWS}}; +} +std::vector> daphne::GemvOp::getVectorCombines() { + return {{daphne::VectorCombine::ADD}}; } -std::vector daphne::GemvOp::getVectorCombines() { return {daphne::VectorCombine::ADD}; } -std::vector> daphne::GemvOp::createOpsOutputSizes(OpBuilder &builder) { +std::vector>> daphne::GemvOp::createOpsOutputSizes(OpBuilder &builder) { auto loc = getLoc(); auto sizeTy = builder.getIndexType(); auto cols = builder.create(loc, sizeTy, getMat()); auto one = builder.create(loc, builder.getIndexType(), builder.getIndexAttr(1)); - return {{cols, one}}; + return {{{cols, one}}}; } -// ---------------------------------------------------------------------------- +// ---------------------------------------------------------------------------- \ No newline at end of file diff --git a/src/ir/daphneir/DaphneVectorizableOpInterface.td b/src/ir/daphneir/DaphneVectorizableOpInterface.td index 747eb19b0..baae04dcd 100644 --- a/src/ir/daphneir/DaphneVectorizableOpInterface.td +++ b/src/ir/daphneir/DaphneVectorizableOpInterface.td @@ -22,16 +22,19 @@ include "mlir/IR/OpBase.td" def VECTOR_SPLIT_NONE : I64EnumAttrCase<"NONE", 0>; def VECTOR_SPLIT_ROWS : I64EnumAttrCase<"ROWS", 1>; +def VECTOR_SPLIT_COLS : I64EnumAttrCase<"COLS", 2>; -def VectorSplitAttr : I64EnumAttr<"VectorSplit", "", [VECTOR_SPLIT_NONE, VECTOR_SPLIT_ROWS]> { +def VectorSplitAttr : I64EnumAttr<"VectorSplit", "", [VECTOR_SPLIT_NONE, VECTOR_SPLIT_ROWS, VECTOR_SPLIT_COLS]> { let cppNamespace = "::mlir::daphne"; } def VECTOR_COMBINE_ROWS : I64EnumAttrCase<"ROWS", 1>; def VECTOR_COMBINE_COLS : I64EnumAttrCase<"COLS", 2>; def VECTOR_COMBINE_ADD : I64EnumAttrCase<"ADD", 3>; +def VECTOR_COMBINE_MAX : I64EnumAttrCase<"MAX", 4>; +def VECTOR_COMBINE_MIN : I64EnumAttrCase<"MIN", 5>; -def VectorCombineAttr : I64EnumAttr<"VectorCombine", "", [VECTOR_COMBINE_ROWS, VECTOR_COMBINE_COLS, VECTOR_COMBINE_ADD]> { +def VectorCombineAttr : I64EnumAttr<"VectorCombine", "", [VECTOR_COMBINE_ROWS, VECTOR_COMBINE_COLS, VECTOR_COMBINE_ADD, VECTOR_COMBINE_MAX, VECTOR_COMBINE_MIN]> { let cppNamespace = "::mlir::daphne"; } @@ -42,11 +45,11 @@ def VectorizableOpInterface : OpInterface<"Vectorizable"> { let methods = [ InterfaceMethod<"Get the vector split kind for each input.", - "std::vector", "getVectorSplits", (ins)>, + "std::vector>", "getVectorSplits", (ins)>, InterfaceMethod<"Get the vector combine kind for each output.", - "std::vector", "getVectorCombines", (ins)>, + "std::vector>", "getVectorCombines", (ins)>, InterfaceMethod<"Create values for #rows and #cols of each output. -1 for dynamic/unknown.", - "std::vector>", "createOpsOutputSizes", (ins "mlir::OpBuilder&":$builder)>, + "std::vector>>", "createOpsOutputSizes", (ins "mlir::OpBuilder&":$builder)>, // TODO: for complex operations (non element-wise) where the computation per vector is not equal to the operation // itself on the whole input, we will require a new method generating the operations in the pipeline. This is // the same behaviour as with `Distributable` Ops, and therefore combining them might make sense. diff --git a/src/ir/daphneir/Passes.h b/src/ir/daphneir/Passes.h index 3dbb67e4f..7911e3b0b 100644 --- a/src/ir/daphneir/Passes.h +++ b/src/ir/daphneir/Passes.h @@ -67,7 +67,12 @@ std::unique_ptr createRewriteToCallKernelOpPass(const DaphneUserConfig &cf std::unordered_map &usedLibPaths); std::unique_ptr createSelectMatrixRepresentationsPass(const DaphneUserConfig &cfg); std::unique_ptr createSpecializeGenericFunctionsPass(const DaphneUserConfig &cfg); -std::unique_ptr createVectorizeComputationsPass(); + +std::unique_ptr createDaphneVectorizeComputationsPass(); +std::unique_ptr createGreedy1VectorizeComputationsPass(const DaphneUserConfig& cfg); +std::unique_ptr createHorizontalFusionPass(); +std::unique_ptr createDrawPipelineOpsPass(const std::string filename); + #ifdef USE_CUDA std::unique_ptr createMarkCUDAOpsPass(const DaphneUserConfig &cfg); #endif diff --git a/src/parser/daphnedsl/DaphneDSLBuiltins.cpp b/src/parser/daphnedsl/DaphneDSLBuiltins.cpp index afe498a20..5c9a0456e 100644 --- a/src/parser/daphnedsl/DaphneDSLBuiltins.cpp +++ b/src/parser/daphnedsl/DaphneDSLBuiltins.cpp @@ -1248,6 +1248,17 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string &fu builder.create(loc, source.getType(), source, attr.dyn_cast())); } + // **************************************************************************** + // Profiling + // **************************************************************************** + + if (func == "startProfiling") { + return builder.create(loc); + } + if (func == "stopProfiling") { + return builder.create(loc); + } + // **************************************************************************** // List operations // **************************************************************************** diff --git a/src/runtime/local/vectorized/MTWrapper.h b/src/runtime/local/vectorized/MTWrapper.h index a802f2d87..236f7853f 100644 --- a/src/runtime/local/vectorized/MTWrapper.h +++ b/src/runtime/local/vectorized/MTWrapper.h @@ -68,6 +68,9 @@ template class MTWrapperBase { if (splits[i] == mlir::daphne::VectorSplit::ROWS) { len = std::max(len, inputs[i]->getNumRows()); mem_required += inputs[i]->getNumItems() * sizeof(typename DT::VT); + } else if (splits[i] == mlir::daphne::VectorSplit::COLS) { + len = std::max(len, inputs[i]->getNumCols()); + mem_required += inputs[i]->getNumItems() * sizeof(typename DT::VT); } } return std::make_pair(len, mem_required); diff --git a/src/runtime/local/vectorized/MTWrapper_dense.cpp b/src/runtime/local/vectorized/MTWrapper_dense.cpp index ffed73054..08382fc60 100644 --- a/src/runtime/local/vectorized/MTWrapper_dense.cpp +++ b/src/runtime/local/vectorized/MTWrapper_dense.cpp @@ -36,7 +36,13 @@ template std::unique_ptr q = std::make_unique(len); std::vector tmp_q{q.get()}; - auto batchSize8M = std::max(100ul, static_cast(std::ceil(8388608 / row_mem))); + + auto batchSize8M = ctx->config.batchSize; + if (batchSize8M == 0) { + batchSize8M = std::max(100ul, static_cast(std::ceil(8388608 / row_mem))); + } + //llvm::outs() << "si: " << batchSize8M << "\n"; + this->initCPPWorkers(tmp_q, batchSize8M, verbose, 1, 0, false); #ifdef USE_CUDA @@ -109,7 +115,13 @@ template } } - auto batchSize8M = std::max(100ul, static_cast(std::ceil(8388608 / row_mem))); + + auto batchSize8M = ctx->config.batchSize; + if (batchSize8M == 0) { + batchSize8M = std::max(100ul, static_cast(std::ceil(8388608 / row_mem))); + } + //llvm::outs() << "mu: " << batchSize8M << "\n"; + this->initCPPWorkers(qvector, batchSize8M, verbose, this->_numQueues, this->_queueMode, ctx->getUserConfig().pinWorkers); diff --git a/src/runtime/local/vectorized/Tasks.cpp b/src/runtime/local/vectorized/Tasks.cpp index 8291f2623..caebb3464 100644 --- a/src/runtime/local/vectorized/Tasks.cpp +++ b/src/runtime/local/vectorized/Tasks.cpp @@ -15,7 +15,18 @@ */ #include "runtime/local/vectorized/Tasks.h" +#include "ir/daphneir/Daphne.h" +#include "runtime/local/datastructures/DenseMatrix.h" +#include "runtime/local/kernels/BinaryOpCode.h" #include "runtime/local/kernels/EwBinaryMat.h" +#include +#include +#include +#include + +#ifdef USE_PAPI +#include +#endif template void CompiledPipelineTask>::execute(uint32_t fid, uint32_t batchSize) { // local add aggregation to minimize locking @@ -24,15 +35,15 @@ template void CompiledPipelineTask>::execute(uint3 std::vector **> outputs; for (auto &lres : localResults) outputs.push_back(&lres); - for (uint64_t r = _data._rl; r < _data._ru; r += batchSize) { + for (uint64_t d = _data._dl; d < _data._du; d += batchSize) { // create zero-copy views of inputs/outputs - uint64_t r2 = std::min(r + batchSize, _data._ru); + uint64_t d2 = std::min(d + batchSize, _data._du); - auto linputs = this->createFuncInputs(r, r2); + auto linputs = this->createFuncInputs(d, d2); // execute function on given data binding (batch size) _data._funcs[fid](outputs.data(), linputs.data(), _data._ctx); - accumulateOutputs(localResults, localAddRes, r, r2); + accumulateOutputs(localResults, localAddRes, d, d2); // cleanup for (auto &localResult : localResults) @@ -47,75 +58,119 @@ template void CompiledPipelineTask>::execute(uint3 } for (size_t o = 0; o < _data._numOutputs; ++o) { - if (_data._combines[o] == VectorCombine::ADD) { - auto &result = (*_res[o]); - _resLock.lock(); - if (result == nullptr) { - result = localAddRes[o]; - _resLock.unlock(); - } else { - ewBinaryMat(BinaryOpCode::ADD, result, result, localAddRes[o], _data._ctx); - _resLock.unlock(); - // cleanup - DataObjectFactory::destroy(localAddRes[o]); + + if (_data._combines[o] == VectorCombine::ROWS || _data._combines[o] == VectorCombine::COLS) + continue; + + auto &result = (*_res[o]); + _resLock.lock(); + if (result == nullptr) { + result = localAddRes[o]; + _resLock.unlock(); + } else { + switch (_data._combines[o]) { + case VectorCombine::ADD: + ewBinaryMat(BinaryOpCode::ADD, result, result, localAddRes[o], _data._ctx); + break; + case VectorCombine::MIN: + ewBinaryMat(BinaryOpCode::MIN, result, result, localAddRes[o], _data._ctx); + break; + case VectorCombine::MAX: + ewBinaryMat(BinaryOpCode::MAX, result, result, localAddRes[o], _data._ctx); + break; + default: + throw std::runtime_error("not implemented"); + break; } + _resLock.unlock(); + // cleanup + DataObjectFactory::destroy(localAddRes[o]); } } } -template uint64_t CompiledPipelineTask>::getTaskSize() { return _data._ru - _data._rl; } +template uint64_t CompiledPipelineTask>::getTaskSize() { return _data._du - _data._dl; } template void CompiledPipelineTask>::accumulateOutputs(std::vector *> &localResults, std::vector *> &localAddRes, - uint64_t rowStart, uint64_t rowEnd) { + uint64_t dimStart, uint64_t dimEnd) { // TODO: in-place computation via better compiled pipelines // TODO: multi-return for (auto o = 0u; o < _data._numOutputs; ++o) { auto &result = (*_res[o]); switch (_data._combines[o]) { - case VectorCombine::ROWS: { - auto slice = result->sliceRow(rowStart - _data._offset, rowEnd - _data._offset); - // TODO It's probably more efficient to memcpy than to get/set. - // But eventually, we don't want to copy at all. - for (auto i = 0u; i < slice->getNumRows(); ++i) { - for (auto j = 0u; j < slice->getNumCols(); ++j) { - slice->set(i, j, localResults[o]->get(i, j)); + case VectorCombine::ROWS: { + auto slice = result->sliceRow(dimStart - _data._offset, dimEnd - _data._offset); + + //PAPI_hl_region_begin("fixme_rows"); + VT *sliceValues = slice->getValues(); + VT *localResultsValues = localResults[o]->getValues(); + for (auto i = 0u; i < slice->getNumRows(); ++i) { + for (auto j = 0u; j < slice->getNumCols(); ++j) { + sliceValues[i * slice->getRowSkip() + j] = + localResultsValues[i * localResults[o]->getRowSkip() + j]; + } } + //PAPI_hl_region_end("fixme_rows"); + + DataObjectFactory::destroy(slice); + break; } - DataObjectFactory::destroy(slice); - break; - } - case VectorCombine::COLS: { - auto slice = result->sliceCol(rowStart - _data._offset, rowEnd - _data._offset); - // TODO It's probably more efficient to memcpy than to get/set. - // But eventually, we don't want to copy at all. - for (auto i = 0u; i < slice->getNumRows(); ++i) { - for (auto j = 0u; j < slice->getNumCols(); ++j) { - slice->set(i, j, localResults[o]->get(i, j)); + case VectorCombine::COLS: { + + auto slice = result->sliceCol(dimStart - _data._offset, dimEnd - _data._offset); + + //PAPI_hl_region_begin("fixme_cols"); + VT *sliceValues = slice->getValues(); + VT *localResultsValues = localResults[o]->getValues(); + for (auto i = 0u; i < slice->getNumRows(); ++i) { + for (auto j = 0u; j < slice->getNumCols(); ++j) { + sliceValues[i * slice->getRowSkip() + j] = + localResultsValues[i * localResults[o]->getRowSkip() + j]; + } } + //PAPI_hl_region_end("fixme_cols"); + + DataObjectFactory::destroy(slice); + break; } - DataObjectFactory::destroy(slice); - break; - } - case VectorCombine::ADD: { - if (localAddRes[o] == nullptr) { - // take lres and reset it to nullptr - localAddRes[o] = localResults[o]; - localResults[o] = nullptr; - } else { - ewBinaryMat(BinaryOpCode::ADD, localAddRes[o], localAddRes[o], localResults[o], nullptr); + case VectorCombine::ADD: { + accumulateAggregate(localAddRes[o], localResults[0], BinaryOpCode::ADD); + break; + } + case VectorCombine::MAX: { + accumulateAggregate(localAddRes[o], localResults[0], BinaryOpCode::MAX); + break; + } + case VectorCombine::MIN: { + accumulateAggregate(localAddRes[o], localResults[0], BinaryOpCode::MIN); + break; + } + default: { + throw std::runtime_error(("VectorCombine case `" + + std::to_string(static_cast(_data._combines[o])) + "` not supported")); } - break; - } - default: { - throw std::runtime_error(("VectorCombine case `" + - std::to_string(static_cast(_data._combines[o])) + "` not supported")); - } } } } +template +void CompiledPipelineTask>::accumulateAggregate(DenseMatrix*& localAddRes, + DenseMatrix*& localResult, + BinaryOpCode opCode) { + if (localAddRes == nullptr) { + // take lres and reset it to nullptr + localAddRes = localResult; + localResult = nullptr; + } else { + ewBinaryMat(opCode, localAddRes, localAddRes, localResult, nullptr); + } +} + + +//----------------------------------------------------------------------------- + template void CompiledPipelineTask>::execute(uint32_t fid, uint32_t batchSize) { std::vector localResNumRows(_data._numOutputs); std::vector localResNumCols(_data._numOutputs); @@ -125,7 +180,7 @@ template void CompiledPipelineTask>::execute(uint32_ if (_data._wholeResultCols[i] == -1) throw std::runtime_error("TODO: CompiledPipeLineTask (CSRMatrix) Rows " "_data._wholeResultCols[i] == -1"); - localResNumRows[i] = _data._ru - _data._rl; + localResNumRows[i] = _data._du - _data._dl; localResNumCols[i] = _data._wholeResultCols[i]; break; } @@ -134,7 +189,7 @@ template void CompiledPipelineTask>::execute(uint32_ throw std::runtime_error("TODO: CompiledPipeLineTask (CSRMatrix) Cols " "_data._wholeResultRows[i] == -1"); localResNumRows[i] = _data._wholeResultRows[i]; - localResNumCols[i] = _data._ru - _data._rl; + localResNumCols[i] = _data._du - _data._dl; break; } default: @@ -148,11 +203,11 @@ template void CompiledPipelineTask>::execute(uint32_ new VectorizedDataSink>(_data._combines[i], localResNumRows[i], localResNumCols[i]); std::vector *> lres(_data._numOutputs, nullptr); - for (uint64_t r = _data._rl; r < _data._ru; r += batchSize) { + for (uint64_t d = _data._dl; d < _data._du; d += batchSize) { // create zero-copy views of inputs/outputs - uint64_t r2 = std::min(r + batchSize, _data._ru); + uint64_t d2 = std::min(d + batchSize, _data._du); - auto linputs = this->createFuncInputs(r, r2); + auto linputs = this->createFuncInputs(d, d2); CSRMatrix ***outputs = new CSRMatrix **[_data._numOutputs]; for (size_t i = 0; i < _data._numOutputs; i++) outputs[i] = &(lres[i]); @@ -160,7 +215,7 @@ template void CompiledPipelineTask>::execute(uint32_ _data._funcs[fid](outputs, linputs.data(), _data._ctx); delete[] outputs; for (size_t i = 0; i < _data._numOutputs; i++) - localSinks[i]->add(lres[i], r - _data._rl, false); + localSinks[i]->add(lres[i], d - _data._dl, false); // cleanup for (size_t i = 0; i < _data._numOutputs; i++) @@ -171,12 +226,12 @@ template void CompiledPipelineTask>::execute(uint32_ // here. } for (size_t i = 0; i < _data._numOutputs; i++) { - _resultSinks[i]->add(localSinks[i]->consume(), _data._rl); + _resultSinks[i]->add(localSinks[i]->consume(), _data._dl); delete localSinks[i]; } } -template uint64_t CompiledPipelineTask>::getTaskSize() { return _data._ru - _data._rl; } +template uint64_t CompiledPipelineTask>::getTaskSize() { return _data._du - _data._dl; } template class CompiledPipelineTask>; template class CompiledPipelineTask>; diff --git a/src/runtime/local/vectorized/Tasks.h b/src/runtime/local/vectorized/Tasks.h index e7b4f783f..83f80fe3c 100644 --- a/src/runtime/local/vectorized/Tasks.h +++ b/src/runtime/local/vectorized/Tasks.h @@ -57,17 +57,17 @@ template struct CompiledPipelineTaskData { const int64_t *_outCols; const VectorSplit *_splits; const VectorCombine *_combines; - const uint64_t _rl; // row lower index - const uint64_t _ru; // row upper index + const uint64_t _dl; // dim lower index + const uint64_t _du; // dim upper index const int64_t *_wholeResultRows; // number of rows of the complete result const int64_t *_wholeResultCols; // number of cols of the complete result const uint64_t _offset; DCTX(_ctx); - [[maybe_unused]] CompiledPipelineTaskData
withDifferentRange(uint64_t newRl, uint64_t newRu) { + [[maybe_unused]] CompiledPipelineTaskData
withDifferentRange(uint64_t newDl, uint64_t newDu) { CompiledPipelineTaskData
flatCopy = *this; - flatCopy._rl = newRl; - flatCopy._ru = newRu; + flatCopy._dl = newDl; + flatCopy._du = newDu; return flatCopy; } }; @@ -83,10 +83,12 @@ template class CompiledPipelineTaskBase : public Task { protected: bool isBroadcast(mlir::daphne::VectorSplit splitMethod, Structure *input) { - return splitMethod == VectorSplit::NONE || (splitMethod == VectorSplit::ROWS && input->getNumRows() == 1); + return splitMethod == VectorSplit::NONE || + (splitMethod == VectorSplit::ROWS && input->getNumRows() == 1) || + (splitMethod == VectorSplit::COLS && input->getNumCols() == 1); } - std::vector createFuncInputs(uint64_t rowStart, uint64_t rowEnd) { + std::vector createFuncInputs(uint64_t dimStart, uint64_t dimEnd) { std::vector linputs; for (auto i = 0u; i < _data._numInputs; i++) { if (isBroadcast(_data._splits[i], _data._inputs[i])) { @@ -101,8 +103,12 @@ template class CompiledPipelineTaskBase : public Task { // alternative. _data._inputs[i]->increaseRefCounter(); } else if (VectorSplit::ROWS == _data._splits[i]) { - linputs.push_back(_data._inputs[i]->sliceRow(rowStart, rowEnd)); - } else { + linputs.push_back(_data._inputs[i]->sliceRow(dimStart, dimEnd)); + } + else if(VectorSplit::COLS == _data._splits[i]) { + linputs.push_back(_data._inputs[i]->sliceCol(dimStart, dimEnd)); + } + else { llvm_unreachable("Not all vector splits handled"); } } @@ -126,7 +132,9 @@ template class CompiledPipelineTask> : public Comp private: void accumulateOutputs(std::vector *> &localResults, std::vector *> &localAddRes, - uint64_t rowStart, uint64_t rowEnd); + uint64_t dimStart, uint64_t dimEnd); + void accumulateAggregate(DenseMatrix*& localAddRes, DenseMatrix* &localResult, + BinaryOpCode opCode); }; template class CompiledPipelineTask> : public CompiledPipelineTaskBase> {