LoopsToGPUPass.cpp
6.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
//===- LoopsToGPUPass.cpp - Convert a loop nest to a GPU kernel -----------===//
//
// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h"
#include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h"
#include "mlir/Dialect/AffineOps/AffineOps.h"
#include "mlir/Dialect/LoopOps/LoopOps.h"
#include "mlir/Dialect/StandardOps/Ops.h"
#include "mlir/Pass/Pass.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Support/CommandLine.h"
#define PASS_NAME "convert-loops-to-gpu"
#define LOOPOP_TO_GPU_PASS_NAME "convert-loop-op-to-gpu"
using namespace mlir;
using namespace mlir::loop;
static llvm::cl::OptionCategory clOptionsCategory(PASS_NAME " options");
static llvm::cl::opt<unsigned>
clNumBlockDims("gpu-block-dims",
llvm::cl::desc("Number of GPU block dimensions for mapping"),
llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u));
static llvm::cl::opt<unsigned> clNumThreadDims(
"gpu-thread-dims",
llvm::cl::desc("Number of GPU thread dimensions for mapping"),
llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u));
static llvm::cl::OptionCategory clLoopOpToGPUCategory(LOOPOP_TO_GPU_PASS_NAME
" options");
static llvm::cl::list<unsigned>
clNumWorkGroups("gpu-num-workgroups",
llvm::cl::desc("Num workgroups in the GPU launch"),
llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
llvm::cl::cat(clLoopOpToGPUCategory));
static llvm::cl::list<unsigned>
clWorkGroupSize("gpu-workgroup-size",
llvm::cl::desc("Workgroup Size in the GPU launch"),
llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
llvm::cl::cat(clLoopOpToGPUCategory));
namespace {
// A pass that traverses top-level loops in the function and converts them to
// GPU launch operations. Nested launches are not allowed, so this does not
// walk the function recursively to avoid considering nested loops.
struct ForLoopMapper : public FunctionPass<ForLoopMapper> {
ForLoopMapper(unsigned numBlockDims, unsigned numThreadDims)
: numBlockDims(numBlockDims), numThreadDims(numThreadDims) {}
void runOnFunction() override {
for (Block &block : getFunction())
for (Operation &op : llvm::make_early_inc_range(block)) {
if (auto forOp = dyn_cast<AffineForOp>(&op)) {
if (failed(convertAffineLoopNestToGPULaunch(forOp, numBlockDims,
numThreadDims)))
signalPassFailure();
} else if (auto forOp = dyn_cast<ForOp>(&op)) {
if (failed(convertLoopNestToGPULaunch(forOp, numBlockDims,
numThreadDims)))
signalPassFailure();
}
}
}
unsigned numBlockDims;
unsigned numThreadDims;
};
// A pass that traverses top-level loops in the function and convertes them to
// GPU launch operations. The top-level loops itself does not have to be
// perfectly nested. The only requirement is that there be as many perfectly
// nested loops as the size of `numWorkGroups`. Within these any loop nest has
// to be perfectly nested upto depth equal to size of `workGroupSize`.
struct ImperfectlyNestedForLoopMapper
: public FunctionPass<ImperfectlyNestedForLoopMapper> {
ImperfectlyNestedForLoopMapper(ArrayRef<int64_t> numWorkGroups,
ArrayRef<int64_t> workGroupSize)
: numWorkGroups(numWorkGroups.begin(), numWorkGroups.end()),
workGroupSize(workGroupSize.begin(), workGroupSize.end()) {}
void runOnFunction() override {
// Insert the num work groups and workgroup sizes as constant values. This
// pass is only used for testing.
FuncOp funcOp = getFunction();
OpBuilder builder(funcOp.getOperation()->getRegion(0));
SmallVector<Value, 3> numWorkGroupsVal, workGroupSizeVal;
for (auto val : numWorkGroups) {
auto constOp = builder.create<ConstantOp>(
funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val));
numWorkGroupsVal.push_back(constOp);
}
for (auto val : workGroupSize) {
auto constOp = builder.create<ConstantOp>(
funcOp.getLoc(), builder.getIntegerAttr(builder.getIndexType(), val));
workGroupSizeVal.push_back(constOp);
}
for (Block &block : getFunction()) {
for (Operation &op : llvm::make_early_inc_range(block)) {
if (auto forOp = dyn_cast<ForOp>(&op)) {
if (failed(convertLoopToGPULaunch(forOp, numWorkGroupsVal,
workGroupSizeVal))) {
return signalPassFailure();
}
}
}
}
}
SmallVector<int64_t, 3> numWorkGroups;
SmallVector<int64_t, 3> workGroupSize;
};
} // namespace
std::unique_ptr<OpPassBase<FuncOp>>
mlir::createSimpleLoopsToGPUPass(unsigned numBlockDims,
unsigned numThreadDims) {
return std::make_unique<ForLoopMapper>(numBlockDims, numThreadDims);
}
std::unique_ptr<OpPassBase<FuncOp>>
mlir::createLoopToGPUPass(ArrayRef<int64_t> numWorkGroups,
ArrayRef<int64_t> workGroupSize) {
return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups,
workGroupSize);
}
static PassRegistration<ForLoopMapper>
registration(PASS_NAME, "Convert top-level loops to GPU kernels", [] {
return std::make_unique<ForLoopMapper>(clNumBlockDims.getValue(),
clNumThreadDims.getValue());
});
static PassRegistration<ImperfectlyNestedForLoopMapper> loopOpToGPU(
LOOPOP_TO_GPU_PASS_NAME, "Convert top-level loop::ForOp to GPU kernels",
[] {
SmallVector<int64_t, 3> numWorkGroups, workGroupSize;
numWorkGroups.assign(clNumWorkGroups.begin(), clNumWorkGroups.end());
workGroupSize.assign(clWorkGroupSize.begin(), clWorkGroupSize.end());
return std::make_unique<ImperfectlyNestedForLoopMapper>(numWorkGroups,
workGroupSize);
});