data_sharing.cu
11.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
//===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the implementation of data sharing environments
//
//===----------------------------------------------------------------------===//
#include "common/omptarget.h"
#include "target_impl.h"
// Return true if this is the master thread.
INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
}
////////////////////////////////////////////////////////////////////////////////
// Runtime functions for trunk data sharing scheme.
////////////////////////////////////////////////////////////////////////////////
INLINE static void data_sharing_init_stack_common() {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
omptarget_nvptx_TeamDescr *teamDescr =
&omptarget_nvptx_threadPrivateContext->TeamContext();
for (int WID = 0; WID < DS_Max_Warp_Number; WID++) {
__kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
DataSharingState.SlotPtr[WID] = RootS;
DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
}
}
// Initialize data sharing data structure. This function needs to be called
// once at the beginning of a data sharing context (coincides with the kernel
// initialization). This function is called only by the MASTER thread of each
// team in non-SPMD mode.
EXTERN void __kmpc_data_sharing_init_stack() {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
// This function initializes the stack pointer with the pointer to the
// statically allocated shared memory slots. The size of a shared memory
// slot is pre-determined to be 256 bytes.
data_sharing_init_stack_common();
omptarget_nvptx_globalArgs.Init();
}
// Initialize data sharing data structure. This function needs to be called
// once at the beginning of a data sharing context (coincides with the kernel
// initialization). This function is called in SPMD mode only.
EXTERN void __kmpc_data_sharing_init_stack_spmd() {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
// This function initializes the stack pointer with the pointer to the
// statically allocated shared memory slots. The size of a shared memory
// slot is pre-determined to be 256 bytes.
if (GetThreadIdInBlock() == 0)
data_sharing_init_stack_common();
__kmpc_impl_threadfence_block();
}
INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
// Only warp active master threads manage the stack.
bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;
// Add worst-case padding to DataSize so that future stack allocations are
// correctly aligned.
const size_t Alignment = 8;
PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
// Frame pointer must be visible to all workers in the same warp.
const unsigned WID = GetWarpId();
void *FrameP = 0;
__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
if (IsWarpMaster) {
// SlotP will point to either the shared memory slot or an existing
// global memory slot.
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
// Check if we have room for the data in the current slot.
const uintptr_t StartAddress = (uintptr_t)StackP;
const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;
// If we requested more data than there is room for in the rest
// of the slot then we need to either re-use the next slot, if one exists,
// or create a new slot.
if (EndAddress < RequestedEndAddress) {
__kmpc_data_sharing_slot *NewSlot = 0;
size_t NewSize = PushSize;
// Allocate at least the default size for each type of slot.
// Master is a special case and even though there is only one thread,
// it can share more things with the workers. For uniformity, it uses
// the full size of a worker warp slot.
size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
if (DefaultSlotSize > NewSize)
NewSize = DefaultSlotSize;
NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
sizeof(__kmpc_data_sharing_slot) + NewSize,
"Global memory slot allocation.");
NewSlot->Next = 0;
NewSlot->Prev = SlotP;
NewSlot->PrevSlotStackPtr = StackP;
NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
// Make previous slot point to the newly allocated slot.
SlotP->Next = NewSlot;
// The current slot becomes the new slot.
SlotP = NewSlot;
// The stack pointer always points to the next free stack frame.
StackP = &NewSlot->Data[0] + PushSize;
// The frame pointer always points to the beginning of the frame.
FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0];
} else {
// Add the data chunk to the current slot. The frame pointer is set to
// point to the start of the new frame held in StackP.
FrameP = DataSharingState.FramePtr[WID] = StackP;
// Reset stack pointer to the requested address.
StackP = (void *)RequestedEndAddress;
}
}
// Get address from lane 0.
int *FP = (int *)&FrameP;
FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
if (sizeof(FrameP) == 8)
FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
return FrameP;
}
EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
int16_t UseSharedMemory) {
return data_sharing_push_stack_common(DataSize);
}
// Called at the time of the kernel initialization. This is used to initilize
// the list of references to shared variables and to pre-allocate global storage
// for holding the globalized variables.
//
// By default the globalized variables are stored in global memory. If the
// UseSharedMemory is set to true, the runtime will attempt to use shared memory
// as long as the size requested fits the pre-allocated size.
EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
int16_t UseSharedMemory) {
// Compute the total memory footprint of the requested data.
// The master thread requires a stack only for itself. A worker
// thread (which at this point is a warp master) will require
// space for the variables of each thread in the warp,
// i.e. one DataSize chunk per warp lane.
// TODO: change WARPSIZE to the number of active threads in the warp.
size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode()))
? DataSize
: WARPSIZE * DataSize;
// Compute the start address of the frame of each thread in the warp.
uintptr_t FrameStartAddress =
(uintptr_t) data_sharing_push_stack_common(PushSize);
FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize);
return (void *)FrameStartAddress;
}
// Pop the stack and free any memory which can be reclaimed.
//
// When the pop operation removes the last global memory slot,
// reclaim all outstanding global memory slots since it is
// likely we have reached the end of the kernel.
EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
__kmpc_impl_threadfence_block();
if (GetThreadIdInBlock() % WARPSIZE == 0) {
unsigned WID = GetWarpId();
// Current slot
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
// Pointer to next available stack.
void *&StackP = DataSharingState.StackPtr[WID];
// Pop the frame.
StackP = FrameStart;
// If the current slot is empty, we need to free the slot after the
// pop.
bool SlotEmpty = (StackP == &SlotP->Data[0]);
if (SlotEmpty && SlotP->Prev) {
// Before removing the slot we need to reset StackP.
StackP = SlotP->PrevSlotStackPtr;
// Remove the slot.
SlotP = SlotP->Prev;
SafeFree(SlotP->Next, "Free slot.");
SlotP->Next = 0;
}
}
}
// Begin a data sharing context. Maintain a list of references to shared
// variables. This list of references to shared variables will be passed
// to one or more threads.
// In L0 data sharing this is called by master thread.
// In L1 data sharing this is called by active warp master thread.
EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
omptarget_nvptx_globalArgs.EnsureSize(nArgs);
*GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
}
// End a data sharing context. There is no need to have a list of refs
// to shared variables because the context in which those variables were
// shared has now ended. This should clean-up the list of references only
// without affecting the actual global storage of the variables.
// In L0 data sharing this is called by master thread.
// In L1 data sharing this is called by active warp master thread.
EXTERN void __kmpc_end_sharing_variables() {
omptarget_nvptx_globalArgs.DeInit();
}
// This function will return a list of references to global variables. This
// is how the workers will get a reference to the globalized variable. The
// members of this list will be passed to the outlined parallel function
// preserving the order.
// Called by all workers.
EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
*GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
}
// This function is used to init static memory manager. This manager is used to
// manage statically allocated global memory. This memory is allocated by the
// compiler and used to correctly implement globalization of the variables in
// target, teams and distribute regions.
EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
const void *buf, size_t size,
int16_t is_shared,
const void **frame) {
if (is_shared) {
*frame = buf;
return;
}
if (isSPMDExecutionMode) {
if (GetThreadIdInBlock() == 0) {
*frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
}
__kmpc_impl_syncthreads();
return;
}
ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
"Must be called only in the target master thread.");
*frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
__kmpc_impl_threadfence();
}
EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
int16_t is_shared) {
if (is_shared)
return;
if (isSPMDExecutionMode) {
__kmpc_impl_syncthreads();
if (GetThreadIdInBlock() == 0) {
omptarget_nvptx_simpleMemoryManager.Release();
}
return;
}
__kmpc_impl_threadfence();
ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
"Must be called only in the target master thread.");
omptarget_nvptx_simpleMemoryManager.Release();
}