gpu.h 13.8 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459
#ifndef _GPU_H
#define _GPU_H

#include <isl/ast.h>
#include <isl/id.h>
#include <isl/id_to_ast_expr.h>

#include <pet.h>

#include "ppcg.h"
#include "ppcg_options.h"

/* An access to an outer array element or an iterator.
 * Accesses to iterators have an access relation that maps to an unnamed space.
 * An access may be both read and write.
 * If the access relation is empty, then the output dimension may
 * not be equal to the dimension of the corresponding array.
 */
struct gpu_stmt_access {
	/* Access reads elements */
	int read;
	/* Access writes elements */
	int write;
	/* All writes are definite writes. */
	int exact_write;
	/* Is a single, fixed element being accessed? */
	isl_bool fixed_element;
	/* The number of index expressions specified in the access. */
	int n_index;

	/* May access relation */
	isl_map *access;
	/* May access relation with as domain a mapping from iteration domain
	 * to a reference identifier.
	 */
	isl_map *tagged_access;
	/* The reference id of the corresponding pet_expr. */
	isl_id *ref_id;

	struct gpu_stmt_access *next;
};

/* A representation of a user statement.
 * "stmt" points to the corresponding pet statement.
 * "id" is the identifier of the instance set of the statement.
 * "accesses" is a linked list of accesses performed by the statement.
 * If the statement has been killed, i.e., if it will not be scheduled,
 * then this linked list may be empty even if the actual statement does
 * perform accesses.
 */
struct gpu_stmt {
	isl_id *id;
	struct pet_stmt *stmt;

	struct gpu_stmt_access *accesses;
};

/* Represents an outer array possibly accessed by a gpu_prog.
 */
struct gpu_array_info {
	/* The array data space. */
	isl_space *space;
	/* Element type. */
	char *type;
	/* Element size. */
	int size;
	/* Name of the array. */
	char *name;
	/* Declared extent of original array. */
	isl_set *declared_extent;
	/* AST expression for declared size of original array. */
	isl_ast_expr *declared_size;
	/* Extent of the array that needs to be copied. */
	isl_set *extent;
	/* Number of indices. */
	unsigned n_index;
	/* For each index, a bound on "extent" in that direction. */
	isl_multi_pw_aff *bound;
	/* The corresponding access AST expression, if the array needs
	 * to be allocated on the device.
	 */
	isl_ast_expr *bound_expr;

	/* All references to this array; point to elements of a linked list. */
	int n_ref;
	struct gpu_stmt_access **refs;

	/* Is this array accessed at all by the program? */
	int accessed;

	/* Is this a scalar that is read-only within the entire program? */
	int read_only_scalar;

	/* Are the elements of the array structures? */
	int has_compound_element;

	/* Are the elements only accessed through constant index expressions? */
	int only_fixed_element;

	/* Is the array local to the scop? */
	int local;
	/* Is the array local and should it be declared on the host? */
	int declare_local;

	/* Is the corresponding global device memory accessed in any way? */
	int global;

	/* Should the array be linearized? */
	int linearize;

	/* Order dependences on this array.
	 * Only used if live_range_reordering option is set.
	 * It is set to NULL otherwise.
	 */
	isl_union_map *dep_order;

    void *user;
};

/* Represents an outer array accessed by a ppcg_kernel, localized
 * to the context of this kernel.
 *
 * "array" points to the corresponding array in the gpu_prog.
 * The "n_group" "groups" are the reference groups associated to the array.
 * If "force_private" is set, then the array (in practice a scalar)
 * must be mapped to a register.
 * "global" is set if the global device memory corresponding
 * to this array is accessed by the kernel.
 * "bound" is equal to array->bound specialized to the current kernel.
 * "bound_expr" is the corresponding access AST expression.
 */
struct gpu_local_array_info {
	struct gpu_array_info *array;

	int n_group;
	struct gpu_array_ref_group **groups;

	int force_private;
	int global;

	unsigned n_index;
	isl_multi_pw_aff *bound;
	isl_ast_expr *bound_expr;
};

__isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
	struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr);

/* A sequence of "n" names of types.
 */
struct gpu_types {
	int n;
	char **name;
};

/* "read" and "write" contain the original access relations, possibly
 * involving member accesses.
 *
 * The elements of "array", as well as the ranges of "copy_in" and "copy_out"
 * only refer to the outer arrays of any possible member accesses.
 */
struct gpu_prog {
	isl_ctx *ctx;

	struct ppcg_scop *scop;

	/* Set of parameter values */
	isl_set *context;

	/* All potential read accesses in the entire program */
	isl_union_map *read;

	/* All potential write accesses in the entire program */
	isl_union_map *may_write;
	/* All definite write accesses in the entire program */
	isl_union_map *must_write;
	/* All tagged definite kills in the entire program */
	isl_union_map *tagged_must_kill;

	/* The set of inner array elements that may be preserved. */
	isl_union_set *may_persist;

	/* A mapping from all innermost arrays to their outer arrays. */
	isl_union_map *to_outer;
	/* A mapping from the outer arrays to all corresponding inner arrays. */
	isl_union_map *to_inner;
	/* A mapping from all intermediate arrays to their outer arrays,
	 * including an identity mapping from the anonymous 1D space to itself.
	 */
	isl_union_map *any_to_outer;

	/* Order dependences on non-scalars. */
	isl_union_map *array_order;

	/* Array of statements */
	int n_stmts;
	struct gpu_stmt *stmts;

	int n_array;
	struct gpu_array_info *array;
};

struct gpu_gen {
	isl_ctx *ctx;
	struct ppcg_options *options;

	/* Callback for printing of AST in appropriate format. */
	__isl_give isl_printer *(*print)(__isl_take isl_printer *p,
		struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
		struct gpu_types *types, void *user);
	void *print_user;

    isl_id_to_ast_expr *(*build_ast_expr)(void *stmt,
            isl_ast_build *build,
            isl_multi_pw_aff *(*fn_index)(
                __isl_take isl_multi_pw_aff *mpa, isl_id *id,
                void *user),
            void *user_index,
            isl_ast_expr *(*fn_expr)(isl_ast_expr *expr,
                isl_id *id, void *user),
        void *user_expr);

	struct gpu_prog *prog;
	/* The generated AST. */
	isl_ast_node *tree;

	/* The sequence of types for which a definition has been printed. */
	struct gpu_types types;

	/* User specified tile, grid and block sizes for each kernel */
	isl_union_map *sizes;

	/* Effectively used tile, grid and block sizes for each kernel */
	isl_union_map *used_sizes;

	/* Identifier of the next kernel. */
	int kernel_id;
};

enum ppcg_group_access_type {
	ppcg_access_global,
	ppcg_access_shared,
	ppcg_access_private
};

enum ppcg_kernel_stmt_type {
	ppcg_kernel_copy,
	ppcg_kernel_domain,
	ppcg_kernel_sync
};

/* Representation of special statements, in particular copy statements
 * and __syncthreads statements, inside a kernel.
 *
 * type represents the kind of statement
 *
 *
 * for ppcg_kernel_copy statements we have
 *
 * read is set if the statement should copy data from global memory
 * to shared memory or registers.
 *
 * index expresses an access to the array element that needs to be copied
 * local_index expresses the corresponding element in the tile
 *
 * array refers to the original array being copied
 * local_array is a pointer to the appropriate element in the "array"
 *	array of the ppcg_kernel to which this copy access belongs
 *
 *
 * for ppcg_kernel_domain statements we have
 *
 * stmt is the corresponding input statement
 *
 * n_access is the number of accesses in stmt
 * access is an array of local information about the accesses
 */
struct ppcg_kernel_stmt {
	enum ppcg_kernel_stmt_type type;

	union {
		struct {
			int read;
			isl_ast_expr *index;
			isl_ast_expr *local_index;
			struct gpu_array_info *array;
			struct gpu_local_array_info *local_array;
		} c;
		struct {
			struct gpu_stmt *stmt;
			isl_id_to_ast_expr *ref2expr;
		} d;
	} u;
};

/* Representation of a local variable in a kernel.
 */
struct ppcg_kernel_var {
	struct gpu_array_info *array;
	enum ppcg_group_access_type type;
	char *name;
	isl_vec *size;
};

/* Representation of a kernel.
 *
 * prog describes the original code from which the kernel is extracted.
 *
 * id is the sequence number of the kernel.
 *
 * block_ids contains the list of block identifiers for this kernel.
 * thread_ids contains the list of thread identifiers for this kernel.
 *
 * the first n_grid elements of grid_dim represent the specified size
 * of the grid.
 * the first n_block elements of block_dim represent the specified or
 * effective size of the block.
 * Note that in the input file, the sizes of the grid and the blocks
 * are specified in the order x, y, z, but internally, the sizes
 * are stored in reverse order, so that the last element always
 * refers to the x dimension.
 *
 * grid_size reflects the effective grid size.
 * grid_size_expr contains a corresponding access AST expression, built within
 * the context where the launch appears.
 *
 * context contains the values of the parameters and outer schedule dimensions
 * for which any statement instance in this kernel needs to be executed.
 *
 * n_sync is the number of synchronization operations that have
 * been introduced in the schedule tree corresponding to this kernel (so far).
 *
 * core contains the spaces of the statement domains that form
 * the core computation of the kernel.  It is used to navigate
 * the tree during the construction of the device part of the schedule
 * tree in gpu_create_kernel.
 *
 * expanded_domain contains the original statement instances,
 * i.e., those that appear in the domains of access relations,
 * that are involved in the kernel.
 * contraction maps those original statement instances to
 * the statement instances that are active at the point
 * in the schedule tree where the kernel is created.
 *
 * arrays is the set of possibly accessed outer array elements.
 *
 * space is the schedule space of the AST context.  That is, it represents
 * the loops of the generated host code containing the kernel launch.
 *
 * n_array is the total number of arrays in the input program and also
 * the number of element in the array array.
 * array contains information about each array that is local
 * to the current kernel.  If an array is not used in a kernel,
 * then the corresponding entry does not contain any information.
 *
 * any_force_private is set if any array in the kernel is marked force_private
 *
 * block_filter contains constraints on the domain elements in the kernel
 * that encode the mapping to block identifiers, where the block identifiers
 * are represented by "n_grid" parameters with as names the elements
 * of "block_ids".
 *
 * thread_filter contains constraints on the domain elements in the kernel
 * that encode the mapping to thread identifiers, where the thread identifiers
 * are represented by "n_block" parameters with as names the elements
 * of "thread_ids".
 *
 * copy_schedule corresponds to the schedule dimensions of
 * the (tiled) schedule for this kernel that have been taken into account
 * for computing private/shared memory tiles.
 * The domain corresponds to the original statement instances, i.e.,
 * those that appear in the leaves of the schedule tree.
 * copy_schedule_dim is the dimension of this schedule.
 *
 * sync_writes contains write references that require synchronization.
 * Each reference is represented by a universe set in a space [S[i,j] -> R[]]
 * with S[i,j] the statement instance space and R[] the array reference.
 */
struct ppcg_kernel {
	isl_ctx *ctx;
	struct ppcg_options *options;

	struct gpu_prog *prog;

	int id;

	isl_id_list *block_ids;
	isl_id_list *thread_ids;

	int n_grid;
	int n_block;
	int grid_dim[2];
	int block_dim[3];

	isl_multi_pw_aff *grid_size;
	isl_ast_expr *grid_size_expr;
	isl_set *context;

	int n_sync;
	isl_union_set *core;
	isl_union_set *arrays;

	isl_union_pw_multi_aff *contraction;
	isl_union_set *expanded_domain;

	isl_space *space;

	int n_array;
	struct gpu_local_array_info *array;

	int n_var;
	struct ppcg_kernel_var *var;

	int any_force_private;

	isl_union_set *block_filter;
	isl_union_set *thread_filter;
	isl_union_pw_multi_aff *copy_schedule;
	int copy_schedule_dim;

	isl_union_set *sync_writes;

	isl_ast_node *tree;
};

int gpu_array_is_scalar(struct gpu_array_info *array);
int gpu_array_is_read_only_scalar(struct gpu_array_info *array);
int gpu_array_requires_device_allocation(struct gpu_array_info *array);
__isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array);
isl_bool gpu_array_can_be_private(struct gpu_array_info *array);

struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop);
void *gpu_prog_free(struct gpu_prog *prog);

int ppcg_kernel_requires_array_argument(struct ppcg_kernel *kernel, int i);

int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
	struct ppcg_options *options,
	__isl_give isl_printer *(*print)(__isl_take isl_printer *p,
		struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
		struct gpu_types *types, void *user), void *user);

__isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen,
	__isl_take isl_schedule_node *node, int scale,
	__isl_keep isl_multi_val *sizes);

__isl_give isl_schedule *get_schedule(struct gpu_gen *gen);
int has_any_permutable_node(__isl_keep isl_schedule *schedule);
__isl_give isl_schedule *map_to_device(struct gpu_gen *gen,
                                       __isl_take isl_schedule *schedule,
                                      int to_from_device);
__isl_give isl_ast_node *generate_code(struct gpu_gen *gen,
                                       __isl_take isl_schedule *schedule);

__isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog);
void collect_references(struct gpu_prog *prog, struct gpu_array_info *array);
void collect_order_dependences(struct gpu_prog *prog);
isl_bool only_fixed_element_accessed(struct gpu_array_info *array);
#endif