-
Notifications
You must be signed in to change notification settings - Fork 0
/
toolkit.cl
165 lines (160 loc) · 5.21 KB
/
toolkit.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/*
toolkit.cl
Helper functions for CLPKM generated code
*/
//
// Memory copy functions
//
void __clpkm_load_private_align_4(__global void * __lvb,
__private const void * __dst,
size_t __size) {
// OpenCL 1.2 §6.3.k: The sizeof operator yields the size (in bytes) of its
// operand, including any padding bytes needed for alignment
// So I think if the alignment is 4, the size must be multiple of 4
__global uint * __w_lvb = (__global uint *) __lvb;
__private uint * __w_dst = (__private uint *) __dst;
while (__size >= 4) {
* __w_dst++ = * __w_lvb++;
__size -= 4;
}
}
void __clpkm_store_private_align_4(__global void * __lvb,
__private const void * __src,
size_t __size) {
__global uint * __w_lvb = (__global uint *) __lvb;
__private uint * __w_src = (__private uint *) __src;
while (__size >= 4) {
* __w_lvb++ = * __w_src++;
__size -= 4;
}
}
void __clpkm_load_private_align_2(__global void * __lvb,
__private const void * __dst,
size_t __size) {
__global ushort * __w_lvb = (__global ushort *) __lvb;
__private ushort * __w_dst = (__private ushort *) __dst;
while (__size >= 2) {
* __w_dst++ = * __w_lvb++;
__size -= 2;
}
}
void __clpkm_store_private_align_2(__global void * __lvb,
__private const void * __src,
size_t __size) {
__global ushort * __w_lvb = (__global ushort *) __lvb;
__private ushort * __w_src = (__private ushort *) __src;
while (__size >= 2) {
* __w_lvb++ = * __w_src++;
__size -= 2;
}
}
void __clpkm_load_private(__global void * __lvb,
__private const void * __dst,
size_t __size) {
__global uchar * __w_lvb = (__global uchar *) __lvb;
__private uchar * __w_dst = (__private uchar *) __dst;
while (__size > 0) {
* __w_dst++ = * __w_lvb++;
--__size;
}
}
void __clpkm_store_private(__global void * __lvb,
__private const void * __src,
size_t __size) {
__global uchar * __w_lvb = (__global uchar *) __lvb;
__private uchar * __w_src = (__private uchar *) __src;
while (__size > 0) {
* __w_lvb++ = * __w_src++;
--__size;
}
}
// XXX: assumption: __local variables are always 4-aligned
void __clpkm_store_local(__global void * __lvb, __local const void * __src,
size_t __size, size_t __loc_id, size_t __batch_size) {
__global uint * __w_lvb = ((__global uint *) __lvb) + __loc_id;
__local uint * __w_src = ((__local uint *) __src) + __loc_id;
__local uchar * __b_last = ((__local uchar *) __src) + __size;
// This loop keeps running until there are less than 4 bytes left
while (((__local uchar *) __w_src) + 4 <= __b_last) {
* __w_lvb = * __w_src;
__w_lvb += __batch_size;
__w_src += __batch_size;
}
__global uchar * __b_lvb = (__global uchar *) __w_lvb;
__local uchar * __b_src = (__local uchar *) __w_src;
if (__b_src < __b_last) {
switch (__b_last - __b_src) {
case 3:
* __b_lvb++ = * __b_src++;
case 2:
* __b_lvb++ = * __b_src++;
case 1:
* __b_lvb++ = * __b_src++;
break;
default:
__builtin_unreachable();
}
}
}
//
// Work related functions
//
void __get_linear_id(size_t * __global_id, size_t * __group_id,
size_t * __local_id, size_t * __group_size) {
uint __dim = get_work_dim();
size_t __grp_id = 0; // group id
size_t __loc_id = 0; // local id within a work group
size_t __grp_sz = 1; // num of work-items in each work group
while (__dim-- > 0) {
__grp_id = __grp_id * get_num_groups(__dim) + get_group_id(__dim);
__loc_id = __loc_id * get_local_size(__dim) + get_local_id(__dim);
__grp_sz = __grp_sz * get_local_size(__dim);
}
* __global_id = __grp_id * __grp_sz + __loc_id;
* __group_id = __grp_id;
* __local_id = __loc_id;
* __group_size = __grp_sz;
}
//
// CR-related stuff
//
#if 1
ulong clock64(void) {
ulong __clock_val;
asm volatile ("mov.u64 %0, %%clock64;"
: /* output */ "=l"(__clock_val)
: /* input */
: /* clobbers */ // "memory"
);
return __clock_val;
}
ulong clock32_lo(void) {
uint __clock_val;
asm volatile ("mov.u32 %0, %%clock;"
: /* output */ "=r"(__clock_val)
: /* input */
: /* clobbers */ // "memory"
);
return __clock_val;
}
ulong clock32_hi(void) {
uint __clock_val;
asm volatile ("mov.u32 %0, %%clock_hi;"
: /* output */ "=r"(__clock_val)
: /* input */
: /* clobbers */ // "memory"
);
return __clock_val;
}
#endif
void __clpkm_init_cost_ctr(uint * __cost_ctr, const uint __clpkm_tlv) {
//* __cost_ctr = 0;
* __cost_ctr = (uint)(clock64() >> 10);
}
void __clpkm_update_ctr(uint * __cost_ctr, uint __esti_cost) {
//* __cost_ctr += __esti_cost;
}
bool __clpkm_should_chkpnt(uint __cost_ctr, uint __clpkm_tlv) {
//return __cost_ctr > __clpkm_tlv;
return ((uint)(clock64() >> 10) - __cost_ctr) > __clpkm_tlv;
}