Newer
Older
from operator import mul
from operator import add
from functools import reduce
import copy
import math
from . import loop_enum as le
from . import buffer_enum as be
def get_comp_cost(layer):
Compute the total # of MAC computation, it is independent of other optimizations
Also it is independent of input size and input/filter stride
Total # of computation = OX*OY*IC*OC*ON*FX*FY
"""
cost = (
layer.wofm
* layer.hofm
* layer.nifm
* layer.nofm
* layer.nimg
* layer.wfil
* layer.hfil
)
return cost
def get_ideal_performance(layer, resource):
Compute the ideal runtime in cycles by assuming 100% PE array utilization
Ideal # of cycles = Total # of MAC computation / Total # of PEs
#LMEI Need to be modified if later when adding precision-scalable PE.
# of functional PE will change depending on different precision modes.
total_comp = get_comp_cost(layer)
number_pe = reduce(mul, resource.para_count_list, 1)
return runtime
def get_layer_size(layer):
Get size of ifmap, ofmap, filter of the layer
#LMEI ifmap_size should be able to calculate based on ofmap_size and input stride(IS) /filter stride(FS)
IX = IS*(OX-1) + FS*(FX-1) + 1
wifm = wistd*(wofm-1) + wfstd*(wfil-1) + 1
ifmap_size = layer.wifm * layer.hifm * layer.nifm * layer.nimg
ofmap_size = layer.wofm * layer.hofm * layer.nofm * layer.nimg
flmap_size = layer.wfil * layer.hfil * layer.nifm * layer.nofm
return [ifmap_size, ofmap_size, flmap_size]
def get_hinted_para(level, hint):
Get the actual total spatial unrolling size from loop schedule
assert hint
hinted_para = 1
for loop in range(le.NUM):
if loop in hint:
hinted_loop_para = hint[loop][level][2]
hinted_para *= hinted_loop_para
return hinted_para
def valid_dataflow(resource, hint):
Check if the actual spatial unrolling size from loop schedule meets the HW utilization requirement
by comparing it with real HW parallelism size * utilization threshold.
num_levels = resource.buffer_levels()
for level in range(num_levels):
if resource.paras[level].count != 1 and get_hinted_para(level, hint) < (
resource.paras[level].count * resource.utilization_threshold
):
return False
return True
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def get_if_access(resource, point, layer, mac_capacity=1):
"""
Returns the number of accesses to the inputs for each level.
"""
irrelevant_loops = [le.OC, le.FX, le.FY]
num_levels = resource.buffer_levels()
access_counts_per_level = []
for level in range(num_levels):
# general idea: total number of accesses = tiling at current level * block size * num_blocks
# block size = tiling without the irrelevant loops
# num_blocks = tiling at the levels above the current level
# multiply all the tiling factors at the current level
def multiply_tiling_factors():
# find the innermost loop among [OX, OY, IC, ON]
lowest_input_loop_index = min(
point.loop_orders[le.OX][level],
point.loop_orders[le.OY][level],
point.loop_orders[le.IC][level],
point.loop_orders[le.ON][level],
# these are partially relevant
point.loop_orders[le.FX][level],
point.loop_orders[le.FY][level],
)
# we can ignore OC if it is at a lower level than the innermost input loop
# FX, FY can't be ignored, because they are partially relevant
tiling = 1
for i in range(le.NUM):
if i in [le.OC]:
if point.loop_orders[i][level] > lowest_input_loop_index:
# if the loop is at a higher level than the innermost input loop, we need to consider it
tiling *= point.loop_blockings[i][level]
else:
tiling *= point.loop_blockings[i][level]
return tiling
# remove all the irrelevant loops from the tiling of the levels below
def calculate_block_size():
block_size = 1
for lower_level in range(level - 1, -1, -1):
for i in range(le.NUM):
if i not in irrelevant_loops:
if i == le.OX:
stride = layer.wstd
# stride should be ignored for L0 level or if FX/FY = 1
if (
lower_level == 0
or point.loop_blockings[le.FX][lower_level] == 1
):
stride = 1
block_size *= point.loop_blockings[i][
lower_level
] * stride + (point.loop_blockings[le.FX][lower_level] - 1)
stride = layer.hstd
# stride should be ignored for L0 level or if FX/FY = 1
if (
lower_level == 0
or point.loop_blockings[le.FY][lower_level] == 1
):
stride = 1
block_size *= point.loop_blockings[i][
lower_level
] * stride + (point.loop_blockings[le.FY][lower_level] - 1)
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
else:
block_size *= point.loop_blockings[i][lower_level]
block_size *= point.loop_partitionings[i][lower_level]
return block_size
def get_num_blocks():
# get tiling of the levels above the current level
num_blocks = 1
for i in range(level + 1, num_levels):
for j in range(le.NUM):
num_blocks *= point.loop_blockings[j][i]
return num_blocks
access_counts_per_level.append(
multiply_tiling_factors()
* calculate_block_size()
* get_num_blocks()
* resource.paras[level].count
)
# print("Accesses at each level: ", access_counts_per_level)
return access_counts_per_level
def get_if_access_old(level, point, layer, mac_capacity=1):
Get per element # of access of Input at current level
Not accurate because [FX, FY] is not totally irrelevant terms for ifmap..
#LMEI Need to be modified by using the concept of the dataset.
if level == 0 and mac_capacity == 0:
return layer.wfil * layer.hfil * layer.nofm // (layer.wstd * layer.hstd)
ex_order_index = min(
point.loop_orders[le.OX][level],
point.loop_orders[le.OY][level],
point.loop_orders[le.IC][level],
# if FX, FY, OC are at a lower level than the innermost input loop, they are irrelevant
fx_exclusive = point.loop_orders[le.FX][level] < ex_order_index
fy_exclusive = point.loop_orders[le.FY][level] < ex_order_index
oc_exclusive = point.loop_orders[le.OC][level] < ex_order_index
fx_acc = reduce(mul, point.loop_blockings[le.FX][level + fx_exclusive :], 1)
fy_acc = reduce(mul, point.loop_blockings[le.FY][level + fy_exclusive :], 1)
oc_acc = reduce(mul, point.loop_blockings[le.OC][level + oc_exclusive :], 1)
# No loop orders among unrolled loops, they have the same order
fx_par = reduce(mul, point.loop_partitionings[le.FX][level:], 1)
fy_par = reduce(mul, point.loop_partitionings[le.FY][level:], 1)
oc_par = reduce(mul, point.loop_partitionings[le.OC][level:], 1)
fx_acc * fy_acc * oc_acc * fx_par * fy_par * oc_par // (layer.wstd * layer.hstd)
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
def get_of_access(resource, point, layer, mac_capacity=1):
irrelevant_loops = [le.FX, le.FY, le.IC]
num_levels = resource.buffer_levels()
access_counts_per_level = []
for level in range(num_levels):
# general idea: total number of accesses = tiling at current level * block size * num_blocks
# block size = tiling without the irrelevant loops
# num_blocks = tiling at the levels above the current level
# multiply all the tiling factors at the current level
def multiply_tiling_factors():
lowest_relevant_loop_index = min(
point.loop_orders[le.OX][level],
point.loop_orders[le.OY][level],
point.loop_orders[le.OC][level],
point.loop_orders[le.ON][level],
)
# we can ignore OX,OY,ON since they are not relevant to the weight
tiling = 1
for i in range(le.NUM):
if i in irrelevant_loops:
if point.loop_orders[i][level] > lowest_relevant_loop_index:
tiling *= point.loop_blockings[i][level]
else:
tiling *= point.loop_blockings[i][level]
return tiling
# remove all the irrelevant loops from the tiling of the levels below
def calculate_block_size():
block_size = 1
for lower_level in range(level - 1, -1, -1):
for i in range(le.NUM):
if i not in irrelevant_loops:
block_size *= point.loop_blockings[i][lower_level]
block_size *= point.loop_partitionings[i][lower_level]
return block_size
def get_num_blocks():
# get tiling of the levels above the current level
num_blocks = 1
for i in range(level + 1, num_levels):
for j in range(le.NUM):
num_blocks *= point.loop_blockings[j][i]
return num_blocks
access_counts_per_level.append(
multiply_tiling_factors()
* calculate_block_size()
* get_num_blocks()
* resource.paras[level].count
)
# print("Accesses at each level: ", access_counts_per_level)
return access_counts_per_level
def get_of_access_old(level, point, layer, mac_capacity=1):
Get per element # of access of Output at current level
For output:
Relevant terms [OX, OY, OC, ON]
irrelevant terms [FX, FY, IC]
Calculating rule:
At lowest mem level (directly talk to MAC), calculate per element access
by timing all irrelevant terms [FX, FY, IC] together
For the rest higher mem levels,
firstly, check if there is stationary possibility
(irrelevant loops for filter [FX, FY, IC] are at the innermost position of this level)
if there is, exclude the irrelevant loop(s) from the current level's # of per element access computing
because they have been taken into account in lower level's # of per element access computing
secondly, calculate the current level's # of per element access
by multiplying all the irrelevant terms from current level to the highest level
including both temporal unrolling part and spatial unrolling part (parallelism).
return layer.wfil * layer.hfil * layer.nifm
ex_order_index = min(
point.loop_orders[le.OX][level],
point.loop_orders[le.OY][level],
point.loop_orders[le.OC][level],
fx_exclusive = point.loop_orders[le.FX][level] < ex_order_index
fy_exclusive = point.loop_orders[le.FY][level] < ex_order_index
ic_exclusive = point.loop_orders[le.IC][level] < ex_order_index
fx_acc = reduce(mul, point.loop_blockings[le.FX][level + fx_exclusive :], 1)
fy_acc = reduce(mul, point.loop_blockings[le.FY][level + fy_exclusive :], 1)
ic_acc = reduce(mul, point.loop_blockings[le.IC][level + ic_exclusive :], 1)
fx_par = reduce(mul, point.loop_partitionings[le.FX][level:], 1)
fy_par = reduce(mul, point.loop_partitionings[le.FY][level:], 1)
ic_par = reduce(mul, point.loop_partitionings[le.IC][level:], 1)
return fx_acc * fy_acc * ic_acc * fx_par * fy_par * ic_par
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
def get_fl_access(resource, point, layer, mac_capacity=1):
"""
Returns the number of accesses to the inputs for each level.
"""
irrelevant_loops = [le.OX, le.OY, le.ON]
num_levels = resource.buffer_levels()
access_counts_per_level = []
for level in range(num_levels):
# general idea: total number of accesses = tiling at current level * block size * num_blocks
# block size = tiling without the irrelevant loops
# num_blocks = tiling at the levels above the current level
# multiply all the tiling factors at the current level
def multiply_tiling_factors():
lowest_relevant_loop_index = min(
point.loop_orders[le.FX][level],
point.loop_orders[le.FY][level],
point.loop_orders[le.IC][level],
point.loop_orders[le.OC][level],
)
# we can ignore OX,OY,ON since they are not relevant to the weight
tiling = 1
for i in range(le.NUM):
if i in irrelevant_loops:
if point.loop_orders[i][level] > lowest_relevant_loop_index:
tiling *= point.loop_blockings[i][level]
else:
tiling *= point.loop_blockings[i][level]
return tiling
# remove all the irrelevant loops from the tiling of the levels below
def calculate_block_size():
block_size = 1
for lower_level in range(level - 1, -1, -1):
for i in range(le.NUM):
if i not in irrelevant_loops:
block_size *= point.loop_blockings[i][lower_level]
block_size *= point.loop_partitionings[i][lower_level]
return block_size
def get_num_blocks():
# get tiling of the levels above the current level
num_blocks = 1
for i in range(level + 1, num_levels):
for j in range(le.NUM):
num_blocks *= point.loop_blockings[j][i]
return num_blocks
access_counts_per_level.append(
multiply_tiling_factors()
* calculate_block_size()
* get_num_blocks()
* resource.paras[level].count
)
# print("Accesses at each level: ", access_counts_per_level)
return access_counts_per_level
def get_fl_access_old(level, point, layer, mac_capacity=1):
Get per element # of access of Weight at current level
For filter:
Relevant terms [FX, FY, IC, OC]
irrelevant terms [OX, OY, ON]
Calculating rule:
At lowest mem level (directly talk to MAC), calculate per element access
by timing all irrelevant terms [OX, OY, ON] together
For the rest higher mem levels,
firstly, check if there is stationary possibility
(irrelevant loops for filter [OX, OY, ON] are at the innermost position of this level)
if there is, exclude the irrelevant loop(s) from the current level's # of per element access computing
because they have been taken into account in lower level's # of per element access computing
secondly, calculate the current level's # of per element access
by multiplying all the irrelevant terms from current level to the highest level
including both temporal unrolling part and spatial unrolling part (parallelism).
if level == 0 and mac_capacity == 0:
return layer.wofm * layer.hofm * layer.nimg
ex_order_index = min(
point.loop_orders[le.FX][level],
point.loop_orders[le.FY][level],
point.loop_orders[le.IC][level],
ox_exclusive = point.loop_orders[le.OX][level] < ex_order_index
oy_exclusive = point.loop_orders[le.OY][level] < ex_order_index
on_exclusive = point.loop_orders[le.ON][level] < ex_order_index
ox_acc = reduce(mul, point.loop_blockings[le.OX][level + ox_exclusive :], 1)
oy_acc = reduce(mul, point.loop_blockings[le.OY][level + oy_exclusive :], 1)
on_acc = reduce(mul, point.loop_blockings[le.ON][level + on_exclusive :], 1)
ox_par = reduce(mul, point.loop_partitionings[le.OX][level:], 1)
oy_par = reduce(mul, point.loop_partitionings[le.OY][level:], 1)
on_par = reduce(mul, point.loop_partitionings[le.ON][level:], 1)
return ox_acc * oy_acc * on_acc * ox_par * oy_par * on_par
def opt_get_if_access(level, point, ba_arr, pa_arr):
Get # access of if block at current level
The repeated access to ifmap is determined by the blocking factors and
parallelism counts of those loops other than ifmap-related loops outside of
this level.
At the same buffer level, if the other loops are outside of the innermost
loop of ifmap-related loops, their blocking factors and parallelism counts
at this level should also contribute to the number of accesses.
ex_order_index = min(
point.loop_orders[le.OX][level],
point.loop_orders[le.OY][level],
point.loop_orders[le.IC][level],
fx_exclusive = point.loop_orders[le.FX][level] < ex_order_index
fy_exclusive = point.loop_orders[le.FY][level] < ex_order_index
oc_exclusive = point.loop_orders[le.OC][level] < ex_order_index
fx_acc = ba_arr[le.FX][
level + fx_exclusive
] # reduce(mul, point.loop_blockings[le.FX][level+fx_exclusive:], 1)
fy_acc = ba_arr[le.FY][
level + fy_exclusive
] # reduce(mul, point.loop_blockings[le.FY][level+fy_exclusive:], 1)
oc_acc = ba_arr[le.OC][
level + oc_exclusive
] # reduce(mul, point.loop_blockings[le.OC][level+oc_exclusive:], 1)
fx_par = pa_arr[le.FX][
level
] # reduce(mul, point.loop_partitionings[le.FX][level+fx_exclusive:], 1)
fy_par = pa_arr[le.FY][
level
] # reduce(mul, point.loop_partitionings[le.FY][level+fy_exclusive:], 1)
oc_par = pa_arr[le.OC][
level
] # reduce(mul, point.loop_partitionings[le.OC][level+oc_exclusive:], 1)
return fx_acc * fy_acc * oc_acc * fx_par * fy_par * oc_par
def opt_get_of_access(level, point, ba_arr, pa_arr):
Get # access of of block at current level
See comments in routine for ifmap.
ex_order_index = min(
point.loop_orders[le.OX][level],
point.loop_orders[le.OY][level],
point.loop_orders[le.OC][level],
fx_exclusive = point.loop_orders[le.FX][level] < ex_order_index
fy_exclusive = point.loop_orders[le.FY][level] < ex_order_index
ic_exclusive = point.loop_orders[le.IC][level] < ex_order_index
# TODO
fx_acc = ba_arr[le.FX][
level + fx_exclusive
] # reduce(mul, point.loop_blockings[le.FX][level+fx_exclusive:], 1)
fy_acc = ba_arr[le.FY][
level + fy_exclusive
] # reduce(mul, point.loop_blockings[le.FY][level+fy_exclusive:], 1)
ic_acc = ba_arr[le.IC][
level + ic_exclusive
] # reduce(mul, point.loop_blockings[le.OC][level+oc_exclusive:], 1)
fx_par = pa_arr[le.FX][
level
] # reduce(mul, point.loop_partitionings[le.FX][level+fx_exclusive:], 1)
fy_par = pa_arr[le.FY][
level
] # reduce(mul, point.loop_partitionings[le.FY][level+fy_exclusive:], 1)
ic_par = pa_arr[le.IC][
level
] # reduce(mul, point.loop_partitionings[le.OC][level+oc_exclusive:], 1)
return fx_acc * fy_acc * ic_acc * fx_par * fy_par * ic_par
def opt_get_fl_access(level, point, ba_arr, pa_arr):
Get # access of fl block at current level
See comments in routine for ifmap.
ex_order_index = min(
point.loop_orders[le.FX][level],
point.loop_orders[le.FY][level],
point.loop_orders[le.IC][level],
ox_exclusive = point.loop_orders[le.OX][level] < ex_order_index
oy_exclusive = point.loop_orders[le.OY][level] < ex_order_index
on_exclusive = point.loop_orders[le.ON][level] < ex_order_index
ox_acc = ba_arr[le.OX][
level + ox_exclusive
] # reduce(mul, point.loop_blockings[le.OX][level+ox_exclusive:], 1)
oy_acc = ba_arr[le.OY][
level + oy_exclusive
] # reduce(mul, point.loop_blockings[le.OY][level+oy_exclusive:], 1)
on_acc = ba_arr[le.ON][
level + on_exclusive
] # reduce(mul, point.loop_blockings[le.ON][level+on_exclusive:], 1)
ox_par = pa_arr[le.OX][
level
] # reduce(mul, point.loop_partitionings[le.OX][level+ox_exclusive:], 1)
oy_par = pa_arr[le.OY][
level
] # reduce(mul, point.loop_partitionings[le.OY][level+oy_exclusive:], 1)
on_par = pa_arr[le.ON][
level
] # reduce(mul, point.loop_partitionings[le.ON][level+on_exclusive:], 1)
return ox_acc * oy_acc * on_acc * ox_par * oy_par * on_par
def get_if_size(blocking_accum_list, partitioning_accum_list, partitioning_list, layer):
Get size of if block at current level including both temporal and spatial loop part
blocking -> temporal loop part
partitioning -> spatial loop part
#LMEI to support filter stride(FS) later
right now, FS/wfstd = 1 in
IX = IS*(OX-1) + FS*(FX-1) + 1 or
wifm = wistd*(wofm-1) + wfstd*(wfil-1) + 1
#LMEI (new HW template) no need for Input Duplication when OC partitions
by letting one reg broadcast Input to a row of OC partitioned PE
and remove inner PE ifamp register
fx_acc = blocking_accum_list[le.FX] * partitioning_accum_list[le.FX]
fy_acc = blocking_accum_list[le.FY] * partitioning_accum_list[le.FY]
ox_acc = blocking_accum_list[le.OX] * partitioning_accum_list[le.OX]
oy_acc = blocking_accum_list[le.OY] * partitioning_accum_list[le.OY]
width = fx_acc + (ox_acc - 1) * layer.wstd
height = fy_acc + (oy_acc - 1) * layer.hstd
return (
width
* height
* blocking_accum_list[le.IC]
* partitioning_accum_list[le.IC]
* blocking_accum_list[le.ON]
* partitioning_accum_list[le.ON]
* partitioning_list[le.OC]
) # Duplication when OC partitions
def get_of_size(blocking_accum_list, partitioning_accum_list, partitioning_list):
Get size of of block at current level including both temporal and spatial loop part
#LMEI (new HW template) no need for Output Duplication when IC, FX or FY partitions
by letting output data from a row of IC, FX or FY partitioned PE add together
and remove inner PE ofamp register
"""
return (
blocking_accum_list[le.OX]
* partitioning_accum_list[le.OX]
* blocking_accum_list[le.OY]
* partitioning_accum_list[le.OY]
* blocking_accum_list[le.OC]
* partitioning_accum_list[le.OC]
* blocking_accum_list[le.ON]
* partitioning_accum_list[le.ON]
* partitioning_list[le.IC]
* partitioning_list[le.FX]
* partitioning_list[le.FY]
) # Duplication when IC, FX or FY partitions
def get_fl_size(blocking_accum_list, partitioning_accum_list, partitioning_list):
Get size of fl block at current level
#LMEI (new HW template) no need for Weight Duplication when OX, OY or ON partitions
by letting one reg broadcast Weight to a row of OX, OY or ON partitioned PE
and remove inner PE weight register
"""
return (
blocking_accum_list[le.FX]
* partitioning_accum_list[le.FX]
* blocking_accum_list[le.FY]
* partitioning_accum_list[le.FY]
* blocking_accum_list[le.IC]
* partitioning_accum_list[le.IC]
* blocking_accum_list[le.OC]
* partitioning_accum_list[le.OC]
* partitioning_list[le.OX]
* partitioning_list[le.OY]
* partitioning_list[le.ON]
) # Duplication when OX, OY or ON partitions
def get_if_bank_size(blocking_accum_list, layer):
Get size of if block at current level
blocking -> temporal loop part
#LMEI to support filter stride(FS) later
right now, FS/wfstd = 1 in
IX = IS*(OX-1) + FS*(FX-1) + 1 or
wifm = wistd*(wofm-1) + wfstd*(wfil-1) + 1
fx_acc = blocking_accum_list[le.FX]
fy_acc = blocking_accum_list[le.FY]
ox_acc = blocking_accum_list[le.OX]
oy_acc = blocking_accum_list[le.OY]
width = fx_acc + (ox_acc - 1) * layer.wstd
height = fy_acc + (oy_acc - 1) * layer.hstd
return width * height * blocking_accum_list[le.IC] * blocking_accum_list[le.ON]
def get_of_bank_size(blocking_accum_list):
Get size of of block at current level
blocking -> temporal loop part
return (
blocking_accum_list[le.OX]
* blocking_accum_list[le.OY]
* blocking_accum_list[le.OC]
* blocking_accum_list[le.ON]
)
def get_fl_bank_size(blocking_accum_list):
Get size of fl block at current level
blocking -> temporal loop part
return (
blocking_accum_list[le.FX]
* blocking_accum_list[le.FY]
* blocking_accum_list[le.IC]
* blocking_accum_list[le.OC]
)
def get_array_access_and_cost(level, para, access_list, point):
Get the access at array level from the access at the
lower level of memory hierarchy
para_mode = para.access_mode
array_dim = para.array_dim
para_count = para.array_width
para_cost = para.array_access_cost * 1.0
nearest_pe_cost = para_cost
[if_block_access, of_block_access, fl_block_access] = access_list
partitions = list(zip(*point.loop_partitionings))[level]
para_dim = point.para_loop_dim[level]
if para_mode == 1:
for i in range(len(para_dim)):
para_index = para_dim[i]
partitions_far.append(
[
1,
]
* le.NUM
)
if len(para_index) == 1:
partitions_nearest[para_index[0]] = partitions[para_index[0]]
else:
inner_loop, outer_loop = para_index
partitions_nearest[inner_loop] = partitions[inner_loop]
partitions_far[i][outer_loop] = partitions[outer_loop]
across_block_cost[i] = para_cost * partitions[inner_loop]
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
array_if_block_access_nearest = (
if_block_access
* partitions_nearest[le.FX]
* partitions_nearest[le.FY]
* partitions_nearest[le.OC]
)
array_of_block_access_nearest = (
of_block_access
* partitions_nearest[le.FX]
* partitions_nearest[le.FY]
* partitions_nearest[le.IC]
)
array_fl_block_access_nearest = (
fl_block_access
* partitions_nearest[le.OX]
* partitions_nearest[le.OY]
* partitions_nearest[le.ON]
)
array_access = [
[
array_if_block_access_nearest,
array_of_block_access_nearest,
array_fl_block_access_nearest,
]
]
for i in range(array_dim): # Don't get it
if_partitions_far = (
partitions_far[i][le.FX]
* partitions_far[i][le.FY]
* partitions_far[i][le.OC]
)
if_partitions_far = if_partitions_far if if_partitions_far != 1 else 0
of_partitions_far = (
partitions_far[i][le.FX]
* partitions_far[i][le.FY]
* partitions_far[i][le.IC]
)
of_partitions_far = of_partitions_far if of_partitions_far != 1 else 0
fl_partitions_far = (
partitions_far[i][le.OX]
* partitions_far[i][le.OY]
* partitions_far[i][le.ON]
)
fl_partitions_far = fl_partitions_far if fl_partitions_far != 1 else 0
if_array_block_access = if_block_access * if_partitions_far
of_array_block_access = of_block_access * of_partitions_far
fl_array_block_access = fl_block_access * fl_partitions_far
array_access.append(
[if_array_block_access, of_array_block_access, fl_array_block_access]
)
return [array_access, [nearest_pe_cost] + across_block_cost]
elif para_mode == 2:
for i in range(len(para_dim)):
para_index = para_dim[i]
for j in para_index:
partitions_nearest[j] = partitions[j]
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
array_if_block_access_nearest = (
if_block_access
* partitions_nearest[le.FX]
* partitions_nearest[le.FY]
* partitions_nearest[le.OC]
)
array_of_block_access_nearest = (
of_block_access
* partitions_nearest[le.FX]
* partitions_nearest[le.FY]
* partitions_nearest[le.IC]
)
array_fl_block_access_nearest = (
fl_block_access
* partitions_nearest[le.OX]
* partitions_nearest[le.OY]
* partitions_nearest[le.ON]
)
array_access = [
[
array_if_block_access_nearest,
array_of_block_access_nearest,
array_fl_block_access_nearest,
]
]
return [array_access, [nearest_pe_cost]]
def get_access(point, layer, resource):
Get the total access of each block at each level,
return the list as
[[if_block_access, of_block_access, fl_block_access], ...].
Assume all the buffers are inclusive, so buffers in lower level
appear in higher level as well.
For the parallelism case assume read from next memory level,
Support more access modes in parallelism case
"""
# TODO support more customized memory
# TODO more access at overlapped boundary
num_levels = resource.buffer_levels()
mac_capacity = resource.mac_capacity
access_list = []
if_accesses = get_if_access(resource, point, layer, mac_capacity)
of_accesses = get_of_access(resource, point, layer, mac_capacity)
fl_accesses = get_fl_access(resource, point, layer, mac_capacity)
access_list = list(zip(if_accesses, of_accesses, fl_accesses))
# para_mode = [e.access_mode for i, e in enumerate(resource.paras) if e.access_mode != 0]
para_mode_level = [i for i, e in enumerate(resource.paras) if e.access_mode != 0]
partitions = list(zip(*point.loop_partitionings))
array_costs = []
if para_mode_level:
# access at array level
# para_mode_level = [i for i, e in enumerate(resource.paras) if e.access_mode != 0]
delta = 0
for level in para_mode_level:
next_level_access = [1, 1, 1]
else:
next_level_access = copy.copy(access_list[level + delta + 1])
next_level_access[1] = (next_level_access[1] + 1) / 2
array_access, array_cost = get_array_access_and_cost(
level, resource.paras[level], next_level_access, point
)
array_costs.append(array_cost)
access_list.insert(level + delta + 1, array_access)
delta += 1
return [access_list, array_costs]
def opt_get_access(num_levels, point, mac_capacity):
See the above function's comments. This function is just an
optimized version of the above function
"""
""" blocking_accum_arr is reversed cumprod numpy array """
# TODO support mac_capacity
# blocking_arr = np.ones((le.NUM, num_levels+1))
# partitioning_arr = np.ones((le.NUM, num_levels+1))
# blocking_arr[:,:-1] = np.array(point.loop_blockings)
# partitioning_arr[:,:-1] = np.array(point.loop_partitionings)
# blocking_accum_arr = np.ones((le.NUM, num_levels+1))
# partitioning_accum_arr = np.ones((le.NUM, num_levels+1))
# blocking_accum_arr[i][:-1] = np.cumprod(blocking_arr[i][::-1])[::-1]
# partitioning_accum_arr[i][:-1] = np.cumprod(partitioning_arr[i][::-1])[::-1]
# blocking_accum_arr = blocking_arr[...,::-1].cumprod(axis=-1)[...,::-1]
# partitioning_accum_arr = partitioning_arr[...,::-1].cumprod(axis=-1)[...,::-1]
# blocking_accum_arr = np.hstack((blocking_accum_arr, np.ones((le.NUM, 1))))
# partitioning_accum_arr = np.hstack((partitioning_accum_arr, np.ones((le.NUM, 1))))
blocking_accum_arr = []
partitioning_accum_arr = []
for i in range(le.NUM):
ba_current_level = [1]
pa_current_level = [1]
ba_tmp = 1
pa_tmp = 1
ba_tmp = ba_tmp * point.loop_blockings[i][level]
pa_tmp = pa_tmp * point.loop_partitionings[i][level]
ba_current_level.append(ba_tmp)
pa_current_level.append(pa_tmp)
blocking_accum_arr.append(ba_current_level[::-1])
partitioning_accum_arr.append(pa_current_level[::-1])
access_arr = np.zeros((num_levels, 3))
for level in range(num_levels):
access_arr[level][0] = opt_get_if_access(
level, point, blocking_accum_arr, partitioning_accum_arr
)
access_arr[level][1] = (
2
* opt_get_of_access(
level, point, blocking_accum_arr, partitioning_accum_arr
)
- 1
)
access_arr[level][2] = opt_get_fl_access(
level, point, blocking_accum_arr, partitioning_accum_arr
)
def get_bank_size(point, layer, level):
blocking_accum_list = []
for i in range(le.NUM):
blocking_accum_list.append(reduce(mul, point.loop_blocking(i)[: level + 1], 1))
if_bank_size = get_if_bank_size(blocking_accum_list, layer)
of_bank_size = get_of_bank_size(blocking_accum_list)
fl_bank_size = get_fl_bank_size(blocking_accum_list)
return (if_bank_size, of_bank_size, fl_bank_size)
def get_block_size(point, layer, level):
"""
Calculate the size of ifmap, ofmap, filter at current level
"""
blocking_accum_list = []
partitioning_accum_list = []
partitioning_reshape = list(zip(*point.loop_partitionings))
partitioning_list = partitioning_reshape[level]
for i in range(le.NUM):
blocking_accum_list.append(reduce(mul, point.loop_blocking(i)[: level + 1], 1))
partitioning_accum_list.append(
reduce(mul, point.loop_partitioning(i)[: level + 1], 1)
) # FIXME inclusive mode also duplicates data
if_block_size = get_if_size(
blocking_accum_list, partitioning_accum_list, partitioning_list, layer
)