Newer
Older
from operator import mul
from operator import add
from functools import reduce
import copy
import math
from . import loop_enum as le
from . import buffer_enum as be
def get_comp_cost(layer):
Compute the total # of MAC computation, it is independent of other optimizations
Also it is independent of input size and input/filter stride
Total # of computation = OX*OY*IC*OC*ON*FX*FY
"""
cost = (
layer.wofm
* layer.hofm
* layer.nifm
* layer.nofm
* layer.nimg
* layer.wfil
* layer.hfil
)
return cost
def get_ideal_performance(layer, resource):
Compute the ideal runtime in cycles by assuming 100% PE array utilization
Ideal # of cycles = Total # of MAC computation / Total # of PEs
#LMEI Need to be modified if later when adding precision-scalable PE.
# of functional PE will change depending on different precision modes.
total_comp = get_comp_cost(layer)
number_pe = reduce(mul, resource.para_count_list, 1)
return runtime
def get_layer_size(layer):
Get size of ifmap, ofmap, filter of the layer
#LMEI ifmap_size should be able to calculate based on ofmap_size and input stride(IS) /filter stride(FS)
IX = IS*(OX-1) + FS*(FX-1) + 1
wifm = wistd*(wofm-1) + wfstd*(wfil-1) + 1
ifmap_size = layer.wifm * layer.hifm * layer.nifm * layer.nimg
ofmap_size = layer.wofm * layer.hofm * layer.nofm * layer.nimg
flmap_size = layer.wfil * layer.hfil * layer.nifm * layer.nofm
return [ifmap_size, ofmap_size, flmap_size]
def get_hinted_para(level, hint):
Get the actual total spatial unrolling size from loop schedule
assert hint
hinted_para = 1
for loop in range(le.NUM):
if loop in hint:
hinted_loop_para = hint[loop][level][2]
hinted_para *= hinted_loop_para
return hinted_para
def valid_dataflow(resource, hint):
Check if the actual spatial unrolling size from loop schedule meets the HW utilization requirement
by comparing it with real HW parallelism size * utilization threshold.
num_levels = resource.buffer_levels()
for level in range(num_levels):
if resource.paras[level].count != 1 and get_hinted_para(level, hint) < (
resource.paras[level].count * resource.utilization_threshold
):
return False
return True
def get_if_access(level, point, layer, mac_capacity=1):
"""
Get per element # of access of Input at current level
Not accurate because [FX, FY] is not totally irrelevant terms for ifmap..
#LMEI Need to be modified by using the concept of the dataset.
if level == 0 and mac_capacity == 0:
return layer.wfil * layer.hfil * layer.nofm // (layer.wstd * layer.hstd)
ex_order_index = min(
point.loop_orders[le.OX][level],
point.loop_orders[le.OY][level],
point.loop_orders[le.IC][level],
fx_exclusive = point.loop_orders[le.FX][level] < ex_order_index
fy_exclusive = point.loop_orders[le.FY][level] < ex_order_index
oc_exclusive = point.loop_orders[le.OC][level] < ex_order_index
fx_acc = reduce(mul, point.loop_blockings[le.FX][level + fx_exclusive :], 1)
fy_acc = reduce(mul, point.loop_blockings[le.FY][level + fy_exclusive :], 1)
oc_acc = reduce(mul, point.loop_blockings[le.OC][level + oc_exclusive :], 1)
# No loop orders among unrolled loops, they have the same order
fx_par = reduce(mul, point.loop_partitionings[le.FX][level:], 1)
fy_par = reduce(mul, point.loop_partitionings[le.FY][level:], 1)
oc_par = reduce(mul, point.loop_partitionings[le.OC][level:], 1)
fx_acc * fy_acc * oc_acc * fx_par * fy_par * oc_par // (layer.wstd * layer.hstd)
def get_of_access(level, point, layer, mac_capacity=1):
"""
Get per element # of access of Output at current level
For output:
Relevant terms [OX, OY, OC, ON]
irrelevant terms [FX, FY, IC]
Calculating rule:
At lowest mem level (directly talk to MAC), calculate per element access
by timing all irrelevant terms [FX, FY, IC] together
For the rest higher mem levels,
firstly, check if there is stationary possibility
(irrelevant loops for filter [FX, FY, IC] are at the innermost position of this level)
if there is, exclude the irrelevant loop(s) from the current level's # of per element access computing
because they have been taken into account in lower level's # of per element access computing
secondly, calculate the current level's # of per element access
by multiplying all the irrelevant terms from current level to the highest level
including both temporal unrolling part and spatial unrolling part (parallelism).
return layer.wfil * layer.hfil * layer.nifm
ex_order_index = min(
point.loop_orders[le.OX][level],
point.loop_orders[le.OY][level],
point.loop_orders[le.OC][level],
fx_exclusive = point.loop_orders[le.FX][level] < ex_order_index
fy_exclusive = point.loop_orders[le.FY][level] < ex_order_index
ic_exclusive = point.loop_orders[le.IC][level] < ex_order_index
fx_acc = reduce(mul, point.loop_blockings[le.FX][level + fx_exclusive :], 1)
fy_acc = reduce(mul, point.loop_blockings[le.FY][level + fy_exclusive :], 1)
ic_acc = reduce(mul, point.loop_blockings[le.IC][level + ic_exclusive :], 1)
fx_par = reduce(mul, point.loop_partitionings[le.FX][level:], 1)
fy_par = reduce(mul, point.loop_partitionings[le.FY][level:], 1)
ic_par = reduce(mul, point.loop_partitionings[le.IC][level:], 1)
return fx_acc * fy_acc * ic_acc * fx_par * fy_par * ic_par
def get_fl_access(level, point, layer, mac_capacity=1):
"""
Get per element # of access of Weight at current level
For filter:
Relevant terms [FX, FY, IC, OC]
irrelevant terms [OX, OY, ON]
Calculating rule:
At lowest mem level (directly talk to MAC), calculate per element access
by timing all irrelevant terms [OX, OY, ON] together
For the rest higher mem levels,
firstly, check if there is stationary possibility
(irrelevant loops for filter [OX, OY, ON] are at the innermost position of this level)
if there is, exclude the irrelevant loop(s) from the current level's # of per element access computing
because they have been taken into account in lower level's # of per element access computing
secondly, calculate the current level's # of per element access
by multiplying all the irrelevant terms from current level to the highest level
Loading
Loading full blame...