Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
'''
Cost model.
'''
#import numpy as np
from operator import mul
from operator import add
from functools import reduce
import copy
import math
from . import loop_enum as le
from . import buffer_enum as be
def get_comp_cost(layer):
'''
Compute the total # of MAC computation, it is independent of other optimizations
Also it is independent of input size and input/filter stride
Total # of computation = OX*OY*IC*OC*ON*FX*FY
'''
cost = layer.wofm * layer.hofm * layer.nifm * layer.nofm \
* layer.nimg * layer.wfil * layer.hfil
return cost
def get_ideal_performance(layer, resource):
'''
Compute the ideal runtime in cycles by assuming 100% PE array utilization
Ideal # of cycles = Total # of MAC computation / Total # of PEs
#LMEI Need to be modified if later when adding precision-scalable PE.
# of functional PE will change depending on different precision modes.
'''
total_comp = get_comp_cost(layer)
number_pe = reduce(mul, resource.para_count_list, 1)
runtime = math.ceil(total_comp *1.0 / number_pe)
return runtime
def get_layer_size(layer):
'''
Get size of ifmap, ofmap, filter of the layer
#LMEI ifmap_size should be able to calculate based on ofmap_size and input stride(IS) /filter stride(FS)
IX = IS*(OX-1) + FS*(FX-1) + 1
wifm = wistd*(wofm-1) + wfstd*(wfil-1) + 1
'''
ifmap_size = layer.wifm * layer.hifm * layer.nifm * layer.nimg
ofmap_size = layer.wofm * layer.hofm * layer.nofm * layer.nimg
flmap_size = layer.wfil * layer.hfil * layer.nifm * layer.nofm
return [ifmap_size, ofmap_size, flmap_size]
def get_hinted_para(level, hint):
'''
Get the actual total spatial unrolling size from loop schedule
'''
assert hint
hinted_para = 1
for loop in range(le.NUM):
if loop in hint:
hinted_loop_para = hint[loop][level][2]
hinted_para *= hinted_loop_para
return hinted_para
def valid_dataflow(resource, hint):
'''
Check if the actual spatial unrolling size from loop schedule meets the HW utilization requirement
by comparing it with real HW parallelism size * utilization threshold.
'''
num_levels = resource.buffer_levels()
for level in range(num_levels):
if resource.paras[level].count != 1 and \
get_hinted_para(level, hint) < (resource.paras[level].count * resource.utilization_threshold):
return False
return True
def get_if_access(level, point, layer, mac_capacity = 1):
'''
Get per element # of access of Input at current level
Not accurate because [FX, FY] is not totally irrelevant terms for ifmap..
#LMEI Need to be modified by using the concept of the dataset.
'''
if level == 0 and mac_capacity == 0:
return layer.wfil * layer.hfil * layer.nofm / (layer.wstd * layer.hstd)
ex_order_index = min(point.loop_orders[le.OX][level],
point.loop_orders[le.OY][level],
point.loop_orders[le.IC][level],
point.loop_orders[le.ON][level])
fx_exclusive = point.loop_orders[le.FX][level] < ex_order_index
fy_exclusive = point.loop_orders[le.FY][level] < ex_order_index
oc_exclusive = point.loop_orders[le.OC][level] < ex_order_index
fx_acc = reduce(mul, point.loop_blockings[le.FX][level+fx_exclusive:], 1)
fy_acc = reduce(mul, point.loop_blockings[le.FY][level+fy_exclusive:], 1)
oc_acc = reduce(mul, point.loop_blockings[le.OC][level+oc_exclusive:], 1)
# No loop orders among unrolled loops, they have the same order
fx_par = reduce(mul, point.loop_partitionings[le.FX][level:], 1)
fy_par = reduce(mul, point.loop_partitionings[le.FY][level:], 1)
oc_par = reduce(mul, point.loop_partitionings[le.OC][level:], 1)
return fx_acc * fy_acc * oc_acc * fx_par * fy_par * oc_par / (layer.wstd * layer.hstd)
def get_of_access(level, point, layer, mac_capacity = 1):
'''
Get per element # of access of Output at current level
For output:
Relevant terms [OX, OY, OC, ON]
irrelevant terms [FX, FY, IC]
Calculating rule:
At lowest mem level (directly talk to MAC), calculate per element access
by timing all irrelevant terms [FX, FY, IC] together
For the rest higher mem levels,
firstly, check if there is stationary possibility
(irrelevant loops for filter [FX, FY, IC] are at the innermost position of this level)
if there is, exclude the irrelevant loop(s) from the current level's # of per element access computing
because they have been taken into account in lower level's # of per element access computing
secondly, calculate the current level's # of per element access
by multiplying all the irrelevant terms from current level to the highest level
including both temporal unrolling part and spatial unrolling part (parallelism).
'''
if level == 0 and mac_capacity == 0 :
return layer.wfil * layer.hfil * layer.nifm
ex_order_index = min(point.loop_orders[le.OX][level],
point.loop_orders[le.OY][level],
point.loop_orders[le.OC][level],
point.loop_orders[le.ON][level])
fx_exclusive = point.loop_orders[le.FX][level] < ex_order_index
fy_exclusive = point.loop_orders[le.FY][level] < ex_order_index
ic_exclusive = point.loop_orders[le.IC][level] < ex_order_index
fx_acc = reduce(mul, point.loop_blockings[le.FX][level+fx_exclusive:], 1)
fy_acc = reduce(mul, point.loop_blockings[le.FY][level+fy_exclusive:], 1)
ic_acc = reduce(mul, point.loop_blockings[le.IC][level+ic_exclusive:], 1)
fx_par = reduce(mul, point.loop_partitionings[le.FX][level:], 1)
fy_par = reduce(mul, point.loop_partitionings[le.FY][level:], 1)
ic_par = reduce(mul, point.loop_partitionings[le.IC][level:], 1)
return fx_acc * fy_acc * ic_acc * fx_par * fy_par * ic_par
def get_fl_access(level, point, layer, mac_capacity = 1):
'''
Get per element # of access of Weight at current level
For filter:
Relevant terms [FX, FY, IC, OC]
irrelevant terms [OX, OY, ON]
Calculating rule:
At lowest mem level (directly talk to MAC), calculate per element access
by timing all irrelevant terms [OX, OY, ON] together
For the rest higher mem levels,
firstly, check if there is stationary possibility
(irrelevant loops for filter [OX, OY, ON] are at the innermost position of this level)
if there is, exclude the irrelevant loop(s) from the current level's # of per element access computing
because they have been taken into account in lower level's # of per element access computing
secondly, calculate the current level's # of per element access
by multiplying all the irrelevant terms from current level to the highest level
including both temporal unrolling part and spatial unrolling part (parallelism).
'''
if level == 0 and mac_capacity == 0:
return layer.wofm * layer.hofm * layer.nimg
ex_order_index = min(point.loop_orders[le.FX][level],
point.loop_orders[le.FY][level],
point.loop_orders[le.IC][level],
point.loop_orders[le.OC][level])
ox_exclusive = point.loop_orders[le.OX][level] < ex_order_index
oy_exclusive = point.loop_orders[le.OY][level] < ex_order_index
on_exclusive = point.loop_orders[le.ON][level] < ex_order_index
ox_acc = reduce(mul, point.loop_blockings[le.OX][level+ox_exclusive:], 1)
Loading
Loading full blame...