Skip to content
Snippets Groups Projects
Commit 17fc485b authored by xuanyoya's avatar xuanyoya
Browse files

Setup project

parents
No related branches found
No related tags found
No related merge requests found
Showing
with 689 additions and 0 deletions
# CNN-blocking
Tool for optimize CNN blocking
usage: run_optimizer.py [-h] [-s SCHEDULE] [-v]
{basic,mem_explore, dataflow_explore} arch network
positional arguments:
{basic,mem_explore, dataflow_explore} optimizer type
arch architecture specification
layer layer specification
optional arguments:
-h, --help show this help message and exit
-s SCHEDULE, --schedule SCHEDULE restriction of the schedule space
this is optional but restricting the schedule space will accelerate the scipt significantly
-v, --verbose vebosity
# Examples
## To optimize loop blocking.
Dataflow: C | K
Memory Architecture: 3 level
Network: AlexNet Conv3 Batch16
```
python ./tools/run_optimizer.py basic ./examples/arch/3_level_mem_basic_example.json ./examples/layer/alex_conv3_batch16.json -s ./examples/schedule/dataflow_C_K.json -v
```
## To optimize memory capacity.
Dataflow: C | K
Memory Architecture: 3 level
Network: AlexNet Conv3 Batch16
```
python ./tools/run_optimizer.py mem_explore ./examples/arch/3_level_mem_explore_example.json ./examples/layer/alex_conv3_batch16.json -s ./examples/schedule/eyeriss_alex_conv3.json -v
```
## To explore dataflow.
Dataflow: All
Memory Architecture: Eyeriss
Network: AlexNet Conv3 Batch16
```
python ./tools/run_optimizer.py dataflow_explore ./examples/arch/3_level_mem_basic_example.json ./examples/layer/alex_conv3_batch16.json -v
```
or:
```
python ./tools/run_optimizer.py dataflow_explore ./examples/arch/3_level_mem_basic_example.json ./examples/layer/alex_conv3_batch16.json -n user_defined_pickle_filename -v
```
'''
cnn_mapping package
'''
from .mapping_point import MappingPoint
from .resource import Resource
from .layer import Layer
from .schedule import Schedule
from . import loop_enum as le
from . import utils
from . import extract_input
from . import cost_model
from . import mapping_point_generator
from . import optimizer
'''
Buffer enum type.
Buffers include ifmap (IF), ofmap (OF), filter (FL).
'''
IF = 0
OF = 1
FL = 2
NUM = 3
from collections import deque
class Cache(object):
'''
Helper class for cache computed values to reduce runtime
'''
def __init__(self, num_levels, size):
self.num_levels = num_levels
self.cache_map = [dict() for x in range(num_levels)]
self.cache_queue = [deque() for i in range(num_levels)]
self.size = size
def read_cache(self, level, data):
'''
Only read cache, no change to make
'''
if len(self.cache_map[level]) == 0 or data not in self.cache_map[level]:
return None
else:
return self.cache_map[level][data]
def write_cache(self, level, data, value):
'''
Only write cache, change map and queue
'''
assert len(self.cache_map[level]) == len(self.cache_queue[level])
self.cache_map[level][data] = value
self.cache_queue[level].append(data)
if len(self.cache_queue[level]) > self.size:
pop_ele = self.cache_queue[level].popleft()
del self.cache_map[level][pop_ele]
This diff is collapsed.
import json
import os
import loop_enum as le
def extract_arch_info(arch_file):
with open(arch_file) as json_data_file:
data = json.load(json_data_file)
assert data["mem_levels"] == len(data["capacity"]), \
"capacity list is invalid, too many or too few elements"
assert data["mem_levels"] == len(data["access_cost"]), \
"access_cost list is invalid, too many or too few elements"
assert data["mem_levels"] == len(data["parallel_count"]), \
"parallel_count list is invalid, too many or too few elements"
num_bytes = data["precision"]/8
capacity_list = [x/num_bytes for x in data["capacity"]]
data["capacity"] = capacity_list
if "static_cost" not in data:
data["static_cost"] = [0,] * data["mem_levels"]
else:
assert data["mem_levels"] == len(data["static_cost"]), \
"static_cost list is invalid, too many or too few elements"
if "mac_capacity" not in data:
data["mac_capacity"] = 0
if "parallel_mode" not in data:
data["parallel_mode"] = [0,] * data["mem_levels"]
for level in xrange(data["mem_levels"]):
if data["parallel_count"][level] != 1:
data["parallel_mode"][level] = 1
else:
assert data["mem_levels"] == len(data["parallel_mode"]), \
"parallel_mode list is invalid, too many or too few elements"
if "array_dim" not in data:
data["array_dim"] = None
if "utilization_threshold" not in data:
data["utilization_threshold"] = 0.75
if "replication" not in data:
data["replication"] = True
return data
def extract_network_info(network_file):
with open(network_file) as json_data_file:
data = json.load(json_data_file)
if "batch_size" not in data:
data["batch_size"] = 1
if "stride_width" not in data:
data["stride_width"] = 1
if "stride_height" not in data:
data["stride_height"] = 1
layer_summary = data.values()
data['layer_info'] = layer_summary
data['layer_name'] = os.path.splitext(os.path.basename(network_file))[0]
return data
def extract_schedule_info(schedule_file, num_levels):
with open(schedule_file) as json_data_file:
data = json.load(json_data_file)
schedule = {}
hint = data["schedule_hint"]
schedule_hint = {}
for loop in hint:
schedule_hint[le.loop_table[loop]] = [None,]*num_levels
for level in hint[loop]:
level_index = int(level.lstrip('level'))
schedule_hint[le.loop_table[loop]][level_index] = [None,]*3
if "order" in hint[loop][level]:
schedule_hint[le.loop_table[loop]][level_index][0] = hint[loop][level]["order"]
if "blocking_size" in hint[loop][level]:
schedule_hint[le.loop_table[loop]][level_index][1] = hint[loop][level]["blocking_size"]
if "partitioning_size" in hint[loop][level]:
schedule_hint[le.loop_table[loop]][level_index][2] = hint[loop][level]["partitioning_size"]
schedule["schedule_hint"] = schedule_hint
if "partition_loops" not in data:
schedule["partition_loops"] = None
else:
schedule["partition_loops"] = data["partition_loops"]
#TODO partition at dimension
return schedule
def extract_info(args):
arch_info = extract_arch_info(args.arch)
network_info = extract_network_info(args.network)
schedule_info = extract_schedule_info(args.schedule, arch_info["mem_levels"]) if args.schedule else None
return arch_info, network_info, schedule_info
'''
Layer specification.
'''
class Layer(object):
'''
NN layer parameters.
nifm: # ifmap channels.
nofm: # ofmap channels.
wifm: ifmap width.
hifm: ifmap height.
wofm: ofmap width.
hofm: ofmap height.
wfil: weight filter width.
hfil: weight filter height.
nimg: # input images (batch).
wstd: stride size in width dimension.
hstd: stride size in height dimension.
'''
def __init__(self, nifm, nofm, wofm, hofm, wfil, hfil, nimg=1, wstd=1, hstd=1):
self.nifm = nifm
self.nofm = nofm
self.wofm = wofm
self.hofm = hofm
self.wifm = wfil + (wofm - 1) * wstd
self.hifm = hfil + (hofm - 1) * hstd
self.wfil = wfil
self.hfil = hfil
self.nimg = nimg
self.wstd = wstd
self.hstd = hstd
assert self.wofm > 0
assert self.hofm > 0
assert self.nimg > 0
self.sizes = [wfil, hfil, wofm, hofm, nofm, nifm, nimg]
@classmethod
def layer(cls, info):
return cls(info["input_fmap_channel"], info["output_fmap_channel"],
info["fmap_width"], info["fmap_height"], info["window_width"],
info["window_height"], info["batch_size"],
info["stride_width"], info["stride_height"])
class FCLayer(Layer):
'''
NN fully-connected layer parameters.
(wifm, hifm) = (wfil, hfil), wstd = hstd = 1, wofm = hofm = 1.
'''
def __init__(self, nifm, nofm, wfil, hfil, nimg=1):
Layer.__init__(self, nifm, nofm, 1, 1, wfil, hfil, nimg)
'''
Loop enum type.
Loops include filter width (FX), filter height (FY),
output width (OX), output height (OY),
output channel (OC), input channel (IC),
batch (ON).
'''
FX = 0
FY = 1
OX = 2
OY = 3
OC = 4
IC = 5
ON = 6
NUM = 7
table = {0: 'FX',
1: 'FY',
2: 'OX',
3: 'OY',
4: 'OC',
5: 'IC',
6: 'ON' }
loop_table = { 'FX': 0,
'FY': 1,
'OX': 2,
'OY': 3,
'OC': 4,
'IC': 5,
'ON': 6}
'''
Type for a specific mapping point.
'''
class MappingPoint(object):
'''
Configurations of a specific mapping.
Mapping includes the complete description of the loop order and loop
blocking factors of each buffer level, loop partitioning onto each level of
parallel units, etc.
Each loop order and each set of loop blocking factors are corresponding to
each loop at all buffer levels, because it does not make much sense to block
more than once at a single buffer level.
Each set of loop partitioning factors is corresponding to number of parallelism of
each loop at all all levels.
Partition mode is the access mode in parallelism case.
0(default): access to next level of memory
1: access buffers in neighbor processing units
'''
def __init__(self, loop_order_list, loop_blockings_list,
loop_partitionings_list, para_loop_dim_list=None):
# NOTE(mgao12): no value validation here; cost model needs to abandon
# invalid mapping.
self.loop_orders = loop_order_list
self.loop_blockings = loop_blockings_list
self.loop_partitionings = loop_partitionings_list
self.para_loop_dim = para_loop_dim_list
def loop_order(self, loop):
'''
Loop order of the given loop.
Return a tuple of the order indices for the same loop at all buffer
levels, i.e., if a tuple for loop OX is returned, and the first
element of this tuple t[0] = 2, then loop OX is the third innermost
loop at the first buffer level.
Tuples are organized as the same order as loop enum order.
E.g., for a two-level memory hierachy, each tuple contains two
elements, [(0, 0), (1, 1), (2, 4), (3, 5), (4, 3), (5, 2), (6, 6)]
means for the first loop (FX = 0), at both levels FX is at the
innermost (tuple (0, 0)), etc.. I.e., it means a loop structure as:
for on
for oy
for ox
for oc
for ic
for fy
for fx
for on
for ic
for oc
for oy
for ox
for fy
for fx
...
'''
return self.loop_orders[loop]
def loop_blocking(self, loop):
'''
Loop blocking factors of the given loop.
Return a tuple of factors for the given loop at all buffer levels, from
inside level to outside level.
Tuples are organized as the same order as loop enum order.
E.g., [(4, 2), (8, 1), ...] means for the first loop (FX = 0), the
blocking factor is 4 for the innermost level, and 2 for the next level;
for the second loop (FY = 0), the blocking factor is 8 for the
innermost level, and 1 for the next level.
'''
return self.loop_blockings[loop]
def loop_partitioning(self, loop):
'''
Loop partitioning factors of the given loop.
Return a tuple of factors for the given loop at all buffer levels, from
inside level to outside level.
Tuples are organized as the same order as loop enum order.
E.g., [(4, 2), (8, 1), ...] means for the first loop (FX = 0), it is
parallelized in 4 units for the first parallel level, and 2 for the
next level, etc..
'''
return self.loop_partitionings[loop]
This diff is collapsed.
'''
Top level function of optimization framework
'''
import mapping_point_generator
import cost_model
import loop_enum as le
import buffer_enum as be
def opt_optimizer(resource, layer, hint=None, verbose=False):
'''
Evaluate the cost of each mapping point,
record the mapping_point with the smallest cost
'''
if hint is not None and hint.partition_loops is None:
valid = cost_model.valid_dataflow(resource, hint.schedule_hint)
assert valid == True, "Specified schedule doesn't satisfy the utilization threshold, please check partitioning_size"
smallest_cost, perf, best_mapping_point = mapping_point_generator.opt_mapping_point_generator_function(resource, layer, hint, verbose)
total_cost = cost_model.get_cost(resource, best_mapping_point, layer, verbose)
if verbose:
print "Optimal energy (pJ): ", smallest_cost
print "Runtime (cycles):", perf
print "Best mapping_point: ", best_mapping_point.loop_blockings, best_mapping_point.loop_partitionings, best_mapping_point.loop_orders
return [smallest_cost, best_mapping_point]
'''
Hardware resource types.
'''
#import numpy as np
from collections import namedtuple
from operator import mul
import math
class Buffer(namedtuple('Buffer',
['capacity', 'access_cost', 'unit_static_cost'])):
'''
Buffer specification.
Immutable type.
Buffer attributes include capacity, access cost, unit static cost.
Capacity is for a single buffer (If current level has parallelsim,
then it is the capacity of the buffer bank inside each parallel
units); access cost is the cost per access;
unit static cost is the static cost per time unit.
'''
pass
class Parallelism(namedtuple('Parallelism',
['count', 'access_mode', 'array_access_cost', 'array_dim', 'array_width'])):
'''
Parallelism specification.
Immutable type.
Parallelism attributes include count and access_mode.
Count is the number of parallel units.
Access mode is the mode of access non-private data,
for example, whether access neighborhood PE, or
goes to next level buffer.
Array access cost is the cost of accessing array level buffers.
Array dimension is the dimension of PE array, whether it is 1D or 2D.
Array width is the width of PE array, if 1D array, same as array dimension.
if 2D array, sqrt(array_dim)
Note: shared buffer level is the level
index of the lowest shared buffer for this parallelism.
'''
pass
class Resource(object):
'''
Hardware resource specification.
Hardware resource includes buffer hierarchy and parallel processing units.
mac_capacity[0, 1], determines whether MAC can buffer 1 output.
'''
def __init__(self, buf_capacity_list, buf_access_cost_list,
buf_unit_static_cost_list, para_count_list,
mac_capacity=1, partition_mode=None, array_access_cost=None,
array_dim = None, utilization_threshold = 0.75, replication=True):
# Buffers.
assert len(buf_capacity_list) == len(buf_access_cost_list)
assert len(buf_capacity_list) == len(buf_unit_static_cost_list)
assert len(buf_capacity_list) == len(para_count_list)
self.bufs = [Buffer(*t) for t in zip(buf_capacity_list, \
buf_access_cost_list, buf_unit_static_cost_list)]
self.num_levels = len(self.bufs)
# Parallelism.
array_access_costs = [None] * len(para_count_list)
if not partition_mode :
partition_mode = [0] * len(para_count_list)
else :
array_level = 0
for i in xrange(self.num_levels):
# when using non-default partition mode, the parallelism
# count needs to be large than 1
assert partition_mode[i] == 0 or para_count_list <= 1 \
or (partition_mode[i] > 0 and para_count_list > 1)
if partition_mode[i] == 1 or partition_mode[i] == 2:
array_access_costs[i] = array_access_cost[array_level]
array_level += 1
self.para_index = [i for i, e in enumerate(para_count_list) if e != 1]
if not array_dim:
array_dim = [2 if e != 1 else 1 for e in para_count_list]
array_width = [para_count_list[i] if array_dim[i] == 1 else int(math.sqrt(para_count_list[i])) for i in xrange(self.num_levels)]
self.paras = [Parallelism(*t) for t in zip(para_count_list, \
partition_mode, array_access_costs, array_dim, array_width)]
self.access_cost = buf_access_cost_list
self.mac_capacity = mac_capacity
self.array_access_cost = array_access_cost
self.para_count_list = para_count_list
self.utilization_threshold = utilization_threshold
self.replication = replication
@classmethod
def arch(cls, info):
return cls(info["capacity"], info["access_cost"], info["static_cost"],
info["parallel_count"], info["mac_capacity"], info["parallel_mode"],
info["parallel_cost"], info["array_dim"], info["utilization_threshold"], info["replication"])
def buffer_levels(self):
'''
Return total levels of buffers in the hierarchy.
'''
return self.num_levels
def buffer(self, level):
'''
Return the specification of the buffer of the given level.
'''
return self.bufs[level]
def parallelism(self, level):
'''
Return the specification of the parallelism of the given level.
'''
return self.paras[level]
def total_parallelism(self):
'''
Return the specification of the total parallelism.
'''
return reduce(mul, self.para_count_list, 1)
'''
Schedule hint
'''
import loop_enum as le
class Schedule(object):
def __init__(self, schedule_hint, partition_loops=None):
self.schedule_hint = schedule_hint
if partition_loops != None:
self.partition_loops = []
for l in partition_loops:
self.partition_loops.append(le.loop_table[l])
else:
self.partition_loops = partition_loops
num_levels = len(schedule_hint.values()[0])
hint_para_index = {}
for loop in schedule_hint:
for level in xrange(num_levels):
if schedule_hint[loop][level] != None and schedule_hint[loop][level][2] != None:
if level not in hint_para_index:
hint_para_index[level] = [loop]
else:
hint_para_index[level].append(loop)
self.hint_para_index = hint_para_index
@classmethod
def schedule(cls, info):
return cls(info["schedule_hint"], info["partition_loops"])
import loop_enum as le
import buffer_enum as be
def print_loop_nest(point):
loop_orders = zip(*point.loop_orders)
loop_blockings = zip(*point.loop_blockings)
loop_partitionings = zip(*point.loop_partitionings)
para_dims = point.para_loop_dim
num_level = len(loop_orders)
order_lists = []
for level in xrange(num_level):
order_list = [None] * le.NUM
for order in xrange(le.NUM):
if loop_blockings[level][order] != 1 or loop_partitionings[level][order] != 1 :
order_list[loop_orders[level][order]] = (le.table[order],
loop_blockings[level][order],
loop_partitionings[level][order])
order_lists.append(order_list)
print order_lists, para_dims
{
"mem_levels": 3,
"capacity":[512, 131072, 1073741824],
"access_cost":[1, 6, 200],
"static_cost":[0, 0, 0],
"parallel_count":[16, 1, 1],
"mac_capacity":0,
"parallel_mode": [1, 0, 0],
"parallel_cost":[2],
"precision":16
}
{
"mem_levels": 3,
"capacity":[64, 131072, 1073741824],
"access_cost":[0.1, 6, 200],
"static_cost":[0, 0, 0],
"parallel_count":[256, 1, 1],
"array_dim": [1, 1, 1],
"mac_capacity":0,
"parallel_mode": [1, 0, 0],
"parallel_cost":[0.25],
"precision":16
}
{
"mem_levels": 3,
"capacity":[512, 131072, 1073741824],
"access_cost":[0.96, 20, 200],
"static_cost":[0, 0, 0],
"parallel_count":[256, 1, 1],
"array_dim": [1, 1, 1],
"mac_capacity":0,
"parallel_mode": [1, 0, 0],
"parallel_cost":[0.035],
"precision":16
}
{
"mem_levels": 3,
"capacity":[64, 131072, 1073741824],
"access_cost":[0.1, 6, 200],
"static_cost":[0, 0, 0],
"parallel_count":[256, 1, 1],
"mac_capacity":0,
"parallel_mode": [1, 0, 0],
"parallel_cost":[0.25],
"precision":16
}
{
"mem_levels": 3,
"capacity":[64, 131072, 1073741824],
"access_cost":[0.12, 20, 200],
"static_cost":[0, 0, 0],
"parallel_count":[256, 1, 1],
"mac_capacity":0,
"parallel_mode": [1, 0, 0],
"parallel_cost":[0.035],
"precision":16
}
{
"mem_levels": 3,
"capacity":[512, 131072, 1073741824],
"access_cost":[0.96, 20, 200],
"static_cost":[0, 0, 0],
"parallel_count":[4096, 1, 1],
"mac_capacity":0,
"parallel_mode": [1, 0, 0],
"parallel_cost":[0.035],
"precision":16
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment