Setup project

17fc485b · xuanyoya · 17fc485b · 17fc485b · 17fc485b · 17fc485b
Commit 17fc485b authored 6 years ago by xuanyoya
--- a/README.md
+++ b/README.md
+# CNN-blocking
+Tool for optimize CNN blocking
+
+usage: run_optimizer.py [-h] [-s SCHEDULE] [-v]
+                        {basic,mem_explore, dataflow_explore} arch network
+
+positional arguments:
+  
+  {basic,mem_explore, dataflow_explore}   optimizer type
+
+  arch                  architecture specification
+
+  layer                 layer specification
+
+optional arguments:
+
+  -h, --help            show this help message and exit
+
+  -s SCHEDULE, --schedule SCHEDULE restriction of the schedule space
+  this is optional but restricting the schedule space will accelerate the scipt significantly
+
+  -v, --verbose         vebosity
+
+
+# Examples
+## To optimize loop blocking.
+Dataflow: C | K
+
+Memory Architecture: 3 level
+
+Network: AlexNet Conv3 Batch16
+
+```
+python ./tools/run_optimizer.py basic ./examples/arch/3_level_mem_basic_example.json ./examples/layer/alex_conv3_batch16.json -s ./examples/schedule/dataflow_C_K.json -v 
+```
+
+## To optimize memory capacity.
+Dataflow: C | K
+
+Memory Architecture: 3 level
+
+Network: AlexNet Conv3 Batch16
+
+```
+python ./tools/run_optimizer.py mem_explore ./examples/arch/3_level_mem_explore_example.json ./examples/layer/alex_conv3_batch16.json -s ./examples/schedule/eyeriss_alex_conv3.json -v 
+```
+
+## To explore dataflow.
+Dataflow: All
+
+Memory Architecture: Eyeriss
+
+Network: AlexNet Conv3 Batch16
+
+```
+python ./tools/run_optimizer.py dataflow_explore ./examples/arch/3_level_mem_basic_example.json ./examples/layer/alex_conv3_batch16.json -v
+```
+
+or:
+
+```
+python ./tools/run_optimizer.py dataflow_explore ./examples/arch/3_level_mem_basic_example.json ./examples/layer/alex_conv3_batch16.json -n user_defined_pickle_filename -v
+```
--- a/cnn_mapping/__init__.py
+++ b/cnn_mapping/__init__.py
+'''
+cnn_mapping package
+'''
+
+from .mapping_point import MappingPoint
+from .resource import Resource
+from .layer import Layer
+from .schedule import Schedule
+from . import loop_enum as le
+from . import utils
+from . import extract_input
+from . import cost_model
+from . import mapping_point_generator
+from . import optimizer
--- a/cnn_mapping/buffer_enum.py
+++ b/cnn_mapping/buffer_enum.py
+'''
+Buffer enum type.
+
+Buffers include ifmap (IF), ofmap (OF), filter (FL).
+'''
+
+IF = 0
+OF = 1
+FL = 2
+NUM = 3
+
--- a/cnn_mapping/cache.py
+++ b/cnn_mapping/cache.py
+from collections import deque
+
+class Cache(object):
+    '''
+    Helper class for cache computed values to reduce runtime
+    '''
+
+    def __init__(self, num_levels, size):
+        self.num_levels = num_levels
+        self.cache_map = [dict() for x in range(num_levels)]
+        self.cache_queue = [deque() for i in range(num_levels)]
+        self.size = size
+
+    def read_cache(self, level, data):
+        '''
+        Only read cache, no change to make
+        '''
+        if len(self.cache_map[level]) == 0 or data not in self.cache_map[level]:
+            return None
+        else:
+            return self.cache_map[level][data]    
+
+    def write_cache(self, level, data, value):
+        '''
+        Only write cache, change map and queue
+        ''' 
+        assert len(self.cache_map[level]) == len(self.cache_queue[level])
+        self.cache_map[level][data] = value
+        self.cache_queue[level].append(data)
+
+        if len(self.cache_queue[level]) > self.size:
+            pop_ele = self.cache_queue[level].popleft()
+            del self.cache_map[level][pop_ele] 
+
+
+       
--- a/cnn_mapping/cost_model.py
+++ b/cnn_mapping/cost_model.py
--- a/cnn_mapping/extract_input.py
+++ b/cnn_mapping/extract_input.py
+import json
+import os
+import loop_enum as le
+
+def extract_arch_info(arch_file):
+    with open(arch_file) as json_data_file:
+        data = json.load(json_data_file)
+    assert data["mem_levels"] == len(data["capacity"]), \
+        "capacity list is invalid, too many or too few elements"
+    assert data["mem_levels"] == len(data["access_cost"]), \
+        "access_cost list is invalid, too many or too few elements"
+    assert data["mem_levels"] == len(data["parallel_count"]), \
+        "parallel_count list is invalid, too many or too few elements"
+
+    num_bytes = data["precision"]/8
+    capacity_list =  [x/num_bytes for x in data["capacity"]]
+    data["capacity"] = capacity_list
+    if "static_cost" not in data:
+        data["static_cost"] = [0,] * data["mem_levels"]
+    else:
+        assert data["mem_levels"] == len(data["static_cost"]), \
+            "static_cost list is invalid, too many or too few elements"
+
+    if "mac_capacity" not in data:
+        data["mac_capacity"] = 0
+    if "parallel_mode" not in data:
+        data["parallel_mode"] = [0,] * data["mem_levels"]
+        for level in xrange(data["mem_levels"]):
+            if data["parallel_count"][level] != 1:
+                data["parallel_mode"][level] = 1
+    else:
+        assert data["mem_levels"] == len(data["parallel_mode"]), \
+            "parallel_mode list is invalid, too many or too few elements"
+ 
+    if "array_dim" not in data:
+        data["array_dim"] = None
+    if "utilization_threshold" not in data:
+        data["utilization_threshold"] = 0.75
+    if "replication" not in data:
+        data["replication"] = True
+   
+    return data
+
+def extract_network_info(network_file):
+    with open(network_file) as json_data_file:
+        data = json.load(json_data_file)
+
+    if "batch_size" not in data:
+        data["batch_size"] = 1
+    if "stride_width" not in data:
+        data["stride_width"] = 1
+    if "stride_height" not in data:
+        data["stride_height"] = 1
+   
+    layer_summary = data.values()
+    data['layer_info'] = layer_summary
+    data['layer_name'] = os.path.splitext(os.path.basename(network_file))[0]
+
+    return data
+
+def extract_schedule_info(schedule_file, num_levels):
+    with open(schedule_file) as json_data_file:
+        data = json.load(json_data_file)
+
+    schedule = {}
+    hint = data["schedule_hint"]
+    schedule_hint = {}
+    for loop in hint:
+        schedule_hint[le.loop_table[loop]] = [None,]*num_levels
+        for level in hint[loop]:
+            level_index = int(level.lstrip('level'))
+            schedule_hint[le.loop_table[loop]][level_index] = [None,]*3
+            if  "order" in hint[loop][level]:
+                schedule_hint[le.loop_table[loop]][level_index][0] = hint[loop][level]["order"]
+            if  "blocking_size" in hint[loop][level]:
+                schedule_hint[le.loop_table[loop]][level_index][1] = hint[loop][level]["blocking_size"]
+            if  "partitioning_size" in hint[loop][level]:
+                schedule_hint[le.loop_table[loop]][level_index][2] = hint[loop][level]["partitioning_size"]
+
+    schedule["schedule_hint"] = schedule_hint
+
+    if "partition_loops" not in data:
+        schedule["partition_loops"] = None
+    else:
+        schedule["partition_loops"] = data["partition_loops"]
+
+    #TODO partition at dimension  
+    return schedule
+
+
+def extract_info(args):
+    arch_info = extract_arch_info(args.arch)
+    network_info = extract_network_info(args.network)
+    schedule_info = extract_schedule_info(args.schedule, arch_info["mem_levels"]) if args.schedule else None
+    return arch_info, network_info, schedule_info
+
--- a/cnn_mapping/layer.py
+++ b/cnn_mapping/layer.py
+'''
+Layer specification.
+'''
+
+class Layer(object):
+    '''
+    NN layer parameters.
+
+    nifm: # ifmap channels.
+    nofm: # ofmap channels.
+    wifm: ifmap width.
+    hifm: ifmap height.
+    wofm: ofmap width.
+    hofm: ofmap height.
+    wfil: weight filter width.
+    hfil: weight filter height.
+    nimg: # input images (batch).
+    wstd: stride size in width dimension.
+    hstd: stride size in height dimension.
+    '''
+
+    def __init__(self, nifm, nofm, wofm, hofm, wfil, hfil, nimg=1, wstd=1, hstd=1):
+        self.nifm = nifm
+        self.nofm = nofm
+        self.wofm = wofm
+        self.hofm = hofm
+        self.wifm = wfil + (wofm - 1) * wstd
+        self.hifm = hfil + (hofm - 1) * hstd
+        self.wfil = wfil
+        self.hfil = hfil
+        self.nimg = nimg
+        self.wstd = wstd
+        self.hstd = hstd
+        assert self.wofm > 0
+        assert self.hofm > 0
+        assert self.nimg > 0
+        self.sizes = [wfil, hfil, wofm, hofm, nofm, nifm, nimg]
+
+    @classmethod
+    def layer(cls, info):
+        return cls(info["input_fmap_channel"], info["output_fmap_channel"], 
+                     info["fmap_width"], info["fmap_height"], info["window_width"],
+                     info["window_height"], info["batch_size"], 
+                     info["stride_width"], info["stride_height"]) 
+    
+class FCLayer(Layer):
+    '''
+    NN fully-connected layer parameters.
+
+    (wifm, hifm) = (wfil, hfil), wstd = hstd = 1, wofm = hofm = 1.
+    '''
+
+    def __init__(self, nifm, nofm, wfil, hfil, nimg=1):
+        Layer.__init__(self, nifm, nofm, 1, 1, wfil, hfil, nimg)
+
--- a/cnn_mapping/loop_enum.py
+++ b/cnn_mapping/loop_enum.py
+'''
+Loop enum type.
+
+Loops include filter width (FX), filter height (FY), 
+output width (OX), output height (OY),
+output channel (OC), input channel (IC),
+batch (ON).
+'''
+
+FX = 0
+FY = 1
+OX = 2
+OY = 3
+OC = 4
+IC = 5
+ON = 6
+NUM = 7
+
+table = {0: 'FX', 
+         1: 'FY',
+         2: 'OX',
+         3: 'OY',
+         4: 'OC',
+         5: 'IC',
+         6: 'ON' }
+
+loop_table = { 'FX': 0,
+               'FY': 1,
+               'OX': 2,
+               'OY': 3,
+               'OC': 4,
+               'IC': 5,
+               'ON': 6}
--- a/cnn_mapping/mapping_point.py
+++ b/cnn_mapping/mapping_point.py
+'''
+Type for a specific mapping point.
+'''
+
+class MappingPoint(object):
+    '''
+    Configurations of a specific mapping.
+
+    Mapping includes the complete description of the loop order and loop
+    blocking factors of each buffer level, loop partitioning onto each level of
+    parallel units, etc.
+
+    Each loop order and each set of loop blocking factors are corresponding to
+    each loop at all buffer levels, because it does not make much sense to block
+    more than once at a single buffer level.
+
+    Each set of loop partitioning factors is corresponding to number of parallelism of
+    each loop at all all levels.
+
+    Partition mode is the access mode in parallelism case. 
+        0(default): access to next level of memory
+        1: access buffers in neighbor processing units
+    '''
+
+    def __init__(self, loop_order_list, loop_blockings_list,
+                 loop_partitionings_list, para_loop_dim_list=None):
+        # NOTE(mgao12): no value validation here; cost model needs to abandon
+        # invalid mapping.
+        self.loop_orders = loop_order_list
+        self.loop_blockings = loop_blockings_list
+        self.loop_partitionings = loop_partitionings_list
+        self.para_loop_dim = para_loop_dim_list
+
+    def loop_order(self, loop):
+        '''
+        Loop order of the given loop.
+
+        Return a tuple of the order indices for the same loop at all buffer
+        levels, i.e., if a tuple for loop OX is returned, and the first 
+        element of this tuple t[0] = 2, then loop OX is the third innermost 
+        loop at the first buffer level.  
+
+        Tuples are organized as the same order as loop enum order.
+
+        E.g., for a two-level memory hierachy, each tuple contains two 
+        elements, [(0, 0), (1, 1), (2, 4), (3, 5), (4, 3), (5, 2), (6, 6)]
+        means for the first loop (FX = 0), at both levels FX is at the
+        innermost (tuple (0, 0)), etc.. I.e., it means a loop structure as:
+          for on
+            for oy
+              for ox
+                for oc
+                  for ic
+                    for fy
+                      for fx
+
+                        for on
+                          for ic
+                            for oc
+                              for oy
+                                for ox
+                                  for fy
+                                    for fx
+                                      ...
+        '''
+        return self.loop_orders[loop]
+
+    def loop_blocking(self, loop):
+        '''
+        Loop blocking factors of the given loop.
+
+        Return a tuple of factors for the given loop at all buffer levels, from
+        inside level to outside level.
+
+        Tuples are organized as the same order as loop enum order.
+
+        E.g., [(4, 2), (8, 1), ...] means for the first loop (FX = 0), the
+        blocking factor is 4 for the innermost level, and 2 for the next level;
+        for the second loop (FY = 0), the blocking factor is 8 for the
+        innermost level, and 1 for the next level.
+        '''
+        return self.loop_blockings[loop]
+
+    def loop_partitioning(self, loop):
+        '''
+        Loop partitioning factors of the given loop.
+
+        Return a tuple of factors for the given loop at all buffer levels, from
+        inside level to outside level.
+
+        Tuples are organized as the same order as loop enum order.
+
+        E.g., [(4, 2), (8, 1), ...] means for the first loop (FX = 0), it is
+        parallelized in 4 units for the first parallel level, and 2 for the
+        next level, etc..
+        '''
+        return self.loop_partitionings[loop]
--- a/cnn_mapping/mapping_point_generator.py
+++ b/cnn_mapping/mapping_point_generator.py
--- a/cnn_mapping/optimizer.py
+++ b/cnn_mapping/optimizer.py
+'''
+Top level function of optimization framework
+'''
+import mapping_point_generator
+import cost_model
+
+import loop_enum as le
+import buffer_enum as be 
+
+def opt_optimizer(resource, layer, hint=None, verbose=False):
+    '''
+    Evaluate the cost of each mapping point,
+    record the mapping_point with the smallest cost
+    '''
+    if hint is not None and hint.partition_loops is None:
+        valid = cost_model.valid_dataflow(resource, hint.schedule_hint)
+    assert valid == True, "Specified schedule doesn't satisfy the utilization threshold, please check partitioning_size"
+
+    smallest_cost, perf, best_mapping_point = mapping_point_generator.opt_mapping_point_generator_function(resource, layer, hint, verbose)
+    total_cost = cost_model.get_cost(resource, best_mapping_point, layer, verbose)
+  
+    if verbose:
+        print "Optimal energy (pJ): ", smallest_cost
+        print "Runtime (cycles):", perf
+        print "Best mapping_point: ", best_mapping_point.loop_blockings, best_mapping_point.loop_partitionings, best_mapping_point.loop_orders
+    return [smallest_cost, best_mapping_point]
+ 
+
--- a/cnn_mapping/resource.py
+++ b/cnn_mapping/resource.py
+'''
+Hardware resource types.
+'''
+#import numpy as np
+from collections import namedtuple
+from operator import mul
+import math
+
+class Buffer(namedtuple('Buffer',
+                        ['capacity', 'access_cost', 'unit_static_cost'])):
+    '''
+    Buffer specification.
+
+    Immutable type.
+
+    Buffer attributes include capacity, access cost, unit static cost.
+
+    Capacity is for a single buffer (If current level has parallelsim, 
+    then it is the capacity of the buffer bank inside each parallel 
+    units); access cost is the cost per access; 
+    unit static cost is the static cost per time unit.
+    '''
+    pass
+
+class Parallelism(namedtuple('Parallelism',
+                             ['count', 'access_mode', 'array_access_cost', 'array_dim', 'array_width'])):
+    '''
+    Parallelism specification.
+
+    Immutable type.
+
+    Parallelism attributes include count and access_mode.
+
+    Count is the number of parallel units. 
+
+    Access mode is the mode of access non-private data, 
+    for example, whether access neighborhood PE, or 
+    goes to next level buffer.
+
+    Array access cost is the cost of accessing array level buffers.
+
+    Array dimension is the dimension of PE array, whether it is 1D or 2D.
+
+    Array width is the width of PE array, if 1D array, same as array dimension. 
+    if 2D array, sqrt(array_dim)
+    
+    Note: shared buffer level is the level
+    index of the lowest shared buffer for this parallelism.
+    '''
+    pass
+
+class Resource(object):
+    '''
+    Hardware resource specification.
+    Hardware resource includes buffer hierarchy and parallel processing units.
+    
+    mac_capacity[0, 1], determines whether MAC can buffer 1 output.
+    '''
+
+    def __init__(self, buf_capacity_list, buf_access_cost_list,
+                 buf_unit_static_cost_list, para_count_list,  
+                 mac_capacity=1, partition_mode=None, array_access_cost=None, 
+                 array_dim = None, utilization_threshold = 0.75, replication=True):
+
+        # Buffers.
+        assert len(buf_capacity_list) == len(buf_access_cost_list)
+        assert len(buf_capacity_list) == len(buf_unit_static_cost_list)
+        assert len(buf_capacity_list) == len(para_count_list)
+        
+        self.bufs = [Buffer(*t) for t in zip(buf_capacity_list, \
+            buf_access_cost_list, buf_unit_static_cost_list)]
+
+        self.num_levels = len(self.bufs)
+        # Parallelism.
+        array_access_costs = [None] * len(para_count_list)
+        if not partition_mode :
+            partition_mode = [0] * len(para_count_list)
+        else :
+            array_level = 0
+            for i in xrange(self.num_levels):
+                # when using non-default partition mode, the parallelism
+                # count needs to be large than 1
+                assert partition_mode[i] == 0 or para_count_list <= 1 \
+                       or (partition_mode[i] > 0 and para_count_list > 1)
+                if partition_mode[i] == 1 or partition_mode[i] == 2:
+                    array_access_costs[i] = array_access_cost[array_level]
+                    array_level += 1
+ 
+        self.para_index = [i for i, e in enumerate(para_count_list) if e != 1]
+
+        if not array_dim:
+            array_dim = [2 if e != 1 else 1 for e in para_count_list]
+
+        array_width = [para_count_list[i] if array_dim[i] == 1 else int(math.sqrt(para_count_list[i])) for i in xrange(self.num_levels)]
+ 
+        self.paras = [Parallelism(*t) for t in zip(para_count_list, \
+            partition_mode, array_access_costs, array_dim, array_width)]
+        self.access_cost = buf_access_cost_list
+        self.mac_capacity = mac_capacity
+        self.array_access_cost = array_access_cost
+        self.para_count_list = para_count_list
+        self.utilization_threshold = utilization_threshold
+        self.replication = replication
+
+    @classmethod
+    def arch(cls, info):
+        return cls(info["capacity"], info["access_cost"], info["static_cost"],
+                        info["parallel_count"], info["mac_capacity"], info["parallel_mode"],
+                        info["parallel_cost"], info["array_dim"], info["utilization_threshold"], info["replication"])  
+
+    def buffer_levels(self):
+        '''
+        Return total levels of buffers in the hierarchy.
+        '''
+        return self.num_levels
+
+    def buffer(self, level):
+        '''
+        Return the specification of the buffer of the given level.
+        '''
+        return self.bufs[level]
+
+
+    def parallelism(self, level):
+        '''
+        Return the specification of the parallelism of the given level.
+        '''
+        return self.paras[level]
+
+    def total_parallelism(self):
+        '''
+        Return the specification of the total parallelism.
+        '''
+        return reduce(mul, self.para_count_list, 1)
--- a/cnn_mapping/schedule.py
+++ b/cnn_mapping/schedule.py
+'''
+Schedule hint
+'''
+
+import loop_enum as le
+
+class Schedule(object):
+
+    def __init__(self, schedule_hint, partition_loops=None):
+
+        self.schedule_hint = schedule_hint
+        if partition_loops != None:
+            self.partition_loops = []
+            for l in partition_loops:
+                self.partition_loops.append(le.loop_table[l])
+        else:
+            self.partition_loops = partition_loops       
+
+        num_levels = len(schedule_hint.values()[0])
+        hint_para_index = {}
+        for loop in schedule_hint:
+            for level in xrange(num_levels):
+                if schedule_hint[loop][level] != None and schedule_hint[loop][level][2] != None:
+                    if level not in hint_para_index:
+                        hint_para_index[level] = [loop]
+                    else:
+                        hint_para_index[level].append(loop)
+        self.hint_para_index = hint_para_index
+
+    @classmethod
+    def schedule(cls, info):
+        return cls(info["schedule_hint"], info["partition_loops"]) 
--- a/cnn_mapping/utils.py
+++ b/cnn_mapping/utils.py
+import loop_enum as le
+import buffer_enum as be
+
+def print_loop_nest(point):
+    loop_orders = zip(*point.loop_orders)
+    loop_blockings = zip(*point.loop_blockings)
+    loop_partitionings = zip(*point.loop_partitionings)
+    para_dims = point.para_loop_dim
+    num_level = len(loop_orders)
+    order_lists = []
+    for level in xrange(num_level):
+        order_list = [None] * le.NUM
+        for order in xrange(le.NUM):
+            if loop_blockings[level][order] != 1 or loop_partitionings[level][order] != 1 :
+                order_list[loop_orders[level][order]] = (le.table[order], 
+                                                        loop_blockings[level][order],
+                                                        loop_partitionings[level][order]) 
+                
+        order_lists.append(order_list) 
+
+    print order_lists, para_dims
+
--- a/examples/arch/3_level_mem_16MAC.json
+++ b/examples/arch/3_level_mem_16MAC.json
+{
+    "mem_levels": 3,
+    "capacity":[512, 131072, 1073741824],
+    "access_cost":[1, 6, 200],
+    "static_cost":[0, 0, 0],
+    "parallel_count":[16, 1, 1],
+    "mac_capacity":0,
+    "parallel_mode": [1, 0, 0],
+    "parallel_cost":[2],
+    "precision":16
+}
--- a/examples/arch/3_level_mem_1D_64Reg.json
+++ b/examples/arch/3_level_mem_1D_64Reg.json
+{
+    "mem_levels": 3,
+    "capacity":[64, 131072, 1073741824],
+    "access_cost":[0.1, 6, 200],
+    "static_cost":[0, 0, 0],
+    "parallel_count":[256, 1, 1],
+    "array_dim": [1, 1, 1],
+    "mac_capacity":0,
+    "parallel_mode": [1, 0, 0],
+    "parallel_cost":[0.25],
+    "precision":16
+}
--- a/examples/arch/3_level_mem_1D_asic.json
+++ b/examples/arch/3_level_mem_1D_asic.json
+{
+    "mem_levels": 3,
+    "capacity":[512, 131072, 1073741824],
+    "access_cost":[0.96, 20, 200],
+    "static_cost":[0, 0, 0],
+    "parallel_count":[256, 1, 1],
+    "array_dim": [1, 1, 1],
+    "mac_capacity":0,
+    "parallel_mode": [1, 0, 0],
+    "parallel_cost":[0.035],
+    "precision":16
+}
--- a/examples/arch/3_level_mem_64Reg.json
+++ b/examples/arch/3_level_mem_64Reg.json
+{
+    "mem_levels": 3,
+    "capacity":[64, 131072, 1073741824],
+    "access_cost":[0.1, 6, 200],
+    "static_cost":[0, 0, 0],
+    "parallel_count":[256, 1, 1],
+    "mac_capacity":0,
+    "parallel_mode": [1, 0, 0],
+    "parallel_cost":[0.25],
+    "precision":16
+}
--- a/examples/arch/3_level_mem_64Reg_asic.json
+++ b/examples/arch/3_level_mem_64Reg_asic.json
+{
+    "mem_levels": 3,
+    "capacity":[64, 131072, 1073741824],
+    "access_cost":[0.12, 20, 200],
+    "static_cost":[0, 0, 0],
+    "parallel_count":[256, 1, 1],
+    "mac_capacity":0,
+    "parallel_mode": [1, 0, 0],
+    "parallel_cost":[0.035],
+    "precision":16
+}
--- a/examples/arch/3_level_mem_asic_64_64.json
+++ b/examples/arch/3_level_mem_asic_64_64.json
+{
+    "mem_levels": 3,
+    "capacity":[512, 131072, 1073741824],
+    "access_cost":[0.96, 20, 200],
+    "static_cost":[0, 0, 0],
+    "parallel_count":[4096, 1, 1],
+    "mac_capacity":0,
+    "parallel_mode": [1, 0, 0],
+    "parallel_cost":[0.035],
+    "precision":16
+}