Turn Interstellar into a Python package

c1a9ce4c · sgauthamr2001 · 6b0672af · c1a9ce4c · c1a9ce4c · c1a9ce4c
Commit c1a9ce4c authored 2 months ago by sgauthamr2001
--- a/setup.py
+++ b/setup.py
+from setuptools import setup, find_packages
+
+setup(
+    name="interstellar-dnn-mapping",
+    version="0.1.dev0",
+    author="Kartik Prabhu",
+    author_email="kprabhu7@stanford.edu",
+    description="Design space exploration for DNN accelerators",
+    long_description=open("README.md", "r", encoding="utf-8").read(),
+    package_dir={"": "src"},
+    packages=find_packages("src"),
+)
--- a/src/interstellar/__init__.py
+++ b/src/interstellar/__init__.py
+'''
+cnn_mapping package
+'''
+
+from . import loop_enum as le
+from . import utils
+from . import extract_input
+from . import cost_model
+from . import mapping_point_generator
+from . import optimizer
+
+from .mapping_point import MappingPoint
+from .resource import Resource
+from .layer import Layer
+from .schedule import Schedule
+
--- a/src/interstellar/buffer_enum.py
+++ b/src/interstellar/buffer_enum.py
+'''
+Buffer enum type.
+
+Buffers include ifmap (IF), ofmap (OF), filter (FL).
+'''
+
+IF = 0
+OF = 1
+FL = 2
+NUM = 3
+
--- a/src/interstellar/cache.py
+++ b/src/interstellar/cache.py
+from collections import deque
+
+class Cache(object):
+    '''
+    Helper class for cache computed values to reduce runtime
+    '''
+
+    def __init__(self, num_levels, size):
+        self.num_levels = num_levels
+        self.cache_map = [dict() for x in range(num_levels)]
+        self.cache_queue = [deque() for i in range(num_levels)]
+        self.size = size
+
+    def read_cache(self, level, data):
+        '''
+        Only read cache, no change to make
+        '''
+        if len(self.cache_map[level]) == 0 or data not in self.cache_map[level]:
+            return None
+        else:
+            return self.cache_map[level][data]    
+
+    def write_cache(self, level, data, value):
+        '''
+        Only write cache, change map and queue
+        ''' 
+        assert len(self.cache_map[level]) == len(self.cache_queue[level])
+        self.cache_map[level][data] = value
+        self.cache_queue[level].append(data)
+
+        if len(self.cache_queue[level]) > self.size:
+            pop_ele = self.cache_queue[level].popleft()
+            del self.cache_map[level][pop_ele] 
+
+
+       
--- a/src/interstellar/cost_model.py
+++ b/src/interstellar/cost_model.py
--- a/src/interstellar/extract_input.py
+++ b/src/interstellar/extract_input.py
+import json
+import os
+from . import loop_enum as le
+
+
+def extract_arch_info(arch_file):
+    with open(arch_file) as json_data_file:
+        data = json.load(json_data_file)
+    assert data["mem_levels"] == len(data["capacity"]), \
+        "capacity list is invalid, too many or too few elements"
+    assert data["mem_levels"] == len(data["access_cost"]), \
+        "access_cost list is invalid, too many or too few elements"
+    assert data["mem_levels"] == len(data["parallel_count"]), \
+        "parallel_count list is invalid, too many or too few elements"
+
+    num_bytes = data["precision"] / 8
+    
+    if type(data["capacity"][0]) is list: 
+        capacity_list = [ [x / num_bytes for x in data["capacity"][i]] for i in range(len(data["capacity"])) ]
+    else:
+        capacity_list = [x / num_bytes for x in data["capacity"] ]
+    data["capacity"] = capacity_list
+    if "static_cost" not in data:
+        data["static_cost"] = [0, ] * data["mem_levels"]
+    else:
+        assert data["mem_levels"] == len(data["static_cost"]), \
+            "static_cost list is invalid, too many or too few elements"
+
+    if "mac_capacity" not in data:
+        data["mac_capacity"] = 0
+    if "parallel_mode" not in data:
+        data["parallel_mode"] = [0, ] * data["mem_levels"]
+        for level in range(data["mem_levels"]):
+            if data["parallel_count"][level] != 1:
+                data["parallel_mode"][level] = 1
+    else:
+        assert data["mem_levels"] == len(data["parallel_mode"]), \
+            "parallel_mode list is invalid, too many or too few elements"
+
+    if "array_dim" not in data:
+        data["array_dim"] = None
+    if "utilization_threshold" not in data:
+        data["utilization_threshold"] = 0.0
+    if "replication" not in data:
+        data["replication"] = True
+    if "invalid_underutilized" not in data:
+        data["invalid_underutilized"] = True
+    if "memory_partitions" not in data:
+        data["memory_partitions"] = [[0,0,0],[0,0,0],[0,0,0]]
+    
+
+    return data
+
+def extract_network_info(network_file):
+    with open(network_file) as json_data_file:
+        data = json.load(json_data_file)
+
+    if "batch_size" not in data:
+        data["batch_size"] = 1
+    if "stride_width" not in data:
+        data["stride_width"] = 1
+    if "stride_height" not in data:
+        data["stride_height"] = 1
+   
+    layer_summary = data.values()
+    data['layer_info'] = layer_summary
+    data['layer_name'] = os.path.splitext(os.path.basename(network_file))[0]
+
+    return data
+
+def extract_schedule_info(schedule_file, num_levels):
+    with open(schedule_file) as json_data_file:
+        data = json.load(json_data_file)
+
+    schedule = {}
+    hint = data["schedule_hint"]
+    schedule_hint = {}
+    for loop in hint:
+        schedule_hint[le.loop_table[loop]] = [None,]*num_levels
+        for level in hint[loop]:
+            level_index = int(level.lstrip('level'))
+            schedule_hint[le.loop_table[loop]][level_index] = [None,]*3
+            if  "order" in hint[loop][level]:
+                schedule_hint[le.loop_table[loop]][level_index][0] = hint[loop][level]["order"]
+            if  "blocking_size" in hint[loop][level]:
+                schedule_hint[le.loop_table[loop]][level_index][1] = hint[loop][level]["blocking_size"]
+            if  "partitioning_size" in hint[loop][level]:
+                schedule_hint[le.loop_table[loop]][level_index][2] = hint[loop][level]["partitioning_size"]
+
+    schedule["schedule_hint"] = schedule_hint
+
+    if "partition_loops" not in data:
+        schedule["partition_loops"] = None
+    else:
+        schedule["partition_loops"] = data["partition_loops"]
+
+    #TODO partition at dimension  
+    return schedule
+
+
+def extract_info(args):
+    arch_info = extract_arch_info(args.arch)
+    network_info = extract_network_info(args.network)
+    schedule_info = extract_schedule_info(args.schedule, arch_info["mem_levels"]) if args.schedule else None
+    return arch_info, network_info, schedule_info
+
--- a/src/interstellar/layer.py
+++ b/src/interstellar/layer.py
+'''
+Layer specification.
+'''
+
+class Layer(object):
+    '''
+    NN layer parameters.
+
+    nifm: # ifmap channels.
+    nofm: # ofmap channels.
+    wifm: ifmap width.
+    hifm: ifmap height.
+    wofm: ofmap width.
+    hofm: ofmap height.
+    wfil: weight filter width.
+    hfil: weight filter height.
+    nimg: # input images (batch).
+    wstd: stride size in width dimension.
+    hstd: stride size in height dimension.
+    '''
+
+    def __init__(self, nifm, nofm, wofm, hofm, wfil, hfil, nimg=1, wstd=1, hstd=1):
+        self.nifm = nifm
+        self.nofm = nofm
+        self.wofm = wofm
+        self.hofm = hofm
+        self.wifm = wfil + (wofm - 1) * wstd
+        self.hifm = hfil + (hofm - 1) * hstd
+        self.wfil = wfil
+        self.hfil = hfil
+        self.nimg = nimg
+        self.wstd = wstd
+        self.hstd = hstd
+        assert self.wofm > 0
+        assert self.hofm > 0
+        assert self.nimg > 0
+        self.sizes = [wfil, hfil, wofm, hofm, nofm, nifm, nimg]
+
+    @classmethod
+    def layer(cls, info):
+        return cls(info["input_fmap_channel"], info["output_fmap_channel"], 
+                     info["fmap_width"], info["fmap_height"], info["window_width"],
+                     info["window_height"], info["batch_size"], 
+                     info["stride_width"], info["stride_height"]) 
+    
+class FCLayer(Layer):
+    '''
+    NN fully-connected layer parameters.
+
+    (wifm, hifm) = (wfil, hfil), wstd = hstd = 1, wofm = hofm = 1.
+    '''
+
+    def __init__(self, nifm, nofm, wfil, hfil, nimg=1):
+        Layer.__init__(self, nifm, nofm, 1, 1, wfil, hfil, nimg)
+
--- a/src/interstellar/loop_enum.py
+++ b/src/interstellar/loop_enum.py
+'''
+Loop enum type.
+
+Loops include filter width (FX), filter height (FY), 
+output width (OX), output height (OY),
+output channel (OC), input channel (IC),
+batch (ON).
+'''
+
+FX = 0
+FY = 1
+OX = 2
+OY = 3
+OC = 4
+IC = 5
+ON = 6
+NUM = 7
+
+table = {0: 'FX', 
+         1: 'FY',
+         2: 'OX',
+         3: 'OY',
+         4: 'OC',
+         5: 'IC',
+         6: 'ON' }
+
+loop_table = { 'FX': 0,
+               'FY': 1,
+               'OX': 2,
+               'OY': 3,
+               'OC': 4,
+               'IC': 5,
+               'ON': 6}
--- a/src/interstellar/mapping_point.py
+++ b/src/interstellar/mapping_point.py
+'''
+Type for a specific mapping point.
+'''
+
+class MappingPoint(object):
+    '''
+    Configurations of a specific mapping.
+
+    Mapping includes the complete description of the loop order and loop
+    blocking factors of each buffer level, loop partitioning onto each level of
+    parallel units, etc.
+
+    Each loop order and each set of loop blocking factors are corresponding to
+    each loop at all buffer levels, because it does not make much sense to block
+    more than once at a single buffer level.
+
+    Each set of loop partitioning factors is corresponding to number of parallelism of
+    each loop at all all levels.
+
+    Partition mode is the access mode in parallelism case. 
+        0(default): access to next level of memory
+        1: access buffers in neighbor processing units
+        #LMEI other parallelism cases to be added
+    '''
+
+    def __init__(self, loop_order_list, loop_blockings_list,
+                 loop_partitionings_list, para_loop_dim_list=None):
+        # NOTE(mgao12): no value validation here; cost model needs to abandon
+        # invalid mapping.
+        self.loop_orders = loop_order_list
+        self.loop_blockings = loop_blockings_list
+        self.loop_partitionings = loop_partitionings_list
+        self.para_loop_dim = para_loop_dim_list
+
+    def loop_order(self, loop):
+        '''
+        Loop order of the given loop.
+
+        Return a tuple of the order indices for the same loop at all buffer
+        levels, i.e., if a tuple for loop OX is returned, and the first 
+        element of this tuple t[0] = 2, then loop OX is the third innermost 
+        loop at the first buffer level.  
+
+        Tuples are organized as the same order as loop enum order.
+
+        E.g., for a two-level memory hierarchy, each tuple contains two
+        elements, [(0, 0), (1, 1), (2, 4), (3, 5), (4, 3), (5, 2), (6, 6)]
+        means for the first loop (FX = 0), at both levels FX is at the
+        innermost (tuple (0, 0)), etc.. I.e., it means a loop structure as:
+          for on
+            for oy
+              for ox
+                for oc
+                  for ic
+                    for fy
+                      for fx
+
+                        for on
+                          for ic
+                            for oc
+                              for oy
+                                for ox
+                                  for fy
+                                    for fx
+                                      ...
+        '''
+        return self.loop_orders[loop]
+
+    def loop_blocking(self, loop):
+        '''
+        Loop blocking factors of the given loop.
+
+        Return a tuple of factors for the given loop at all buffer levels, from
+        inside level to outside level.
+
+        Tuples are organized as the same order as loop enum order.
+
+        E.g., [(4, 2), (8, 1), ...] means for the first loop (FX = 0), the
+        blocking factor is 4 for the innermost level, and 2 for the next level;
+        for the second loop (FY = 1), the blocking factor is 8 for the
+        innermost level, and 1 for the next level.
+        '''
+        return self.loop_blockings[loop]
+
+    def loop_partitioning(self, loop):
+        '''
+        Loop partitioning factors of the given loop.
+
+        Return a tuple of factors for the given loop at all buffer levels, from
+        inside level to outside level.
+
+        Tuples are organized as the same order as loop enum order.
+
+        E.g., [(4, 2), (8, 1), ...] means for the first loop (FX), it is
+        parallelized in 4 units for the first parallel level, and 2 for the
+        next level, etc..
+        '''
+        return self.loop_partitionings[loop]
--- a/src/interstellar/mapping_point_generator.py
+++ b/src/interstellar/mapping_point_generator.py
--- a/src/interstellar/optimizer.py
+++ b/src/interstellar/optimizer.py
+'''
+Top level function of optimization framework
+'''
+from . import mapping_point_generator
+from . import cost_model
+
+from . import loop_enum as le
+from . import buffer_enum as be 
+
+def opt_optimizer(resource, layer, hint=None, verbose=False):
+    '''
+    Evaluate the cost of each mapping point,
+    record the mapping_point with the smallest cost
+    '''
+
+    smallest_cost, perf, best_mapping_point = mapping_point_generator.opt_mapping_point_generator_function(resource, layer, hint, verbose)
+    access_list, array_cost  = cost_model.get_access(best_mapping_point, layer, resource)
+
+    # total_cost = cost_model.get_cost(resource, best_mapping_point, layer, verbose)
+    # assert total_cost == smallest_cost
+
+    if verbose:
+        print("Optimal_Energy_(pJ): ", smallest_cost)
+        print("Runtime_(cycles):", perf)
+        #print("Best mapping_point: ", best_mapping_point.loop_blockings, best_mapping_point.loop_partitionings, best_mapping_point.loop_orders)
+    return [smallest_cost, best_mapping_point, perf]
+ 
+def optimizer(resource, layer, hint=None, verbose=False):
+     
+    smallest_cost = float("inf")
+    mp_generator = mapping_point_generator.mapping_point_generator_function(resource, layer, hint, verbose)
+
+    
+    #counter = 0
+    for mapping_point in mp_generator:
+        #counter += 1
+        cost = cost_model.get_cost(resource, mapping_point, layer, verbose)
+        #if verbose:
+        #    print "Current cost: ", cost
+        #     print "Current mapping_point: ", mapping_point.loop_blockings, mapping_point.loop_orders
+
+        if cost < smallest_cost:
+            smallest_cost = cost
+            best_mapping_point = mapping_point
+            if verbose:
+                print("Current smallest cost: ", smallest_cost)
+                print("Current best mapping_point: ", mapping_point.loop_blockings, mapping_point.loop_orders)
+    #print counter
+    
+    if verbose:
+        print(smallest_cost)
+        #print("Best mapping_point: ", best_mapping_point.loop_blockings, mapping_point.loop_partitionings, best_mapping_point.loop_orders)
+
+    return [smallest_cost, best_mapping_point] 
+
--- a/src/interstellar/resource.py
+++ b/src/interstellar/resource.py
+'''
+Hardware resource types.
+'''
+#import numpy as np
+from collections import namedtuple
+from operator import mul
+import math
+from functools import reduce
+
+class Buffer(namedtuple('Buffer',
+                        ['capacity', 'access_cost', 'unit_static_cost'])):
+    '''
+    Buffer specification.
+
+    Immutable type.
+
+    Buffer attributes include capacity, access cost, unit static cost.
+
+    Capacity is for a single buffer (If current level has parallelism,
+    then it is the capacity of the buffer bank inside each parallel 
+    units); access cost is the cost per access; 
+    unit static cost is the static cost per time unit.
+    '''
+    pass
+
+class Parallelism(namedtuple('Parallelism',
+                             ['count', 'access_mode', 'array_access_cost', 'array_dim', 'array_width'])):
+    '''
+    Parallelism specification.
+
+    Immutable type.
+
+    Parallelism attributes include count and access_mode.
+
+    Count is the number of parallel units. 
+
+    Access mode is the mode of access non-private data, 
+    for example, whether access neighborhood PE, or
+    goes to next level buffer.
+
+    Array access cost is the cost of accessing array level buffers.
+
+    Array dimension is the dimension of PE array, whether it is 1D or 2D.
+
+    Array width is the width of PE array, if 1D array, same as array dimension. 
+    if 2D array, sqrt(array_dim)
+    
+    Note: shared buffer level is the level
+    index of the lowest shared buffer for this parallelism.
+    '''
+    pass
+
+class Resource(object):
+    '''
+    Hardware resource specification.
+    Hardware resource includes buffer hierarchy and parallel processing units.
+
+    buf_capacity_list:         [1st level buffer size, 2nd level ...] (UNIT: Byte)
+    buf_access_cost_list:      [1st level mem per access cost, 2nd level ...] (UNIT: pJ)
+    buf_unit_static_cost_list: [1st level mem static cost per time unit, 2nd level ...] (UNIT: pJ)
+    para_count_list:           [1st level number of parallel units, 2nd level ...]
+    mac_capacity:              [0, 1], determines whether MAC can buffer 1 output. (UNIT: Element)
+    partition_mode:            (aka 'parallel mode' outside the class) determines hardware parallel template
+                               ['0' for no parallelism, only hierarchical memory fetch,
+                                '1' neighbour for parallel unit fetch,
+                                '2' for broadcast.]
+    array_access_cost:         (aka 'parallel cost' outside the class)
+                               per access cost of fetching data from neighborhood PE
+    array_dim:                 array dimension (right now support 1D & square-shape 2D)
+    utilization_threshold:     # of utilized unit / # of total units @ paralleled level
+    replication:               [True, False], whether allows another loop dimension (3rd) to be spatially unrolled
+    '''
+
+    def __init__(self, buf_capacity_list, buf_access_cost_list,
+                 buf_unit_static_cost_list, para_count_list,  
+                 mac_capacity=1, partition_mode=None, array_access_cost=None, 
+                 array_dim = None, utilization_threshold = 0, replication=True,memory_partitions=[[0,0,0],[0,0,0],[0,0,0]],invalid_underutilized=True):
+
+        # Buffers.
+        assert len(buf_capacity_list) == len(buf_access_cost_list)
+        assert len(buf_capacity_list) == len(buf_unit_static_cost_list)
+        assert len(buf_capacity_list) == len(para_count_list)
+        
+        self.bufs = [Buffer(*t) for t in list(zip(buf_capacity_list, \
+            buf_access_cost_list, buf_unit_static_cost_list))]
+
+        self.num_levels = len(self.bufs)
+
+        # Parallelism.
+        array_access_costs = [None] * len(para_count_list)
+        if not partition_mode :
+            partition_mode = [0] * len(para_count_list)
+        else :
+            array_level = 0
+            for i in range(self.num_levels):
+                # when using non-default partition mode, the parallelism
+                # count needs to be large than 1
+                assert partition_mode[i] == 0 or para_count_list[i] <= 1 \
+                       or (partition_mode[i] > 0 and para_count_list[i] > 1)
+                if partition_mode[i] == 1 or partition_mode[i] == 2:
+                    array_access_costs[i] = array_access_cost[array_level]
+                    array_level += 1
+
+        # "para_index" indicates which level do we have parallelism in
+        self.para_index = [i for i, e in enumerate(para_count_list) if e != 1]
+
+        # 2D array is default setting for paralleled level
+        # Define 1D array in arch file manually if needed, e.g. "array_dim": [1, 1, 1] ([@ mem level 1, 2, 3])
+        if not array_dim:
+            array_dim = [2 if e != 1 else 1 for e in para_count_list]
+
+        # LMEI always assume square-shape array, could change later
+        array_width = [para_count_list[i] if array_dim[i] == 1 else int(math.sqrt(para_count_list[i])) for i in range(self.num_levels)]
+ 
+        self.paras = [Parallelism(*t) for t in list(zip(para_count_list, \
+            partition_mode, array_access_costs, array_dim, array_width))]
+        self.access_cost = buf_access_cost_list
+        # If list does not contain 3 separate access costs for (inputs, weights, psum)
+        # assume they all have the same cost
+        if type(buf_access_cost_list[0]) is not list:
+            self.access_cost = [ [x]*3 for x in buf_access_cost_list ]
+        self.mac_capacity = mac_capacity
+        self.array_access_cost = array_access_cost
+        self.para_count_list = para_count_list
+        self.utilization_threshold = utilization_threshold
+        self.memory_partitions = memory_partitions 
+        self.memory_partitions.append([None]*3)#do not check for invalid_underutilized at last memory level
+        self.replication = replication
+        self.invalid_underutilized = invalid_underutilized
+        
+
+
+    @classmethod
+    def arch(cls, info):
+        return cls(info["capacity"], info["access_cost"], info["static_cost"],
+                        info["parallel_count"], info["mac_capacity"], info["parallel_mode"],
+                        info["parallel_cost"], info["array_dim"], info["utilization_threshold"], info["replication"],info["memory_partitions"], info['invalid_underutilized'])  
+
+    def buffer_levels(self):
+        '''
+        Return total levels of buffers in the hierarchy.
+        '''
+        return self.num_levels
+
+    def buffer(self, level):
+        '''
+        Return the specification of the buffer of the given level.
+        '''
+        return self.bufs[level]
+
+
+    def parallelism(self, level):
+        '''
+        Return the specification of the parallelism of the given level.
+        '''
+        return self.paras[level]
+
+    def total_parallelism(self):
+        '''
+        Return the specification of the total parallelism.
+        '''
+        return reduce(mul, self.para_count_list, 1)
--- a/src/interstellar/schedule.py
+++ b/src/interstellar/schedule.py
+'''
+Schedule hint
+'''
+
+from . import loop_enum as le
+
+class Schedule(object):
+    '''
+    schedule_hint: {loop index:[[loop order,loop blocking,loop partitioning @ 1st level mem],[@ 2nd level mem],[3rd .]]}
+        loop blocking     ->  temporal loop size
+        loop partitioning ->  spatial loop size (spatial unrolling / parallelism)
+
+    partition_loops: the loops which are allowed to be replicated (on top of the defined loop partitioning)
+                     to improve HW utilization
+
+    hint_para_index: {mem level: [spatially unrolled loop indexes]}
+    '''
+
+    def __init__(self, schedule_hint, partition_loops=None):
+
+        self.schedule_hint = schedule_hint
+        if partition_loops != None:
+            self.partition_loops = []
+            for l in partition_loops:
+                self.partition_loops.append(le.loop_table[l])
+        else:
+            self.partition_loops = partition_loops       
+        
+        num_levels = len(list(schedule_hint.values())[0])
+        hint_para_index = {}
+        for loop in schedule_hint:
+            for level in range(num_levels):
+                if schedule_hint[loop][level] != None and schedule_hint[loop][level][2] != None:
+                    if level not in hint_para_index:
+                        hint_para_index[level] = [loop]
+                    else:
+                        hint_para_index[level].append(loop)
+        self.hint_para_index = hint_para_index
+
+    @classmethod
+    def schedule(cls, info):
+        return cls(info["schedule_hint"], info["partition_loops"]) 
--- a/src/interstellar/utils.py
+++ b/src/interstellar/utils.py
+from . import loop_enum as le
+from . import buffer_enum as be
+
+def print_loop_nest(point):
+    loop_orders = list(zip(*point.loop_orders))
+    loop_blockings = list(zip(*point.loop_blockings))
+    loop_partitionings = list(zip(*point.loop_partitionings))
+    para_dims = point.para_loop_dim
+    num_level = len(loop_orders)
+    order_lists = []
+    for level in range(num_level):
+        order_list = [None] * le.NUM
+        for order in range(le.NUM):
+            if loop_blockings[level][order] != 1 or loop_partitionings[level][order] != 1 :
+                order_list[loop_orders[level][order]] = (le.table[order], 
+                                                        loop_blockings[level][order],
+                                                        loop_partitionings[level][order]) 
+                
+        order_lists.append(order_list) 
+
+    print(order_lists, para_dims)
+
+def print_best_schedule(point):
+    loop_orders = list(zip(*point.loop_orders))
+    loop_blockings = list(zip(*point.loop_blockings))
+    loop_partitionings = list(zip(*point.loop_partitionings))
+    para_dims = point.para_loop_dim
+    num_level = len(loop_orders)
+    order_lists = []
+    for level in range(num_level):
+        print("\tLevel_Number: {}".format(level))
+        order_list = [None] * le.NUM
+        for order in range(le.NUM):
+            if loop_blockings[level][order] != 1 or loop_partitionings[level][order] != 1 :
+                order_list[loop_orders[level][order]] = (le.table[order], 
+                                                        loop_blockings[level][order],
+                                                        loop_partitionings[level][order])
+                print("\t\tLoop_Name: {}, Loop_Bound: {}, Unrolling: {}".format(le.table[order], loop_blockings[level][order], loop_partitionings[level][order])) 
+                
+        order_lists.append(order_list) 
+
+    #print(order_lists)
+
--- a/tools/run_optimizer.py
+++ b/tools/run_optimizer.py
@@ -3,7 +3,7 @@ import numpy as np
 import argparse
 import math
 import time
-import cnn_mapping as cm
+import interstellar as cm
 import datetime
 import json