Skip to content
Snippets Groups Projects
Commit c1a9ce4c authored by sgauthamr2001's avatar sgauthamr2001
Browse files

Turn Interstellar into a Python package

parent 6b0672af
No related branches found
No related tags found
1 merge request!5bug_fixes::kartik
setup.py 0 → 100644
from setuptools import setup, find_packages
setup(
name="interstellar-dnn-mapping",
version="0.1.dev0",
author="Kartik Prabhu",
author_email="kprabhu7@stanford.edu",
description="Design space exploration for DNN accelerators",
long_description=open("README.md", "r", encoding="utf-8").read(),
package_dir={"": "src"},
packages=find_packages("src"),
)
'''
cnn_mapping package
'''
from . import loop_enum as le
from . import utils
from . import extract_input
from . import cost_model
from . import mapping_point_generator
from . import optimizer
from .mapping_point import MappingPoint
from .resource import Resource
from .layer import Layer
from .schedule import Schedule
'''
Buffer enum type.
Buffers include ifmap (IF), ofmap (OF), filter (FL).
'''
IF = 0
OF = 1
FL = 2
NUM = 3
from collections import deque
class Cache(object):
'''
Helper class for cache computed values to reduce runtime
'''
def __init__(self, num_levels, size):
self.num_levels = num_levels
self.cache_map = [dict() for x in range(num_levels)]
self.cache_queue = [deque() for i in range(num_levels)]
self.size = size
def read_cache(self, level, data):
'''
Only read cache, no change to make
'''
if len(self.cache_map[level]) == 0 or data not in self.cache_map[level]:
return None
else:
return self.cache_map[level][data]
def write_cache(self, level, data, value):
'''
Only write cache, change map and queue
'''
assert len(self.cache_map[level]) == len(self.cache_queue[level])
self.cache_map[level][data] = value
self.cache_queue[level].append(data)
if len(self.cache_queue[level]) > self.size:
pop_ele = self.cache_queue[level].popleft()
del self.cache_map[level][pop_ele]
This diff is collapsed.
import json
import os
from . import loop_enum as le
def extract_arch_info(arch_file):
with open(arch_file) as json_data_file:
data = json.load(json_data_file)
assert data["mem_levels"] == len(data["capacity"]), \
"capacity list is invalid, too many or too few elements"
assert data["mem_levels"] == len(data["access_cost"]), \
"access_cost list is invalid, too many or too few elements"
assert data["mem_levels"] == len(data["parallel_count"]), \
"parallel_count list is invalid, too many or too few elements"
num_bytes = data["precision"] / 8
if type(data["capacity"][0]) is list:
capacity_list = [ [x / num_bytes for x in data["capacity"][i]] for i in range(len(data["capacity"])) ]
else:
capacity_list = [x / num_bytes for x in data["capacity"] ]
data["capacity"] = capacity_list
if "static_cost" not in data:
data["static_cost"] = [0, ] * data["mem_levels"]
else:
assert data["mem_levels"] == len(data["static_cost"]), \
"static_cost list is invalid, too many or too few elements"
if "mac_capacity" not in data:
data["mac_capacity"] = 0
if "parallel_mode" not in data:
data["parallel_mode"] = [0, ] * data["mem_levels"]
for level in range(data["mem_levels"]):
if data["parallel_count"][level] != 1:
data["parallel_mode"][level] = 1
else:
assert data["mem_levels"] == len(data["parallel_mode"]), \
"parallel_mode list is invalid, too many or too few elements"
if "array_dim" not in data:
data["array_dim"] = None
if "utilization_threshold" not in data:
data["utilization_threshold"] = 0.0
if "replication" not in data:
data["replication"] = True
if "invalid_underutilized" not in data:
data["invalid_underutilized"] = True
if "memory_partitions" not in data:
data["memory_partitions"] = [[0,0,0],[0,0,0],[0,0,0]]
return data
def extract_network_info(network_file):
with open(network_file) as json_data_file:
data = json.load(json_data_file)
if "batch_size" not in data:
data["batch_size"] = 1
if "stride_width" not in data:
data["stride_width"] = 1
if "stride_height" not in data:
data["stride_height"] = 1
layer_summary = data.values()
data['layer_info'] = layer_summary
data['layer_name'] = os.path.splitext(os.path.basename(network_file))[0]
return data
def extract_schedule_info(schedule_file, num_levels):
with open(schedule_file) as json_data_file:
data = json.load(json_data_file)
schedule = {}
hint = data["schedule_hint"]
schedule_hint = {}
for loop in hint:
schedule_hint[le.loop_table[loop]] = [None,]*num_levels
for level in hint[loop]:
level_index = int(level.lstrip('level'))
schedule_hint[le.loop_table[loop]][level_index] = [None,]*3
if "order" in hint[loop][level]:
schedule_hint[le.loop_table[loop]][level_index][0] = hint[loop][level]["order"]
if "blocking_size" in hint[loop][level]:
schedule_hint[le.loop_table[loop]][level_index][1] = hint[loop][level]["blocking_size"]
if "partitioning_size" in hint[loop][level]:
schedule_hint[le.loop_table[loop]][level_index][2] = hint[loop][level]["partitioning_size"]
schedule["schedule_hint"] = schedule_hint
if "partition_loops" not in data:
schedule["partition_loops"] = None
else:
schedule["partition_loops"] = data["partition_loops"]
#TODO partition at dimension
return schedule
def extract_info(args):
arch_info = extract_arch_info(args.arch)
network_info = extract_network_info(args.network)
schedule_info = extract_schedule_info(args.schedule, arch_info["mem_levels"]) if args.schedule else None
return arch_info, network_info, schedule_info
'''
Layer specification.
'''
class Layer(object):
'''
NN layer parameters.
nifm: # ifmap channels.
nofm: # ofmap channels.
wifm: ifmap width.
hifm: ifmap height.
wofm: ofmap width.
hofm: ofmap height.
wfil: weight filter width.
hfil: weight filter height.
nimg: # input images (batch).
wstd: stride size in width dimension.
hstd: stride size in height dimension.
'''
def __init__(self, nifm, nofm, wofm, hofm, wfil, hfil, nimg=1, wstd=1, hstd=1):
self.nifm = nifm
self.nofm = nofm
self.wofm = wofm
self.hofm = hofm
self.wifm = wfil + (wofm - 1) * wstd
self.hifm = hfil + (hofm - 1) * hstd
self.wfil = wfil
self.hfil = hfil
self.nimg = nimg
self.wstd = wstd
self.hstd = hstd
assert self.wofm > 0
assert self.hofm > 0
assert self.nimg > 0
self.sizes = [wfil, hfil, wofm, hofm, nofm, nifm, nimg]
@classmethod
def layer(cls, info):
return cls(info["input_fmap_channel"], info["output_fmap_channel"],
info["fmap_width"], info["fmap_height"], info["window_width"],
info["window_height"], info["batch_size"],
info["stride_width"], info["stride_height"])
class FCLayer(Layer):
'''
NN fully-connected layer parameters.
(wifm, hifm) = (wfil, hfil), wstd = hstd = 1, wofm = hofm = 1.
'''
def __init__(self, nifm, nofm, wfil, hfil, nimg=1):
Layer.__init__(self, nifm, nofm, 1, 1, wfil, hfil, nimg)
'''
Loop enum type.
Loops include filter width (FX), filter height (FY),
output width (OX), output height (OY),
output channel (OC), input channel (IC),
batch (ON).
'''
FX = 0
FY = 1
OX = 2
OY = 3
OC = 4
IC = 5
ON = 6
NUM = 7
table = {0: 'FX',
1: 'FY',
2: 'OX',
3: 'OY',
4: 'OC',
5: 'IC',
6: 'ON' }
loop_table = { 'FX': 0,
'FY': 1,
'OX': 2,
'OY': 3,
'OC': 4,
'IC': 5,
'ON': 6}
'''
Type for a specific mapping point.
'''
class MappingPoint(object):
'''
Configurations of a specific mapping.
Mapping includes the complete description of the loop order and loop
blocking factors of each buffer level, loop partitioning onto each level of
parallel units, etc.
Each loop order and each set of loop blocking factors are corresponding to
each loop at all buffer levels, because it does not make much sense to block
more than once at a single buffer level.
Each set of loop partitioning factors is corresponding to number of parallelism of
each loop at all all levels.
Partition mode is the access mode in parallelism case.
0(default): access to next level of memory
1: access buffers in neighbor processing units
#LMEI other parallelism cases to be added
'''
def __init__(self, loop_order_list, loop_blockings_list,
loop_partitionings_list, para_loop_dim_list=None):
# NOTE(mgao12): no value validation here; cost model needs to abandon
# invalid mapping.
self.loop_orders = loop_order_list
self.loop_blockings = loop_blockings_list
self.loop_partitionings = loop_partitionings_list
self.para_loop_dim = para_loop_dim_list
def loop_order(self, loop):
'''
Loop order of the given loop.
Return a tuple of the order indices for the same loop at all buffer
levels, i.e., if a tuple for loop OX is returned, and the first
element of this tuple t[0] = 2, then loop OX is the third innermost
loop at the first buffer level.
Tuples are organized as the same order as loop enum order.
E.g., for a two-level memory hierarchy, each tuple contains two
elements, [(0, 0), (1, 1), (2, 4), (3, 5), (4, 3), (5, 2), (6, 6)]
means for the first loop (FX = 0), at both levels FX is at the
innermost (tuple (0, 0)), etc.. I.e., it means a loop structure as:
for on
for oy
for ox
for oc
for ic
for fy
for fx
for on
for ic
for oc
for oy
for ox
for fy
for fx
...
'''
return self.loop_orders[loop]
def loop_blocking(self, loop):
'''
Loop blocking factors of the given loop.
Return a tuple of factors for the given loop at all buffer levels, from
inside level to outside level.
Tuples are organized as the same order as loop enum order.
E.g., [(4, 2), (8, 1), ...] means for the first loop (FX = 0), the
blocking factor is 4 for the innermost level, and 2 for the next level;
for the second loop (FY = 1), the blocking factor is 8 for the
innermost level, and 1 for the next level.
'''
return self.loop_blockings[loop]
def loop_partitioning(self, loop):
'''
Loop partitioning factors of the given loop.
Return a tuple of factors for the given loop at all buffer levels, from
inside level to outside level.
Tuples are organized as the same order as loop enum order.
E.g., [(4, 2), (8, 1), ...] means for the first loop (FX), it is
parallelized in 4 units for the first parallel level, and 2 for the
next level, etc..
'''
return self.loop_partitionings[loop]
This diff is collapsed.
'''
Top level function of optimization framework
'''
from . import mapping_point_generator
from . import cost_model
from . import loop_enum as le
from . import buffer_enum as be
def opt_optimizer(resource, layer, hint=None, verbose=False):
'''
Evaluate the cost of each mapping point,
record the mapping_point with the smallest cost
'''
smallest_cost, perf, best_mapping_point = mapping_point_generator.opt_mapping_point_generator_function(resource, layer, hint, verbose)
access_list, array_cost = cost_model.get_access(best_mapping_point, layer, resource)
# total_cost = cost_model.get_cost(resource, best_mapping_point, layer, verbose)
# assert total_cost == smallest_cost
if verbose:
print("Optimal_Energy_(pJ): ", smallest_cost)
print("Runtime_(cycles):", perf)
#print("Best mapping_point: ", best_mapping_point.loop_blockings, best_mapping_point.loop_partitionings, best_mapping_point.loop_orders)
return [smallest_cost, best_mapping_point, perf]
def optimizer(resource, layer, hint=None, verbose=False):
smallest_cost = float("inf")
mp_generator = mapping_point_generator.mapping_point_generator_function(resource, layer, hint, verbose)
#counter = 0
for mapping_point in mp_generator:
#counter += 1
cost = cost_model.get_cost(resource, mapping_point, layer, verbose)
#if verbose:
# print "Current cost: ", cost
# print "Current mapping_point: ", mapping_point.loop_blockings, mapping_point.loop_orders
if cost < smallest_cost:
smallest_cost = cost
best_mapping_point = mapping_point
if verbose:
print("Current smallest cost: ", smallest_cost)
print("Current best mapping_point: ", mapping_point.loop_blockings, mapping_point.loop_orders)
#print counter
if verbose:
print(smallest_cost)
#print("Best mapping_point: ", best_mapping_point.loop_blockings, mapping_point.loop_partitionings, best_mapping_point.loop_orders)
return [smallest_cost, best_mapping_point]
'''
Hardware resource types.
'''
#import numpy as np
from collections import namedtuple
from operator import mul
import math
from functools import reduce
class Buffer(namedtuple('Buffer',
['capacity', 'access_cost', 'unit_static_cost'])):
'''
Buffer specification.
Immutable type.
Buffer attributes include capacity, access cost, unit static cost.
Capacity is for a single buffer (If current level has parallelism,
then it is the capacity of the buffer bank inside each parallel
units); access cost is the cost per access;
unit static cost is the static cost per time unit.
'''
pass
class Parallelism(namedtuple('Parallelism',
['count', 'access_mode', 'array_access_cost', 'array_dim', 'array_width'])):
'''
Parallelism specification.
Immutable type.
Parallelism attributes include count and access_mode.
Count is the number of parallel units.
Access mode is the mode of access non-private data,
for example, whether access neighborhood PE, or
goes to next level buffer.
Array access cost is the cost of accessing array level buffers.
Array dimension is the dimension of PE array, whether it is 1D or 2D.
Array width is the width of PE array, if 1D array, same as array dimension.
if 2D array, sqrt(array_dim)
Note: shared buffer level is the level
index of the lowest shared buffer for this parallelism.
'''
pass
class Resource(object):
'''
Hardware resource specification.
Hardware resource includes buffer hierarchy and parallel processing units.
buf_capacity_list: [1st level buffer size, 2nd level ...] (UNIT: Byte)
buf_access_cost_list: [1st level mem per access cost, 2nd level ...] (UNIT: pJ)
buf_unit_static_cost_list: [1st level mem static cost per time unit, 2nd level ...] (UNIT: pJ)
para_count_list: [1st level number of parallel units, 2nd level ...]
mac_capacity: [0, 1], determines whether MAC can buffer 1 output. (UNIT: Element)
partition_mode: (aka 'parallel mode' outside the class) determines hardware parallel template
['0' for no parallelism, only hierarchical memory fetch,
'1' neighbour for parallel unit fetch,
'2' for broadcast.]
array_access_cost: (aka 'parallel cost' outside the class)
per access cost of fetching data from neighborhood PE
array_dim: array dimension (right now support 1D & square-shape 2D)
utilization_threshold: # of utilized unit / # of total units @ paralleled level
replication: [True, False], whether allows another loop dimension (3rd) to be spatially unrolled
'''
def __init__(self, buf_capacity_list, buf_access_cost_list,
buf_unit_static_cost_list, para_count_list,
mac_capacity=1, partition_mode=None, array_access_cost=None,
array_dim = None, utilization_threshold = 0, replication=True,memory_partitions=[[0,0,0],[0,0,0],[0,0,0]],invalid_underutilized=True):
# Buffers.
assert len(buf_capacity_list) == len(buf_access_cost_list)
assert len(buf_capacity_list) == len(buf_unit_static_cost_list)
assert len(buf_capacity_list) == len(para_count_list)
self.bufs = [Buffer(*t) for t in list(zip(buf_capacity_list, \
buf_access_cost_list, buf_unit_static_cost_list))]
self.num_levels = len(self.bufs)
# Parallelism.
array_access_costs = [None] * len(para_count_list)
if not partition_mode :
partition_mode = [0] * len(para_count_list)
else :
array_level = 0
for i in range(self.num_levels):
# when using non-default partition mode, the parallelism
# count needs to be large than 1
assert partition_mode[i] == 0 or para_count_list[i] <= 1 \
or (partition_mode[i] > 0 and para_count_list[i] > 1)
if partition_mode[i] == 1 or partition_mode[i] == 2:
array_access_costs[i] = array_access_cost[array_level]
array_level += 1
# "para_index" indicates which level do we have parallelism in
self.para_index = [i for i, e in enumerate(para_count_list) if e != 1]
# 2D array is default setting for paralleled level
# Define 1D array in arch file manually if needed, e.g. "array_dim": [1, 1, 1] ([@ mem level 1, 2, 3])
if not array_dim:
array_dim = [2 if e != 1 else 1 for e in para_count_list]
# LMEI always assume square-shape array, could change later
array_width = [para_count_list[i] if array_dim[i] == 1 else int(math.sqrt(para_count_list[i])) for i in range(self.num_levels)]
self.paras = [Parallelism(*t) for t in list(zip(para_count_list, \
partition_mode, array_access_costs, array_dim, array_width))]
self.access_cost = buf_access_cost_list
# If list does not contain 3 separate access costs for (inputs, weights, psum)
# assume they all have the same cost
if type(buf_access_cost_list[0]) is not list:
self.access_cost = [ [x]*3 for x in buf_access_cost_list ]
self.mac_capacity = mac_capacity
self.array_access_cost = array_access_cost
self.para_count_list = para_count_list
self.utilization_threshold = utilization_threshold
self.memory_partitions = memory_partitions
self.memory_partitions.append([None]*3)#do not check for invalid_underutilized at last memory level
self.replication = replication
self.invalid_underutilized = invalid_underutilized
@classmethod
def arch(cls, info):
return cls(info["capacity"], info["access_cost"], info["static_cost"],
info["parallel_count"], info["mac_capacity"], info["parallel_mode"],
info["parallel_cost"], info["array_dim"], info["utilization_threshold"], info["replication"],info["memory_partitions"], info['invalid_underutilized'])
def buffer_levels(self):
'''
Return total levels of buffers in the hierarchy.
'''
return self.num_levels
def buffer(self, level):
'''
Return the specification of the buffer of the given level.
'''
return self.bufs[level]
def parallelism(self, level):
'''
Return the specification of the parallelism of the given level.
'''
return self.paras[level]
def total_parallelism(self):
'''
Return the specification of the total parallelism.
'''
return reduce(mul, self.para_count_list, 1)
'''
Schedule hint
'''
from . import loop_enum as le
class Schedule(object):
'''
schedule_hint: {loop index:[[loop order,loop blocking,loop partitioning @ 1st level mem],[@ 2nd level mem],[3rd .]]}
loop blocking -> temporal loop size
loop partitioning -> spatial loop size (spatial unrolling / parallelism)
partition_loops: the loops which are allowed to be replicated (on top of the defined loop partitioning)
to improve HW utilization
hint_para_index: {mem level: [spatially unrolled loop indexes]}
'''
def __init__(self, schedule_hint, partition_loops=None):
self.schedule_hint = schedule_hint
if partition_loops != None:
self.partition_loops = []
for l in partition_loops:
self.partition_loops.append(le.loop_table[l])
else:
self.partition_loops = partition_loops
num_levels = len(list(schedule_hint.values())[0])
hint_para_index = {}
for loop in schedule_hint:
for level in range(num_levels):
if schedule_hint[loop][level] != None and schedule_hint[loop][level][2] != None:
if level not in hint_para_index:
hint_para_index[level] = [loop]
else:
hint_para_index[level].append(loop)
self.hint_para_index = hint_para_index
@classmethod
def schedule(cls, info):
return cls(info["schedule_hint"], info["partition_loops"])
from . import loop_enum as le
from . import buffer_enum as be
def print_loop_nest(point):
loop_orders = list(zip(*point.loop_orders))
loop_blockings = list(zip(*point.loop_blockings))
loop_partitionings = list(zip(*point.loop_partitionings))
para_dims = point.para_loop_dim
num_level = len(loop_orders)
order_lists = []
for level in range(num_level):
order_list = [None] * le.NUM
for order in range(le.NUM):
if loop_blockings[level][order] != 1 or loop_partitionings[level][order] != 1 :
order_list[loop_orders[level][order]] = (le.table[order],
loop_blockings[level][order],
loop_partitionings[level][order])
order_lists.append(order_list)
print(order_lists, para_dims)
def print_best_schedule(point):
loop_orders = list(zip(*point.loop_orders))
loop_blockings = list(zip(*point.loop_blockings))
loop_partitionings = list(zip(*point.loop_partitionings))
para_dims = point.para_loop_dim
num_level = len(loop_orders)
order_lists = []
for level in range(num_level):
print("\tLevel_Number: {}".format(level))
order_list = [None] * le.NUM
for order in range(le.NUM):
if loop_blockings[level][order] != 1 or loop_partitionings[level][order] != 1 :
order_list[loop_orders[level][order]] = (le.table[order],
loop_blockings[level][order],
loop_partitionings[level][order])
print("\t\tLoop_Name: {}, Loop_Bound: {}, Unrolling: {}".format(le.table[order], loop_blockings[level][order], loop_partitionings[level][order]))
order_lists.append(order_list)
#print(order_lists)
......@@ -3,7 +3,7 @@ import numpy as np
import argparse
import math
import time
import cnn_mapping as cm
import interstellar as cm
import datetime
import json
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment