Skip to content
Snippets Groups Projects
cost_model.py 51.1 KiB
Newer Older
sgauthamr2001's avatar
sgauthamr2001 committed
"""
Cost model.
sgauthamr2001's avatar
sgauthamr2001 committed
"""

# import numpy as np
from operator import mul
from operator import add
from functools import reduce
import copy
import math

from . import loop_enum as le
from . import buffer_enum as be


def get_comp_cost(layer):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Compute the total # of MAC computation, it is independent of other optimizations

    Also it is independent of input size and input/filter stride
    Total # of computation = OX*OY*IC*OC*ON*FX*FY
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    cost = (
        layer.wofm
        * layer.hofm
        * layer.nifm
        * layer.nofm
        * layer.nimg
        * layer.wfil
        * layer.hfil
    )
    return cost


def get_ideal_performance(layer, resource):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Compute the ideal runtime in cycles by assuming 100% PE array utilization
    Ideal # of cycles = Total # of MAC computation / Total # of PEs

    #LMEI Need to be modified if later when adding precision-scalable PE.
    # of functional PE will change depending on different precision modes.
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    total_comp = get_comp_cost(layer)
    number_pe = reduce(mul, resource.para_count_list, 1)
sgauthamr2001's avatar
sgauthamr2001 committed
    runtime = math.ceil(total_comp * 1.0 / number_pe)

    return runtime


def get_layer_size(layer):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get size of ifmap, ofmap, filter of the layer

    #LMEI ifmap_size should be able to calculate based on ofmap_size and input stride(IS) /filter stride(FS)
    IX = IS*(OX-1) + FS*(FX-1) + 1
    wifm = wistd*(wofm-1) + wfstd*(wfil-1) + 1
sgauthamr2001's avatar
sgauthamr2001 committed
    """

    ifmap_size = layer.wifm * layer.hifm * layer.nifm * layer.nimg
    ofmap_size = layer.wofm * layer.hofm * layer.nofm * layer.nimg
    flmap_size = layer.wfil * layer.hfil * layer.nifm * layer.nofm

    return [ifmap_size, ofmap_size, flmap_size]


def get_hinted_para(level, hint):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get the actual total spatial unrolling size from loop schedule
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    assert hint

    hinted_para = 1
    for loop in range(le.NUM):
        if loop in hint:
            hinted_loop_para = hint[loop][level][2]
            hinted_para *= hinted_loop_para

    return hinted_para


def valid_dataflow(resource, hint):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Check if the actual spatial unrolling size from loop schedule meets the HW utilization requirement
    by comparing it with real HW parallelism size * utilization threshold.
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    num_levels = resource.buffer_levels()

    for level in range(num_levels):
sgauthamr2001's avatar
sgauthamr2001 committed
        if resource.paras[level].count != 1 and get_hinted_para(level, hint) < (
            resource.paras[level].count * resource.utilization_threshold
        ):
            return False

    return True

sgauthamr2001's avatar
sgauthamr2001 committed

def get_if_access(resource, point, layer, mac_capacity=1):
    """
    Returns the number of accesses to the inputs for each level.
    """

    irrelevant_loops = [le.OC, le.FX, le.FY]

    num_levels = resource.buffer_levels()
    access_counts_per_level = []

    for level in range(num_levels):
        # general idea: total number of accesses = tiling at current level * block size * num_blocks
        # block size = tiling without the irrelevant loops
        # num_blocks = tiling at the levels above the current level

        # multiply all the tiling factors at the current level

        def multiply_tiling_factors():
            # find the innermost loop among [OX, OY, IC, ON]
            lowest_input_loop_index = min(
                point.loop_orders[le.OX][level],
                point.loop_orders[le.OY][level],
                point.loop_orders[le.IC][level],
                point.loop_orders[le.ON][level],
                # these are partially relevant
                point.loop_orders[le.FX][level],
                point.loop_orders[le.FY][level],
            )

            # we can ignore OC if it is at a lower level than the innermost input loop
            # FX, FY can't be ignored, because they are partially relevant
            tiling = 1
            for i in range(le.NUM):
                if i in [le.OC]:
                    if point.loop_orders[i][level] > lowest_input_loop_index:
                        # if the loop is at a higher level than the innermost input loop, we need to consider it
                        tiling *= point.loop_blockings[i][level]
                else:
                    tiling *= point.loop_blockings[i][level]
            return tiling

        # remove all the irrelevant loops from the tiling of the levels below
        def calculate_block_size():
            block_size = 1

            for lower_level in range(level - 1, -1, -1):
                for i in range(le.NUM):
                    if i not in irrelevant_loops:
                        if i == le.OX:
                            block_size *= point.loop_blockings[i][lower_level] + (
                                point.loop_blockings[le.FX][lower_level] - 1
                            )
                            block_size *= point.loop_blockings[i][lower_level] + (
                                point.loop_blockings[le.FY][lower_level] - 1
                            )
                        else:
                            block_size *= point.loop_blockings[i][lower_level]
                        block_size *= point.loop_partitionings[i][lower_level]

            return block_size

        def get_num_blocks():
            # get tiling of the levels above the current level
            num_blocks = 1
            for i in range(level + 1, num_levels):
                for j in range(le.NUM):
                    num_blocks *= point.loop_blockings[j][i]
            return num_blocks

        access_counts_per_level.append(
            multiply_tiling_factors()
            * calculate_block_size()
            * get_num_blocks()
            * resource.paras[level].count
        )

    # print("Accesses at each level: ", access_counts_per_level)

    return access_counts_per_level


def get_if_access_old(level, point, layer, mac_capacity=1):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get per element # of access of Input at current level

    Not accurate because [FX, FY] is not totally irrelevant terms for ifmap..
    #LMEI Need to be modified by using the concept of the dataset.
sgauthamr2001's avatar
sgauthamr2001 committed
    """

    if level == 0 and mac_capacity == 0:
sgauthamr2001's avatar
sgauthamr2001 committed
        return layer.wfil * layer.hfil * layer.nofm // (layer.wstd * layer.hstd)
sgauthamr2001's avatar
sgauthamr2001 committed
    # find which loop is innermost among [OX, OY, IC, ON]
sgauthamr2001's avatar
sgauthamr2001 committed
    ex_order_index = min(
        point.loop_orders[le.OX][level],
        point.loop_orders[le.OY][level],
        point.loop_orders[le.IC][level],
sgauthamr2001's avatar
sgauthamr2001 committed
        point.loop_orders[le.ON][level],
    )
sgauthamr2001's avatar
sgauthamr2001 committed
    # if FX, FY, OC are at a lower level than the innermost input loop, they are irrelevant
    fx_exclusive = point.loop_orders[le.FX][level] < ex_order_index
    fy_exclusive = point.loop_orders[le.FY][level] < ex_order_index
    oc_exclusive = point.loop_orders[le.OC][level] < ex_order_index

sgauthamr2001's avatar
sgauthamr2001 committed
    fx_acc = reduce(mul, point.loop_blockings[le.FX][level + fx_exclusive :], 1)
    fy_acc = reduce(mul, point.loop_blockings[le.FY][level + fy_exclusive :], 1)
    oc_acc = reduce(mul, point.loop_blockings[le.OC][level + oc_exclusive :], 1)

    # No loop orders among unrolled loops, they have the same order
    fx_par = reduce(mul, point.loop_partitionings[le.FX][level:], 1)
    fy_par = reduce(mul, point.loop_partitionings[le.FY][level:], 1)
    oc_par = reduce(mul, point.loop_partitionings[le.OC][level:], 1)

sgauthamr2001's avatar
sgauthamr2001 committed
    return (
sgauthamr2001's avatar
sgauthamr2001 committed
        fx_acc * fy_acc * oc_acc * fx_par * fy_par * oc_par // (layer.wstd * layer.hstd)
sgauthamr2001's avatar
sgauthamr2001 committed
    )
def get_of_access(resource, point, layer, mac_capacity=1):
    irrelevant_loops = [le.FX, le.FY, le.IC]

    num_levels = resource.buffer_levels()
    access_counts_per_level = []
    for level in range(num_levels):
        # general idea: total number of accesses = tiling at current level * block size * num_blocks
        # block size = tiling without the irrelevant loops
        # num_blocks = tiling at the levels above the current level

        # multiply all the tiling factors at the current level

        def multiply_tiling_factors():

            lowest_relevant_loop_index = min(
                point.loop_orders[le.OX][level],
                point.loop_orders[le.OY][level],
                point.loop_orders[le.OC][level],
                point.loop_orders[le.ON][level],
            )

            # we can ignore OX,OY,ON since they are not relevant to the weight
            tiling = 1
            for i in range(le.NUM):
                if i in irrelevant_loops:
                    if point.loop_orders[i][level] > lowest_relevant_loop_index:
                        tiling *= point.loop_blockings[i][level]
                else:
                    tiling *= point.loop_blockings[i][level]
            return tiling

        # remove all the irrelevant loops from the tiling of the levels below
        def calculate_block_size():
            block_size = 1

            for lower_level in range(level - 1, -1, -1):
                for i in range(le.NUM):
                    if i not in irrelevant_loops:
                        block_size *= point.loop_blockings[i][lower_level]
                        block_size *= point.loop_partitionings[i][lower_level]

            return block_size

        def get_num_blocks():
            # get tiling of the levels above the current level
            num_blocks = 1
            for i in range(level + 1, num_levels):
                for j in range(le.NUM):
                    num_blocks *= point.loop_blockings[j][i]
            return num_blocks

        access_counts_per_level.append(
            multiply_tiling_factors()
            * calculate_block_size()
            * get_num_blocks()
            * resource.paras[level].count
        )

    # print("Accesses at each level: ", access_counts_per_level)

    return access_counts_per_level


def get_of_access_old(level, point, layer, mac_capacity=1):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get per element # of access of Output at current level

    For output:
    Relevant terms [OX, OY, OC, ON]
    irrelevant terms [FX, FY, IC]

    Calculating rule:
    At lowest mem level (directly talk to MAC), calculate per element access
    by timing all irrelevant terms [FX, FY, IC] together

    For the rest higher mem levels,
    firstly, check if there is stationary possibility
    (irrelevant loops for filter [FX, FY, IC] are at the innermost position of this level)
    if there is, exclude the irrelevant loop(s) from the current level's # of per element access computing
    because they have been taken into account in lower level's # of per element access computing

    secondly, calculate the current level's # of per element access
    by multiplying all the irrelevant terms from current level to the highest level
    including both temporal unrolling part and spatial unrolling part (parallelism).
sgauthamr2001's avatar
sgauthamr2001 committed
    """
sgauthamr2001's avatar
sgauthamr2001 committed
    if level == 0 and mac_capacity == 0:
        return layer.wfil * layer.hfil * layer.nifm

sgauthamr2001's avatar
sgauthamr2001 committed
    ex_order_index = min(
        point.loop_orders[le.OX][level],
        point.loop_orders[le.OY][level],
        point.loop_orders[le.OC][level],
sgauthamr2001's avatar
sgauthamr2001 committed
        point.loop_orders[le.ON][level],
    )

    fx_exclusive = point.loop_orders[le.FX][level] < ex_order_index
    fy_exclusive = point.loop_orders[le.FY][level] < ex_order_index
    ic_exclusive = point.loop_orders[le.IC][level] < ex_order_index

sgauthamr2001's avatar
sgauthamr2001 committed
    fx_acc = reduce(mul, point.loop_blockings[le.FX][level + fx_exclusive :], 1)
    fy_acc = reduce(mul, point.loop_blockings[le.FY][level + fy_exclusive :], 1)
    ic_acc = reduce(mul, point.loop_blockings[le.IC][level + ic_exclusive :], 1)

    fx_par = reduce(mul, point.loop_partitionings[le.FX][level:], 1)
    fy_par = reduce(mul, point.loop_partitionings[le.FY][level:], 1)
    ic_par = reduce(mul, point.loop_partitionings[le.IC][level:], 1)

    return fx_acc * fy_acc * ic_acc * fx_par * fy_par * ic_par


def get_fl_access(resource, point, layer, mac_capacity=1):
    """
    Returns the number of accesses to the inputs for each level.
    """

    irrelevant_loops = [le.OX, le.OY, le.ON]

    num_levels = resource.buffer_levels()
    access_counts_per_level = []
    for level in range(num_levels):
        # general idea: total number of accesses = tiling at current level * block size * num_blocks
        # block size = tiling without the irrelevant loops
        # num_blocks = tiling at the levels above the current level

        # multiply all the tiling factors at the current level

        def multiply_tiling_factors():

            lowest_relevant_loop_index = min(
                point.loop_orders[le.FX][level],
                point.loop_orders[le.FY][level],
                point.loop_orders[le.IC][level],
                point.loop_orders[le.OC][level],
            )

            # we can ignore OX,OY,ON since they are not relevant to the weight
            tiling = 1
            for i in range(le.NUM):
                if i in irrelevant_loops:
                    if point.loop_orders[i][level] > lowest_relevant_loop_index:
                        tiling *= point.loop_blockings[i][level]
                else:
                    tiling *= point.loop_blockings[i][level]
            return tiling

        # remove all the irrelevant loops from the tiling of the levels below
        def calculate_block_size():
            block_size = 1

            for lower_level in range(level - 1, -1, -1):
                for i in range(le.NUM):
                    if i not in irrelevant_loops:
                        block_size *= point.loop_blockings[i][lower_level]
                        block_size *= point.loop_partitionings[i][lower_level]

            return block_size

        def get_num_blocks():
            # get tiling of the levels above the current level
            num_blocks = 1
            for i in range(level + 1, num_levels):
                for j in range(le.NUM):
                    num_blocks *= point.loop_blockings[j][i]
            return num_blocks

        access_counts_per_level.append(
            multiply_tiling_factors()
            * calculate_block_size()
            * get_num_blocks()
            * resource.paras[level].count
        )

    # print("Accesses at each level: ", access_counts_per_level)

    return access_counts_per_level


def get_fl_access_old(level, point, layer, mac_capacity=1):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get per element # of access of Weight at current level

    For filter:
    Relevant terms [FX, FY, IC, OC]
    irrelevant terms [OX, OY, ON]

    Calculating rule:
    At lowest mem level (directly talk to MAC), calculate per element access
    by timing all irrelevant terms [OX, OY, ON] together

    For the rest higher mem levels,
    firstly, check if there is stationary possibility
    (irrelevant loops for filter [OX, OY, ON] are at the innermost position of this level)
    if there is, exclude the irrelevant loop(s) from the current level's # of per element access computing
    because they have been taken into account in lower level's # of per element access computing

    secondly, calculate the current level's # of per element access
    by multiplying all the irrelevant terms from current level to the highest level
    including both temporal unrolling part and spatial unrolling part (parallelism).
sgauthamr2001's avatar
sgauthamr2001 committed
    """

    if level == 0 and mac_capacity == 0:
        return layer.wofm * layer.hofm * layer.nimg

sgauthamr2001's avatar
sgauthamr2001 committed
    ex_order_index = min(
        point.loop_orders[le.FX][level],
        point.loop_orders[le.FY][level],
        point.loop_orders[le.IC][level],
sgauthamr2001's avatar
sgauthamr2001 committed
        point.loop_orders[le.OC][level],
    )

    ox_exclusive = point.loop_orders[le.OX][level] < ex_order_index
    oy_exclusive = point.loop_orders[le.OY][level] < ex_order_index
    on_exclusive = point.loop_orders[le.ON][level] < ex_order_index

sgauthamr2001's avatar
sgauthamr2001 committed
    ox_acc = reduce(mul, point.loop_blockings[le.OX][level + ox_exclusive :], 1)
    oy_acc = reduce(mul, point.loop_blockings[le.OY][level + oy_exclusive :], 1)
    on_acc = reduce(mul, point.loop_blockings[le.ON][level + on_exclusive :], 1)

    ox_par = reduce(mul, point.loop_partitionings[le.OX][level:], 1)
    oy_par = reduce(mul, point.loop_partitionings[le.OY][level:], 1)
    on_par = reduce(mul, point.loop_partitionings[le.ON][level:], 1)

    return ox_acc * oy_acc * on_acc * ox_par * oy_par * on_par


def opt_get_if_access(level, point, ba_arr, pa_arr):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get # access of if block at current level

    The repeated access to ifmap is determined by the blocking factors and
    parallelism counts of those loops other than ifmap-related loops outside of
    this level.

    At the same buffer level, if the other loops are outside of the innermost
    loop of ifmap-related loops, their blocking factors and parallelism counts
    at this level should also contribute to the number of accesses.
sgauthamr2001's avatar
sgauthamr2001 committed
    """
sgauthamr2001's avatar
sgauthamr2001 committed
    ex_order_index = min(
        point.loop_orders[le.OX][level],
        point.loop_orders[le.OY][level],
        point.loop_orders[le.IC][level],
sgauthamr2001's avatar
sgauthamr2001 committed
        point.loop_orders[le.ON][level],
    )

    fx_exclusive = point.loop_orders[le.FX][level] < ex_order_index
    fy_exclusive = point.loop_orders[le.FY][level] < ex_order_index
    oc_exclusive = point.loop_orders[le.OC][level] < ex_order_index

sgauthamr2001's avatar
sgauthamr2001 committed
    fx_acc = ba_arr[le.FX][
        level + fx_exclusive
    ]  # reduce(mul, point.loop_blockings[le.FX][level+fx_exclusive:], 1)
    fy_acc = ba_arr[le.FY][
        level + fy_exclusive
    ]  # reduce(mul, point.loop_blockings[le.FY][level+fy_exclusive:], 1)
    oc_acc = ba_arr[le.OC][
        level + oc_exclusive
    ]  # reduce(mul, point.loop_blockings[le.OC][level+oc_exclusive:], 1)

    fx_par = pa_arr[le.FX][
        level
    ]  # reduce(mul, point.loop_partitionings[le.FX][level+fx_exclusive:], 1)
    fy_par = pa_arr[le.FY][
        level
    ]  # reduce(mul, point.loop_partitionings[le.FY][level+fy_exclusive:], 1)
    oc_par = pa_arr[le.OC][
        level
    ]  # reduce(mul, point.loop_partitionings[le.OC][level+oc_exclusive:], 1)

    return fx_acc * fy_acc * oc_acc * fx_par * fy_par * oc_par


def opt_get_of_access(level, point, ba_arr, pa_arr):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get # access of of block at current level

    See comments in routine for ifmap.
sgauthamr2001's avatar
sgauthamr2001 committed
    """
sgauthamr2001's avatar
sgauthamr2001 committed
    ex_order_index = min(
        point.loop_orders[le.OX][level],
        point.loop_orders[le.OY][level],
        point.loop_orders[le.OC][level],
sgauthamr2001's avatar
sgauthamr2001 committed
        point.loop_orders[le.ON][level],
    )

    fx_exclusive = point.loop_orders[le.FX][level] < ex_order_index
    fy_exclusive = point.loop_orders[le.FY][level] < ex_order_index
    ic_exclusive = point.loop_orders[le.IC][level] < ex_order_index

sgauthamr2001's avatar
sgauthamr2001 committed
    # TODO
    fx_acc = ba_arr[le.FX][
        level + fx_exclusive
    ]  # reduce(mul, point.loop_blockings[le.FX][level+fx_exclusive:], 1)
    fy_acc = ba_arr[le.FY][
        level + fy_exclusive
    ]  # reduce(mul, point.loop_blockings[le.FY][level+fy_exclusive:], 1)
    ic_acc = ba_arr[le.IC][
        level + ic_exclusive
    ]  # reduce(mul, point.loop_blockings[le.OC][level+oc_exclusive:], 1)

    fx_par = pa_arr[le.FX][
        level
    ]  # reduce(mul, point.loop_partitionings[le.FX][level+fx_exclusive:], 1)
    fy_par = pa_arr[le.FY][
        level
    ]  # reduce(mul, point.loop_partitionings[le.FY][level+fy_exclusive:], 1)
    ic_par = pa_arr[le.IC][
        level
    ]  # reduce(mul, point.loop_partitionings[le.OC][level+oc_exclusive:], 1)

    return fx_acc * fy_acc * ic_acc * fx_par * fy_par * ic_par


def opt_get_fl_access(level, point, ba_arr, pa_arr):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get # access of fl block at current level

    See comments in routine for ifmap.
sgauthamr2001's avatar
sgauthamr2001 committed
    """
sgauthamr2001's avatar
sgauthamr2001 committed
    ex_order_index = min(
        point.loop_orders[le.FX][level],
        point.loop_orders[le.FY][level],
        point.loop_orders[le.IC][level],
sgauthamr2001's avatar
sgauthamr2001 committed
        point.loop_orders[le.OC][level],
    )

    ox_exclusive = point.loop_orders[le.OX][level] < ex_order_index
    oy_exclusive = point.loop_orders[le.OY][level] < ex_order_index
    on_exclusive = point.loop_orders[le.ON][level] < ex_order_index

sgauthamr2001's avatar
sgauthamr2001 committed
    ox_acc = ba_arr[le.OX][
        level + ox_exclusive
    ]  # reduce(mul, point.loop_blockings[le.OX][level+ox_exclusive:], 1)
    oy_acc = ba_arr[le.OY][
        level + oy_exclusive
    ]  # reduce(mul, point.loop_blockings[le.OY][level+oy_exclusive:], 1)
    on_acc = ba_arr[le.ON][
        level + on_exclusive
    ]  # reduce(mul, point.loop_blockings[le.ON][level+on_exclusive:], 1)

    ox_par = pa_arr[le.OX][
        level
    ]  # reduce(mul, point.loop_partitionings[le.OX][level+ox_exclusive:], 1)
    oy_par = pa_arr[le.OY][
        level
    ]  # reduce(mul, point.loop_partitionings[le.OY][level+oy_exclusive:], 1)
    on_par = pa_arr[le.ON][
        level
    ]  # reduce(mul, point.loop_partitionings[le.ON][level+on_exclusive:], 1)

    return ox_acc * oy_acc * on_acc * ox_par * oy_par * on_par


def get_if_size(blocking_accum_list, partitioning_accum_list, partitioning_list, layer):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get size of if block at current level including both temporal and spatial loop part

    blocking     -> temporal loop part
    partitioning -> spatial  loop part

    #LMEI to support filter stride(FS) later
    right now, FS/wfstd = 1 in
    IX = IS*(OX-1) + FS*(FX-1) + 1 or
    wifm = wistd*(wofm-1) + wfstd*(wfil-1) + 1

    #LMEI (new HW template) no need for Input Duplication when OC partitions
     by letting one reg broadcast Input to a row of OC partitioned PE
     and remove inner PE ifamp register
sgauthamr2001's avatar
sgauthamr2001 committed
    """

    fx_acc = blocking_accum_list[le.FX] * partitioning_accum_list[le.FX]
    fy_acc = blocking_accum_list[le.FY] * partitioning_accum_list[le.FY]
    ox_acc = blocking_accum_list[le.OX] * partitioning_accum_list[le.OX]
    oy_acc = blocking_accum_list[le.OY] * partitioning_accum_list[le.OY]
    width = fx_acc + (ox_acc - 1) * layer.wstd
    height = fy_acc + (oy_acc - 1) * layer.hstd

sgauthamr2001's avatar
sgauthamr2001 committed
    return (
        width
        * height
        * blocking_accum_list[le.IC]
        * partitioning_accum_list[le.IC]
        * blocking_accum_list[le.ON]
        * partitioning_accum_list[le.ON]
        * partitioning_list[le.OC]
    )  # Duplication when OC partitions


def get_of_size(blocking_accum_list, partitioning_accum_list, partitioning_list):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get size of of block at current level including both temporal and spatial loop part

    #LMEI (new HW template) no need for Output Duplication when IC, FX or FY partitions
     by letting output data from a row of IC, FX or FY partitioned PE add together
     and remove inner PE ofamp register
sgauthamr2001's avatar
sgauthamr2001 committed
    """

    return (
        blocking_accum_list[le.OX]
        * partitioning_accum_list[le.OX]
        * blocking_accum_list[le.OY]
        * partitioning_accum_list[le.OY]
        * blocking_accum_list[le.OC]
        * partitioning_accum_list[le.OC]
        * blocking_accum_list[le.ON]
        * partitioning_accum_list[le.ON]
        * partitioning_list[le.IC]
        * partitioning_list[le.FX]
        * partitioning_list[le.FY]
    )  # Duplication when IC, FX or FY partitions


def get_fl_size(blocking_accum_list, partitioning_accum_list, partitioning_list):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get size of fl block at current level

    #LMEI (new HW template) no need for Weight Duplication when OX, OY or ON partitions
     by letting one reg broadcast Weight to a row of OX, OY or ON partitioned PE
     and remove inner PE weight register
sgauthamr2001's avatar
sgauthamr2001 committed
    """

    return (
        blocking_accum_list[le.FX]
        * partitioning_accum_list[le.FX]
        * blocking_accum_list[le.FY]
        * partitioning_accum_list[le.FY]
        * blocking_accum_list[le.IC]
        * partitioning_accum_list[le.IC]
        * blocking_accum_list[le.OC]
        * partitioning_accum_list[le.OC]
        * partitioning_list[le.OX]
        * partitioning_list[le.OY]
        * partitioning_list[le.ON]
    )  # Duplication when OX, OY or ON partitions


def get_if_bank_size(blocking_accum_list, layer):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get size of if block at current level

    blocking -> temporal loop part

    #LMEI to support filter stride(FS) later
    right now, FS/wfstd = 1 in
    IX = IS*(OX-1) + FS*(FX-1) + 1 or
    wifm = wistd*(wofm-1) + wfstd*(wfil-1) + 1
sgauthamr2001's avatar
sgauthamr2001 committed
    """

    fx_acc = blocking_accum_list[le.FX]
    fy_acc = blocking_accum_list[le.FY]
    ox_acc = blocking_accum_list[le.OX]
    oy_acc = blocking_accum_list[le.OY]
    width = fx_acc + (ox_acc - 1) * layer.wstd
    height = fy_acc + (oy_acc - 1) * layer.hstd

sgauthamr2001's avatar
sgauthamr2001 committed
    return width * height * blocking_accum_list[le.IC] * blocking_accum_list[le.ON]


def get_of_bank_size(blocking_accum_list):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get size of of block at current level

    blocking -> temporal loop part
sgauthamr2001's avatar
sgauthamr2001 committed
    """
sgauthamr2001's avatar
sgauthamr2001 committed
    return (
        blocking_accum_list[le.OX]
        * blocking_accum_list[le.OY]
        * blocking_accum_list[le.OC]
        * blocking_accum_list[le.ON]
    )


def get_fl_bank_size(blocking_accum_list):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get size of fl block at current level

    blocking -> temporal loop part
sgauthamr2001's avatar
sgauthamr2001 committed
    """
sgauthamr2001's avatar
sgauthamr2001 committed
    return (
        blocking_accum_list[le.FX]
        * blocking_accum_list[le.FY]
        * blocking_accum_list[le.IC]
        * blocking_accum_list[le.OC]
    )


def get_array_access_and_cost(level, para, access_list, point):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get the access at array level from the access at the
    lower level of memory hierarchy
sgauthamr2001's avatar
sgauthamr2001 committed
    """

    para_mode = para.access_mode
sgauthamr2001's avatar
sgauthamr2001 committed
    assert para_mode == 1 or para_mode == 2  # Don't get it

    array_dim = para.array_dim
    para_count = para.array_width
    para_cost = para.array_access_cost * 1.0
    nearest_pe_cost = para_cost

    [if_block_access, of_block_access, fl_block_access] = access_list
    partitions = list(zip(*point.loop_partitionings))[level]
    para_dim = point.para_loop_dim[level]

sgauthamr2001's avatar
sgauthamr2001 committed
    partitions_nearest = [
        1,
    ] * le.NUM
    partitions_far = []
sgauthamr2001's avatar
sgauthamr2001 committed
    across_block_cost = [0] * array_dim

    if para_mode == 1:
        for i in range(len(para_dim)):
            para_index = para_dim[i]
sgauthamr2001's avatar
sgauthamr2001 committed
            partitions_far.append(
                [
                    1,
                ]
                * le.NUM
            )
            if len(para_index) == 1:
                partitions_nearest[para_index[0]] = partitions[para_index[0]]
            else:
                inner_loop, outer_loop = para_index
                partitions_nearest[inner_loop] = partitions[inner_loop]
                partitions_far[i][outer_loop] = partitions[outer_loop]
                across_block_cost[i] = para_cost * partitions[inner_loop]

sgauthamr2001's avatar
sgauthamr2001 committed
        array_if_block_access_nearest = (
            if_block_access
            * partitions_nearest[le.FX]
            * partitions_nearest[le.FY]
            * partitions_nearest[le.OC]
        )
        array_of_block_access_nearest = (
            of_block_access
            * partitions_nearest[le.FX]
            * partitions_nearest[le.FY]
            * partitions_nearest[le.IC]
        )
        array_fl_block_access_nearest = (
            fl_block_access
            * partitions_nearest[le.OX]
            * partitions_nearest[le.OY]
            * partitions_nearest[le.ON]
        )

        array_access = [
            [
                array_if_block_access_nearest,
                array_of_block_access_nearest,
                array_fl_block_access_nearest,
            ]
        ]

        for i in range(array_dim):  # Don't get it
            if_partitions_far = (
                partitions_far[i][le.FX]
                * partitions_far[i][le.FY]
                * partitions_far[i][le.OC]
            )
            if_partitions_far = if_partitions_far if if_partitions_far != 1 else 0
sgauthamr2001's avatar
sgauthamr2001 committed
            of_partitions_far = (
                partitions_far[i][le.FX]
                * partitions_far[i][le.FY]
                * partitions_far[i][le.IC]
            )
            of_partitions_far = of_partitions_far if of_partitions_far != 1 else 0
sgauthamr2001's avatar
sgauthamr2001 committed
            fl_partitions_far = (
                partitions_far[i][le.OX]
                * partitions_far[i][le.OY]
                * partitions_far[i][le.ON]
            )
            fl_partitions_far = fl_partitions_far if fl_partitions_far != 1 else 0

            if_array_block_access = if_block_access * if_partitions_far
            of_array_block_access = of_block_access * of_partitions_far
            fl_array_block_access = fl_block_access * fl_partitions_far

sgauthamr2001's avatar
sgauthamr2001 committed
            array_access.append(
                [if_array_block_access, of_array_block_access, fl_array_block_access]
            )

        return [array_access, [nearest_pe_cost] + across_block_cost]

    elif para_mode == 2:
        for i in range(len(para_dim)):
            para_index = para_dim[i]
            for j in para_index:
                partitions_nearest[j] = partitions[j]

sgauthamr2001's avatar
sgauthamr2001 committed
        array_if_block_access_nearest = (
            if_block_access
            * partitions_nearest[le.FX]
            * partitions_nearest[le.FY]
            * partitions_nearest[le.OC]
        )
        array_of_block_access_nearest = (
            of_block_access
            * partitions_nearest[le.FX]
            * partitions_nearest[le.FY]
            * partitions_nearest[le.IC]
        )
        array_fl_block_access_nearest = (
            fl_block_access
            * partitions_nearest[le.OX]
            * partitions_nearest[le.OY]
            * partitions_nearest[le.ON]
        )

        array_access = [
            [
                array_if_block_access_nearest,
                array_of_block_access_nearest,
                array_fl_block_access_nearest,
            ]
        ]

        return [array_access, [nearest_pe_cost]]


def get_access(point, layer, resource):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get the total access of each block at each level,
    return the list as
    [[if_block_access, of_block_access, fl_block_access], ...].

    Assume all the buffers are inclusive, so buffers in lower level
    appear in higher level as well.

    For the parallelism case assume read from next memory level,

    Support more access modes in parallelism case
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    # TODO support more customized memory
    # TODO more access at overlapped boundary

    num_levels = resource.buffer_levels()
    mac_capacity = resource.mac_capacity

    access_list = []

    if_accesses = get_if_access(resource, point, layer, mac_capacity)
    of_accesses = get_of_access(resource, point, layer, mac_capacity)
    fl_accesses = get_fl_access(resource, point, layer, mac_capacity)

    access_list = list(zip(if_accesses, of_accesses, fl_accesses))
sgauthamr2001's avatar
sgauthamr2001 committed
    # para_mode = [e.access_mode for i, e in enumerate(resource.paras) if e.access_mode != 0]
    para_mode_level = [i for i, e in enumerate(resource.paras) if e.access_mode != 0]
    partitions = list(zip(*point.loop_partitionings))
    array_costs = []
    if para_mode_level:
        # access at array level
sgauthamr2001's avatar
sgauthamr2001 committed
        # para_mode_level = [i for i, e in enumerate(resource.paras) if e.access_mode != 0]
        delta = 0
        for level in para_mode_level:
sgauthamr2001's avatar
sgauthamr2001 committed
            if level + delta + 1 >= num_levels:
                next_level_access = [1, 1, 1]
            else:
                next_level_access = copy.copy(access_list[level + delta + 1])
sgauthamr2001's avatar
sgauthamr2001 committed
                next_level_access[1] = (next_level_access[1] + 1) / 2
            array_access, array_cost = get_array_access_and_cost(
                level, resource.paras[level], next_level_access, point
            )
            array_costs.append(array_cost)
            access_list.insert(level + delta + 1, array_access)
            delta += 1

    return [access_list, array_costs]

sgauthamr2001's avatar
sgauthamr2001 committed

def opt_get_access(num_levels, point, mac_capacity):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    See the above function's comments. This function is just an
    optimized version of the above function
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    """ blocking_accum_arr is reversed cumprod numpy array """
    # TODO support mac_capacity
sgauthamr2001's avatar
sgauthamr2001 committed
    # blocking_arr = np.ones((le.NUM, num_levels+1))
    # partitioning_arr = np.ones((le.NUM, num_levels+1))
sgauthamr2001's avatar
sgauthamr2001 committed
    # blocking_arr[:,:-1] = np.array(point.loop_blockings)
    # partitioning_arr[:,:-1] = np.array(point.loop_partitionings)
sgauthamr2001's avatar
sgauthamr2001 committed
    # blocking_accum_arr = np.ones((le.NUM, num_levels+1))
    # partitioning_accum_arr = np.ones((le.NUM, num_levels+1))
sgauthamr2001's avatar
sgauthamr2001 committed
    # for i in range(le.NUM):
    #    blocking_accum_arr[i][:-1] = np.cumprod(blocking_arr[i][::-1])[::-1]
    #    partitioning_accum_arr[i][:-1] = np.cumprod(partitioning_arr[i][::-1])[::-1]

sgauthamr2001's avatar
sgauthamr2001 committed
    # blocking_accum_arr = blocking_arr[...,::-1].cumprod(axis=-1)[...,::-1]
    # partitioning_accum_arr = partitioning_arr[...,::-1].cumprod(axis=-1)[...,::-1]
sgauthamr2001's avatar
sgauthamr2001 committed
    # blocking_accum_arr = np.hstack((blocking_accum_arr, np.ones((le.NUM, 1))))
    # partitioning_accum_arr = np.hstack((partitioning_accum_arr, np.ones((le.NUM, 1))))

    blocking_accum_arr = []
    partitioning_accum_arr = []
    for i in range(le.NUM):
        ba_current_level = [1]
        pa_current_level = [1]
        ba_tmp = 1
        pa_tmp = 1
sgauthamr2001's avatar
sgauthamr2001 committed
        for level in range(num_levels - 1, -1, -1):
            ba_tmp = ba_tmp * point.loop_blockings[i][level]
            pa_tmp = pa_tmp * point.loop_partitionings[i][level]
            ba_current_level.append(ba_tmp)
            pa_current_level.append(pa_tmp)

        blocking_accum_arr.append(ba_current_level[::-1])
        partitioning_accum_arr.append(pa_current_level[::-1])

    access_arr = np.zeros((num_levels, 3))
    for level in range(num_levels):
sgauthamr2001's avatar
sgauthamr2001 committed
        access_arr[level][0] = opt_get_if_access(
            level, point, blocking_accum_arr, partitioning_accum_arr
        )
        access_arr[level][1] = (
            2
            * opt_get_of_access(
                level, point, blocking_accum_arr, partitioning_accum_arr
            )
            - 1
        )
        access_arr[level][2] = opt_get_fl_access(
            level, point, blocking_accum_arr, partitioning_accum_arr
        )
sgauthamr2001's avatar
sgauthamr2001 committed

def get_bank_size(point, layer, level):

    blocking_accum_list = []
    for i in range(le.NUM):
sgauthamr2001's avatar
sgauthamr2001 committed
        blocking_accum_list.append(reduce(mul, point.loop_blocking(i)[: level + 1], 1))

    if_bank_size = get_if_bank_size(blocking_accum_list, layer)
    of_bank_size = get_of_bank_size(blocking_accum_list)
    fl_bank_size = get_fl_bank_size(blocking_accum_list)

    return (if_bank_size, of_bank_size, fl_bank_size)

sgauthamr2001's avatar
sgauthamr2001 committed

def get_block_size(point, layer, level):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Calculate the size of ifmap, ofmap, filter at current level
    """

    blocking_accum_list = []
    partitioning_accum_list = []
    partitioning_reshape = list(zip(*point.loop_partitionings))
    partitioning_list = partitioning_reshape[level]
    for i in range(le.NUM):
sgauthamr2001's avatar
sgauthamr2001 committed
        blocking_accum_list.append(reduce(mul, point.loop_blocking(i)[: level + 1], 1))
        partitioning_accum_list.append(
            reduce(mul, point.loop_partitioning(i)[: level + 1], 1)
        )  # FIXME inclusive mode also duplicates data

    if_block_size = get_if_size(
        blocking_accum_list, partitioning_accum_list, partitioning_list, layer
    )
    of_block_size = get_of_size(
        blocking_accum_list, partitioning_accum_list, partitioning_list
    )
    fl_block_size = get_fl_size(
        blocking_accum_list, partitioning_accum_list, partitioning_list
    )

    return (if_block_size, of_block_size, fl_block_size)


def get_block_sizes(num_levels, point, layer):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get size of ifmap, ofmap, filter
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    bank_list = []