cost_model.py

"""
Cost model.
"""

# import numpy as np
from operator import mul
from operator import add
from functools import reduce
import copy
import math

from . import loop_enum as le
from . import buffer_enum as be


def get_comp_cost(layer):
    """
    Compute the total # of MAC computation, it is independent of other optimizations

    Also it is independent of input size and input/filter stride
    Total # of computation = OX*OY*IC*OC*ON*FX*FY
    """
    cost = (
        layer.wofm
        * layer.hofm
        * layer.nifm
        * layer.nofm
        * layer.nimg
        * layer.wfil
        * layer.hfil
    )
    return cost


def get_ideal_performance(layer, resource):
    """
    Compute the ideal runtime in cycles by assuming 100% PE array utilization
    Ideal # of cycles = Total # of MAC computation / Total # of PEs

    #LMEI Need to be modified if later when adding precision-scalable PE.
    # of functional PE will change depending on different precision modes.
    """
    total_comp = get_comp_cost(layer)
    number_pe = reduce(mul, resource.para_count_list, 1)
    runtime = math.ceil(total_comp * 1.0 / number_pe)

    return runtime


def get_layer_size(layer):
    """
    Get size of ifmap, ofmap, filter of the layer

    #LMEI ifmap_size should be able to calculate based on ofmap_size and input stride(IS) /filter stride(FS)
    IX = IS*(OX-1) + FS*(FX-1) + 1
    wifm = wistd*(wofm-1) + wfstd*(wfil-1) + 1
    """

    ifmap_size = layer.wifm * layer.hifm * layer.nifm * layer.nimg
    ofmap_size = layer.wofm * layer.hofm * layer.nofm * layer.nimg
    flmap_size = layer.wfil * layer.hfil * layer.nifm * layer.nofm

    return [ifmap_size, ofmap_size, flmap_size]


def get_hinted_para(level, hint):
    """
    Get the actual total spatial unrolling size from loop schedule
    """
    assert hint

    hinted_para = 1
    for loop in range(le.NUM):
        if loop in hint:
            hinted_loop_para = hint[loop][level][2]
            hinted_para *= hinted_loop_para

    return hinted_para


def valid_dataflow(resource, hint):
    """
    Check if the actual spatial unrolling size from loop schedule meets the HW utilization requirement
    by comparing it with real HW parallelism size * utilization threshold.
    """
    num_levels = resource.buffer_levels()

    for level in range(num_levels):
        if resource.paras[level].count != 1 and get_hinted_para(level, hint) < (
            resource.paras[level].count * resource.utilization_threshold
        ):
            return False

    return True


def get_if_access(resource, point, layer, mac_capacity=1):
    """
    Returns the number of accesses to the inputs for each level.
    """

    irrelevant_loops = [le.OC, le.FX, le.FY]

    num_levels = resource.buffer_levels()
    access_counts_per_level = []

    for level in range(num_levels):
        # general idea: total number of accesses = tiling at current level * block size * num_blocks
        # block size = tiling without the irrelevant loops
        # num_blocks = tiling at the levels above the current level

        # multiply all the tiling factors at the current level

        def multiply_tiling_factors():
            # find the innermost loop among [OX, OY, IC, ON]
            lowest_input_loop_index = min(
                point.loop_orders[le.OX][level],
                point.loop_orders[le.OY][level],
                point.loop_orders[le.IC][level],
                point.loop_orders[le.ON][level],
                # these are partially relevant
                point.loop_orders[le.FX][level],
                point.loop_orders[le.FY][level],
            )

            # we can ignore OC if it is at a lower level than the innermost input loop
            # FX, FY can't be ignored, because they are partially relevant
            tiling = 1
            for i in range(le.NUM):
                if i in [le.OC]:
                    if point.loop_orders[i][level] > lowest_input_loop_index:
                        # if the loop is at a higher level than the innermost input loop, we need to consider it
                        tiling *= point.loop_blockings[i][level]
                else:
                    tiling *= point.loop_blockings[i][level]
            return tiling

        # remove all the irrelevant loops from the tiling of the levels below
        def calculate_block_size():
            block_size = 1

            for lower_level in range(level - 1, -1, -1):
                for i in range(le.NUM):
                    if i not in irrelevant_loops:
                        if i == le.OX:
                            block_size *= point.loop_blockings[i][lower_level] + (
                                point.loop_blockings[le.FX][lower_level] - 1
                            )
                        elif i == le.OY:
                            block_size *= point.loop_blockings[i][lower_level] + (
                                point.loop_blockings[le.FY][lower_level] - 1
                            )
                        else:
                            block_size *= point.loop_blockings[i][lower_level]
                        block_size *= point.loop_partitionings[i][lower_level]

            return block_size

        def get_num_blocks():
            # get tiling of the levels above the current level
            num_blocks = 1
            for i in range(level + 1, num_levels):
                for j in range(le.NUM):
                    num_blocks *= point.loop_blockings[j][i]
            return num_blocks

        access_counts_per_level.append(
            multiply_tiling_factors()
            * calculate_block_size()
            * get_num_blocks()
            * resource.paras[level].count
        )

    # print("Accesses at each level: ", access_counts_per_level)

    return access_counts_per_level


def get_if_access_old(level, point, layer, mac_capacity=1):
    """
    Get per element # of access of Input at current level

    Not accurate because [FX, FY] is not totally irrelevant terms for ifmap..
    #LMEI Need to be modified by using the concept of the dataset.
    """

    if level == 0 and mac_capacity == 0:
        return layer.wfil * layer.hfil * layer.nofm // (layer.wstd * layer.hstd)

    # find which loop is innermost among [OX, OY, IC, ON]
    ex_order_index = min(
        point.loop_orders[le.OX][level],
        point.loop_orders[le.OY][level],
        point.loop_orders[le.IC][level],
        point.loop_orders[le.ON][level],
    )

    # if FX, FY, OC are at a lower level than the innermost input loop, they are irrelevant
    fx_exclusive = point.loop_orders[le.FX][level] < ex_order_index
    fy_exclusive = point.loop_orders[le.FY][level] < ex_order_index