Skip to content
Snippets Groups Projects
cost_model.py 43.5 KiB
Newer Older
sgauthamr2001's avatar
sgauthamr2001 committed
"""
Cost model.
sgauthamr2001's avatar
sgauthamr2001 committed
"""

# import numpy as np
from operator import mul
from operator import add
from functools import reduce
import copy
import math

from . import loop_enum as le
from . import buffer_enum as be


def get_comp_cost(layer):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Compute the total # of MAC computation, it is independent of other optimizations

    Also it is independent of input size and input/filter stride
    Total # of computation = OX*OY*IC*OC*ON*FX*FY
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    cost = (
        layer.wofm
        * layer.hofm
        * layer.nifm
        * layer.nofm
        * layer.nimg
        * layer.wfil
        * layer.hfil
    )
    return cost


def get_ideal_performance(layer, resource):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Compute the ideal runtime in cycles by assuming 100% PE array utilization
    Ideal # of cycles = Total # of MAC computation / Total # of PEs

    #LMEI Need to be modified if later when adding precision-scalable PE.
    # of functional PE will change depending on different precision modes.
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    total_comp = get_comp_cost(layer)
    number_pe = reduce(mul, resource.para_count_list, 1)
sgauthamr2001's avatar
sgauthamr2001 committed
    runtime = math.ceil(total_comp * 1.0 / number_pe)

    return runtime


def get_layer_size(layer):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get size of ifmap, ofmap, filter of the layer

    #LMEI ifmap_size should be able to calculate based on ofmap_size and input stride(IS) /filter stride(FS)
    IX = IS*(OX-1) + FS*(FX-1) + 1
    wifm = wistd*(wofm-1) + wfstd*(wfil-1) + 1
sgauthamr2001's avatar
sgauthamr2001 committed
    """

    ifmap_size = layer.wifm * layer.hifm * layer.nifm * layer.nimg
    ofmap_size = layer.wofm * layer.hofm * layer.nofm * layer.nimg
    flmap_size = layer.wfil * layer.hfil * layer.nifm * layer.nofm

    return [ifmap_size, ofmap_size, flmap_size]


def get_hinted_para(level, hint):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Get the actual total spatial unrolling size from loop schedule
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    assert hint

    hinted_para = 1
    for loop in range(le.NUM):
        if loop in hint:
            hinted_loop_para = hint[loop][level][2]
            hinted_para *= hinted_loop_para

    return hinted_para


def valid_dataflow(resource, hint):
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    Check if the actual spatial unrolling size from loop schedule meets the HW utilization requirement
    by comparing it with real HW parallelism size * utilization threshold.
sgauthamr2001's avatar
sgauthamr2001 committed
    """
    num_levels = resource.buffer_levels()

    for level in range(num_levels):
sgauthamr2001's avatar
sgauthamr2001 committed
        if resource.paras[level].count != 1 and get_hinted_para(level, hint) < (
            resource.paras[level].count * resource.utilization_threshold
        ):
            return False

    return True

sgauthamr2001's avatar
sgauthamr2001 committed

def get_if_access(level, point, layer, mac_capacity=1):
    """
    Get per element # of access of Input at current level

    Not accurate because [FX, FY] is not totally irrelevant terms for ifmap..
    #LMEI Need to be modified by using the concept of the dataset.
sgauthamr2001's avatar
sgauthamr2001 committed
    """

    if level == 0 and mac_capacity == 0:
sgauthamr2001's avatar
sgauthamr2001 committed
        return layer.wfil * layer.hfil * layer.nofm // (layer.wstd * layer.hstd)
sgauthamr2001's avatar
sgauthamr2001 committed
    ex_order_index = min(
        point.loop_orders[le.OX][level],
        point.loop_orders[le.OY][level],
        point.loop_orders[le.IC][level],
sgauthamr2001's avatar
sgauthamr2001 committed
        point.loop_orders[le.ON][level],
    )

    fx_exclusive = point.loop_orders[le.FX][level] < ex_order_index
    fy_exclusive = point.loop_orders[le.FY][level] < ex_order_index
    oc_exclusive = point.loop_orders[le.OC][level] < ex_order_index

sgauthamr2001's avatar
sgauthamr2001 committed
    fx_acc = reduce(mul, point.loop_blockings[le.FX][level + fx_exclusive :], 1)
    fy_acc = reduce(mul, point.loop_blockings[le.FY][level + fy_exclusive :], 1)
    oc_acc = reduce(mul, point.loop_blockings[le.OC][level + oc_exclusive :], 1)

    # No loop orders among unrolled loops, they have the same order
    fx_par = reduce(mul, point.loop_partitionings[le.FX][level:], 1)
    fy_par = reduce(mul, point.loop_partitionings[le.FY][level:], 1)
    oc_par = reduce(mul, point.loop_partitionings[le.OC][level:], 1)

sgauthamr2001's avatar
sgauthamr2001 committed
    return (
sgauthamr2001's avatar
sgauthamr2001 committed
        fx_acc * fy_acc * oc_acc * fx_par * fy_par * oc_par // (layer.wstd * layer.hstd)
sgauthamr2001's avatar
sgauthamr2001 committed
    )
sgauthamr2001's avatar
sgauthamr2001 committed
def get_of_access(level, point, layer, mac_capacity=1):
    """
    Get per element # of access of Output at current level

    For output:
    Relevant terms [OX, OY, OC, ON]
    irrelevant terms [FX, FY, IC]

    Calculating rule:
    At lowest mem level (directly talk to MAC), calculate per element access
    by timing all irrelevant terms [FX, FY, IC] together

    For the rest higher mem levels,
    firstly, check if there is stationary possibility
    (irrelevant loops for filter [FX, FY, IC] are at the innermost position of this level)
    if there is, exclude the irrelevant loop(s) from the current level's # of per element access computing
    because they have been taken into account in lower level's # of per element access computing

    secondly, calculate the current level's # of per element access
    by multiplying all the irrelevant terms from current level to the highest level
    including both temporal unrolling part and spatial unrolling part (parallelism).
sgauthamr2001's avatar
sgauthamr2001 committed
    """
sgauthamr2001's avatar
sgauthamr2001 committed
    if level == 0 and mac_capacity == 0:
        return layer.wfil * layer.hfil * layer.nifm

sgauthamr2001's avatar
sgauthamr2001 committed
    ex_order_index = min(
        point.loop_orders[le.OX][level],
        point.loop_orders[le.OY][level],
        point.loop_orders[le.OC][level],
sgauthamr2001's avatar
sgauthamr2001 committed
        point.loop_orders[le.ON][level],
    )

    fx_exclusive = point.loop_orders[le.FX][level] < ex_order_index
    fy_exclusive = point.loop_orders[le.FY][level] < ex_order_index
    ic_exclusive = point.loop_orders[le.IC][level] < ex_order_index

sgauthamr2001's avatar
sgauthamr2001 committed
    fx_acc = reduce(mul, point.loop_blockings[le.FX][level + fx_exclusive :], 1)
    fy_acc = reduce(mul, point.loop_blockings[le.FY][level + fy_exclusive :], 1)
    ic_acc = reduce(mul, point.loop_blockings[le.IC][level + ic_exclusive :], 1)

    fx_par = reduce(mul, point.loop_partitionings[le.FX][level:], 1)
    fy_par = reduce(mul, point.loop_partitionings[le.FY][level:], 1)
    ic_par = reduce(mul, point.loop_partitionings[le.IC][level:], 1)

    return fx_acc * fy_acc * ic_acc * fx_par * fy_par * ic_par


sgauthamr2001's avatar
sgauthamr2001 committed
def get_fl_access(level, point, layer, mac_capacity=1):
    """
    Get per element # of access of Weight at current level

    For filter:
    Relevant terms [FX, FY, IC, OC]
    irrelevant terms [OX, OY, ON]

    Calculating rule:
    At lowest mem level (directly talk to MAC), calculate per element access
    by timing all irrelevant terms [OX, OY, ON] together

    For the rest higher mem levels,
    firstly, check if there is stationary possibility
    (irrelevant loops for filter [OX, OY, ON] are at the innermost position of this level)
    if there is, exclude the irrelevant loop(s) from the current level's # of per element access computing
    because they have been taken into account in lower level's # of per element access computing

    secondly, calculate the current level's # of per element access
    by multiplying all the irrelevant terms from current level to the highest level
Loading
Loading full blame...