diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..dacf1626c3ad104344c761976d024a8b93646725
Binary files /dev/null and b/.DS_Store differ
diff --git a/GB_GridSearchCVModel.py b/GB_GridSearchCVModel.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c50411b29d90637767fa826fd7cc9d6d37b0500
--- /dev/null
+++ b/GB_GridSearchCVModel.py
@@ -0,0 +1,197 @@
+# %% [code]
+# standard library
+import os
+import sys
+
+# data packages
+import numpy as np
+import pandas as pd
+
+# lightgbm
+import lightgbm as lgb
+
+# parallelization
+from joblib import Parallel, delayed
+
+# custom tooling
+sys.path.append("../input/utilities")
+import models
+import splitting
+import preprocess
+
+########################
+### Global variables ###
+########################
+
+n_seeds = 1
+n_folds = 5
+
+# %% [code]
+import sklearn.base
+import sklearn.model_selection
+
+# Create modified grid search
+# This grid search is similar to that in `models.py` but it works with `OVRModels` and LightGBM.
+
+class GridSearch(sklearn.base.BaseEstimator):
+    """Custom analogue to sklearn's GridSearchCV with support for early stopping.
+    Parameters
+    ----------
+    model : obj
+    param_grid : dict of str to list
+        Dictionary mapping each parameter to a list of candidate values to be searched.
+    loss_fn : callable
+        A function or callable loss object with signature `loss_fn(y_pred, y_true)`.
+    Attributes
+    ----------
+    results_ : list of dict
+        List of dictionaries recording fold, parameters, and loss from the grid search.
+    results_df_ : pd.DataFrame
+        A dataframe wrapper around `self.results_`.
+    """
+
+    def __init__(self, model, param_grid, loss_fn):
+        self.model = model
+        self.param_grid = param_grid
+        self.loss_fn = loss_fn
+
+    def fit(self, X, y=None, splits=None, **fit_params):
+
+        """Split data into folds and train/evaluate parameter combinations on each fold.
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The training features.
+        y : array-like, shape (n_samples, n_labels), optional
+            The training labels, if required.
+        splits : list of tuple
+            The fold splits to  be used in training, in the form of a list of
+            `(idx_train, idx_test)` tuples.
+        **fit_params
+            Additional fit parameters to pass to the model.
+        Returns
+        -------
+        self : obj
+            Returns the estimator itself.
+        Notes
+        -----
+        This estimator does not save the trained models or implement a `best_model_`
+        attribute or `predict` method. It is intended only for obtaining optimal
+        parameter combos, most easily accessed in the `results_df_` attribtue.
+        """
+
+        loss_fn = (
+            self.loss_fn if isinstance(self.loss_fn, dict) else {"loss": self.loss_fn}
+        )
+
+        # create parameter grid (result is list of parameter dicts)
+        pg = sklearn.model_selection.ParameterGrid(self.param_grid)
+
+        # we'll record results for every param combo and fold
+        self.results_ = []
+        for params in pg:
+            self.model.model = sklearn.base.clone(self.model.model).set_params(**params)
+            # use our custom CV implementation to pass oos fold as val set
+            cv = models.CVModel(model)
+            cv.fit(X=X, y=y, splits=splits, **fit_params)
+            # get oof error for every fold
+            for i, m in enumerate(cv.models_):
+                oof_indices = splits[i][1]
+                y_pred = m.predict(models._subset_rows(X, oof_indices))
+                y_true = models._subset_rows(y, oof_indices)
+                loss = {name: f(y_pred, y_true).item() for name, f in loss_fn.items()}
+                # extract the name for network class so output is more readable
+                params_copy = params.copy()
+                self.results_.append({"fold": i, **params_copy, **loss})
+
+        # add dataframe version to easily read results
+        self.results_df_ = pd.DataFrame(self.results_)
+
+        return self
+
+# %% [code]
+###################
+### Import Data ###
+###################
+
+X = pd.read_csv("../input/lish-moa/train_features.csv")
+y = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
+drugs = pd.read_csv("../input/lish-moa/train_drug.csv")
+
+# %% [code]
+# Add drugs for k-fold splitting
+X = X.merge(drugs, on = 'sig_id', how = 'inner')
+
+# create k-fold splits 
+groups = splitting.create_splitting_groups(
+    X, 
+    strat_vars=["cp_dose", "cp_time"], 
+    split_var="drug_id", 
+    random_state=2021
+)
+
+splits = splitting.split_on_group(n_splits=n_folds, groups=groups, random_state=2021)
+
+X.drop('drug_id', axis=1, inplace=True)
+
+# %% [code]
+##########################
+### Data Preprocessing ###
+##########################
+
+transformer = preprocess.Preprocessor() 
+transformer.fit(X)
+X = transformer.transform(X)
+y = y.drop(["sig_id"], axis = 1).values.astype("float32")
+
+# %% [code]
+# NOTE: need to specify params, but these are overwritten 
+# by param_grid in grid search.
+lgb_params = {
+    "num_leaves": 10,
+    "learning_rate": 0.05,
+    "n_estimators": 100,
+    "objective": "binary",
+    "random_state": 2021
+}
+
+param_grid = {
+    "num_leaves": [10, 50, 1000],
+    "max_depth": [3, 4, 10],
+    "learning_rate": [0.01, 0.001],
+}
+
+# %% [code]
+def multi_log_loss(y_pred, y_true):
+    losses = -y_true * np.log(y_pred + 1e-15) - (1 - y_true) * np.log(1 - y_pred + 1e-15)
+    return np.mean(losses)
+
+# %% [code]
+model = models.OVRModel(lgb.LGBMClassifier(**lgb_params), n_jobs=-2)
+
+gs = GridSearch(
+    model=model,
+    param_grid=param_grid,
+    loss_fn=multi_log_loss
+)
+
+gs.fit(
+    X=X, 
+    y=y,
+    splits=splits
+)
+
+# %% [code]
+# Get loss for each param combo
+df_results = gs.results_df_
+
+# Average over folds and seeds
+param_cols = [col for col in list(df_results.columns) if col not in ["fold", "seed", "loss"]]
+df_results_params = (
+    df_results
+    .groupby(param_cols, as_index=False)["loss"]
+    .mean()
+    .sort_values(by=["loss"]) # order by lowest to highest loss
+)
+
+df_results_params.head(10)
diff --git a/GB_Model.py b/GB_Model.py
new file mode 100644
index 0000000000000000000000000000000000000000..39bdfb25a9e44418c831c636b8a5f68962ab68a7
--- /dev/null
+++ b/GB_Model.py
@@ -0,0 +1,77 @@
+# %% [code]
+# standard library
+import os
+import sys
+
+# data packages
+import numpy as np
+import pandas as pd
+
+# lightgbm
+import lightgbm as lgb
+
+# parallelization
+from joblib import Parallel, delayed
+
+# custom tooling
+sys.path.append("../input/utilities")
+import models
+import splitting
+import preprocess
+
+# %% [code]
+###################
+### Import Data ###
+###################
+
+X = pd.read_csv("../input/lish-moa/train_features.csv")
+y = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
+X_test = pd.read_csv("../input/lish-moa/test_features.csv")
+y_test = pd.read_csv("../input/test-targets/solution.csv")
+y_test = y_test[y_test['Usage'] == 'Public']
+# Remove 'Usage' column
+y_test.drop('Usage', axis=1, inplace=True)
+drugs = pd.read_csv("../input/lish-moa/train_drug.csv")
+
+# %% [code]
+##########################
+### Data Preprocessing ###
+##########################
+
+transformer = preprocess.Preprocessor() 
+transformer.fit(X)
+X = transformer.transform(X)
+X_test = transformer.transform(X_test)
+y = y.drop(["sig_id"], axis = 1).values.astype("float32")
+y_test = y_test.drop(["sig_id"], axis = 1).values.astype("float32")
+
+# %% [code]
+lgb_params = {
+    "num_leaves": 50,
+    "max_depth": 4,
+    "learning_rate": 0.01,
+    "n_estimators": 100,
+    "min_child_weight": 0.02,
+    "objective": "binary",
+    "random_state": 2021
+}
+
+# %% [code]
+# fit 206 models (one for each label)
+model = models.OVRModel(lgb.LGBMClassifier(**lgb_params), n_jobs=-2)
+
+model.fit(X=pd.DataFrame(X), y=pd.DataFrame(y))
+
+# %% [code]
+preds = model.predict_proba(X)
+test_probs = model.predict_proba(X_test)
+
+# %% [code]
+def multi_log_loss(y_pred, y_true):
+    losses = -y_true * np.log(y_pred + 1e-15) - (1 - y_true) * np.log(1 - y_pred + 1e-15)
+    return np.mean(losses)
+
+print("Train loss: ", multi_log_loss(preds, y))
+print("Test loss: ", multi_log_loss(test_probs, y_test))
+
+pd.DataFrame(test_probs).to_csv("GB_test_probs.csv", index=False)
diff --git a/NN_GridSearchCVModel.py b/NN_GridSearchCVModel.py
new file mode 100644
index 0000000000000000000000000000000000000000..6809a2b6ecd88c0fd9873dc2a79822e919b6a3c0
--- /dev/null
+++ b/NN_GridSearchCVModel.py
@@ -0,0 +1,193 @@
+# %% [code]
+#######################
+### Library imports ###
+#######################
+
+# standard library
+import os
+import sys
+import pickle
+import copy
+
+# data packages
+import numpy as np
+import pandas as pd
+
+# pytorch
+import torch
+import torch.nn as nn
+
+import joblib
+import itertools
+
+# custom tooling
+sys.path.append("../input/utilities")
+import models
+import splitting
+import preprocess
+
+########################
+### Global variables ###
+########################
+label_smoothing = True
+smoothing = 0.001
+n_seeds = 1
+n_folds = 5
+device = ("cuda" if torch.cuda.is_available() else "cpu")
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+
+# %% [code]
+# creates nets for grid search
+def create_nets(n_input, n_output, nodes_list, dropout_list, batch_norm_list):
+    nets = []
+    for nodes, dropout, batch_norm in itertools.product(nodes_list, dropout_list, batch_norm_list):
+        if batch_norm:
+            net_obj = models.Sequential(
+                nn.Linear(n_input, nodes),
+                nn.BatchNorm1d(nodes),
+                nn.LeakyReLU(),
+                nn.Dropout(dropout),
+                nn.Linear(nodes, nodes),
+                nn.BatchNorm1d(nodes),
+                nn.LeakyReLU(),
+                nn.Dropout(dropout),
+                nn.Linear(nodes, n_output)
+            )
+        else:
+            net_obj = models.Sequential(
+                nn.Linear(n_input, nodes),
+                nn.LeakyReLU(),
+                nn.Dropout(dropout),
+                nn.Linear(nodes, nodes),
+                nn.LeakyReLU(),
+                nn.Dropout(dropout),
+                nn.Linear(nodes, n_output)
+            )
+        net_obj.name = f"Nodes: {nodes}, Dropout: {dropout}, Batch Norm: {batch_norm}"
+        nets.append(net_obj)
+    
+    return nets
+
+# %% [code]
+###################
+### Import Data ###
+###################
+
+X = pd.read_csv("../input/lish-moa/train_features.csv")
+y = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
+drugs = pd.read_csv("../input/lish-moa/train_drug.csv")
+
+# %% [code]
+# Add drugs for k-fold splitting
+X = X.merge(drugs, on = 'sig_id', how = 'inner')
+
+# create k-fold splits 
+groups = splitting.create_splitting_groups(
+    X, 
+    strat_vars=["cp_dose", "cp_time"], 
+    split_var="drug_id", 
+    random_state=2021
+)
+
+splits = splitting.split_on_group(n_splits=n_folds, groups=groups, random_state=2021)
+
+X.drop('drug_id', axis=1, inplace=True)
+
+# %% [code]
+##########################
+### Data Preprocessing ###
+##########################
+
+transformer = preprocess.Preprocessor() 
+transformer.fit(X)
+X = transformer.transform(X)
+y = y.drop(["sig_id"], axis = 1).values.astype("float32")
+
+# %% [code]
+# Define network architecture 
+
+n_input = X.shape[1]
+n_output = y.shape[1]
+# NOTE: need to specify params, but these are overwritten 
+# by param_grid in grid search.
+hidden_units = 640
+dropout = 0.2
+
+net_obj = models.Sequential(
+    nn.Linear(n_input, hidden_units),
+    nn.BatchNorm1d(hidden_units),
+    nn.LeakyReLU(),
+    nn.Dropout(dropout),
+    nn.Linear(hidden_units, hidden_units),
+    nn.BatchNorm1d(hidden_units),
+    nn.LeakyReLU(),
+    nn.Dropout(dropout),
+    nn.Linear(hidden_units, n_output)
+)
+
+# %% [code]
+log_loss = nn.BCEWithLogitsLoss()
+smoothed_log_loss = models.SmoothCrossEntropyLoss(smoothing=smoothing, device=device)
+
+if label_smoothing:
+    loss = smoothed_log_loss
+else:
+    loss = log_loss
+
+# Initialize network
+# NOTE: need to specify params, but these are overwritten 
+# by param_grid in grid search.
+net = models.Network(
+    net_obj=net_obj, 
+    max_epochs=20,
+    batch_size=128, 
+    device=device,
+    loss_fn=loss, 
+    lr=0.01,
+    weight_decay=1e-6,
+    lr_scheduler="ReduceLROnPlateau"
+)
+
+seeds = range(2021, 2021 + n_seeds)
+
+param_grid = {
+    "net_obj": create_nets(
+        n_input=X.shape[1],
+        n_output=y.shape[1],
+        nodes_list=[128, 256, 512],
+        dropout_list=[0.0, 0.2, 0.3, 0.4],
+        batch_norm_list=[True, False]
+    ),
+    "seed": seeds,
+    "lr": [0.01, 0.001],
+    "weight_decay": [1e-4, 1e-5, 1e-6]    
+}
+
+gs = models.GridSearch(
+    model=net,
+    param_grid=param_grid,
+    loss_fn=loss
+)
+
+gs.fit(
+    X=X, 
+    y=y,
+    splits=splits,
+    eval_metric=[log_loss], 
+    verbose=10
+)
+
+# %% [code]
+# Get loss for each param combo
+df_results = gs.results_df_
+
+# Average over folds and seeds
+param_cols = [col for col in list(df_results.columns) if col not in ["fold", "seed", "loss"]]
+df_results_params = (
+    df_results
+    .groupby(param_cols, as_index=False)["loss"]
+    .mean()
+    .sort_values(by=["loss"])
+)
+
+df_results_params.head(10)
diff --git a/NN_Model.py b/NN_Model.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5fac8c4c2b5d575247f98357b585af2ec907d6c
--- /dev/null
+++ b/NN_Model.py
@@ -0,0 +1,131 @@
+# %% [code]
+#######################
+### Library imports ###
+#######################
+
+# standard library
+import os
+import sys
+import pickle
+import copy
+
+# data packages
+import numpy as np
+import pandas as pd
+
+# pytorch
+import torch
+import torch.nn as nn
+
+# sklearn 
+import sklearn.metrics
+
+# custom tooling
+sys.path.append("../input/utilities")
+import models
+import splitting
+import preprocess
+
+########################
+### Global variables ###
+########################
+label_smoothing = True
+smoothing = 0.001
+device = ("cuda" if torch.cuda.is_available() else "cpu")
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+
+# %% [code]
+###################
+### Import Data ###
+###################
+
+train_drug = pd.read_csv("../input/lish-moa/train_drug.csv")
+X = pd.read_csv("../input/lish-moa/train_features.csv")
+y = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
+X_test = pd.read_csv("../input/lish-moa/test_features.csv")
+y_test = pd.read_csv("../input/test-targets/solution.csv")
+y_test = y_test[y_test['Usage'] == 'Public']
+# Remove 'Usage' column
+y_test.drop('Usage', axis=1, inplace=True)
+drugs = pd.read_csv("../input/lish-moa/train_drug.csv")
+
+# %% [code]
+##########################
+### Data Preprocessing ###
+##########################
+
+transformer = preprocess.Preprocessor() 
+transformer.fit(X)
+X = transformer.transform(X)
+X_test = transformer.transform(X_test)
+y = y.drop(["sig_id"], axis = 1).values.astype("float32")
+y_test = y_test.drop(["sig_id"], axis = 1).values.astype("float32")
+
+# %% [code]
+# Define network architecture 
+
+n_input = X.shape[1]
+n_output = y.shape[1]
+hidden_units = 256
+dropout = 0.4
+
+net_obj = models.Sequential(
+    nn.Linear(n_input, hidden_units),
+    nn.BatchNorm1d(hidden_units),
+    nn.LeakyReLU(),
+    nn.Dropout(dropout),
+    nn.Linear(hidden_units, hidden_units),
+    nn.BatchNorm1d(hidden_units),
+    nn.LeakyReLU(),
+    nn.Dropout(dropout),
+    nn.Linear(hidden_units, n_output)
+)
+
+# %% [code]
+log_loss = nn.BCEWithLogitsLoss()
+smoothed_log_loss = models.SmoothCrossEntropyLoss(smoothing=smoothing, device=device)
+
+if label_smoothing:
+    loss = smoothed_log_loss
+else:
+    loss = log_loss
+
+# Initialize network
+net = models.Network(
+    net_obj=net_obj, 
+    max_epochs=20,
+    batch_size=128, 
+    device=device,
+    loss_fn=loss, 
+    lr=0.01,
+    weight_decay=0.00001,
+    lr_scheduler="ReduceLROnPlateau",
+    seed=2021
+)
+
+net.fit(
+    X,
+    y,
+    eval_set=[(X_test, y_test)],
+    eval_names=['test'],
+    eval_metric=[log_loss],
+    verbose=1
+)
+
+# %% [code]
+net.metric_history_df_.tail(4)
+
+# %% [code]
+# Evaluation 
+
+def multi_log_loss(y_pred, y_true):
+    losses = -y_true * np.log(y_pred + 1e-15) - (1 - y_true) * np.log(1 - y_pred + 1e-15)
+    return np.mean(losses)
+
+preds = net.predict_proba(X)
+test_probs = net.predict_proba(X_test)
+
+print("Train loss: ", multi_log_loss(preds, y))
+print("Test loss: ", multi_log_loss(test_probs, y_test))
+
+pd.DataFrame(test_probs).to_csv("NN_test_probs.csv", index=False)
diff --git a/Sequential_NN_GridSearchCVModel.py b/Sequential_NN_GridSearchCVModel.py
new file mode 100644
index 0000000000000000000000000000000000000000..745bd063d06c68369e8c36f2a71e3c176b7ed9dd
--- /dev/null
+++ b/Sequential_NN_GridSearchCVModel.py
@@ -0,0 +1,327 @@
+# %% [code]
+#######################
+### Library imports ###
+#######################
+
+# standard library
+import os
+import sys
+import pickle
+import copy
+
+# data packages
+import numpy as np
+import pandas as pd
+
+# pytorch
+import torch
+import torch.nn as nn
+
+import joblib
+import itertools
+
+# custom tooling
+sys.path.append("../input/utilities")
+import models
+import splitting
+import preprocess
+
+########################
+### Global variables ###
+########################
+
+label_smoothing = True
+smoothing = 0.001
+n_seeds = 1
+n_folds = 5
+device = ("cuda" if torch.cuda.is_available() else "cpu")
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+
+# %% [code]
+# creates nets for grid search
+def create_nets(n_input, n_output, nodes_list, dropout_list, batch_norm_list):
+    nets = []
+    for nodes, dropout, batch_norm in itertools.product(nodes_list, dropout_list, batch_norm_list):
+        if batch_norm:
+            net_obj = models.Sequential(
+                nn.Linear(n_input, nodes),
+                nn.BatchNorm1d(nodes),
+                nn.LeakyReLU(),
+                nn.Dropout(dropout),
+                nn.Linear(nodes, nodes),
+                nn.BatchNorm1d(nodes),
+                nn.LeakyReLU(),
+                nn.Dropout(dropout),
+                nn.Linear(nodes, n_output)
+            )
+        else:
+            net_obj = models.Sequential(
+                nn.Linear(n_input, nodes),
+                nn.LeakyReLU(),
+                nn.Dropout(dropout),
+                nn.Linear(nodes, nodes),
+                nn.LeakyReLU(),
+                nn.Dropout(dropout),
+                nn.Linear(nodes, n_output)
+            )
+        net_obj.name = f"Nodes: {nodes}, Dropout: {dropout}, Batch Norm: {batch_norm}"
+        nets.append(net_obj)
+    
+    return nets
+
+# %% [code]
+###################
+### Import Data ###
+###################
+
+X = pd.read_csv("../input/lish-moa/train_features.csv")
+y = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
+drugs = pd.read_csv("../input/lish-moa/train_drug.csv")
+
+# %% [code]
+def create_y_type(y):
+    y = y.rename(columns={col: "val_" + col for col in y.columns if col not in ["sig_id"]})
+    y_long = pd.wide_to_long(
+        y, stubnames="val", i="sig_id", j="moa", sep="_", suffix=".*"
+    ).reset_index()
+    # get suffix. This is the interaction (e.g. agonist, inhibitor, etc.)
+    y_long["interactions"] = [x.rsplit("_", 1)[-1] for x in y_long["moa"]]
+    y_long.drop("moa", axis=1, inplace=True)
+    # keep all interactions that appear more than once, all others set to "other"
+    interactions = y_long["interactions"].value_counts()[:6].index.tolist()
+    y_long.loc[~y_long["interactions"].isin(interactions), "interactions"] = "other"
+    # if any enzyme or protein has a given interaction, set interaction to 1, else 0
+    y_long = y_long.groupby(["sig_id", "interactions"]).max().reset_index()
+    # pivot wider
+    y_type = pd.pivot(
+        y_long, index = "sig_id", columns = "interactions", values = "val"
+    ).reset_index()
+    return y_type
+
+y_type = create_y_type(y)
+
+# %% [code]
+# Add drugs for k-fold splitting
+X = X.merge(drugs, on = 'sig_id', how = 'inner')
+
+# create k-fold splits 
+groups = splitting.create_splitting_groups(
+    X, 
+    strat_vars=["cp_dose", "cp_time"], 
+    split_var="drug_id", 
+    random_state=2021
+)
+
+splits = splitting.split_on_group(n_splits=n_folds, groups=groups, random_state=2021)
+
+X.drop('drug_id', axis=1, inplace=True)
+
+# %% [code]
+##########################
+### Data Preprocessing ###
+##########################
+
+transformer = preprocess.Preprocessor() 
+transformer.fit(X)
+X = transformer.transform(X)
+y = y.drop(["sig_id"], axis = 1).values.astype("float32")
+y_type = y_type.drop(["sig_id"], axis = 1).values.astype("float32")
+
+# %% [code]
+#########################
+### First-Stage Model ###
+#########################
+
+log_loss = nn.BCEWithLogitsLoss()
+smoothed_log_loss = models.SmoothCrossEntropyLoss(smoothing=smoothing, device=device)
+
+if label_smoothing:
+    loss = smoothed_log_loss
+else:
+    loss = log_loss
+
+# Define network architecture 
+
+n_input = X.shape[1]
+n_output = y_type.shape[1]
+# NOTE: need to specify params, but these are overwritten 
+# by param_grid in grid search.
+hidden_units = 256
+dropout = 0.4
+
+net_obj = models.Sequential(
+    nn.Linear(n_input, hidden_units),
+    nn.BatchNorm1d(hidden_units),
+    nn.LeakyReLU(),
+    nn.Dropout(dropout),
+    nn.Linear(hidden_units, hidden_units),
+    nn.BatchNorm1d(hidden_units),
+    nn.LeakyReLU(),
+    nn.Dropout(dropout),
+    nn.Linear(hidden_units, n_output)
+)
+
+# Initialize network
+# NOTE: need to specify params, but these are overwritten 
+# by param_grid in grid search.
+net = models.Network(
+    net_obj=net_obj, 
+    max_epochs=20,
+    batch_size=128, 
+    device=device,
+    loss_fn=loss, 
+    lr=0.01,
+    weight_decay=1e-6,
+    lr_scheduler="ReduceLROnPlateau"
+)
+
+seeds = range(2021, 2021 + n_seeds)
+
+param_grid = {
+    "net_obj": create_nets(
+        n_input=X.shape[1],
+        n_output=y_type.shape[1],
+        nodes_list=[128, 256, 512],
+        dropout_list=[0, 0.2, 0.3, 0.4],
+        batch_norm_list=[True]
+    ),
+    "seed": seeds,
+    "lr": [0.01, 0.001],
+    "weight_decay": [1e-4, 1e-5, 1e-6]    
+}
+
+gs = models.GridSearch(
+    model=net,
+    param_grid=param_grid,
+    loss_fn=loss
+)
+
+gs.fit(
+    X=X, 
+    y=y_type,
+    splits=splits,
+    eval_metric=[log_loss], 
+    verbose=10
+)
+
+# Get loss for each param combo
+df_results = gs.results_df_
+
+# Average over folds and seeds
+param_cols = [col for col in list(df_results.columns) if col not in ["fold", "seed", "loss"]]
+df_results_params = (
+    df_results
+    .groupby(param_cols, as_index=False)["loss"]
+    .mean()
+    .sort_values(by=["loss"])
+)
+
+# Get parameters for best model
+best_model = df_results_params.iloc[0]
+best_net_obj = dict((x.strip(), y.strip()) 
+                 for x, y in (element.split(':')  
+                 for element in best_model["net_obj"].split(', ')))
+
+df_results_params.head(10)
+
+# %% [code]
+# Re-fit best model to use as inputs in grid search for second-stage model
+
+hidden_units = int(best_net_obj["Nodes"])
+dropout = float(best_net_obj["Dropout"])
+
+net_obj = models.Sequential(
+    nn.Linear(n_input, hidden_units),
+    nn.BatchNorm1d(hidden_units),
+    nn.LeakyReLU(),
+    nn.Dropout(dropout),
+    nn.Linear(hidden_units, hidden_units),
+    nn.BatchNorm1d(hidden_units),
+    nn.LeakyReLU(),
+    nn.Dropout(dropout),
+    nn.Linear(hidden_units, n_output)
+)
+
+# Initialize network
+net = models.Network(
+    net_obj=net_obj, 
+    max_epochs=20,
+    batch_size=128, 
+    device=device,
+    loss_fn=loss, 
+    lr=best_model["lr"],
+    weight_decay=best_model["weight_decay"],
+    lr_scheduler="ReduceLROnPlateau"
+)
+
+net.fit(
+    X,
+    y_type,
+    eval_metric=[log_loss],
+    verbose=10
+)
+
+y_type_preds = net.predict_proba(X)
+
+# %% [code]
+##########################
+### Second-Stage Model ###
+##########################
+
+X = np.column_stack((X, y_type_preds))
+
+# Initialize network
+# NOTE: need to specify params, but these are overwritten 
+# by param_grid in grid search.
+net = models.Network(
+    net_obj=net_obj, 
+    max_epochs=20,
+    batch_size=128, 
+    device=device,
+    loss_fn=loss, 
+    lr=0.01,
+    weight_decay=1e-6,
+    lr_scheduler="ReduceLROnPlateau"
+)
+
+param_grid = {
+    "net_obj": create_nets(
+        n_input=X.shape[1],
+        n_output=y.shape[1],
+        nodes_list=[128, 256, 512],
+        dropout_list=[0, 0.2, 0.3, 0.4],
+        batch_norm_list=[True, False]
+    ),
+    "seed": seeds,
+    "lr": [0.01, 0.001],
+    "weight_decay": [1e-4, 1e-5, 1e-6]    
+}
+
+gs = models.GridSearch(
+    model=net,
+    param_grid=param_grid,
+    loss_fn=loss
+)
+
+gs.fit(
+    X=X, 
+    y=y,
+    splits=splits,
+    eval_metric=[loss], 
+    verbose=10
+)
+
+# %% [code]
+# Get loss for each param combo
+df_results = gs.results_df_
+
+# Average over folds and seeds
+param_cols = [col for col in list(df_results.columns) if col not in ["fold", "seed", "loss"]]
+df_results_params = (
+    df_results
+    .groupby(param_cols, as_index=False)["loss"]
+    .mean()
+    .sort_values(by=["loss"])
+)
+
+df_results_params.head(10)
diff --git a/Sequential_NN_Model.py b/Sequential_NN_Model.py
new file mode 100644
index 0000000000000000000000000000000000000000..139799a6acbfe393532b87b5ac47a84fbf4f4422
--- /dev/null
+++ b/Sequential_NN_Model.py
@@ -0,0 +1,218 @@
+# %% [code]
+#######################
+### Library imports ###
+#######################
+
+# standard library
+import os
+import sys
+import pickle
+import copy
+
+# data packages
+import numpy as np
+import pandas as pd
+
+# pytorch
+import torch
+import torch.nn as nn
+
+# sklearn 
+import sklearn.metrics
+
+# custom tooling
+sys.path.append("../input/utilities")
+import models
+import splitting
+import preprocess
+
+########################
+### Global variables ###
+########################
+label_smoothing = False
+smoothing = 0.0001
+device = ("cuda" if torch.cuda.is_available() else "cpu")
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+
+# %% [code]
+###################
+### Import Data ###
+###################
+
+train_drug = pd.read_csv("../input/lish-moa/train_drug.csv")
+X = pd.read_csv("../input/lish-moa/train_features.csv")
+y = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
+X_test = pd.read_csv("../input/lish-moa/test_features.csv")
+y_test = pd.read_csv("../input/test-targets/solution.csv")
+y_test = y_test[y_test['Usage'] == 'Public']
+# Remove 'Usage' column
+y_test.drop('Usage', axis=1, inplace=True)
+drugs = pd.read_csv("../input/lish-moa/train_drug.csv")
+
+# %% [code]
+def create_y_type(y):
+    y = y.rename(columns={col: "val_" + col for col in y.columns if col not in ["sig_id"]})
+    y_long = pd.wide_to_long(
+        y, stubnames="val", i="sig_id", j="moa", sep="_", suffix=".*"
+    ).reset_index()
+    # get suffix. This is the interaction (e.g. agonist, inhibitor, etc.)
+    y_long["interactions"] = [x.rsplit("_", 1)[-1] for x in y_long["moa"]]
+    y_long.drop("moa", axis=1, inplace=True)
+    # keep all interactions that appear more than once, all others set to "other"
+    interactions = y_long["interactions"].value_counts()[:6].index.tolist()
+    y_long.loc[~y_long["interactions"].isin(interactions), "interactions"] = "other"
+    # if any enzyme or protein has a given interaction, set interaction to 1, else 0
+    y_long = y_long.groupby(["sig_id", "interactions"]).max().reset_index()
+    # pivot wider
+    y_type = pd.pivot(
+        y_long, index = "sig_id", columns = "interactions", values = "val"
+    ).reset_index()
+    return y_type
+
+y_type = create_y_type(y)
+y_type_test = create_y_type(y_test)
+
+# %% [code]
+##########################
+### Data Preprocessing ###
+##########################
+
+transformer = preprocess.Preprocessor() 
+transformer.fit(X)
+X = transformer.transform(X)
+X_test = transformer.transform(X_test)
+y = y.drop(["sig_id"], axis = 1).values.astype("float32")
+y_test = y_test.drop(["sig_id"], axis = 1).values.astype("float32")
+y_type = y_type.drop(["sig_id"], axis = 1).values.astype("float32")
+y_type_test = y_type_test.drop(["sig_id"], axis = 1).values.astype("float32")
+
+# %% [code]
+#########################
+### First-Stage Model ###
+#########################
+
+# Define network architecture 
+
+n_input = X.shape[1]
+n_output = y_type.shape[1]
+hidden_units = 128
+dropout = 0.2
+
+net_obj = models.Sequential(
+    nn.Linear(n_input, hidden_units),
+    nn.BatchNorm1d(hidden_units),
+    nn.LeakyReLU(),
+    nn.Dropout(dropout),
+    nn.Linear(hidden_units, hidden_units),
+    nn.BatchNorm1d(hidden_units),
+    nn.LeakyReLU(),
+    nn.Dropout(dropout),
+    nn.Linear(hidden_units, n_output)
+)
+
+log_loss = nn.BCEWithLogitsLoss()
+smoothed_log_loss = models.SmoothCrossEntropyLoss(smoothing=smoothing, device=device)
+
+if label_smoothing:
+    loss = smoothed_log_loss
+else:
+    loss = log_loss
+
+
+# Initialize network
+net = models.Network(
+    net_obj=net_obj, 
+    max_epochs=20,
+    batch_size=128, 
+    device=device,
+    loss_fn=loss, 
+    lr=0.01,
+    weight_decay=0.0001,
+    lr_scheduler="ReduceLROnPlateau",
+    seed=2021
+)
+
+net.fit(
+    X,
+    y_type,
+    eval_set=[(X_test, y_type_test)],
+    eval_names=['test'],
+    eval_metric=[log_loss],
+    verbose=1
+)
+
+y_type_preds = net.predict_proba(X)
+y_type_test_preds = net.predict_proba(X_test)
+
+# %% [code]
+def multi_log_loss(y_pred, y_true):
+    losses = -y_true * np.log(y_pred + 1e-15) - (1 - y_true) * np.log(1 - y_pred + 1e-15)
+    return np.mean(losses)
+
+print("Train loss: ", multi_log_loss(y_type_preds, y_type))
+print("Test loss: ", multi_log_loss(y_type_test_preds, y_type_test))
+
+# %% [code]
+##########################
+### Second-Stage Model ###
+#########################
+
+X = np.column_stack((X, y_type_preds))
+X_test = np.column_stack((X_test, y_type_test_preds))
+
+# Define network architecture 
+
+n_input = X.shape[1]
+n_output = y.shape[1]
+hidden_units = 256
+dropout = 0.2
+
+net_obj = models.Sequential(
+    nn.Linear(n_input, hidden_units),
+    nn.BatchNorm1d(hidden_units),
+    nn.LeakyReLU(),
+    nn.Dropout(dropout),
+    nn.Linear(hidden_units, hidden_units),
+    nn.BatchNorm1d(hidden_units),
+    nn.LeakyReLU(),
+    nn.Dropout(dropout),
+    nn.Linear(hidden_units, n_output)
+)
+
+# %% [code]
+# Initialize network
+net = models.Network(
+    net_obj=net_obj, 
+    max_epochs=20,
+    batch_size=128, 
+    device=device,
+    loss_fn=loss, 
+    lr=0.01,
+    weight_decay=0.00001,
+    lr_scheduler="ReduceLROnPlateau",
+    seed=2021
+)
+
+net.fit(
+    X,
+    y,
+    eval_set=[(X_test, y_test)],
+    eval_names=['test'],
+    eval_metric=[log_loss],
+    verbose=1
+)
+
+# %% [code]
+net.metric_history_df_.tail(4)
+
+# %% [code]
+# Evaluation 
+
+preds = net.predict_proba(X)
+test_probs = net.predict_proba(X_test)
+
+print("Train loss: ", multi_log_loss(preds, y))
+print("Test loss: ", multi_log_loss(test_probs, y_test))
+
+
+pd.DataFrame(test_probs).to_csv("Sequential_NN_test_probs.csv", index=False)
diff --git a/baseline_model.py b/baseline_model.py
deleted file mode 100644
index ddb018682964853961d636b3c2ab121d0424e3bf..0000000000000000000000000000000000000000
--- a/baseline_model.py
+++ /dev/null
@@ -1,598 +0,0 @@
-# %% [code]
-#######################
-### Library imports ###
-#######################
-
-# standard library
-import os
-import sys
-import pickle
-import copy
-
-# data packages
-import numpy as np
-import pandas as pd
-
-# pytorch
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-
-# sklearn
-import sklearn.base
-
-import joblib
-
-from sklearn.base import TransformerMixin, BaseEstimator
-from sklearn.preprocessing import OneHotEncoder, StandardScaler
-from sklearn.decomposition import PCA
-from sklearn.pipeline import make_union, make_pipeline
-from sklearn.compose import make_column_transformer
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import log_loss
-
-########################
-### Global variables ###
-########################
-device = ("cuda" if torch.cuda.is_available() else "cpu")
-os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
-
-# %% [code]
-class Preprocessor(TransformerMixin):
-    def __init__(
-        self,
-        variance_threshold = 0.7,
-        num_pc_genes = 80,
-        num_pc_cells = 10,
-        seed = 2021
-    ):
-        
-        self.variance_threshold = variance_threshold,
-        self.num_pc_genes = num_pc_genes
-        self.num_pc_cells = num_pc_cells
-        self.seed = seed
-        
-    def fit(self, X, y = None, X_test = None):
-        if X_test is not None:
-            X = pd.concat([X, X_test], axis = 0, ignore_index = True)
-    
-        gene_feats = [col for col in X.columns if col.startswith('g-')]
-        cell_feats = [col for col in X.columns if col.startswith('c-')]
-        numeric_feats = gene_feats + cell_feats            
-        categorical_feats = ['cp_time', 'cp_dose']        
-        
-        self._transformer = make_column_transformer(
-            (OneHotEncoder(), categorical_feats),
-            (StandardScaler(), numeric_feats)
-        )
-        self._transformer.fit(X)
-        return self
-        
-    
-    def transform(self, X):
-        X_new = self._transformer.transform(X).astype("float32")
-        return X_new
-
-# %% [code]
-# Sub-class nn.Sequential to add reset_parameters method
-class Sequential(nn.Sequential):
-    def reset_parameters(self):
-        for layer in self.children():
-           if hasattr(layer, "reset_parameters"):
-               layer.reset_parameters()
-
-# %% [code]
-class Network(sklearn.base.BaseEstimator):
-    """An sklearn-compatible wrapper for pytorch estimators.
-    Wraps pytorch training and prediction in sklearn-compatible estimator with `fit` and
-    `predict` methods and limited support for commonly-tuned net parameters. Supports
-    early stopping and `eval_set` similarly to the LightGBM sklearn implementation.
-    Parameters
-    ----------
-    net_obj : obj
-        The instantiated pytorch network object to be used in training. Should have type
-        that is a subclass of nn.Module.
-    seed : int, optional
-        Seed to be used for randomness in network initalization for reproducibility.
-    optimizer : type, optional, default=torch.optim.Adam
-        The optimizer class to be used in training. Should be a subclass of
-        torch.optim.Optimizer.
-    loss_fn : callable, optional, default=nn.BCEWithLogitsLoss()
-        A function or callable loss object with signature `f(y_pred, y_true)`. 
-    device : {"cpu", "cuda"}, optional, default="cpu"
-        The device used in training.
-    lr : float, optional, default=0.001
-        The learning rate. Ignored if `lr_scheduler` is provided.
-    weight_decay : float, optional, default=0
-        Weight decay parameter used for network weight regularization.
-    batch_size : int, optional, default=128
-        Batch sized used in training.
-    max_epochs : int, optional, default=10
-        Maximum number of epochs used in training. Actual number of epochs used may be
-        lower if early stopping is enabled.
-    lr_scheduler : type, optional
-        Learning rate scheduler for training, e.g. a class from
-        torch.optim.lr_scheduler.
-    lr_scheduler_params : dict, optional
-        The parameters used to initialize the `lr_scheduler`.
-    Attributes
-    ----------
-    self.metric_history_ : list of dict
-        A list of dictionaries recording the values for each metric, eval_set, and
-        epoch.
-    self.early_stopping_history_ : list of float
-        The list of values for only the first metric and eval_set, used for early
-        stopping if specified.
-    self.early_stopping_epoch_ : int or None
-        The optimal epoch chosen by early stopping.
-    self.net_ : obj
-        The trained `net_obj`, which is used for prediction.
-    self.metric_history_df_ : pd.DataFrame
-        A dataframe wrapper around `self.metric_history_`.
-    """
-
-    def __init__(
-        self,
-        net_obj,
-        seed=None,
-        optimizer=torch.optim.Adam,
-        loss_fn=None,
-        device="cpu",
-        lr=0.001,
-        weight_decay=0,
-        batch_size=128,
-        max_epochs=10,
-        lr_scheduler=None,
-        lr_scheduler_params=None,
-    ):
-
-        self.net_obj = net_obj
-        self.seed = seed
-        self.optimizer = optimizer
-        self.loss_fn = loss_fn
-        self.device = device
-        self.lr = lr
-        self.weight_decay = weight_decay
-        self.batch_size = batch_size
-        self.max_epochs = max_epochs
-        self.lr_scheduler = lr_scheduler
-        self.lr_scheduler_params = lr_scheduler_params
-
-    def fit(
-        self,
-        X,
-        y,
-        eval_set=None,
-        eval_names=None,
-        eval_metric=None,
-        patience=None,
-        min_delta=None,
-        verbose=False,
-    ):
-        """Trains the network, with support for early stopping.
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            The training features.
-        y : array-like, shape (n_samples, n_labels)
-            The training labels.
-        eval_set : list of tuple, optional
-            A list of (X, y) tuples to be used for computing eval loss. At least one
-            dataset is required for early stopping, in which case the first tuple in the
-            list is used for early stopping evaluation.
-        eval_names : list, optional
-            An optionl list of the same length as `eval_set`, specifying the name of
-            each dataset.
-        eval_metric : list of callable, optional
-            A list of metric functions to be used in evaluation. Loss will be recorded
-            for all metrics, but only the first metric provided will be used for early
-            stopping, if enabled. Should have signature `f(y_pred, y_true)`.
-        patience : int, optional
-            The number of epochs of increasing loss tested before stopping early. The
-            number to be tested is reset after every epoch with a decrease in loss. This
-            parameter may be overridden if `min_delta` is also set.
-        min_delta : float, optional
-            The minimum decrease in loss required to continue training. If loss doss not
-            decrease by more than this value, training will be stopped early and the
-            stopping epoch will be recorded in the `early_stopping_epoch_` attribute.
-        verbose : int or bool, optional, default=False
-            If False, no loss is printed during training. Otherwise, results are printed
-            after every `verbose` epochs.
-        Returns
-        -------
-        self : obj
-            Returns the estimator itself, in keeping with sklearn requirements.
-        """
-
-        # initialize device for training
-        device = torch.device(self.device)
-
-        # set seed for network weight initialization
-        if self.seed is not None:
-            torch.manual_seed(self.seed)
-            # this should be redundant with torch.manual_seed
-            torch.cuda.manual_seed_all(self.seed)
-        # send pytorch network to specified device
-        net = self.net_obj.to(device)
-        # reset initial parameters for net
-        net.reset_parameters()
-        # initialize loss and optimizer
-        if self.loss_fn is None:
-            loss_fn = nn.BCEWithLogitsLoss()
-        else:
-            loss_fn = self.loss_fn
-        optimizer = self.optimizer(
-            net.parameters(), lr=self.lr, weight_decay=self.weight_decay
-        )
-
-        # helper for converting to tensor
-        def to_tensor(a):
-            return torch.tensor(a, dtype=torch.float32).to(device)
-
-
-        X_nn = to_tensor(X)
-        y_nn = to_tensor(y)
-
-        # Add extra dimension if y is single dimensional
-        if len(y_nn.shape) == 1:
-            y_nn = y_nn.unsqueeze(1)
-
-        # set up for evaluation on train/val data
-        if eval_set is None:
-            eval_set = []
-        if eval_metric is None:
-            eval_metric = []
-        if eval_names is None:
-            eval_names = [f"eval_{i}" for i in range(len(eval_set))]
-
-        # add train data as an eval set
-        eval_set.append((X, y))
-        eval_names.append("train")
-
-        # convert eval sets to tensors
-        eval_set = [(to_tensor(tup[0]), to_tensor(tup[1])) for tup in eval_set]
-
-        eval_metric = [
-            m if isinstance(m, tuple) else (f"metric_{i}", m)
-            for i, m in enumerate(eval_metric)
-        ]
-        eval_metric.append(("objective", loss_fn))
-
-        # set up dataloader for batches
-        dataset = torch.utils.data.TensorDataset(X_nn, y_nn)
-        dataloader = torch.utils.data.DataLoader(
-            dataset, batch_size=self.batch_size, shuffle=True, drop_last=True
-        )
-
-        lr_scheduler = self.lr_scheduler
-        if self.lr_scheduler_params is None:
-            lr_scheduler_params = {}
-        else:
-            lr_scheduler_params = self.lr_scheduler_params
-
-        if lr_scheduler == "OneCycleLR":
-            one_cycle_lr = optim.lr_scheduler.OneCycleLR(
-                optimizer=optimizer,
-                epochs=self.max_epochs,
-                steps_per_epoch=len(dataloader),
-                **lr_scheduler_params,
-            )
-        if lr_scheduler == "ReduceLROnPlateau":
-            reduce_lr_on_plateau = optim.lr_scheduler.ReduceLROnPlateau(
-                optimizer=optimizer, **lr_scheduler_params
-            )
-
-        # track number of epochs with increasing loss for early stopping
-        self.metric_history_ = []
-        self.early_stopping_history_ = []
-        self.early_stopping_epoch_ = None
-        min_valmetric = np.inf
-        increases = 0
-        best_params = copy.deepcopy(net.state_dict())
-        for epoch in range(self.max_epochs):
-
-            # construct batches
-            for batch_x, batch_y in dataloader:
-
-                net.train()
-
-                # zero gradients to start
-                optimizer.zero_grad()
-
-                output = net.forward(batch_x)
-
-                loss = loss_fn(output, batch_y)
-
-                loss.backward()
-                optimizer.step()
-                if lr_scheduler == "OneCycleLR":
-                    one_cycle_lr.step()
-
-            net.eval()
-
-            # record metrics for each eval set
-            for i in range(len(eval_set)):
-                X_val, y_val = eval_set[i]
-                # Add extra dimension if y is single dimensional
-                if len(y_val.shape) == 1:
-                    y_val = y_val.unsqueeze(1)
-
-                set_name = eval_names[i]
-                with torch.no_grad():
-                    preds = net(X_val)
-                for j in range(len(eval_metric)):
-                    metric_name, metric_fn = eval_metric[j]
-                    metric_val = metric_fn(preds, y_val)
-                    if isinstance(metric_val, torch.Tensor):
-                        metric_val = metric_val.item()
-                    row = {
-                        "epoch": epoch,
-                        "data": set_name,
-                        "metric": metric_name,
-                        "value": metric_val,
-                    }
-                    self.metric_history_.append(row)
-                    # use first val set and first metric for early stopping
-                    if i == 0 and j == 0:
-                        self.early_stopping_history_.append(metric_val)
-
-            if verbose:
-                verbose = int(verbose)
-                if epoch % verbose == 0:
-                    for d in self.metric_history_[-len(eval_set) * len(eval_metric) :]:
-                        print(d)
-
-            # if val set is present, record history and follow early stopping parameters
-            if len(eval_set) > 1:
-                val_metric = self.early_stopping_history_[-1]
-                if lr_scheduler == "ReduceLROnPlateau":
-                    reduce_lr_on_plateau.step(val_metric)
-
-                # early stopping based on minimum decrease in loss
-                if min_delta is not None and epoch > 0:
-                    if self.early_stopping_history_[-2] - val_metric < min_delta:
-                        print("Early stopping at epoch ", epoch)
-                        self.early_stopping_epoch_ = epoch
-                        break
-
-                # early stopping based on number of epochs with increasing loss
-                if val_metric < min_valmetric:
-                    min_valmetric = val_metric
-                    increases = 0
-                    # save model paramaters for current best epoch
-                    best_params = copy.deepcopy(net.state_dict())
-                elif patience is not None:
-                    increases += 1
-                    if increases > patience:
-                        print("Early stopping at epoch ", epoch)
-                        self.early_stopping_epoch_ = epoch
-                        break
-
-        # if using early stopping, reload net with best params
-        if patience is not None or min_delta is not None:
-            # load model paramaters from best epoch
-            net.load_state_dict(best_params)
-            net.eval()
-
-        # store network for prediction
-        self.net_ = net
-
-        self.metric_history_df_ = pd.DataFrame(self.metric_history_)
-
-        return self
-
-    def predict(self, X):
-        """Predicts using the trained network.
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            An array of the same shape as the one used in training, containing the data
-            to be used for predictions.
-        Returns
-        -------
-        np.array
-            Model predictions in an array of shape (n_samples, n_labels).
-        """
-
-        # cast to tensor and move to device
-        device = torch.device(self.device)
-        X_nn = torch.tensor(X, dtype=torch.float32).to(device)
-
-        # forward pass through network for predictions
-        # return predictions as numpy array
-        with torch.no_grad():
-            return self.net_(X_nn).cpu().detach().numpy().astype("float32")
-
-    def predict_proba(self, X):
-        """Return predictions on probability scale for classification network.
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            An array of the same shape as the one used in training, containing the data
-            to be used for predictions.
-        Returns
-        -------
-        np.array
-            Model predictions in an array of shape (n_samples, n_labels), with sigmoid
-            transformation applied (i.e. predicted probabilities).
-        """
-
-        preds = self.predict(X)
-        preds_proba = 1 / (1 + np.exp(-preds))
-        return preds_proba.astype("float32")
-
-# %% [code]
-# FOR FUTURE USE: if use label smoothing
-class SmoothCrossEntropyLoss(nn.modules.loss._WeightedLoss):
-    """
-    Computes smoothed cross entropy (log) loss.
-    Label smoothing works by clipping the true label values based on a
-    specified smoothing parameter, e.g., with smoothing == 0.001 and n_classes == 2,
-    [0, 1] --> [0.005, 0.995].
-    The formula is given by label smoothed y = y * (1 - smoothing) + smoothing / n_classes
-    This method can help prevent models from becoming over-confident.
-    See paper: https://papers.nips.cc/paper/2019/file/f1748d6b0fd9d439f71450117eba2725-Paper.pdf
-    """
-    def __init__(self, weight=None, reduction="mean", smoothing=0.001, device="cpu"):
-        super().__init__(weight=weight, reduction=reduction)
-        self.smoothing = smoothing
-        self.weight = weight
-        self.device = device
-
-    @staticmethod
-    def _smooth(targets, n_classes, smoothing, device):
-        """Helper for computing smoothed label values."""
-        assert 0 <= smoothing <= 1
-        with torch.no_grad():
-            targets = (
-                targets * (1 - smoothing)
-                + torch.ones_like(targets).to(device) * smoothing / n_classes
-            )
-        return targets
-
-    def forward(self, inputs, targets, sample_weight=None):
-        # smooth targets
-        targets = SmoothCrossEntropyLoss()._smooth(
-            targets, 2, self.smoothing, self.device
-        )
-        # weight class predictions
-        if self.weight is not None:
-            inputs = inputs * self.weight.unsqueeze(0)
-
-        if sample_weight is None:
-            # binary_cross_entropy_with_logits returns mean log loss
-            loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="mean")
-        else:
-            # binary_cross_entropy_with_logits returns
-            # [# obs., # classes] tensor of log losses
-            loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
-            assert loss.size(0) == sample_weight.size(0)
-            # compute weighted mean for each target
-            loss = torch.sum(loss * sample_weight, dim=0) / torch.sum(sample_weight)
-            # compute column-wise mean
-            loss = torch.mean(loss)
-
-        return loss
-
-# FOR FUTURE USE: if use clipping
-class ClippedCrossEntropyLoss(nn.modules.loss._WeightedLoss):
-    """
-    Computes clipped cross entropy (log) loss.
-    Clipped log loss clips the predicted probabilities based on a specified smoothing
-    parameter, e.g., with smoothing == 0.001, the predicted probabilities [.000013, .99992]
-    --> [0.005, 0.995].
-    This method can help prevent models from becoming over-confident.
-    """
-    def __init__(self, weight=None, reduction="mean", smoothing=0.001):
-        super().__init__(weight=weight, reduction=reduction)
-        self.smoothing = smoothing
-        self.weight = weight
-
-    def forward(self, y_pred, y_true, sample_weight=None):
-        # clip predictions
-        y_pred_clipped = torch.clamp(
-            torch.sigmoid(y_pred), self.smoothing, 1 - self.smoothing
-        )
-
-        # weight class predictions
-        if self.weight is not None:
-            y_pred = y_pred * self.weight.unsqueeze(0)
-
-        if sample_weight is None:
-            # binary_cross_entropy returns mean log loss
-            loss = F.binary_cross_entropy(y_pred_clipped, y_true, reduction="mean")
-        else:
-            # binary_cross_entropy returns [# obs., # classes] tensor of log losses
-            loss = F.binary_cross_entropy(y_pred_clipped, y_true, reduction="none")
-            assert loss.size(0) == sample_weight.size(0)
-            # compute weighted mean for each target
-            loss = torch.sum(loss * sample_weight, dim=0) / torch.sum(sample_weight)
-            # compute mean across targets
-            loss = torch.mean(loss)
-
-        return loss
-
-# %% [code]
-###################
-### Import Data ###
-###################
-
-train_drug = pd.read_csv("../input/lish-moa/train_drug.csv")
-X = pd.read_csv("../input/lish-moa/train_features.csv")
-y = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
-
-# Remove control observations
-y = y.loc[X["cp_type"]=="trt_cp"].reset_index(drop=True)
-X = X.loc[X["cp_type"]=="trt_cp"].reset_index(drop=True)
-
-# %% [code]
-##########################
-### Data Preprocessing ###
-##########################
-
-X, X_test, y, y_test = train_test_split(
-    X, y, test_size=0.2, random_state=1998
-)
-
-transformer = Preprocessor() 
-transformer.fit(X)
-X = transformer.transform(X)
-X_test = transformer.transform(X_test)
-y = y.drop(["sig_id"], axis = 1).values.astype("float32")
-y_test = y_test.drop(["sig_id"], axis = 1).values.astype("float32")
-
-# %% [code]
-# Define network architecture 
-
-n_input = X.shape[1]
-n_output = y.shape[1]
-hidden_units = 640
-dropout = 0.2
-
-net_obj = Sequential(
-    nn.Linear(n_input, hidden_units),
-    nn.BatchNorm1d(hidden_units),
-    nn.ReLU(),
-    nn.Dropout(dropout),
-    nn.Linear(hidden_units, hidden_units),
-    nn.BatchNorm1d(hidden_units),
-    nn.ReLU(),
-    nn.Dropout(dropout),
-    nn.Linear(hidden_units, n_output)
-)
-
-# %% [code]
-loss = nn.BCEWithLogitsLoss()
-
-# Initialize network
-net = Network(
-    net_obj=net_obj, 
-    max_epochs=20,
-    batch_size=128, 
-    device=device,
-    loss_fn=loss, 
-    lr=0.01,
-    weight_decay=1e-6,
-    lr_scheduler="ReduceLROnPlateau"
-)
-
-
-net.fit(
-    X=X, 
-    y=y, 
-    eval_set=[(X_test, y_test)],
-    eval_metric=[loss],
-    verbose=2
-)
-
-# %% [code]
-def multi_log_loss(y_pred, y_true):
-    losses = -y_true * np.log(y_pred + 1e-15) - (1 - y_true) * np.log(1 - y_pred + 1e-15)
-    return np.mean(losses)
-
-preds = net.predict_proba(X)
-print("Train loss: ", multi_log_loss(preds, y))
-    
-test_preds = net.predict_proba(X_test)
-print("Test loss: ", multi_log_loss(test_preds, y_test))
diff --git a/input/.DS_Store b/input/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..a09d0969db0bc57eed573652963ac4bfb05cbcbc
Binary files /dev/null and b/input/.DS_Store differ
diff --git a/input/utilities/models.py b/input/utilities/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5de42f846a8c6b3bd3296d0412eb0e449502a5a
--- /dev/null
+++ b/input/utilities/models.py
@@ -0,0 +1,900 @@
+#######################
+### Library imports ###
+#######################
+
+# standard library
+import copy
+import os
+import sys
+
+# data packages
+import numpy as np
+import pandas as pd
+
+# pytorch
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+# sklearn
+import sklearn.base
+import sklearn.linear_model
+import sklearn.model_selection
+import sklearn.pipeline
+
+# parallelization
+from joblib import Parallel, delayed
+
+
+########################
+### Helper functions ###
+########################
+
+# index rows for both dataframes and arrays
+def _subset_rows(x, idx):
+    if isinstance(x, pd.DataFrame):
+        return x.iloc[idx]
+    else:
+        return x[idx]
+
+
+# index columns for both dataframes and arrays
+def _subset_cols(x, idx):
+    if isinstance(x, pd.DataFrame):
+        return x.iloc[:, idx]
+    else:
+        return x[:, idx]
+
+
+# alternative to sklearn's _fit_binary
+# omits _ConstantPredictor and supports fit_params
+def _fit_binary(estimator, X, y=None, **fit_params):
+    estimator = sklearn.base.clone(estimator)
+    estimator.fit(X=X, y=y, **fit_params)
+    return estimator
+
+
+class SmoothCrossEntropyLoss(nn.modules.loss._WeightedLoss):
+    """
+    Computes smoothed cross entropy (log) loss.
+    Label smoothing works by clipping the true label values based on a
+    specified smoothing parameter, e.g., with smoothing == 0.001 and n_classes == 2,
+    [0, 1] --> [0.005, 0.995].
+    The formula is given by label smoothed y = y * (1 - smoothing) + smoothing / n_classes
+    This method can help prevent models from becoming over-confident.
+    See paper: https://papers.nips.cc/paper/2019/file/f1748d6b0fd9d439f71450117eba2725-Paper.pdf
+    """
+    def __init__(self, weight=None, reduction="mean", smoothing=0.001, device="cpu"):
+        super().__init__(weight=weight, reduction=reduction)
+        self.smoothing = smoothing
+        self.weight = weight
+        self.device = device
+
+    @staticmethod
+    def _smooth(targets, n_classes, smoothing, device):
+        """Helper for computing smoothed label values."""
+        assert 0 <= smoothing <= 1
+        with torch.no_grad():
+            targets = (
+                targets * (1 - smoothing)
+                + torch.ones_like(targets).to(device) * smoothing / n_classes
+            )
+        return targets
+
+    def forward(self, inputs, targets, sample_weight=None):
+        # smooth targets
+        targets = SmoothCrossEntropyLoss()._smooth(
+            targets, 2, self.smoothing, self.device
+        )
+        # weight class predictions
+        if self.weight is not None:
+            inputs = inputs * self.weight.unsqueeze(0)
+
+        if sample_weight is None:
+            # binary_cross_entropy_with_logits returns mean log loss
+            loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="mean")
+        else:
+            # binary_cross_entropy_with_logits returns
+            # [# obs., # classes] tensor of log losses
+            loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+            assert loss.size(0) == sample_weight.size(0)
+            # compute weighted mean for each target
+            loss = torch.sum(loss * sample_weight, dim=0) / torch.sum(sample_weight)
+            # compute column-wise mean
+            loss = torch.mean(loss)
+
+        return loss
+
+
+##############
+### Models ###
+##############
+
+# Sub-class nn.Sequential to add reset_parameters method
+class Sequential(nn.Sequential):
+    def reset_parameters(self):
+        for layer in self.children():
+           if hasattr(layer, "reset_parameters"):
+               layer.reset_parameters()
+
+
+
+class Network(sklearn.base.BaseEstimator):
+    """An sklearn-compatible wrapper for pytorch estimators.
+    Wraps pytorch training and prediction in sklearn-compatible estimator with `fit` and
+    `predict` methods and limited support for commonly-tuned net parameters. Supports
+    early stopping and `eval_set` similarly to the LightGBM sklearn implementation.
+    Parameters
+    ----------
+    net_obj : obj
+        The instantiated pytorch network object to be used in training. Should have type
+        that is a subclass of nn.Module.
+    seed : int, optional
+        Seed to be used for randomness in network initalization for reproducibility.
+    optimizer : type, optional, default=torch.optim.Adam
+        The optimizer class to be used in training. Should be a subclass of
+        torch.optim.Optimizer.
+    loss_fn : callable, optional, default=nn.BCEWithLogitsLoss()
+        A function or callable loss object with signature `f(y_pred, y_true)`. If
+        training with sample weights, should also accept a `sample_weight` argument.
+    device : {"cpu", "cuda"}, optional, default="cpu"
+        The device used in training.
+    lr : float, optional, default=0.001
+        The learning rate. Ignored if `lr_scheduler` is provided.
+    weight_decay : float, optional, default=0
+        Weight decay parameter used for network weight regularization.
+    batch_size : int, optional, default=128
+        Batch sized used in training.
+    max_epochs : int, optional, default=10
+        Maximum number of epochs used in training. Actual number of epochs used may be
+        lower if early stopping is enabled.
+    lr_scheduler : type, optional
+        Learning rate scheduler for training, e.g. a class from
+        torch.optim.lr_scheduler.
+    lr_scheduler_params : dict, optional
+        The parameters used to initialize the `lr_scheduler`.
+    Attributes
+    ----------
+    self.metric_history_ : list of dict
+        A list of dictionaries recording the values for each metric, eval_set, and
+        epoch.
+    self.early_stopping_history_ : list of float
+        The list of values for only the first metric and eval_set, used for early
+        stopping if specified.
+    self.early_stopping_epoch_ : int or None
+        The optimal epoch chosen by early stopping.
+    self.net_ : obj
+        The trained `net_obj`, which is used for prediction.
+    self.metric_history_df_ : pd.DataFrame
+        A dataframe wrapper around `self.metric_history_`.
+    """
+
+    def __init__(
+        self,
+        net_obj,
+        seed=None,
+        optimizer=torch.optim.Adam,
+        loss_fn=None,
+        device="cpu",
+        lr=0.001,
+        weight_decay=0,
+        batch_size=128,
+        max_epochs=10,
+        lr_scheduler=None,
+        lr_scheduler_params=None,
+    ):
+
+        self.net_obj = net_obj
+        self.seed = seed
+        self.optimizer = optimizer
+        self.loss_fn = loss_fn
+        self.device = device
+        self.lr = lr
+        self.weight_decay = weight_decay
+        self.batch_size = batch_size
+        self.max_epochs = max_epochs
+        self.lr_scheduler = lr_scheduler
+        self.lr_scheduler_params = lr_scheduler_params
+
+    def fit(
+        self,
+        X,
+        y,
+        sample_weight=None,
+        eval_set=None,
+        eval_names=None,
+        eval_sample_weight=None,
+        eval_metric=None,
+        patience=None,
+        min_delta=None,
+        verbose=False,
+    ):
+        """Trains the network, with support for early stopping.
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The training features.
+        y : array-like, shape (n_samples, n_labels)
+            The training labels.
+        sample_weight : array-like, shape (n_samples, ), optional
+            An optional array of sample weights to be used in computing loss, if
+            desired, in which case `self.loss_fn` should accept a `sample_weight`
+            argument.
+        eval_set : list of tuple, optional
+            A list of (X, y) tuples to be used for computing eval loss. At least one
+            dataset is required for early stopping, in which case the first tuple in the
+            list is used for early stopping evaluation.
+        eval_names : list, optional
+            An optionl list of the same length as `eval_set`, specifying the name of
+            each dataset.
+        eval_sample_weight : list of array-like, optional
+            An optional list of the same length as `eval_set`, containing the sample
+            weight array for each eval set (if `sample_weight` is in use).
+        eval_metric : list of callable, optional
+            A list of metric functions to be used in evaluation. Loss will be recorded
+            for all metrics, but only the first metric provided will be used for early
+            stopping, if enabled. Should have signature `f(y_pred, y_true), with a
+            `sample_weight` parameter if weights are being used.
+        patience : int, optional
+            The number of epochs of increasing loss tested before stopping early. The
+            number to be tested is reset after every epoch with a decrease in loss. This
+            parameter may be overridden if `min_delta` is also set.
+        min_delta : float, optional
+            The minimum decrease in loss required to continue training. If loss doss not
+            decrease by more than this value, training will be stopped early and the
+            stopping epoch will be recorded in the `early_stopping_epoch_` attribute.
+        verbose : int or bool, optional, default=False
+            If False, no loss is printed during training. Otherwise, results are printed
+            after every `verbose` epochs.
+        Returns
+        -------
+        self : obj
+            Returns the estimator itself, in keeping with sklearn requirements.
+        """
+
+        # initialize device for training
+        device = torch.device(self.device)
+
+        # set seed for network weight initialization
+        if self.seed is not None:
+            torch.manual_seed(self.seed)
+            # this should be redundant with torch.manual_seed
+            torch.cuda.manual_seed_all(self.seed)
+        # send pytorch network to specified device
+        net = self.net_obj.to(device)
+        # reset initial parameters for net
+        net.reset_parameters()
+        # initialize loss and optimizer
+        if self.loss_fn is None:
+            loss_fn = nn.BCEWithLogitsLoss()
+        else:
+            loss_fn = self.loss_fn
+        optimizer = self.optimizer(
+            net.parameters(), lr=self.lr, weight_decay=self.weight_decay
+        )
+
+        # helper for converting to tensor
+        def to_tensor(a):
+            return torch.tensor(a, dtype=torch.float32).to(device)
+
+        if sample_weight is not None:
+            # add sample_weight as column in X to pass through the DataLoader for
+            # creating batches
+            X_w_weights = np.append(X, sample_weight, 1)
+            X_nn = to_tensor(X_w_weights)
+        else:
+            X_nn = to_tensor(X)
+
+        y_nn = to_tensor(y)
+
+        # Add extra dimension if y is single dimensional
+        if len(y_nn.shape) == 1:
+            y_nn = y_nn.unsqueeze(1)
+
+        # set up for evaluation on train/val data
+        if eval_set is None:
+            eval_set = []
+        if eval_metric is None:
+            eval_metric = []
+        if eval_names is None:
+            eval_names = [f"eval_{i}" for i in range(len(eval_set))]
+        if eval_sample_weight is None:
+            eval_sample_weight = [None] * len(eval_set)
+
+        # add train data as an eval set
+        eval_set.append((X, y))
+        eval_sample_weight.append(sample_weight)
+        eval_names.append("train")
+
+        # convert eval sets to tensors
+        eval_set = [(to_tensor(tup[0]), to_tensor(tup[1])) for tup in eval_set]
+        # convert eval weights to tensors
+        eval_sample_weight = [
+            to_tensor(weight) if weight is not None else weight
+            for weight in eval_sample_weight
+        ]
+
+        eval_metric = [
+            m if isinstance(m, tuple) else (f"metric_{i}", m)
+            for i, m in enumerate(eval_metric)
+        ]
+        eval_metric.append(("objective", loss_fn))
+
+        # set up dataloader for batches
+        dataset = torch.utils.data.TensorDataset(X_nn, y_nn)
+        dataloader = torch.utils.data.DataLoader(
+            dataset, batch_size=self.batch_size, shuffle=True, drop_last=True
+        )
+
+        lr_scheduler = self.lr_scheduler
+        if self.lr_scheduler_params is None:
+            lr_scheduler_params = {}
+        else:
+            lr_scheduler_params = self.lr_scheduler_params
+
+        if lr_scheduler == "OneCycleLR":
+            one_cycle_lr = optim.lr_scheduler.OneCycleLR(
+                optimizer=optimizer,
+                epochs=self.max_epochs,
+                steps_per_epoch=len(dataloader),
+                **lr_scheduler_params,
+            )
+        if lr_scheduler == "ReduceLROnPlateau":
+            reduce_lr_on_plateau = optim.lr_scheduler.ReduceLROnPlateau(
+                optimizer=optimizer, **lr_scheduler_params
+            )
+
+        # track number of epochs with increasing loss for early stopping
+        self.metric_history_ = []
+        self.early_stopping_history_ = []
+        self.early_stopping_epoch_ = None
+        min_valmetric = np.inf
+        increases = 0
+        best_params = copy.deepcopy(net.state_dict())
+        for epoch in range(self.max_epochs):
+
+            # construct batches
+            for batch_x, batch_y in dataloader:
+                if sample_weight is not None:
+                    # extract last column of batch_x (the sample weights)
+                    batch_sample_weight = batch_x[:, -1].reshape(-1, 1)
+                    batch_x = batch_x[:, :-1]
+                else:
+                    batch_sample_weight = None
+
+                net.train()
+
+                # zero gradients to start
+                optimizer.zero_grad()
+
+                output = net.forward(batch_x)
+                if sample_weight is not None:
+                    loss = loss_fn(output, batch_y, sample_weight=batch_sample_weight)
+                else:
+                    loss = loss_fn(output, batch_y)
+
+                loss.backward()
+                optimizer.step()
+                if lr_scheduler == "OneCycleLR":
+                    one_cycle_lr.step()
+
+            net.eval()
+
+            # record metrics for each eval set
+            for i in range(len(eval_set)):
+                X_val, y_val = eval_set[i]
+                # Add extra dimension if y is single dimensional
+                if len(y_val.shape) == 1:
+                    y_val = y_val.unsqueeze(1)
+
+                eval_sample_weight_ = eval_sample_weight[i]
+                set_name = eval_names[i]
+                with torch.no_grad():
+                    preds = net(X_val)
+                for j in range(len(eval_metric)):
+                    metric_name, metric_fn = eval_metric[j]
+                    if sample_weight is not None:
+                        metric_val = metric_fn(
+                            preds, y_val, sample_weight=eval_sample_weight_
+                        )
+                    else:
+                        metric_val = metric_fn(preds, y_val)
+                    if isinstance(metric_val, torch.Tensor):
+                        metric_val = metric_val.item()
+                    row = {
+                        "epoch": epoch,
+                        "data": set_name,
+                        "metric": metric_name,
+                        "value": metric_val,
+                    }
+                    self.metric_history_.append(row)
+                    # use first val set and first metric for early stopping
+                    if i == 0 and j == 0:
+                        self.early_stopping_history_.append(metric_val)
+
+            if verbose:
+                verbose = int(verbose)
+                if epoch % verbose == 0:
+                    for d in self.metric_history_[-len(eval_set) * len(eval_metric) :]:
+                        print(d)
+
+            # if val set is present, record history and follow early stopping parameters
+            if len(eval_set) > 1:
+                val_metric = self.early_stopping_history_[-1]
+                if lr_scheduler == "ReduceLROnPlateau":
+                    reduce_lr_on_plateau.step(val_metric)
+
+                # early stopping based on minimum decrease in loss
+                if min_delta is not None and epoch > 0:
+                    if self.early_stopping_history_[-2] - val_metric < min_delta:
+                        print("Early stopping at epoch ", epoch)
+                        self.early_stopping_epoch_ = epoch
+                        break
+
+                # early stopping based on number of epochs with increasing loss
+                if val_metric < min_valmetric:
+                    min_valmetric = val_metric
+                    increases = 0
+                    # save model paramaters for current best epoch
+                    best_params = copy.deepcopy(net.state_dict())
+                elif patience is not None:
+                    increases += 1
+                    if increases > patience:
+                        print("Early stopping at epoch ", epoch)
+                        self.early_stopping_epoch_ = epoch
+                        break
+
+        # if using early stopping, reload net with best params
+        if patience is not None or min_delta is not None:
+            # load model paramaters from best epoch
+            net.load_state_dict(best_params)
+            net.eval()
+
+        # store network for prediction
+        self.net_ = net
+
+        self.metric_history_df_ = pd.DataFrame(self.metric_history_)
+
+        return self
+
+    def predict(self, X):
+        """Predicts using the trained network.
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            An array of the same shape as the one used in training, containing the data
+            to be used for predictions.
+        Returns
+        -------
+        np.array
+            Model predictions in an array of shape (n_samples, n_labels).
+        """
+
+        # cast to tensor and move to device
+        device = torch.device(self.device)
+        X_nn = torch.tensor(X, dtype=torch.float32).to(device)
+
+        # forward pass through network for predictions
+        # return predictions as numpy array
+        with torch.no_grad():
+            return self.net_(X_nn).cpu().detach().numpy().astype("float32")
+
+    def predict_proba(self, X):
+        """Return predictions on probability scale for classification network.
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            An array of the same shape as the one used in training, containing the data
+            to be used for predictions.
+        Returns
+        -------
+        np.array
+            Model predictions in an array of shape (n_samples, n_labels), with sigmoid
+            transformation applied (i.e. predicted probabilities).
+        """
+
+        preds = self.predict(X)
+        preds_proba = 1 / (1 + np.exp(-preds))
+        return preds_proba.astype("float32")
+
+
+class CVModel(sklearn.base.BaseEstimator):
+    """Wraps an sklearn estimator to train in folds using cross validation.
+    In addition to training separate models for each fold, this estimator allows for
+    predictions on new data or out-of-fold predictions on the training data. This makes
+    it a convenient way to get out-of-sample predictions for every observation in a
+    train set
+    Parameters
+    ----------
+    model : obj
+        Sklearn-compatible estimator to be used in training. To work with the CV
+        procedure, the estimator must accept an `eval_set` parameter, although the
+        implementation may be to just accept this parameter and do nothing with it.
+    Attributes
+    ----------
+    splits_ : list of tuple
+        The CV splits used in training, in the form of a list of `(idx_train, idx_test)`
+        tuples.
+    models_ : list of obj
+        A list of trained models of same length as `self.splits_`, with each model cloned
+        from `self.model` and trained on the corresponding split in `self.splits_`.
+    """
+
+    def __init__(self, model):
+        self.model = model
+
+    def fit(
+        self,
+        X,
+        y=None,
+        splits=None,
+        sample_weight=None,
+        eval_weights=False,
+        **fit_params,
+    ):
+        """Split data into folds and train separate model for each split.
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The features for the training data, which will be split into folds for
+            separate model fitting.
+        y : array-like, shape (n_samples, n_labels), optional
+            The training labels, if required.
+        splits : list of tuple
+            The CV splits to  be used in training, in the form of a list of `(idx_train,
+            idx_test)` tuples.
+        sample_weight : array-like, shape (n_samples, )
+            Sample weights for each observation in the training data. If provided, these
+            will be split along with training features/labels and passed to each fold's
+            model as a fit parameter.
+        eval_weights : bool
+            If True, pass the `sample_weight` to each model for eval sets also, as an
+            `eval_sample_weight` fit parameter.
+        Returns
+        -------
+        self : obj
+            Returns the estimator itself, in keeping with sklearn requirements.
+        """
+
+        if splits is None:
+            splits = [None]
+
+        self.splits_ = splits
+
+        # ensure indexing is same as rows if inputs are dataframes
+        if isinstance(X, pd.DataFrame):
+            X = X.reset_index(drop=True)
+        if isinstance(y, pd.DataFrame):
+            y = y.reset_index(drop=True)
+
+        # for each fold, split data, train model, and store it
+        self.models_ = []
+        for i, (idx_train, idx_val) in enumerate(splits):
+            if 'verbose' in fit_params and fit_params['verbose']:
+                print(f"Fitting split {i+1}/{len(splits)}")
+
+            m = sklearn.base.clone(self.model)
+
+            # split sample weights if they are included
+            if sample_weight is not None:
+                fit_params["sample_weight"] = _subset_rows(sample_weight, idx_train)
+                if eval_weights:
+                    fit_params["eval_sample_weight"] = _subset_rows(sample_weight, idx_val)
+
+            m.fit(
+                _subset_rows(X, idx_train),
+                _subset_rows(y, idx_train),
+                eval_set=[(_subset_rows(X, idx_val), _subset_rows(y, idx_val))],
+                **fit_params,
+            )
+            self.models_.append(m)
+
+        return self
+
+    def _predict(self, X, use_splits=False, predict_proba=True, **pred_params):
+        """Private method for implementing `use_splits` and `predict_proba`."""
+
+        # use training splits to prediction out-of-fold
+        if use_splits:
+            order = []
+            preds_shuffled = []
+            for m, split_indices in zip(self.models_, self.splits_):
+                oof_indices = split_indices[1]
+                # get raw or transformed preds, as specified
+                if predict_proba:
+                    preds_oof = m.predict_proba(_subset_rows(X, oof_indices), **pred_params)
+                else:
+                    preds_oof = m.predict(_subset_rows(X, oof_indices), **pred_params)
+                # record the prediction and original index for each observation
+                order.append(oof_indices)
+                preds_shuffled.append(preds_oof)
+            # fix the order to match the input data
+            order = np.concatenate(order)
+            preds_shuffled = np.concatenate(preds_shuffled)
+            preds = np.empty_like(preds_shuffled)
+            preds[order] = preds_shuffled
+        # predict using average of all k models
+        else:
+            preds = []
+            for m in self.models_:
+                if predict_proba:
+                    pred = m.predict_proba(X, **pred_params)
+                else:
+                    pred = m.predict(X, **pred_params)
+                preds.append(pred)
+            preds = np.mean(preds, axis=0)
+
+        return preds
+
+    def predict(self, X, use_splits=False, **pred_params):
+        """Predict on input data using trained models.
+        By setting `use_splits` to `True`, one can immediately obtain an out-of-sample
+        prediction for each training observation by using only the model for which the
+        observation was out-of-fold. Otherwise, predictions are averaged across the
+        models from each fold.
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            An array of the same shape as the one used in training, containing the data
+            to be used for predictions.
+        use_splits : bool
+            Whether or not to use training splits to get out-of-fold predictions. Only
+            applicable if `X` is the original data used in training.
+        **pred_params
+            Additional prediction parameters to be passed to the models.
+        Returns
+        -------
+        array-like
+            Model predictions in an array of shape (n_samples, n_labels).
+        """
+
+        return self._predict(
+            X, use_splits=use_splits, predict_proba=False, **pred_params
+        )
+
+    def predict_proba(self, X, use_splits=False, **pred_params):
+        """Get predictions on probability-scale using trained models.
+        By setting `use_splits` to `True`, one can immediately obtain an out-of-sample
+        prediction for each training observation by using only the model for which the
+        observation was out-of-fold. Otherwise, predictions are averaged across the
+        models from each fold.
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            An array of the same shape as the one used in training, containing the data
+            to be used for predictions.
+        use_splits : bool
+            Whether or not to use training splits to get out-of-fold predictions. Only
+            applicable if `X` is the original data used in training.
+        **pred_params
+            Additional prediction parameters to be passed to the models.
+        Returns
+        -------
+        array-like
+            Model predictions in an array of shape (n_samples, n_labels), with sigmoid
+            transformation applied (i.e. predicted probabilities).
+        """
+
+        return self._predict(
+            X, use_splits=use_splits, predict_proba=True, **pred_params
+        )
+
+
+class SeedCVModel(sklearn.base.BaseEstimator):
+    """
+    Runs `CVModels` multiple times with different seeds and averages
+    the prediction. Each seed is used to simultaneously create CV splits
+    and set any non-deterministic values for the model (meaning that
+    model configurations may differ across the CV runs).
+    """
+    def __init__(self, model, seeds):
+        self.model = model
+        self.seeds = seeds
+
+    def fit(self, X, y, split_method, n_folds, groups, **fit_params):
+        """
+        Loops over seeds and creates CV splits and fits CVModel` for each
+        seed.
+        Split method is one of one of {"grouped", "target stratified"}.
+        """
+        models = []
+        for seed in self.seeds:
+            # copy model with new seed
+            model = sklearn.base.clone(self.model).set_params(seed=seed)
+            cv = CVModel(model)
+            # make splits based on seed
+            splits = kfold_splits(
+                split_method, # one of {"grouped", "target stratified"}
+                n_folds=n_folds,
+                groups=groups,
+                random_state=seed,
+                X=X,
+                y=y,
+            )
+            cv.fit(
+                X=X,
+                y=y,
+                splits=splits,
+                **fit_params,
+            )
+            models.append(cv)
+
+        self.models = models
+
+        return self
+
+    def predict_proba(self, X, use_splits=False, **pred_params):
+        """
+        Predicts the probability by averaging probability
+        predictions with models trained with different seeds.
+        When `use_splits==True`, for a given `CVModel`, preds for each obs.
+        are made with the model trained with that obs. in the out-of-fold sample.
+        When `use_splits==False`, for a given `CVModel`, preds are averaged over
+        the model for each fold.
+        """
+        preds = []
+        # get preds for each model
+        for m in self.models:
+            pred = m.predict(X, use_splits=use_splits, **pred_params)
+            pred = 1 / (1 + np.exp(-pred))
+            preds.append(pred)
+
+        self.preds = preds
+
+        return np.mean(preds, axis=0)
+
+
+class GridSearch(sklearn.base.BaseEstimator):
+    """Custom analogue to sklearn's GridSearchCV with support for early stopping.
+    Parameters
+    ----------
+    model : obj
+    param_grid : dict of str to list
+        Dictionary mapping each parameter to a list of candidate values to be searched.
+    loss_fn : callable
+        A function or callable loss object with signature `loss_fn(y_pred, y_true)`.
+    Attributes
+    ----------
+    results_ : list of dict
+        List of dictionaries recording fold, parameters, and loss from the grid search.
+    results_df_ : pd.DataFrame
+        A dataframe wrapper around `self.results_`.
+    """
+
+    def __init__(self, model, param_grid, loss_fn):
+        self.model = model
+        self.param_grid = param_grid
+        self.loss_fn = loss_fn
+
+    def fit(self, X, y=None, splits=None, **fit_params):
+
+        """Split data into folds and train/evaluate parameter combinations on each fold.
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The training features.
+        y : array-like, shape (n_samples, n_labels), optional
+            The training labels, if required.
+        splits : list of tuple
+            The fold splits to  be used in training, in the form of a list of
+            `(idx_train, idx_test)` tuples.
+        **fit_params
+            Additional fit parameters to pass to the model.
+        Returns
+        -------
+        self : obj
+            Returns the estimator itself.
+        Notes
+        -----
+        This estimator does not save the trained models or implement a `best_model_`
+        attribute or `predict` method. It is intended only for obtaining optimal
+        parameter combos, most easily accessed in the `results_df_` attribtue.
+        """
+
+        loss_fn = (
+            self.loss_fn if isinstance(self.loss_fn, dict) else {"loss": self.loss_fn}
+        )
+
+        # create parameter grid (result is list of parameter dicts)
+        pg = sklearn.model_selection.ParameterGrid(self.param_grid)
+
+        def to_tensor(a):
+            return torch.tensor(a, dtype=torch.float32).to(self.model.device)
+
+        # we'll record results for every param combo and fold
+        self.results_ = []
+        for params in pg:
+            model = sklearn.base.clone(self.model).set_params(**params)
+            # use our custom CV implementation to pass oos fold as val set
+            cv = CVModel(model)
+            cv.fit(X=X, y=y, splits=splits, **fit_params)
+            # get oof error for every fold
+            for i, m in enumerate(cv.models_):
+                oof_indices = splits[i][1]
+                y_pred = to_tensor(m.predict(_subset_rows(X, oof_indices)))
+                y_true = to_tensor(_subset_rows(y, oof_indices))
+                loss = {name: f(y_pred, y_true).item() for name, f in loss_fn.items()}
+                # extract the name for network class so output is more readable
+                params_copy = params.copy()
+                if "net_obj" in params:
+                    params_copy["net_obj"] = params_copy["net_obj"].name
+                self.results_.append({"fold": i, **params_copy, **loss})
+
+        # add dataframe version to easily read results
+        self.results_df_ = pd.DataFrame(self.results_)
+
+        return self
+
+
+class OVRModel(sklearn.base.BaseEstimator):
+    """Custom analogue to sklearn's OneVsRestClassifier with support for fit params.
+    This is a very simple implementation that optionally paralellizes across outcomes
+    and supports use of fit parameters. Prediction is not parallelized.
+    Parameters
+    ----------
+    model : obj
+        The estimator to be used for predicting each outcome. Currently there is no
+        support for using different models for different outcomes.
+    n_jobs : int, optional
+        Number of joblib jobs to use in parallelization. The usual joblib rules apply,
+        i.e. None -> 1 job, -1 -> all available.
+    Attributes
+    ----------
+    models_ : list of obj
+        The trained single-outcome models, which are used in prediction.
+    """
+
+    def __init__(self, model=None, n_jobs=None):
+        self.model = model
+        self.n_jobs = n_jobs
+
+    def fit(self, X, y=None, **fit_params):
+        """Fit a separate model for each outcome in y."""
+        def parse_eval(fit_params, i):
+            params = dict(fit_params)
+            if "eval_set" in params:
+                params["eval_set"] = [
+                    (X_val, _subset_cols(y_val, i))
+                    for (X_val, y_val) in params["eval_set"]
+                ]
+            return params
+
+        self.models_ = Parallel(n_jobs=self.n_jobs)(
+            delayed(_fit_binary)(
+                self.model, X, _subset_cols(y, i), **parse_eval(fit_params, i)
+            )
+            for i in range(y.shape[1])
+        )
+
+    def predict(self, X):
+        """Predict from each separate model and recombine."""
+
+        y_pred = []
+
+        for m in self.models_:
+            # coerce to 1 dimension for models that return (n,1)
+            p = m.predict(X).squeeze()
+            y_pred.append(p)
+
+        y_pred = np.array(y_pred).T
+
+        return y_pred
+
+    def predict_proba(self, X):
+        """Get predicted probability from each separate model and recombine."""
+
+        y_pred = []
+
+        for m in self.models_:
+            p = m.predict_proba(X)
+            # take only p(1) when predict_proba returns 2 outcomes (sklearn, LGBM)
+            if p.shape[1] == 2:
+                p = p[:, 1]
+            else:
+                p = p.squeeze()
+            y_pred.append(p)
+
+        y_pred = np.array(y_pred).T
+
+        return y_pred
diff --git a/input/utilities/preprocess.py b/input/utilities/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1c5259984b0405004d502a7e6242be571791b3a
--- /dev/null
+++ b/input/utilities/preprocess.py
@@ -0,0 +1,45 @@
+#######################
+### Library imports ###
+#######################
+
+# standard library
+import os
+import sys
+
+# data packages
+import numpy as np
+import pandas as pd
+
+# sklearn
+from sklearn.base import TransformerMixin
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.compose import make_column_transformer
+
+
+class Preprocessor(TransformerMixin):
+    def __init__(
+        self,
+        seed = 2021
+    ):
+        self.seed = seed
+        
+    def fit(self, X, y = None, X_test = None):
+        if X_test is not None:
+            X = pd.concat([X, X_test], axis = 0, ignore_index = True)
+    
+        gene_feats = [col for col in X.columns if col.startswith('g-')]
+        cell_feats = [col for col in X.columns if col.startswith('c-')]
+        numeric_feats = gene_feats + cell_feats            
+        categorical_feats = ['cp_time', 'cp_dose']        
+        
+        self._transformer = make_column_transformer(
+            (OneHotEncoder(), categorical_feats),
+            (StandardScaler(), numeric_feats)
+        )
+        self._transformer.fit(X)
+        return self
+        
+    
+    def transform(self, X):
+        X_new = self._transformer.transform(X).astype("float32")
+        return X_new
diff --git a/input/utilities/splitting.py b/input/utilities/splitting.py
new file mode 100644
index 0000000000000000000000000000000000000000..365791b80764ee4c300b53ff5f9b4b25ad94e4f5
--- /dev/null
+++ b/input/utilities/splitting.py
@@ -0,0 +1,79 @@
+#######################
+### Library imports ###
+#######################
+
+# standard library
+import copy
+import os
+
+# data packages
+import numpy as np
+import pandas as pd
+
+
+
+def split_on_group(n_splits, groups, random_state=None):
+    """Custom splitting on groups.
+    Parameters
+    ----------
+    n_splits : int
+        Number of splits (folds) to make.
+    groups : array-like
+        The group labels for every observation.
+    random_state : int, optional
+        Random state for reproducibility.
+    Returns
+    -------
+    list of tuple
+        The split indices as a list of tuples of `(idx_train, idx_test)`.
+    Notes
+    -----
+    Adapted from https://stackoverflow.com/a/54254277.
+    """
+
+    groups = pd.Series(groups)
+    ix = np.arange(len(groups))
+    unique = np.unique(groups)
+    np.random.RandomState(random_state).shuffle(unique)
+    result = []
+    for split in np.array_split(unique, n_splits):
+        mask = groups.isin(split)
+        train, test = ix[~mask], ix[mask]
+        result.append((train, test))
+
+    return result
+
+
+def create_splitting_groups(df, strat_vars, split_var, random_state=None):
+    """Create custom MoA-based groups for CV splitting.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Input data, containing all columns in `strat_vars` and `split_var`.
+    strat_vars : list of str
+        Variables to use for stratification, e.g. ["cp_time", "cp_dose"].
+    split_var : str
+        Variable used for demarcating splits, e.g. "drug_id".
+    random_state : int
+        Random state for reproducibiity.
+    Returns
+    -------
+    pd.Series
+        The constructed groups.
+    """
+
+    df_groups = df.assign(
+        # add random noise to do random ranking
+        random=lambda x: np.random.RandomState(random_state).rand(x.shape[0]),
+        # randomly rank (i.e. assign ids) within group
+        rank=lambda x: x.groupby(strat_vars + [split_var])["random"]
+        # ties shouldn't occur, but make sure distinct ranks are assigned if they do
+        .rank(method="first").astype(int),
+        # then group is based on this rank and the splitting variable (str concat)
+        group=lambda x: x[[split_var, "rank"]]
+        .astype(str)
+        .apply(lambda y: "_".join(y), axis=1),
+    )
+
+    # return the pd.Series of the new groups (with original df index)
+    return df_groups["group"]