Initial commit

e36b81ea · Toren Lev Fronsdal · e36b81ea
Commit e36b81ea authored 4 years ago by Toren Lev Fronsdal
--- a/baseline_model.py
+++ b/baseline_model.py
+# %% [code]
+#######################
+### Library imports ###
+#######################
+
+# standard library
+import os
+import sys
+import pickle
+import copy
+
+# data packages
+import numpy as np
+import pandas as pd
+
+# pytorch
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+# sklearn
+import sklearn.base
+from sklearn.decomposition import PCA
+
+import joblib
+
+from sklearn.base import TransformerMixin, BaseEstimator
+from sklearn.feature_selection import VarianceThreshold
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.decomposition import PCA
+from sklearn.pipeline import make_union, make_pipeline
+from sklearn.compose import make_column_transformer
+
+from scipy.stats import kurtosis, skew
+
+
+########################
+### Global variables ###
+########################
+
+weight_method = None # one of {"inverse freq", "square root"}
+max_weight = None
+n_seeds = 2
+n_folds = 2
+# if holdout_set == True, then a holdout set of the train data 
+# is used as the test set instead of the leaderboard data
+holdout_set = False
+split_method = "grouped" # method : one of {"grouped", "target stratified"}
+device = ("cuda" if torch.cuda.is_available() else "cpu")
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+
+# %% [code]
+class Preprocessor(TransformerMixin):
+    def __init__(
+        self,
+        variance_threshold = 0.7,
+        num_pc_genes = 80,
+        num_pc_cells = 10,
+        seed = 2021
+        ):
+        
+        self.variance_threshold = variance_threshold,
+        self.num_pc_genes = num_pc_genes
+        self.num_pc_cells = num_pc_cells
+        self.seed = seed
+        
+    def fit(self, X, y = None, X_test = None):
+        if X_test is not None:
+            X = pd.concat([X, X_test], axis = 0, ignore_index = True)
+                        
+        gene_feats = [col for col in X.columns if col.startswith('g-')]
+        cell_feats = [col for col in X.columns if col.startswith('c-')]
+        numeric_feats = gene_feats+cell_feats
+        categorical_feats = ['cp_time', 'cp_dose']        
+
+        self._transformer = make_column_transformer(
+            (OneHotEncoder(), categorical_feats)
+        )
+        self._transformer.fit(X)
+        return self
+        
+    
+    def transform(self, X):
+        X_new = self._transformer.transform(X).astype("float32")
+        return X_new
+
+# %% [code]
+# Sub-class nn.Sequential to add reset_parameters method
+class Sequential(nn.Sequential):
+    def reset_parameters(self):
+        for layer in self.children():
+           if hasattr(layer, "reset_parameters"):
+               layer.reset_parameters()
+
+# %% [code]
+class Network(sklearn.base.BaseEstimator):
+    """An sklearn-compatible wrapper for pytorch estimators.
+    Wraps pytorch training and prediction in sklearn-compatible estimator with `fit` and
+    `predict` methods and limited support for commonly-tuned net parameters. Supports
+    early stopping and `eval_set` similarly to the LightGBM sklearn implementation.
+    Parameters
+    ----------
+    net_obj : obj
+        The instantiated pytorch network object to be used in training. Should have type
+        that is a subclass of nn.Module.
+    seed : int, optional
+        Seed to be used for randomness in network initalization for reproducibility.
+    optimizer : type, optional, default=torch.optim.Adam
+        The optimizer class to be used in training. Should be a subclass of
+        torch.optim.Optimizer.
+    loss_fn : callable, optional, default=nn.BCEWithLogitsLoss()
+        A function or callable loss object with signature `f(y_pred, y_true)`. 
+    device : {"cpu", "cuda"}, optional, default="cpu"
+        The device used in training.
+    lr : float, optional, default=0.001
+        The learning rate. Ignored if `lr_scheduler` is provided.
+    weight_decay : float, optional, default=0
+        Weight decay parameter used for network weight regularization.
+    batch_size : int, optional, default=128
+        Batch sized used in training.
+    max_epochs : int, optional, default=10
+        Maximum number of epochs used in training. Actual number of epochs used may be
+        lower if early stopping is enabled.
+    lr_scheduler : type, optional
+        Learning rate scheduler for training, e.g. a class from
+        torch.optim.lr_scheduler.
+    lr_scheduler_params : dict, optional
+        The parameters used to initialize the `lr_scheduler`.
+    Attributes
+    ----------
+    self.metric_history_ : list of dict
+        A list of dictionaries recording the values for each metric, eval_set, and
+        epoch.
+    self.early_stopping_history_ : list of float
+        The list of values for only the first metric and eval_set, used for early
+        stopping if specified.
+    self.early_stopping_epoch_ : int or None
+        The optimal epoch chosen by early stopping.
+    self.net_ : obj
+        The trained `net_obj`, which is used for prediction.
+    self.metric_history_df_ : pd.DataFrame
+        A dataframe wrapper around `self.metric_history_`.
+    """
+
+    def __init__(
+        self,
+        net_obj,
+        seed=None,
+        optimizer=torch.optim.Adam,
+        loss_fn=None,
+        device="cpu",
+        lr=0.001,
+        weight_decay=0,
+        batch_size=128,
+        max_epochs=10,
+        lr_scheduler=None,
+        lr_scheduler_params=None,
+    ):
+
+        self.net_obj = net_obj
+        self.seed = seed
+        self.optimizer = optimizer
+        self.loss_fn = loss_fn
+        self.device = device
+        self.lr = lr
+        self.weight_decay = weight_decay
+        self.batch_size = batch_size
+        self.max_epochs = max_epochs
+        self.lr_scheduler = lr_scheduler
+        self.lr_scheduler_params = lr_scheduler_params
+
+    def fit(
+        self,
+        X,
+        y,
+        eval_set=None,
+        eval_names=None,
+        eval_metric=None,
+        patience=None,
+        min_delta=None,
+        verbose=False,
+    ):
+        """Trains the network, with support for early stopping.
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            The training features.
+        y : array-like, shape (n_samples, n_labels)
+            The training labels.
+        eval_set : list of tuple, optional
+            A list of (X, y) tuples to be used for computing eval loss. At least one
+            dataset is required for early stopping, in which case the first tuple in the
+            list is used for early stopping evaluation.
+        eval_names : list, optional
+            An optionl list of the same length as `eval_set`, specifying the name of
+            each dataset.
+        eval_metric : list of callable, optional
+            A list of metric functions to be used in evaluation. Loss will be recorded
+            for all metrics, but only the first metric provided will be used for early
+            stopping, if enabled. Should have signature `f(y_pred, y_true)`.
+        patience : int, optional
+            The number of epochs of increasing loss tested before stopping early. The
+            number to be tested is reset after every epoch with a decrease in loss. This
+            parameter may be overridden if `min_delta` is also set.
+        min_delta : float, optional
+            The minimum decrease in loss required to continue training. If loss doss not
+            decrease by more than this value, training will be stopped early and the
+            stopping epoch will be recorded in the `early_stopping_epoch_` attribute.
+        verbose : int or bool, optional, default=False
+            If False, no loss is printed during training. Otherwise, results are printed
+            after every `verbose` epochs.
+        Returns
+        -------
+        self : obj
+            Returns the estimator itself, in keeping with sklearn requirements.
+        """
+
+        # initialize device for training
+        device = torch.device(self.device)
+
+        # set seed for network weight initialization
+        if self.seed is not None:
+            torch.manual_seed(self.seed)
+            # this should be redundant with torch.manual_seed
+            torch.cuda.manual_seed_all(self.seed)
+        # send pytorch network to specified device
+        net = self.net_obj.to(device)
+        # reset initial parameters for net
+        net.reset_parameters()
+        # initialize loss and optimizer
+        if self.loss_fn is None:
+            loss_fn = nn.BCEWithLogitsLoss()
+        else:
+            loss_fn = self.loss_fn
+        optimizer = self.optimizer(
+            net.parameters(), lr=self.lr, weight_decay=self.weight_decay
+        )
+
+        # helper for converting to tensor
+        def to_tensor(a):
+            return torch.tensor(a, dtype=torch.float32).to(device)
+
+
+        X_nn = to_tensor(X)
+        y_nn = to_tensor(y)
+
+        # Add extra dimension if y is single dimensional
+        if len(y_nn.shape) == 1:
+            y_nn = y_nn.unsqueeze(1)
+
+        # set up for evaluation on train/val data
+        if eval_set is None:
+            eval_set = []
+        if eval_metric is None:
+            eval_metric = []
+        if eval_names is None:
+            eval_names = [f"eval_{i}" for i in range(len(eval_set))]
+
+        # add train data as an eval set
+        eval_set.append((X, y))
+        eval_names.append("train")
+
+        # convert eval sets to tensors
+        eval_set = [(to_tensor(tup[0]), to_tensor(tup[1])) for tup in eval_set]
+
+        eval_metric = [
+            m if isinstance(m, tuple) else (f"metric_{i}", m)
+            for i, m in enumerate(eval_metric)
+        ]
+        eval_metric.append(("objective", loss_fn))
+
+        # set up dataloader for batches
+        dataset = torch.utils.data.TensorDataset(X_nn, y_nn)
+        dataloader = torch.utils.data.DataLoader(
+            dataset, batch_size=self.batch_size, shuffle=True, drop_last=True
+        )
+
+        lr_scheduler = self.lr_scheduler
+        if self.lr_scheduler_params is None:
+            lr_scheduler_params = {}
+        else:
+            lr_scheduler_params = self.lr_scheduler_params
+
+        if lr_scheduler == "OneCycleLR":
+            one_cycle_lr = optim.lr_scheduler.OneCycleLR(
+                optimizer=optimizer,
+                epochs=self.max_epochs,
+                steps_per_epoch=len(dataloader),
+                **lr_scheduler_params,
+            )
+        if lr_scheduler == "ReduceLROnPlateau":
+            reduce_lr_on_plateau = optim.lr_scheduler.ReduceLROnPlateau(
+                optimizer=optimizer, **lr_scheduler_params
+            )
+
+        # track number of epochs with increasing loss for early stopping
+        self.metric_history_ = []
+        self.early_stopping_history_ = []
+        self.early_stopping_epoch_ = None
+        min_valmetric = np.inf
+        increases = 0
+        best_params = copy.deepcopy(net.state_dict())
+        for epoch in range(self.max_epochs):
+
+            # construct batches
+            for batch_x, batch_y in dataloader:
+
+                net.train()
+
+                # zero gradients to start
+                optimizer.zero_grad()
+
+                output = net.forward(batch_x)
+
+                loss = loss_fn(output, batch_y)
+
+                loss.backward()
+                optimizer.step()
+                if lr_scheduler == "OneCycleLR":
+                    one_cycle_lr.step()
+
+            net.eval()
+
+            # record metrics for each eval set
+            for i in range(len(eval_set)):
+                X_val, y_val = eval_set[i]
+                # Add extra dimension if y is single dimensional
+                if len(y_val.shape) == 1:
+                    y_val = y_val.unsqueeze(1)
+
+                set_name = eval_names[i]
+                with torch.no_grad():
+                    preds = net(X_val)
+                for j in range(len(eval_metric)):
+                    metric_name, metric_fn = eval_metric[j]
+                    metric_val = metric_fn(preds, y_val)
+                    if isinstance(metric_val, torch.Tensor):
+                        metric_val = metric_val.item()
+                    row = {
+                        "epoch": epoch,
+                        "data": set_name,
+                        "metric": metric_name,
+                        "value": metric_val,
+                    }
+                    self.metric_history_.append(row)
+                    # use first val set and first metric for early stopping
+                    if i == 0 and j == 0:
+                        self.early_stopping_history_.append(metric_val)
+
+            if verbose:
+                verbose = int(verbose)
+                if epoch % verbose == 0:
+                    for d in self.metric_history_[-len(eval_set) * len(eval_metric) :]:
+                        print(d)
+
+            # if val set is present, record history and follow early stopping parameters
+            if len(eval_set) > 1:
+                val_metric = self.early_stopping_history_[-1]
+                if lr_scheduler == "ReduceLROnPlateau":
+                    reduce_lr_on_plateau.step(val_metric)
+
+                # early stopping based on minimum decrease in loss
+                if min_delta is not None and epoch > 0:
+                    if self.early_stopping_history_[-2] - val_metric < min_delta:
+                        print("Early stopping at epoch ", epoch)
+                        self.early_stopping_epoch_ = epoch
+                        break
+
+                # early stopping based on number of epochs with increasing loss
+                if val_metric < min_valmetric:
+                    min_valmetric = val_metric
+                    increases = 0
+                    # save model paramaters for current best epoch
+                    best_params = copy.deepcopy(net.state_dict())
+                elif patience is not None:
+                    increases += 1
+                    if increases > patience:
+                        print("Early stopping at epoch ", epoch)
+                        self.early_stopping_epoch_ = epoch
+                        break
+
+        # if using early stopping, reload net with best params
+        if patience is not None or min_delta is not None:
+            # load model paramaters from best epoch
+            net.load_state_dict(best_params)
+            net.eval()
+
+        # store network for prediction
+        self.net_ = net
+
+        self.metric_history_df_ = pd.DataFrame(self.metric_history_)
+
+        return self
+
+    def predict(self, X):
+        """Predicts using the trained network.
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            An array of the same shape as the one used in training, containing the data
+            to be used for predictions.
+        Returns
+        -------
+        np.array
+            Model predictions in an array of shape (n_samples, n_labels).
+        """
+
+        # cast to tensor and move to device
+        device = torch.device(self.device)
+        X_nn = torch.tensor(X, dtype=torch.float32).to(device)
+
+        # forward pass through network for predictions
+        # return predictions as numpy array
+        with torch.no_grad():
+            return self.net_(X_nn).cpu().detach().numpy().astype("float32")
+
+    def predict_proba(self, X):
+        """Return predictions on probability scale for classification network.
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            An array of the same shape as the one used in training, containing the data
+            to be used for predictions.
+        Returns
+        -------
+        np.array
+            Model predictions in an array of shape (n_samples, n_labels), with sigmoid
+            transformation applied (i.e. predicted probabilities).
+        """
+
+        preds = self.predict(X)
+        preds_proba = 1 / (1 + np.exp(-preds))
+        return preds_proba.astype("float32")
+
+
+# %% [code]
+class SmoothCrossEntropyLoss(nn.modules.loss._WeightedLoss):
+    """
+    Computes smoothed cross entropy (log) loss.
+    Label smoothing works by clipping the true label values based on a
+    specified smoothing parameter, e.g., with smoothing == 0.001 and n_classes == 2,
+    [0, 1] --> [0.005, 0.995].
+    The formula is given by label smoothed y = y * (1 - smoothing) + smoothing / n_classes
+    This method can help prevent models from becoming over-confident.
+    See paper: https://papers.nips.cc/paper/2019/file/f1748d6b0fd9d439f71450117eba2725-Paper.pdf
+    """
+    def __init__(self, weight=None, reduction="mean", smoothing=0.001, device="cpu"):
+        super().__init__(weight=weight, reduction=reduction)
+        self.smoothing = smoothing
+        self.weight = weight
+        self.device = device
+
+    @staticmethod
+    def _smooth(targets, n_classes, smoothing, device):
+        """Helper for computing smoothed label values."""
+        assert 0 <= smoothing <= 1
+        with torch.no_grad():
+            targets = (
+                targets * (1 - smoothing)
+                + torch.ones_like(targets).to(device) * smoothing / n_classes
+            )
+        return targets
+
+    def forward(self, inputs, targets, sample_weight=None):
+        # smooth targets
+        targets = SmoothCrossEntropyLoss()._smooth(
+            targets, 2, self.smoothing, self.device
+        )
+        # weight class predictions
+        if self.weight is not None:
+            inputs = inputs * self.weight.unsqueeze(0)
+
+        if sample_weight is None:
+            # binary_cross_entropy_with_logits returns mean log loss
+            loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="mean")
+        else:
+            # binary_cross_entropy_with_logits returns
+            # [# obs., # classes] tensor of log losses
+            loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+            assert loss.size(0) == sample_weight.size(0)
+            # compute weighted mean for each target
+            loss = torch.sum(loss * sample_weight, dim=0) / torch.sum(sample_weight)
+            # compute column-wise mean
+            loss = torch.mean(loss)
+
+        return loss
+
+
+class ClippedCrossEntropyLoss(nn.modules.loss._WeightedLoss):
+    """
+    Computes clipped cross entropy (log) loss.
+    Clipped log loss clips the predicted probabilities based on a specified smoothing
+    parameter, e.g., with smoothing == 0.001, the predicted probabilities [.000013, .99992]
+    --> [0.005, 0.995].
+    This method can help prevent models from becoming over-confident.
+    """
+    def __init__(self, weight=None, reduction="mean", smoothing=0.001):
+        super().__init__(weight=weight, reduction=reduction)
+        self.smoothing = smoothing
+        self.weight = weight
+
+    def forward(self, y_pred, y_true, sample_weight=None):
+        # clip predictions
+        y_pred_clipped = torch.clamp(
+            torch.sigmoid(y_pred), self.smoothing, 1 - self.smoothing
+        )
+
+        # weight class predictions
+        if self.weight is not None:
+            y_pred = y_pred * self.weight.unsqueeze(0)
+
+        if sample_weight is None:
+            # binary_cross_entropy returns mean log loss
+            loss = F.binary_cross_entropy(y_pred_clipped, y_true, reduction="mean")
+        else:
+            # binary_cross_entropy returns [# obs., # classes] tensor of log losses
+            loss = F.binary_cross_entropy(y_pred_clipped, y_true, reduction="none")
+            assert loss.size(0) == sample_weight.size(0)
+            # compute weighted mean for each target
+            loss = torch.sum(loss * sample_weight, dim=0) / torch.sum(sample_weight)
+            # compute mean across targets
+            loss = torch.mean(loss)
+
+        return loss
+
+# %% [code]
+###################
+### Import Data ###
+###################
+
+train_drug = pd.read_csv("../input/lish-moa/train_drug.csv")
+X = pd.read_csv("../input/lish-moa/train_features.csv")
+y = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
+X_test = pd.read_csv("../input/lish-moa/test_features.csv")
+submission = pd.read_csv("../input/lish-moa/sample_submission.csv")
+
+# Remove control observations
+y = y.loc[X["cp_type"]=="trt_cp"].reset_index(drop=True)
+X = X.loc[X["cp_type"]=="trt_cp"].reset_index(drop=True)
+# used to set control obs. to zero for preds
+X_test_copy = X_test.copy()
+
+# %% [code]
+transformer = Preprocessor() 
+transformer.fit(X)
+X = transformer.transform(X)
+y = y.drop(["sig_id"], axis = 1).values.astype("float32")
+
+# %% [code]
+n_input = X.shape[1]
+n_output = y.shape[1]
+hidden_units = 640
+dropout = 0.2
+
+net_obj = Sequential(
+    nn.BatchNorm1d(n_input),
+    nn.Dropout(dropout),
+    nn.Linear(n_input, hidden_units),
+    nn.ReLU(),
+    nn.BatchNorm1d(hidden_units),
+    nn.Dropout(dropout),
+    nn.Linear(hidden_units, hidden_units),
+    nn.ReLU(),
+    nn.BatchNorm1d(hidden_units),
+    nn.Dropout(dropout),
+    nn.Linear(hidden_units, n_output)
+)
+
+# %% [code]
+# zero the submission preds
+submission.iloc[:,1:207] = 0
+
+net = Network(
+    net_obj=net_obj, 
+    max_epochs=6,
+    batch_size=128, 
+    device=device,
+    loss_fn=SmoothCrossEntropyLoss(smoothing=0.001, device=device), 
+    lr=0.001,
+    weight_decay=1e-6,
+    lr_scheduler="ReduceLROnPlateau"
+)
+
+clipped_log_loss = ClippedCrossEntropyLoss(smoothing=0.001)
+
+net.fit(
+    X=X, 
+    y=y, 
+    eval_metric=[clipped_log_loss], 
+    patience=7,
+    verbose=2
+)
+
+net.predict_proba(X)
+
+# %% [code]