diff --git a/setup.py b/setup.py
index 036cf83..c0392b7 100644
--- a/setup.py
+++ b/setup.py
@@ -81,14 +81,13 @@ def run(self):
         "Operating System :: Unix",
         "Operating System :: MacOS",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10"
     ],
     keywords=["Deep Learning", "PyTorch", "Ensemble Learning"],
     packages=find_packages(),
     cmdclass=cmdclass,
-    python_requires=">=3.6",
+    python_requires=">=3.8",
     install_requires=install_requires,
 )
diff --git a/torchensemble/_base.py b/torchensemble/_base.py
index b101311..c0627e1 100644
--- a/torchensemble/_base.py
+++ b/torchensemble/_base.py
@@ -6,6 +6,8 @@
 import numpy as np
 import torch.nn as nn
 
+from typing import List, Dict
+
 from . import _constants as const
 from .utils.io import split_data_target
 from .utils.logging import get_tb_logger
@@ -60,11 +62,11 @@ class BaseModule(nn.Module):
 
     def __init__(
         self,
-        estimator,
-        n_estimators,
-        estimator_args=None,
-        cuda=True,
-        n_jobs=None,
+        estimator: nn.Module,
+        n_estimators: int,
+        estimator_args: Dict = None,
+        device: str | List[str] = "cuda",
+        n_jobs: int = None,
     ):
         super(BaseModule, self).__init__()
         self.base_estimator_ = estimator
@@ -78,12 +80,33 @@ def __init__(
             )
             warnings.warn(msg, RuntimeWarning)
 
-        self.device = torch.device("cuda" if cuda else "cpu")
+        # Specify running devices for each estimator
+        if isinstance(device, str):
+            self.device = torch.device(device)
+        elif isinstance(device, list):
+            if not len(device) == n_estimators:
+                msg = "The length of `device` list should equal `n_estimators`."
+                self.logger.error(msg)
+                raise ValueError(msg)
+            self.device = device
+        else:
+            msg = "The argument `device` should be a string, or a list of string, got {} instead."
+            self.logger.error(msg.format(type(device)))
+            raise ValueError(msg.format(type(device)))
+
         self.n_jobs = n_jobs
         self.logger = logging.getLogger()
         self.tb_logger = get_tb_logger()
 
         self.estimators_ = nn.ModuleList()
+
+        self._criterion = None
+
+        self.optimizer_name = None
+        self.optimizer_args = None
+
+        self.scheduler_name = None
+        self.scheduler_args = None
         self.use_scheduler_ = False
 
     def __len__(self):
@@ -102,7 +125,7 @@ def __getitem__(self, index):
     def _decide_n_outputs(self, train_loader):
         """Decide the number of outputs according to the `train_loader`."""
 
-    def _make_estimator(self):
+    def _make_estimator(self, idx):
         """Make and configure a copy of `self.base_estimator_`."""
 
         # Call `deepcopy` to make a base estimator
@@ -117,7 +140,7 @@ def _make_estimator(self):
             else:
                 estimator = self.base_estimator_(**self.estimator_args)
 
-        return estimator.to(self.device)
+        return estimator.to(self.device[idx])
 
     def _validate_parameters(self, epochs, log_interval):
         """Validate hyper-parameters on training the ensemble."""
@@ -185,9 +208,9 @@ def predict(self, *x):
         x_device = []
         for data in x:
             if isinstance(data, torch.Tensor):
-                x_device.append(data.to(self.device))
+                x_device.append(data.to("cpu"))
             elif isinstance(data, np.ndarray):
-                x_device.append(torch.Tensor(data).to(self.device))
+                x_device.append(torch.Tensor(data).to("cpu"))
             else:
                 msg = (
                     "The type of input X should be one of {{torch.Tensor,"
@@ -206,23 +229,13 @@ def __init__(
         n_estimators=10,
         depth=5,
         lamda=1e-3,
-        cuda=False,
+        device="cuda",
         n_jobs=None,
     ):
-        super(BaseModule, self).__init__()
-        self.base_estimator_ = BaseTree
-        self.n_estimators = n_estimators
+        super(BaseModule, self).__init__(BaseTree, n_estimators, {}, device, n_jobs)
         self.depth = depth
         self.lamda = lamda
 
-        self.device = torch.device("cuda" if cuda else "cpu")
-        self.n_jobs = n_jobs
-        self.logger = logging.getLogger()
-        self.tb_logger = get_tb_logger()
-
-        self.estimators_ = nn.ModuleList()
-        self.use_scheduler_ = False
-
     def _decidce_n_inputs(self, train_loader):
         """Decide the input dimension according to the `train_loader`."""
         for _, elem in enumerate(train_loader):
@@ -231,7 +244,7 @@ def _decidce_n_inputs(self, train_loader):
             data = data.view(n_samples, -1)
             return data.size(1)
 
-    def _make_estimator(self):
+    def _make_estimator(self, idx):
         """Make and configure a soft decision tree."""
         estimator = BaseTree(
             input_dim=self.n_inputs,
@@ -241,7 +254,7 @@ def _make_estimator(self):
             cuda=self.device == torch.device("cuda"),
         )
 
-        return estimator.to(self.device)
+        return estimator.to(self.device[idx])
 
 
 class BaseClassifier(BaseModule):
@@ -263,7 +276,7 @@ def _decide_n_outputs(self, train_loader):
         else:
             labels = []
             for _, elem in enumerate(train_loader):
-                _, target = split_data_target(elem, self.device)
+                _, target = split_data_target(elem, "cpu")
                 labels.append(target)
             labels = torch.unique(torch.cat(labels))
             n_outputs = labels.size(0)
@@ -279,7 +292,7 @@ def evaluate(self, test_loader, return_loss=False):
         loss = 0.0
 
         for _, elem in enumerate(test_loader):
-            data, target = split_data_target(elem, self.device)
+            data, target = split_data_target(elem, "cpu")
 
             output = self.forward(*data)
 
@@ -371,28 +384,28 @@ def __init__(self, input_dim, output_dim, depth=5, lamda=1e-3, cuda=False):
             self.leaf_node_num_, self.output_dim, bias=False
         )
 
-    def forward(self, X, is_training_data=False):
-        _mu, _penalty = self._forward(X)
+    def forward(self, x, is_training_data=False):
+        _mu, _penalty = self._forward(x)
         y_pred = self.leaf_nodes(_mu)
 
-        # When `X` is the training data, the model also returns the penalty
+        # When `x` is the training data, the model also returns the penalty
         # to compute the training loss.
         if is_training_data:
             return y_pred, _penalty
         else:
             return y_pred
 
-    def _forward(self, X):
+    def _forward(self, x):
         """Implementation on the data forwarding process."""
 
-        batch_size = X.size()[0]
-        X = self._data_augment(X)
+        batch_size = x.size(0)
+        x = self._data_augment(x)
 
-        path_prob = self.inner_nodes(X)
+        path_prob = self.inner_nodes(x)
         path_prob = torch.unsqueeze(path_prob, dim=2)
         path_prob = torch.cat((path_prob, 1 - path_prob), dim=2)
 
-        _mu = X.data.new(batch_size, 1, 1).fill_(1.0)
+        _mu = x.data.new(batch_size, 1, 1).fill_(1.0)
         _penalty = torch.tensor(0.0).to(self.device)
 
         # Iterate through internal odes in each layer to compute the final path
@@ -437,14 +450,14 @@ def _cal_penalty(self, layer_idx, _mu, _path_prob):
 
         return penalty
 
-    def _data_augment(self, X):
+    def _data_augment(self, x):
         """Add a constant input `1` onto the front of each sample."""
-        batch_size = X.size()[0]
-        X = X.view(batch_size, -1)
+        batch_size = x.size(0)
+        x = x.view(batch_size, -1)
         bias = torch.ones(batch_size, 1).to(self.device)
-        X = torch.cat((bias, X), 1)
+        x = torch.cat((bias, x), 1)
 
-        return X
+        return x
 
     def _validate_parameters(self):
 
diff --git a/torchensemble/_constants.py b/torchensemble/_constants.py
index b7c362a..69d1954 100644
--- a/torchensemble/_constants.py
+++ b/torchensemble/_constants.py
@@ -13,10 +13,10 @@
         The dictionary of hyper-parameters used to instantiate base
         estimators. This parameter will have no effect if ``estimator`` is a
         base estimator object after instantiation.
-    cuda : bool, default=True
+    device : string or List, default='cuda'
 
-        - If ``True``, use GPU to train and evaluate the ensemble.
-        - If ``False``, use CPU to train and evaluate the ensemble.
+        - If :obj:`string`, all base estimators will be running on the device the string specified.
+        - If :obj:`List`, each base estimator will be running on the device ``device[base_estimator_index]``.
     n_jobs : int, default=None
         The number of workers for training the ensemble. This input
         argument is used for parallel ensemble methods such as
@@ -46,10 +46,10 @@
         The dictionary of hyper-parameters used to instantiate base
         estimators. This parameter will have no effect if ``estimator`` is a
         base estimator object after instantiation.
-    cuda : bool, default=True
+    device : string or List, default='cuda'
 
-        - If ``True``, use GPU to train and evaluate the ensemble.
-        - If ``False``, use CPU to train and evaluate the ensemble.
+        - If :obj:`string`, all base estimators will be running on the device the string specified.
+        - If :obj:`List`, each base estimator will be running on the device ``device[base_estimator_index]``.
 
     Attributes
     ----------
@@ -70,10 +70,10 @@
         The coefficient of the regularization term when training neural
         trees, proposed in the paper: `Distilling a neural network into a
         soft decision tree <https://arxiv.org/abs/1711.09784>`_.
-    cuda : bool, default=True
+    device : string or List, default='cuda'
 
-        - If ``True``, use GPU to train and evaluate the ensemble.
-        - If ``False``, use CPU to train and evaluate the ensemble.
+        - If :obj:`string`, all base estimators will be running on the device the string specified.
+        - If :obj:`List`, each base estimator will be running on the device ``device[base_estimator_index]``.
     n_jobs : int, default=None
         The number of workers for training the ensemble. This input
         argument is used for parallel ensemble methods such as
diff --git a/torchensemble/utils/io.py b/torchensemble/utils/io.py
index 9b402a7..5a42b51 100644
--- a/torchensemble/utils/io.py
+++ b/torchensemble/utils/io.py
@@ -76,12 +76,12 @@ def load(model, save_dir="./", map_location=None, logger=None):
         model.n_inputs = state["n_inputs"]
 
     # Pre-allocate and load all base estimators
-    for _ in range(n_estimators):
-        model.estimators_.append(model._make_estimator())
+    for idx in range(n_estimators):
+        model.estimators_.append(model._make_estimator(idx))
     model.load_state_dict(model_params)
 
 
-def split_data_target(element, device, logger=None):
+def split_data_target(element, device="cpu", logger=None):
     """Split elements in dataloader according to pre-defined rules."""
     if not (isinstance(element, list) or isinstance(element, tuple)):
         msg = (
@@ -98,8 +98,8 @@ def split_data_target(element, device, logger=None):
     elif len(element) > 2:
         # Dataloader with multiple inputs and one target
         data, target = element[:-1], element[-1]
-        data_device = [tensor.to(device) for tensor in data]
-        return data_device, target.to(device)
+        data = [tensor.to(device) for tensor in data]
+        return data, target.to(device)
     else:
         # Dataloader with invalid input
         msg = (
diff --git a/torchensemble/voting.py b/torchensemble/voting.py
index 241f2d9..937bc00 100644
--- a/torchensemble/voting.py
+++ b/torchensemble/voting.py
@@ -5,6 +5,7 @@
 """
 
 
+import copy
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -98,7 +99,7 @@ def __init__(self, voting_strategy="soft", **kwargs):
         implemented_strategies = {"soft", "hard"}
         if voting_strategy not in implemented_strategies:
             msg = (
-                "Voting strategy {} is not implemented, "
+                "Voting strategy `{}` is not implemented, "
                 "please choose from {}."
             )
             raise ValueError(
@@ -112,11 +113,10 @@ def __init__(self, voting_strategy="soft", **kwargs):
         "classifier_forward",
     )
     def forward(self, *x):
-
-        outputs = [
-            F.softmax(op.unsqueeze_tensor(estimator(*x)), dim=1)
-            for estimator in self.estimators_
-        ]
+        outputs = []
+        for (estimator, device) in zip(self.estimators_, self.device):
+            per_estimator_x = tuple(map(lambda x: x.to(device), copy.deepcopy(x)))
+            outputs.append(F.softmax(op.unsqueeze_tensor(estimator(*per_estimator_x)), dim=1).to("cpu"))
 
         if self.voting_strategy == "soft":
             proba = op.average(outputs)
@@ -164,8 +164,8 @@ def fit(
 
         # Instantiate a pool of base estimators, optimizers, and schedulers.
         estimators = []
-        for _ in range(self.n_estimators):
-            estimators.append(self._make_estimator())
+        for idx in range(self.n_estimators):
+            estimators.append(self._make_estimator(idx))
 
         optimizers = []
         for i in range(self.n_estimators):
@@ -180,18 +180,19 @@ def fit(
                 optimizers[0], self.scheduler_name, **self.scheduler_args
             )
 
-        # Check the training criterion
-        if not hasattr(self, "_criterion"):
+        # Check the training criterion, use the cross-entropy loss by default
+        if not hasattr(self, "_criterion") or self._criterion is None:
             self._criterion = nn.CrossEntropyLoss()
 
         # Utils
         best_acc = 0.0
 
         # Internal helper function on pseudo forward
-        def _forward(estimators, *x):
-            outputs = [
-                F.softmax(estimator(*x), dim=1) for estimator in estimators
-            ]
+        def _forward(estimators, devices, *data):
+            outputs = []
+            for (estimator, device) in zip(estimators, devices):
+                per_estimator_x = tuple(map(lambda x: x.to(device), copy.deepcopy(data)))
+                outputs.append(F.softmax(estimator(*per_estimator_x), dim=1).to("cpu"))
 
             if self.voting_strategy == "soft":
                 proba = op.average(outputs)
@@ -230,7 +231,7 @@ def _forward(estimators, *x):
                         idx,
                         epoch,
                         log_interval,
-                        self.device,
+                        self.device[idx],
                         True,
                     )
                     for idx, (estimator, optimizer) in enumerate(
@@ -251,10 +252,8 @@ def _forward(estimators, *x):
                         correct = 0
                         total = 0
                         for _, elem in enumerate(test_loader):
-                            data, target = io.split_data_target(
-                                elem, self.device
-                            )
-                            output = _forward(estimators, *data)
+                            data, target = io.split_data_target(elem, "cpu")
+                            output = _forward(estimators, self.device, *data)
                             _, predicted = torch.max(output.data, 1)
                             correct += (predicted == target).sum().item()
                             total += target.size(0)