Merge pull request #432 from datamol-io/caching

DomInvivo · web-flow · commit 4adaaf7ff651 · 2023-08-18T15:22:01.000-04:00
Caching logic improvement
diff --git a/README.md b/README.md
@@ -97,6 +97,23 @@ graphium-train --config-path [PATH] --config-name [CONFIG]
 ```
 Thanks to the modular nature of `hydra` you can reuse many of our config settings for your own experiments with Graphium.
 
+## Preparing the data in advance
+The data preparation including the featurization (e.g., of molecules from smiles to pyg-compatible format) is embedded in the pipeline and will be performed when executing `graphium-train [...]`.
+
+However, when working with larger datasets, it is recommended to perform data preparation in advance using a machine with sufficient allocated memory (e.g., ~400GB in the case of `LargeMix`). Preparing data in advance is also beneficial when running lots of concurrent jobs with identical molecular featurization, so that resources aren't wasted and processes don't conflict reading/writing in the same directory.
+
+The following command-line will prepare the data and cache it, then use it to train a model.
+```bash
+# First prepare the data and cache it in `path_to_cached_data`
+graphium-prepare-data datamodule.args.processed_graph_data_path=[path_to_cached_data]
+
+# Then train the model on the prepared data
+graphium-train [...] datamodule.args.processed_graph_data_path=[path_to_cached_data]
+```
+
+**Note** that `datamodule.args.processed_graph_data_path` can also be specified at `expts/hydra_configs/`.
+
+**Note** that, every time the configs of `datamodule.args.featurization` changes, you will need to run a new data preparation, which will automatically be saved in a separate directory that uses a hash unique to the configs.
 
 ## License
 
diff --git a/docs/tutorials/feature_processing/choosing_parallelization.ipynb b/docs/tutorials/feature_processing/choosing_parallelization.ipynb
@@ -14,7 +14,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "id": "b5df2ac6-2ded-4597-a445-f2b5fb106330",
    "metadata": {
     "tags": []
@@ -24,8 +24,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO: Pandarallel will run on 240 workers.\n",
-      "INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.\n"
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
      ]
     }
    ],
@@ -39,9 +39,9 @@
     "import datamol as dm\n",
     "import pandas as pd\n",
     "\n",
-    "from pandarallel import pandarallel\n",
+    "# from pandarallel import pandarallel\n",
     "\n",
-    "pandarallel.initialize(progress_bar=True, nb_workers=joblib.cpu_count())"
+    "# pandarallel.initialize(progress_bar=True, nb_workers=joblib.cpu_count())"
    ]
   },
   {
@@ -54,7 +54,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "id": "0f31e18d-bdd9-4d9b-8ba5-81e5887b857e",
    "metadata": {
     "tags": []
@@ -70,7 +70,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 7,
    "id": "a1197c31-7dbc-4fd7-a69a-5215e1a96b8e",
    "metadata": {
     "tags": []
@@ -109,7 +109,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 10,
    "id": "2f8ce5c3-4232-4279-8ea3-7a74832303be",
    "metadata": {
     "tags": []
@@ -129,7 +129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 11,
    "id": "a246cdcf-b5ea-4c9e-9ccc-dd3c544587bb",
    "metadata": {
     "tags": []
@@ -138,7 +138,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3e939cd3a24742038b804bbfd961377d",
+       "model_id": "cc396220c7144c8d8b195fb87694bbfe",
        "version_major": 2,
        "version_minor": 0
       },
@@ -489,7 +489,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.12"
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
diff --git a/expts/configs/config_gps_10M_pcqm4m.yaml b/expts/configs/config_gps_10M_pcqm4m.yaml
@@ -112,7 +112,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 0 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/configs/config_gps_10M_pcqm4m_mod.yaml b/expts/configs/config_gps_10M_pcqm4m_mod.yaml
@@ -81,7 +81,6 @@ datamodule:
     # Data handling-related
     batch_size_training: 64
     batch_size_inference: 16
-    # cache_data_path: .
     num_workers: 0 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/configs/config_mpnn_10M_b3lyp.yaml b/expts/configs/config_mpnn_10M_b3lyp.yaml
@@ -93,6 +93,7 @@ datamodule:
     featurization_progress: True
     featurization_backend: "loky"
     processed_graph_data_path: "../datacache/b3lyp/"
+    dataloading_from: ram
     featurization:
     # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
     # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
@@ -123,7 +124,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 0 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/configs/config_mpnn_pcqm4m.yaml b/expts/configs/config_mpnn_pcqm4m.yaml
@@ -30,8 +30,8 @@ datamodule:
     featurization_n_jobs: 20
     featurization_progress: True
     featurization_backend: "loky"
-    cache_data_path: "./datacache"
     processed_graph_data_path: "graphium/data/PCQM4Mv2/"
+    dataloading_from: ram
     featurization:
     # OGB: ['atomic_num', 'degree', 'possible_formal_charge', 'possible_numH' (total-valence),
     # 'possible_number_radical_e', 'possible_is_aromatic', 'possible_is_in_ring',
@@ -58,7 +58,6 @@ datamodule:
     # Data handling-related
     batch_size_training: 64
     batch_size_inference: 16
-    # cache_data_path: .
     num_workers: 40 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/hydra-configs/architecture/toymix.yaml b/expts/hydra-configs/architecture/toymix.yaml
@@ -79,6 +79,7 @@ datamodule:
     featurization_progress: True
     featurization_backend: "loky"
     processed_graph_data_path: "../datacache/neurips2023-small/"
+    dataloading_from: ram
     num_workers: 30 # -1 to use all
     persistent_workers: False
     featurization:
diff --git a/expts/neurips2023_configs/base_config/large.yaml b/expts/neurips2023_configs/base_config/large.yaml
@@ -168,7 +168,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 32 # -1 to use all
     persistent_workers: True # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/base_config/small.yaml b/expts/neurips2023_configs/base_config/small.yaml
@@ -132,7 +132,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/baseline/config_small_gcn_baseline.yaml b/expts/neurips2023_configs/baseline/config_small_gcn_baseline.yaml
@@ -131,7 +131,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/config_classifigression_l1000.yaml b/expts/neurips2023_configs/config_classifigression_l1000.yaml
@@ -111,7 +111,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 5 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/config_luis_jama.yaml b/expts/neurips2023_configs/config_luis_jama.yaml
@@ -119,7 +119,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 4 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/debug/config_debug.yaml b/expts/neurips2023_configs/debug/config_debug.yaml
@@ -105,7 +105,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 0 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/debug/config_large_gcn_debug.yaml b/expts/neurips2023_configs/debug/config_large_gcn_debug.yaml
@@ -166,7 +166,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
@@ -327,7 +326,7 @@ predictor:
     l1000_mcf7: []
     pcba_1328: []
     pcqm4m_g25: []
-    pcqm4m_n4: [] 
+    pcqm4m_n4: []
   loss_fun:
     l1000_vcap:
       name: hybrid_ce_ipu
diff --git a/expts/neurips2023_configs/debug/config_small_gcn_debug.yaml b/expts/neurips2023_configs/debug/config_small_gcn_debug.yaml
@@ -119,7 +119,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/single_task_gcn/config_large_gcn_mcf7.yaml b/expts/neurips2023_configs/single_task_gcn/config_large_gcn_mcf7.yaml
@@ -100,7 +100,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/single_task_gcn/config_large_gcn_pcba.yaml b/expts/neurips2023_configs/single_task_gcn/config_large_gcn_pcba.yaml
@@ -100,7 +100,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/single_task_gcn/config_large_gcn_vcap.yaml b/expts/neurips2023_configs/single_task_gcn/config_large_gcn_vcap.yaml
@@ -100,7 +100,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/single_task_gin/config_large_gin_g25.yaml b/expts/neurips2023_configs/single_task_gin/config_large_gin_g25.yaml
@@ -103,7 +103,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/single_task_gin/config_large_gin_mcf7.yaml b/expts/neurips2023_configs/single_task_gin/config_large_gin_mcf7.yaml
@@ -100,7 +100,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/single_task_gin/config_large_gin_n4.yaml b/expts/neurips2023_configs/single_task_gin/config_large_gin_n4.yaml
@@ -104,7 +104,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/single_task_gin/config_large_gin_pcba.yaml b/expts/neurips2023_configs/single_task_gin/config_large_gin_pcba.yaml
@@ -100,7 +100,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/single_task_gin/config_large_gin_pcq.yaml b/expts/neurips2023_configs/single_task_gin/config_large_gin_pcq.yaml
@@ -118,7 +118,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/single_task_gin/config_large_gin_vcap.yaml b/expts/neurips2023_configs/single_task_gin/config_large_gin_vcap.yaml
@@ -100,7 +100,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/single_task_gine/config_large_gine_g25.yaml b/expts/neurips2023_configs/single_task_gine/config_large_gine_g25.yaml
@@ -103,7 +103,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/single_task_gine/config_large_gine_mcf7.yaml b/expts/neurips2023_configs/single_task_gine/config_large_gine_mcf7.yaml
@@ -100,7 +100,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/single_task_gine/config_large_gine_n4.yaml b/expts/neurips2023_configs/single_task_gine/config_large_gine_n4.yaml
@@ -104,7 +104,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/single_task_gine/config_large_gine_pcba.yaml b/expts/neurips2023_configs/single_task_gine/config_large_gine_pcba.yaml
@@ -100,7 +100,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/single_task_gine/config_large_gine_pcq.yaml b/expts/neurips2023_configs/single_task_gine/config_large_gine_pcq.yaml
@@ -118,7 +118,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/expts/neurips2023_configs/single_task_gine/config_large_gine_vcap.yaml b/expts/neurips2023_configs/single_task_gine/config_large_gine_vcap.yaml
@@ -100,7 +100,6 @@ datamodule:
             pos_type: rw_return_probs
             ksteps: 16
 
-    # cache_data_path: .
     num_workers: 30 # -1 to use all
     persistent_workers: False # if use persistent worker at the start of each epoch.
     # Using persistent_workers false might make the start of each epoch very long.
diff --git a/graphium/cli/prepare_data.py b/graphium/cli/prepare_data.py
diff --git a/graphium/config/fake_and_missing_multilevel_multitask_pyg.yaml b/graphium/config/fake_and_missing_multilevel_multitask_pyg.yaml
diff --git a/graphium/config/fake_multilevel_multitask_pyg.yaml b/graphium/config/fake_multilevel_multitask_pyg.yaml
diff --git a/graphium/config/zinc_default_multitask_pyg.yaml b/graphium/config/zinc_default_multitask_pyg.yaml
diff --git a/graphium/data/datamodule.py b/graphium/data/datamodule.py
diff --git a/graphium/data/dataset.py b/graphium/data/dataset.py
diff --git a/profiling/configs_profiling.yaml b/profiling/configs_profiling.yaml
diff --git a/profiling/profile_predictor.py b/profiling/profile_predictor.py
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/tests/config_test_ipu_dataloader_multitask.yaml b/tests/config_test_ipu_dataloader_multitask.yaml
diff --git a/tests/data/config_micro_ZINC.yaml b/tests/data/config_micro_ZINC.yaml
diff --git a/tests/test_datamodule.py b/tests/test_datamodule.py
diff --git a/tests/test_multitask_datamodule.py b/tests/test_multitask_datamodule.py