udacity · ThaisLemes · Jul 3, 2025 · Jul 11, 2025 · Jul 11, 2025 · Jul 11, 2025
diff --git a/README.md b/README.md
@@ -5,6 +5,12 @@ on the price of similar properties. Your company receives new data in bulk every
 to be retrained with the same cadence, necessitating an end-to-end pipeline that can be reused.
 
 In this project you will build such a pipeline.
+# W&B project link: 
+Report: https://wandb.ai/tlemes-western-governors-university/nyc_airbnb/reports/NYC-Airbnb-Dataset-project-Report--VmlldzoxMzU5MDg2MQ?accessToken=e6uedivrkyvwbjrr701otecnc3kyuakdywl7g9qciv0xnd80mkbefexwiubg6ji0
+https://wandb.ai/tlemes-western-governors-university/nyc_airbnb?nw=nwusertlemes
+
+# GitHub
+https://github.com/ThaisLemes/Project-Build-an-ML-Pipeline-Starter
 
 ## Table of contents
 

diff --git a/Untitled.ipynb b/Untitled.ipynb
@@ -0,0 +1,33 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44e8a294-4db1-429d-ae7a-15c1a8571af5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/conda b/conda
diff --git a/config.yaml b/config.yaml
@@ -26,8 +26,8 @@ modeling:
   # NOTE: you can put here any parameter that is accepted by the constructor of
   # RandomForestRegressor. This is a subsample, but more could be added:
   random_forest:
-    n_estimators: 100
-    max_depth: 15
+    n_estimators: 200
+    max_depth: 50
     min_samples_split: 4
     min_samples_leaf: 3
     # Here -1 means all available cores

diff --git a/main.py b/main.py
@@ -43,30 +43,54 @@ def go(config: DictConfig):
                 version='main',
                 env_manager="conda",
                 parameters={
-                    "sample": config["etl"]["sample"],
+                    "sample": config['etl']['sample'],
                     "artifact_name": "sample.csv",
                     "artifact_type": "raw_data",
                     "artifact_description": "Raw file as downloaded"
                 },
             )
 
         if "basic_cleaning" in active_steps:
-            ##################
-            # Implement here #
-            ##################
-            pass
-
+           _ = mlflow.run(
+               os.path.join(hydra.utils.get_original_cwd(), "src", "basic_cleaning"),
+           "main",
+           parameters={
+               "input_artifact":"sample.csv:latest",
+               "output_artifact": "clean_sample.csv",
+               "output_type": "clean_sample",
+               "output_description":"Data with outliers and null values removed",
+               "min_price": config['etl']['min_price'],
+               "max_price": config['etl']['max_price'],
+           },
+        )
+
+
         if "data_check" in active_steps:
-            ##################
-            # Implement here #
-            ##################
-            pass
-
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
+                "main",
+                parameters={
+                "csv": "clean_sample.csv:latest",
+                "ref": "clean_sample.csv:reference",
+                "kl_threshold": config["data_check"]["kl_threshold"],
+            "min_price": config['etl']['min_price'],
+            "max_price": config['etl']['max_price'],
+                },
+        )
+
+
         if "data_split" in active_steps:
-            ##################
-            # Implement here #
-            ##################
-            pass
+            _ = mlflow.run(
+    f"{config['main']['components_repository']}/train_val_test_split",
+    'main',
+    parameters = {
+        "input":"clean_sample.csv:latest",
+        "test_size": str(config['modeling']['test_size']),
+        "random_seed": str(config['modeling']['random_seed']),
+        "stratify_by": config['modeling']['stratify_by'],
+             },
+        )
+
 
         if "train_random_forest" in active_steps:
 
@@ -77,20 +101,34 @@ def go(config: DictConfig):
 
             # NOTE: use the rf_config we just created as the rf_config parameter for the train_random_forest
             # step
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "train_random_forest"),
+                "main",
+                parameters = {
+                    "trainval_artifact":"trainval_data.csv:latest",
+                    "val_size": config['modeling']['val_size'],
+                    "random_seed": config['modeling']['random_seed'],
+                    "stratify_by": config['modeling']['stratify_by'],
+                    "rf_config": rf_config,
+                    "max_tfidf_features": config['modeling']['max_tfidf_features'],
+                    "output_artifact": "random_forest_export",
+                },
+            )
 
-            ##################
-            # Implement here #
-            ##################
-
-            pass
+
 
         if "test_regression_model" in active_steps:
+
+              _ = mlflow.run(
+                  os.path.join(hydra.utils.get_original_cwd(), "components", "test_regression_model"),
+                  'main',
+                  parameters = {
+                      "mlflow_model": "random_forest_export:prod",
+                      "test_dataset": "test_data.csv:latest",
+                    },
+            )
 
-            ##################
-            # Implement here #
-            ##################
 
-            pass
 
 
 if __name__ == "__main__":