udacity · yannicknkongolo7-crypto · Sep 20, 2025 · Sep 20, 2025 · Sep 20, 2025 · Sep 20, 2025
diff --git a/.gitignore b/.gitignore
@@ -145,3 +145,9 @@ mlruns/
 outputs/
 random_forest_dir/
 multirun/
+
+mlruns/
+wandb/
+outputs/
+**/.hydra/
+
diff --git a/Makefile b/Makefile
@@ -0,0 +1,36 @@
+SHELL := /bin/bash
+PORT ?= 5000
+WANDB_ENTITY ?= yannicknkongolo7-wgu
+WANDB_PROJECT ?= nyc_airbnb
+
+.PHONY: train
+train:
+	# Train using the artifact from your nyc_airbnb project
+	conda run -n components \
+		env WANDB_ENTITY=$(WANDB_ENTITY) WANDB_PROJECT=$(WANDB_PROJECT) \
+		mlflow run -e main src/train_random_forest \
+		  -P trainval_artifact='yannicknkongolo7-wgu/nyc_airbnb/clean_sample.csv:latest' \
+		  -P val_size=0.2 \
+		  -P rf_config='rf_config.json' \
+		  -P max_tfidf_features=500 \
+		  -P output_artifact='rf_model' \
+		  -P random_seed=42 \
+		  -P stratify_by='neighbourhood_group' \
+		  --env-manager=local
+
+.PHONY: serve
+serve:
+	conda run -n components mlflow models serve \
+	  -m src/train_random_forest/random_forest_dir \
+	  --env-manager=local -p $(PORT)
+
+.PHONY: stop
+stop:
+	- kill $$(lsof -ti :$(PORT)) || true
+
+.PHONY: predict_http
+predict_http:
+	conda run -n components python scripts/predict_http.py \
+	  --csv src/basic_cleaning/clean_sample.csv \
+	  --n_rows 3 \
+	  --url http://127.0.0.1:$(PORT)/invocations
diff --git a/README.md b/README.md
@@ -1,10 +1,7 @@
 # Build an ML Pipeline for Short-Term Rental Prices in NYC
-You are working for a property management company renting rooms and properties for short periods of 
-time on various rental platforms. You need to estimate the typical price for a given property based 
-on the price of similar properties. Your company receives new data in bulk every week. The model needs 
-to be retrained with the same cadence, necessitating an end-to-end pipeline that can be reused.
+You work for a property management company renting rooms and properties for short periods of time on various platforms. You need to estimate the typical price for a given property based on similar listings. New data arrives weekly, so the model must be retrained on the same cadence — requiring a reusable, end-to-end pipeline.
 
-In this project you will build such a pipeline.
+This repo implements that pipeline with MLflow, Hydra, scikit-learn, and Weights & Biases (W&B). It also includes local and HTTP prediction helpers and convenient make targets.
 
 ## Table of contents
 
@@ -16,8 +13,9 @@ In this project you will build such a pipeline.
   * [Running the entire pipeline or just a selection of steps](#Running-the-entire-pipeline-or-just-a-selection-of-steps)
   * [Pre-existing components](#pre-existing-components)
 
-## Preliminary steps
 
+* [Preliminary steps](#preliminary-steps)
+## Preliminary steps
 ### Supported Operating Systems
 
 This project is compatible with the following operating systems:
@@ -122,18 +120,28 @@ re-usable components. While you have a copy in your fork, you will be using them
 repository by accessing them through their GitHub link, like:
 
 ```python
+# Example snippet
+import mlflow
+
+config = {
+    "main": {
+        "components_repository": "https://github.com/udacity/Project-Build-an-ML-Pipeline-Starter/tree/main/components"
+    },
+    "etl": {"sample": 1.0},
+}
+
 _ = mlflow.run(
-                f"{config['main']['components_repository']}/get_data",
-                "main",
-                version='main',
-                env_manager="conda",
-                parameters={
-                    "sample": config["etl"]["sample"],
-                    "artifact_name": "sample.csv",
-                    "artifact_type": "raw_data",
-                    "artifact_description": "Raw file as downloaded"
-                },
-            )
+    f"{config['main']['components_repository']}/get_data",
+    "main",
+    version="main",
+    env_manager="conda",
+    parameters={
+        "sample": config["etl"]["sample"],
+        "artifact_name": "sample.csv",
+        "artifact_type": "raw_data",
+        "artifact_description": "Raw file as downloaded",
+    },
+)
 ```
 where `config['main']['components_repository']` is set to 
 [https://github.com/udacity/Project-Build-an-ML-Pipeline-Starter/tree/main/components](https://github.com/udacity/Project-Build-an-ML-Pipeline-Starter/tree/main/components).
@@ -172,6 +180,51 @@ If you see the any error while running the command:
 ```
 > mlflow run .
 ```
+## Reproducibility Proof — Screenshots & Links
+
+This section shows exactly how to reproduce the results and includes screenshots + links captured from my local run.
+
+### Environment
+
+![Environment versions](images/01_env_versions.png)
+
+### Repo
+
+![Environment versions](images/02_repo_layout.png)
+
+### Data sanity (first rows + columns)
+
+![Environment versions](images/03_data_head.png)
+
+### Train the model (with W&B)
+
+
+![Environment versions](images/04_train_console.png)
+
+![Environment versions](images/04_train_console_02.png)
+
+![Environment versions](images/05_wandb_run.png)
+
+### Confirm the exported model exists
+
+![Environment versions](images/06_model_dir.png)
+
+### Local prediction (no server)
+
+![Environment versions](images/07_predict_local.png)
+
+### Serve the model (Terminal 1)
+
+![Environment versions](images/08_server_listening.png)
+
+### HTTP health & prediction (Terminal 2)
+
+![Environment versions](images/09_ping_200.png)
+
+### Stop the server
+
+![Environment versions](images/10_predict_http.png)
+
 
 Please, make sure all steps are using **the same** python version and that you have **conda installed**. Additionally, *mlflow* and *wandb* packages are crucial and should have the same version.
 

diff --git a/config.yaml b/config.yaml
@@ -1,38 +1,41 @@
 main:
-  components_repository: "https://github.com/udacity/Project-Build-an-ML-Pipeline-Starter.git#components"
-  # All the intermediate files will be copied to this directory at the end of the run.
-  # Set this to null if you are running in prod
+  entity: "yannicknkongolo7-wgu"
   project_name: nyc_airbnb
   experiment_name: development
   steps: all
+  components_repository: "https://github.com/udacity/Project-Build-an-ML-Pipeline-Starter.git#components"
+
+
 etl:
   sample: "sample1.csv"
-  min_price: 10  # dollars
-  max_price: 350  # dollars
+  min_price: 10    # dollars
+  max_price: 350   # dollars
+
 data_check:
   kl_threshold: 0.2
+  # Added for the pytest-based checks
+  min_rows: 10
+  max_rows: 500000
+
 modeling:
-  # Fraction of data to use for test (the remaining will be used for train and validation)
+  # Fraction of data to use for test (the remaining will be used for train+val)
   test_size: 0.2
   # Fraction of remaining data to use for validation
   val_size: 0.2
-  # Fix this for reproducibility, change to have new splits
+  # Fix this for reproducibility
   random_seed: 42
-  # Column to use for stratification (use "none" for no stratification)
+  # Column for stratification (use "none" for no stratification)
   stratify_by: "neighbourhood_group"
-  # Maximum number of features to consider for the TFIDF applied to the title of the
-  # insertion (the column called "name")
-  max_tfidf_features: 5
-  # NOTE: you can put here any parameter that is accepted by the constructor of
-  # RandomForestRegressor. This is a subsample, but more could be added:
+  # Keep this aligned with your training runs
+  max_tfidf_features: 500
+
+  # RandomForest hyperparameters (extend as needed)
   random_forest:
     n_estimators: 100
     max_depth: 15
     min_samples_split: 4
     min_samples_leaf: 3
-    # Here -1 means all available cores
-    n_jobs: -1
+    n_jobs: -1             # -1 means all cores
     criterion: squared_error
     max_features: 0.5
-    # DO not change the following
-    oob_score: true
+    oob_score: true
diff --git a/images/01_env_versions.png b/images/01_env_versions.png
diff --git a/images/02_repo_layout.png b/images/02_repo_layout.png
diff --git a/images/03_data_head.png b/images/03_data_head.png
diff --git a/images/04_train_console.png b/images/04_train_console.png
diff --git a/images/04_train_console_02.png b/images/04_train_console_02.png
diff --git a/images/04_train_console_04.png b/images/04_train_console_04.png
diff --git a/images/04_train_console_2.png b/images/04_train_console_2.png
diff --git a/images/05_wandb_run.png b/images/05_wandb_run.png
diff --git a/images/06_model_dir.png b/images/06_model_dir.png
diff --git a/images/07_predict_local.png b/images/07_predict_local.png
diff --git a/images/08_server_listening.png b/images/08_server_listening.png
diff --git a/images/09_ping_200.png b/images/09_ping_200.png
diff --git a/images/10_predict_http.png b/images/10_predict_http.png
diff --git a/logs/train_console.txt b/logs/train_console.txt