Skip to content
Open

test #29

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ on the price of similar properties. Your company receives new data in bulk every
to be retrained with the same cadence, necessitating an end-to-end pipeline that can be reused.

In this project you will build such a pipeline.
# W&B project link:
Report: https://wandb.ai/tlemes-western-governors-university/nyc_airbnb/reports/NYC-Airbnb-Dataset-project-Report--VmlldzoxMzU5MDg2MQ?accessToken=e6uedivrkyvwbjrr701otecnc3kyuakdywl7g9qciv0xnd80mkbefexwiubg6ji0
https://wandb.ai/tlemes-western-governors-university/nyc_airbnb?nw=nwusertlemes

# GitHub
https://github.com/ThaisLemes/Project-Build-an-ML-Pipeline-Starter

## Table of contents

Expand Down
33 changes: 33 additions & 0 deletions Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "44e8a294-4db1-429d-ae7a-15c1a8571af5",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Empty file added conda
Empty file.
4 changes: 2 additions & 2 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ modeling:
# NOTE: you can put here any parameter that is accepted by the constructor of
# RandomForestRegressor. This is a subsample, but more could be added:
random_forest:
n_estimators: 100
max_depth: 15
n_estimators: 200
max_depth: 50
min_samples_split: 4
min_samples_leaf: 3
# Here -1 means all available cores
Expand Down
86 changes: 62 additions & 24 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,30 +43,54 @@ def go(config: DictConfig):
version='main',
env_manager="conda",
parameters={
"sample": config["etl"]["sample"],
"sample": config['etl']['sample'],
"artifact_name": "sample.csv",
"artifact_type": "raw_data",
"artifact_description": "Raw file as downloaded"
},
)

if "basic_cleaning" in active_steps:
##################
# Implement here #
##################
pass

_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "basic_cleaning"),
"main",
parameters={
"input_artifact":"sample.csv:latest",
"output_artifact": "clean_sample.csv",
"output_type": "clean_sample",
"output_description":"Data with outliers and null values removed",
"min_price": config['etl']['min_price'],
"max_price": config['etl']['max_price'],
},
)


if "data_check" in active_steps:
##################
# Implement here #
##################
pass

_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
"main",
parameters={
"csv": "clean_sample.csv:latest",
"ref": "clean_sample.csv:reference",
"kl_threshold": config["data_check"]["kl_threshold"],
"min_price": config['etl']['min_price'],
"max_price": config['etl']['max_price'],
},
)


if "data_split" in active_steps:
##################
# Implement here #
##################
pass
_ = mlflow.run(
f"{config['main']['components_repository']}/train_val_test_split",
'main',
parameters = {
"input":"clean_sample.csv:latest",
"test_size": str(config['modeling']['test_size']),
"random_seed": str(config['modeling']['random_seed']),
"stratify_by": config['modeling']['stratify_by'],
},
)


if "train_random_forest" in active_steps:

Expand All @@ -77,20 +101,34 @@ def go(config: DictConfig):

# NOTE: use the rf_config we just created as the rf_config parameter for the train_random_forest
# step
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "train_random_forest"),
"main",
parameters = {
"trainval_artifact":"trainval_data.csv:latest",
"val_size": config['modeling']['val_size'],
"random_seed": config['modeling']['random_seed'],
"stratify_by": config['modeling']['stratify_by'],
"rf_config": rf_config,
"max_tfidf_features": config['modeling']['max_tfidf_features'],
"output_artifact": "random_forest_export",
},
)

##################
# Implement here #
##################

pass


if "test_regression_model" in active_steps:

_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "components", "test_regression_model"),
'main',
parameters = {
"mlflow_model": "random_forest_export:prod",
"test_dataset": "test_data.csv:latest",
},
)

##################
# Implement here #
##################

pass


if __name__ == "__main__":
Expand Down
Loading