Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions src/HousePricePrediction/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# House Price Prediction (Jarvis)

This is a small, self-contained house price prediction example (Level 2). It includes:
- A reproducible training & evaluation script (`housePricePrediction.py`).
- Optional Streamlit demo (`housePricePrediction_streamlit.py`).
- A tiny example dataset in `examples/` for quick smoke tests.

**Dataset:** Use the included sample for quick tests. For full training, the maintainer should download the full **Ames Housing dataset** (link provided below).

## How to run (quick)
1. Create and activate a virtual environment:
```bash
python -m venv .venv
source .venv/bin/activate
```
2. Install dependencies:
```bash
pip install -r requirements.txt
```
3. Run training + evaluation:
```bash
python housePricePrediction.py
```
4. Optional streamlit demo:
```bash
streamlit run housePricePrediction_streamlit.py
```
## Dataset links
- Ames Housing dataset on Kaggle (recommended): https://www.kaggle.com/c/house-prices-advanced-regression-techniques
*(NOTE: Do not commit large model files (like `house_price_model_joblib.pkl`) or the full dataset.)*
6 changes: 6 additions & 0 deletions src/HousePricePrediction/examples/sample_house_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
LotArea,OverallQual,YearBuilt,TotalBsmtSF,GrLivArea,FullBath,BedroomAbvGr,SalePrice
8450,7,2003,856,1710,2,3,208500
9600,6,1976,1262,1262,2,3,181500
11250,7,2001,920,1786,2,3,223500
9550,7,2017,700,1050,2,3,150000
... (Add 15-20 synthetic rows)
147 changes: 147 additions & 0 deletions src/HousePricePrediction/housePricePrediction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""
housePricePrediction.py

"""

import os
import sys

Check failure on line 7 in src/HousePricePrediction/housePricePrediction.py

View workflow job for this annotation

GitHub Actions / lint-format

Ruff (F401)

src/HousePricePrediction/housePricePrediction.py:7:8: F401 `sys` imported but unused
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

Check failure on line 18 in src/HousePricePrediction/housePricePrediction.py

View workflow job for this annotation

GitHub Actions / lint-format

Ruff (I001)

src/HousePricePrediction/housePricePrediction.py:6:1: I001 Import block is un-sorted or un-formatted

# Path to a tiny sample CSV included in the folder (commit a very small sample)
DATA_PATH = os.path.join(os.path.dirname(__file__), "examples", "sample_house_data.csv")


def load_data(path=DATA_PATH):
if not os.path.exists(path):
print(f"[ERROR] Dataset not found at {path}. Include a small 'examples/sample_house_data.csv' (see README).")
return None
try:
df = pd.read_csv(path)
except Exception as e:
print(f"[ERROR] Could not read CSV: {e}")
return None
if df.empty:
print("[ERROR] CSV loaded but it's empty.")
return None
return df


def infer_target_and_features(df):
"""
Decide target (y) and features (X) defensively:
- If 'SalePrice' exists, use it as target.
- Otherwise, try to use the last numeric column as target.
"""
df = df.copy()
if "SalePrice" in df.columns:
y_name = "SalePrice"
else:
# pick last numeric column as fallback
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if not numeric_cols:
print("[ERROR] No numeric columns found in dataset to use as target.")
return None, None
y_name = numeric_cols[-1]

if y_name not in df.columns:
print(f"[ERROR] Target column {y_name} not found.")
return None, None

X = df.drop(columns=[y_name])
y = df[y_name]
if X.shape[1] == 0:
print("[ERROR] No feature columns available after dropping target.")
return None, None
return X, y


def build_pipeline(numeric_cols, categorical_cols):
numeric_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
])
# Use sparse=False for compatibility with older sklearn; acceptable for small OHE results.
categorical_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="most_frequent")),
("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

preprocessor = ColumnTransformer([
("num", numeric_pipeline, numeric_cols),
("cat", categorical_pipeline, categorical_cols)
])
return preprocessor


def evaluate_model(model, X_test, y_test):
preds = model.predict(X_test)
mae = mean_absolute_error(y_test, preds)
rmse = mean_squared_error(y_test, preds, squared=False)
r2 = r2_score(y_test, preds)
return {"MAE": mae, "RMSE": rmse, "R2": r2}


def housePricePrediction():
"""Main entrypoint — name matches file per contributing rules."""
df = load_data()
if df is None:
return

X, y = infer_target_and_features(df)
if X is None or y is None:
return

# Simple heuristic for numeric vs categorical columns
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = [c for c in X.columns if c not in numeric_cols]

# Build preprocessor
preprocessor = build_pipeline(numeric_cols, categorical_cols)

# split
try:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
except Exception as e:
print(f"[ERROR] Could not split data: {e}")
return

# Model 1: Linear Regression (baseline)
try:
lr_pipeline = Pipeline([
("pre", preprocessor),
("model", LinearRegression())
])
lr_pipeline.fit(X_train, y_train)
lr_metrics = evaluate_model(lr_pipeline, X_test, y_test)
print("LinearRegression metrics:", lr_metrics)
except Exception as e:
print(f"[WARN] LinearRegression failed: {e}")

# Model 2: Random Forest
try:
rf_pipeline = Pipeline([
("pre", preprocessor),
("model", RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])
rf_pipeline.fit(X_train, y_train)
rf_metrics = evaluate_model(rf_pipeline, X_test, y_test)
print("RandomForest metrics:", rf_metrics)
# Save the model locally (do NOT commit this file). PR should NOT include this file.
joblib.dump(rf_pipeline, "house_price_model_joblib.pkl")
print("Saved model to house_price_model_joblib.pkl (do NOT commit).")
except Exception as e:
print(f"[WARN] RandomForest failed: {e}")


if __name__ == "__main__":
housePricePrediction()
5 changes: 5 additions & 0 deletions src/HousePricePrediction/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pandas>=1.5
numpy>=1.24
scikit-learn>=1.2
streamlit>=1.20 # optional, if including Streamlit demo
joblib>=1.3
Loading