Code-A2Z · sravanioleti · Oct 23, 2025 · Oct 23, 2025 · Oct 23, 2025 · Oct 23, 2025
diff --git a/src/HousePricePrediction/README.md b/src/HousePricePrediction/README.md
@@ -0,0 +1,30 @@
+# House Price Prediction (Jarvis)
+
+This is a small, self-contained house price prediction example (Level 2). It includes:
+- A reproducible training & evaluation script (`housePricePrediction.py`).
+- Optional Streamlit demo (`housePricePrediction_streamlit.py`).
+- A tiny example dataset in `examples/` for quick smoke tests.
+
+**Dataset:** Use the included sample for quick tests. For full training, the maintainer should download the full **Ames Housing dataset** (link provided below).
+
+## How to run (quick)
+1. Create and activate a virtual environment:
+   ```bash
+   python -m venv .venv
+   source .venv/bin/activate
+   ```
+2. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+3. Run training + evaluation:
+   ```bash
+   python housePricePrediction.py
+   ```
+4. Optional streamlit demo:
+   ```bash
+   streamlit run housePricePrediction_streamlit.py
+   ```
+## Dataset links
+- Ames Housing dataset on Kaggle (recommended): https://www.kaggle.com/c/house-prices-advanced-regression-techniques
+*(NOTE: Do not commit large model files (like `house_price_model_joblib.pkl`) or the full dataset.)*
diff --git a/src/HousePricePrediction/examples/sample_house_data.csv b/src/HousePricePrediction/examples/sample_house_data.csv
@@ -0,0 +1,6 @@
+LotArea,OverallQual,YearBuilt,TotalBsmtSF,GrLivArea,FullBath,BedroomAbvGr,SalePrice
+8450,7,2003,856,1710,2,3,208500
+9600,6,1976,1262,1262,2,3,181500
+11250,7,2001,920,1786,2,3,223500
+9550,7,2017,700,1050,2,3,150000
+... (Add 15-20 synthetic rows)
diff --git a/src/HousePricePrediction/housePricePrediction.py b/src/HousePricePrediction/housePricePrediction.py
@@ -0,0 +1,147 @@
+"""
+housePricePrediction.py
+
+"""
+
+import os
+import sys
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+import joblib
+
+# Path to a tiny sample CSV included in the folder (commit a very small sample)
+DATA_PATH = os.path.join(os.path.dirname(__file__), "examples", "sample_house_data.csv")
+
+
+def load_data(path=DATA_PATH):
+    if not os.path.exists(path):
+        print(f"[ERROR] Dataset not found at {path}. Include a small 'examples/sample_house_data.csv' (see README).")
+        return None
+    try:
+        df = pd.read_csv(path)
+    except Exception as e:
+        print(f"[ERROR] Could not read CSV: {e}")
+        return None
+    if df.empty:
+        print("[ERROR] CSV loaded but it's empty.")
+        return None
+    return df
+
+
+def infer_target_and_features(df):
+    """
+    Decide target (y) and features (X) defensively:
+    - If 'SalePrice' exists, use it as target.
+    - Otherwise, try to use the last numeric column as target.
+    """
+    df = df.copy()
+    if "SalePrice" in df.columns:
+        y_name = "SalePrice"
+    else:
+        # pick last numeric column as fallback
+        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+        if not numeric_cols:
+            print("[ERROR] No numeric columns found in dataset to use as target.")
+            return None, None
+        y_name = numeric_cols[-1]
+
+    if y_name not in df.columns:
+        print(f"[ERROR] Target column {y_name} not found.")
+        return None, None
+
+    X = df.drop(columns=[y_name])
+    y = df[y_name]
+    if X.shape[1] == 0:
+        print("[ERROR] No feature columns available after dropping target.")
+        return None, None
+    return X, y
+
+
+def build_pipeline(numeric_cols, categorical_cols):
+    numeric_pipeline = Pipeline([
+        ("imputer", SimpleImputer(strategy="median")),
+        ("scaler", StandardScaler())
+    ])
+    # Use sparse=False for compatibility with older sklearn; acceptable for small OHE results.
+    categorical_pipeline = Pipeline([
+        ("imputer", SimpleImputer(strategy="most_frequent")),
+        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False))
+    ])
+
+    preprocessor = ColumnTransformer([
+        ("num", numeric_pipeline, numeric_cols),
+        ("cat", categorical_pipeline, categorical_cols)
+    ])
+    return preprocessor
+
+
+def evaluate_model(model, X_test, y_test):
+    preds = model.predict(X_test)
+    mae = mean_absolute_error(y_test, preds)
+    rmse = mean_squared_error(y_test, preds, squared=False)
+    r2 = r2_score(y_test, preds)
+    return {"MAE": mae, "RMSE": rmse, "R2": r2}
+
+
+def housePricePrediction():
+    """Main entrypoint — name matches file per contributing rules."""
+    df = load_data()
+    if df is None:
+        return
+
+    X, y = infer_target_and_features(df)
+    if X is None or y is None:
+        return
+
+    # Simple heuristic for numeric vs categorical columns
+    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
+    categorical_cols = [c for c in X.columns if c not in numeric_cols]
+
+    # Build preprocessor
+    preprocessor = build_pipeline(numeric_cols, categorical_cols)
+
+    # split
+    try:
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    except Exception as e:
+        print(f"[ERROR] Could not split data: {e}")
+        return
+
+    # Model 1: Linear Regression (baseline)
+    try:
+        lr_pipeline = Pipeline([
+            ("pre", preprocessor),
+            ("model", LinearRegression())
+        ])
+        lr_pipeline.fit(X_train, y_train)
+        lr_metrics = evaluate_model(lr_pipeline, X_test, y_test)
+        print("LinearRegression metrics:", lr_metrics)
+    except Exception as e:
+        print(f"[WARN] LinearRegression failed: {e}")
+
+    # Model 2: Random Forest
+    try:
+        rf_pipeline = Pipeline([
+            ("pre", preprocessor),
+            ("model", RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
+        ])
+        rf_pipeline.fit(X_train, y_train)
+        rf_metrics = evaluate_model(rf_pipeline, X_test, y_test)
+        print("RandomForest metrics:", rf_metrics)
+        # Save the model locally (do NOT commit this file). PR should NOT include this file.
+        joblib.dump(rf_pipeline, "house_price_model_joblib.pkl")
+        print("Saved model to house_price_model_joblib.pkl (do NOT commit).")
+    except Exception as e:
+        print(f"[WARN] RandomForest failed: {e}")
+
+
+if __name__ == "__main__":
+    housePricePrediction()
diff --git a/src/HousePricePrediction/requirements.txt b/src/HousePricePrediction/requirements.txt
@@ -0,0 +1,5 @@
+pandas>=1.5
+numpy>=1.24
+scikit-learn>=1.2
+streamlit>=1.20   # optional, if including Streamlit demo
+joblib>=1.3