udacity · liyun0016 · Jun 21, 2025 · Jun 21, 2025 · Jun 21, 2025 · Jun 21, 2025
@@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
@@ -0,0 +1,3 @@
+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore
@@ -0,0 +1,42 @@
+name: Python CI
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  lint-and-test:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        python-version: [3.8]
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Set PYTHONPATH
+      run: echo "PYTHONPATH=$GITHUB_WORKSPACE/starter" >> $GITHUB_ENV
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 pytest
+        pip install -r requirements.txt
+
+    - name: Run flake8 (fail if linting fails)
+      run: |
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+      continue-on-error: false
+
+    - name: Run pytest (fail if tests fail)
+      run: PYTHONPATH=$GITHUB_WORKSPACE pytest starter/
@@ -0,0 +1 @@
+3.11.9
@@ -0,0 +1,5 @@
+{
+    "python.analysis.extraPaths": [
+        "./starter/starter"
+    ]
+}
@@ -0,0 +1,27 @@
+import requests
+
+# Replace with your actual deployed URL
+url = "https://nd0821-c3-starter-code-w71g.onrender.com/inference"
+
+# Example payload (match your Pydantic model structure and alias fields)
+payload = {
+    "age": 37,
+    "workclass": "Private",
+    "fnlgt": 284582,
+    "education": "Bachelors",
+    "education-num": 13,
+    "marital-status": "Never-married",
+    "occupation": "Exec-managerial",
+    "relationship": "Not-in-family",
+    "race": "White",
+    "sex": "Male",
+    "capital-gain": 0,
+    "capital-loss": 0,
+    "hours-per-week": 40,
+    "native-country": "United-States"
+}
+
+response = requests.post(url, json=payload)
+
+print("Status code:", response.status_code)
+print("Response JSON:", response.json())
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
@@ -0,0 +1,6 @@
+services:
+  - type: web
+    name: starter-service
+    env: python
+    buildCommand: pip install -r requirements.txt
+    startCommand: uvicorn starter.main:app --host=0.0.0.0 --port=10000
@@ -0,0 +1,14 @@
+# Build dependencies first
+setuptools>=65.0
+wheel>=0.37.0
+pip>=22.0
+
+# Your application dependencies
+numpy==1.24.3
+pandas
+scikit-learn==1.3.2
+pytest
+requests
+fastapi==0.63.0
+uvicorn
+gunicorn
@@ -0,0 +1,35 @@
+marital-status = Never-married
+  Precision: 0.8627
+  Recall:    0.4074
+  F1 Score:  0.5535
+
+marital-status = Divorced
+  Precision: 0.7381
+  Recall:    0.3100
+  F1 Score:  0.4366
+
+marital-status = Married-civ-spouse
+  Precision: 0.7033
+  Recall:    0.6841
+  F1 Score:  0.6935
+
+marital-status = Widowed
+  Precision: 1.0000
+  Recall:    0.2500
+  F1 Score:  0.4000
+
+marital-status = Separated
+  Precision: 1.0000
+  Recall:    0.2143
+  F1 Score:  0.3529
+
+marital-status = Married-spouse-absent
+  Precision: 1.0000
+  Recall:    0.5000
+  F1 Score:  0.6667
+
+marital-status = Married-AF-spouse
+  Precision: 1.0000
+  Recall:    0.0000
+  F1 Score:  0.0000
+
@@ -1 +1,2 @@
 
+/census.csv
@@ -0,0 +1,4 @@
+outs:
+- md5: 12c208530a5680c15ae19b34152286dd
+  size: 3518606
+  path: census.csv
@@ -1 +1,85 @@
 # Put the code for your API here.
+from fastapi import FastAPI
+from pydantic import BaseModel, Field
+import joblib
+import numpy as np
+import pandas as pd
+from typing import Literal
+
+from .starter.ml.data import process_data
+from .starter.ml.model import inference
+
+app = FastAPI()
+
+@app.get("/")
+def read_root():
+    return {"message": "Welcome to the Income Prediction API!"}
+
+# Define Pydantic model for request body
+class InferenceInput(BaseModel):
+    age: int
+    workclass: str
+    fnlgt: int
+    education: str
+    education_num: int = Field(..., alias="education-num")
+    marital_status: str = Field(..., alias="marital-status")
+    occupation: str
+    relationship: str
+    race: str
+    sex: str
+    capital_gain: int = Field(..., alias="capital-gain")
+    capital_loss: int = Field(..., alias="capital-loss")
+    hours_per_week: int = Field(..., alias="hours-per-week")
+    native_country: str = Field(..., alias="native-country")
+
+    class Config:
+        populate_by_name = True
+        json_schema_extra = {
+            "examples": [
+                {
+                    "age": 37,
+                    "workclass": "Self-emp-not-inc",
+                    "fnlgt": 284582,
+                    "education": "Bachelors",
+                    "education-num": 13,
+                    "marital-status": "Never-married",
+                    "occupation": "Exec-managerial",
+                    "relationship": "Not-in-family",
+                    "race": "White",
+                    "sex": "Male",
+                    "capital-gain": 0,
+                    "capital-loss": 0,
+                    "hours-per-week": 40,
+                    "native-country": "United-States"
+                }
+            ]
+        }
+
+
+# Load model and encoders
+model = joblib.load("starter/model/model.pkl")
+encoder = joblib.load("starter/model/encoder.pkl")
+lb = joblib.load("starter/model/label_binarizer.pkl")
+
+@app.post("/inference")
+def predict(input_data: InferenceInput):
+    print("RECEIVED INPUT:", input_data.dict(by_alias=True))
+    input_dict = input_data.dict(by_alias=True)
+    data_df = pd.DataFrame([input_dict])
+
+    X, _, _, _ = process_data(
+        data_df,
+        categorical_features=[
+            "workclass", "education", "marital-status", "occupation",
+            "relationship", "race", "sex", "native-country"
+        ],
+        label=None,
+        training=False,
+        encoder=encoder,
+        lb=lb
+    )
+
+    pred = inference(model, X)
+    prediction_label = lb.inverse_transform(pred)[0]
+
+    return {"prediction": prediction_label}
@@ -4,15 +4,72 @@ For additional information see the Model Card paper: https://arxiv.org/pdf/1810.
 
 ## Model Details
 
+- **Developer**: Created as part of the ND0821 ML DevOps Engineering Nanodegree program.
+- **Model Date**: June 2025
+- **Model Version**: v1.0
+- **Model Type**: RandomForestClassifier
+- **Algorithms/Parameters**:
+  - Ensemble method using decision trees
+  - Default hyperparameters except `random_state=42`
+- **License**: For educational use
+
+
 ## Intended Use
 
+- **Primary Use Case**: Predicting whether an individual's income exceeds \$50K based on demographic attributes (from census data)
+- **Primary Users**: Students, educators, or ML practitioners learning deployment and testing practices
+- **Out-of-Scope Use Cases**:
+  - Real-world income prediction in sensitive applications (e.g., hiring, credit scoring)
+  - Use in production environments without fairness audits
+
+
 ## Training Data
 
+- **Source**: UCI Adult Census Income dataset
+- **Features**: Age, workclass, education, marital-status, race, sex, and others
+- **Target**: Income bracket (<=50K or >50K)
+
+
 ## Evaluation Data
 
+- **Split**: 20% holdout from original dataset
+- **Preprocessing**:
+  - One-hot encoding for categorical features
+  - Label binarization for the target
+  - Sliced evaluation by the `marital-status` feature
+
+
 ## Metrics
-_Please include the metrics used and your model's performance on those metrics._
+
+#### Global Performance:
+- **Precision**: 0.7152
+- **Recall**: 0.6110
+- **F1 Score**: 0.6590
+
+#### Sliced Performance by `marital-status`:
+
+| Marital Status           | Precision | Recall | F1 Score |
+|--------------------------|-----------|--------|----------|
+| Married-civ-spouse       | 0.7149    | 0.6475 | 0.6795   |
+| Never-married            | 0.7692    | 0.4255 | 0.5479   |
+| Married-spouse-absent    | 1.0000    | 0.1429 | 0.2500   |
+| Divorced                 | 0.6531    | 0.3902 | 0.4885   |
+| Separated                | 1.0000    | 0.3636 | 0.5333   |
+| Widowed                  | 0.5000    | 0.1429 | 0.2222   |
+| Married-AF-spouse        | 1.0000    | 0.0000 | 0.0000   |
+
 
 ## Ethical Considerations
 
+- Disparities in F1 scores across demographic slices may reflect model bias.
+- The model was not audited for fairness, bias, or societal impacts.
+- Use in decision-making without fairness evaluation could lead to discriminatory outcomes.
+
+
+
 ## Caveats and Recommendations
+
+- Performance varies significantly by subgroup — further fairness testing is advised.
+- Consider collecting more balanced training data for underrepresented categories.
+- **Do not deploy** this model without bias analysis, stakeholder review, and fairness auditing.
+
@@ -54,7 +54,7 @@ def process_data(
     X_continuous = X.drop(*[categorical_features], axis=1)
 
     if training is True:
-        encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
+        encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
         lb = LabelBinarizer()
         X_categorical = encoder.fit_transform(X_categorical)
         y = lb.fit_transform(y.values).ravel()