tudo-astroparticlephysics · LukasBeiske · Jun 30, 2025 · Jun 30, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,9 @@
+repos:
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.8.1
+    hooks:
+      - id: nbstripout
+  - repo: https://github.com/psf/black
+    rev: 25.1.0
+    hooks:
+      - id: black-jupyter
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# A machine learning lecture <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a> [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/tudo-astroparticlephysics/machine-learning-lecture/main) 
+# A machine learning lecture <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a> [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/tudo-astroparticlephysics/machine-learning-lecture/main)
 
 This collection of notebooks was started for a lecture on machine learning at the Universitat Autònoma de Barcelona.
 It has since grown into a large part of the statistical methods lecture (SMD) at the Physics department at TU Dortmund University.
@@ -29,14 +29,14 @@ The lecture material (e.g. jupyter notebooks) are shared under the Creative Comm
 
 
 ## Install `conda`
+
 To make sure, all needed packages are installed in an environment for these lectures, we use
 `conda`.
 
 Download and install [Anaconda](https://www.anaconda.com/products/individual#Downloads) for a large collection of packages or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) for a minimal starting point.
 
 ## Setup the environment
 
-
 After installing conda, run
 
 ```
@@ -52,6 +52,11 @@ $ conda activate ml
 ```
 everytime before you start working on these lectures.
 
+Before committing any changes, make sure the pre-commit hooks are installed:
+```
+$ pre-commit install
+```
+
 From time to time, we will update the `environment.yml` with new versions or
 additional packages, to then update your environment, run:
 ```

diff --git a/ani_kmeans.py b/ani_kmeans.py
@@ -9,16 +9,19 @@
 
 k = 4
 n_iters = 25
-discrete_cmap = ListedColormap([f'C{i}' for i in range(k)])
+discrete_cmap = ListedColormap([f"C{i}" for i in range(k)])
 fps = 25
 interval = 1000 / fps
 time_per_iter = 1
 frames = n_iters * time_per_iter * fps
 
 # choose inital cluster centers
 X, y = make_blobs(
-    n_samples=500, centers=k, center_box=(-2, 2),
-    cluster_std=0.5, random_state=1,
+    n_samples=500,
+    centers=k,
+    center_box=(-2, 2),
+    cluster_std=0.5,
+    random_state=1,
 )
 
 fig = plt.figure(figsize=(12.8, 7.2), dpi=100)
@@ -34,25 +37,26 @@
 
 
 center_lines = [ax.plot([], [])[0] for _ in range(k)]
-points = ax.scatter(X[:, 0], X[:, 1], c='k', cmap=discrete_cmap, alpha=0.5)
+points = ax.scatter(X[:, 0], X[:, 1], c="k", cmap=discrete_cmap, alpha=0.5)
 center_plot = ax.scatter(
     init_centers[:, 0],
     init_centers[:, 1],
     c=np.arange(k),
     cmap=discrete_cmap,
-    marker='h',
-    edgecolor='k',
+    marker="h",
+    edgecolor="k",
     s=400,
-    label='cluster center',
+    label="cluster center",
 )
 
-ax.legend(loc='upper right')
+ax.legend(loc="upper right")
 
 
 def init():
-    t = ax.set_title('iteration  0')
+    t = ax.set_title("iteration  0")
     return *center_lines, points, t
 
+
 def update(frame, bar=None):
     if bar is not None:
         bar.update(1)
@@ -68,15 +72,24 @@ def update(frame, bar=None):
         center_history[i] = init_centers
 
     for j, line in enumerate(center_lines):
-        line.set_data(center_history[:i + 1, j, 0], center_history[:i + 1, j, 1])
+        line.set_data(center_history[: i + 1, j, 0], center_history[: i + 1, j, 1])
 
     points.set_cmap(discrete_cmap)
-    t = ax.set_title('iteration {}'.format(i + 1))
+    t = ax.set_title("iteration {}".format(i + 1))
 
     return *center_lines, points, t
 
+
 bar = tqdm(total=frames)
-ani = FuncAnimation(fig, update, blit=True, init_func=init, frames=frames, fargs=(bar,), interval=interval)
+ani = FuncAnimation(
+    fig,
+    update,
+    blit=True,
+    init_func=init,
+    frames=frames,
+    fargs=(bar,),
+    interval=interval,
+)
 ani.save("kmeans_clustering.mp4")
 ani.pause()
 plt.close(fig)
diff --git a/ml/plots.py b/ml/plots.py
@@ -12,13 +12,14 @@
 rng = np.random.default_rng(0)
 
 
-colors = ['xkcd:sky', 'xkcd:grass']
+colors = ["xkcd:sky", "xkcd:grass"]
 cmap = ListedColormap(colors)
 
+
 def create_discrete_colormap(n_classes):
     if n_classes == 2:
         return cmap.copy()
-    return ListedColormap([f'C{i}' for i in range(n_classes)])
+    return ListedColormap([f"C{i}" for i in range(n_classes)])
 
 
 def set_plot_style():
@@ -34,7 +35,7 @@ def set_plot_style():
 
 def twospirals(n_samples, noise=0.5, rng=rng):
     """
-     Returns the two spirals dataset.
+    Returns the two spirals dataset.
     """
     n = np.sqrt(rng.uniform(size=(n_samples, 1))) * 360 * (2 * np.pi) / 360
     d1x = -np.cos(n) * n + rng.uniform((n_samples, 1)) * noise
@@ -67,7 +68,9 @@ def draw_linear_regression_function(reg, ax=None, **kwargs):
 def plot_3d_views(X, y, cmap=cmap):
     from mpl_toolkits.mplot3d import Axes3D  # noqa
 
-    fig, axs = plt.subplots(2, 2, subplot_kw={'projection': '3d'}, constrained_layout=False)
+    fig, axs = plt.subplots(
+        2, 2, subplot_kw={"projection": "3d"}, constrained_layout=False
+    )
 
     for ax in axs.ravel():
         ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=cmap, lw=0)
@@ -83,6 +86,7 @@ def plot_3d_views(X, y, cmap=cmap):
     axs[1, 1].view_init(90, 0)
     fig.subplots_adjust(wspace=0.005, hspace=0.005)
 
+
 def draw_tree(clf):
     import pydotplus
 
@@ -176,7 +180,7 @@ def plot_bars_and_confusion(
     axes=None,
     vmin=None,
     vmax=None,
-    cmap='inferno',
+    cmap="inferno",
     title=None,
     bar_color=None,
 ):
@@ -189,7 +193,7 @@ def plot_bars_and_confusion(
     if not isinstance(prediction, pd.Series):
         prediction = pd.Series(prediction)
 
-    correct = pd.Series(np.where(truth.values == prediction.values, 'Correct', 'Wrong'))
+    correct = pd.Series(np.where(truth.values == prediction.values, "Correct", "Wrong"))
 
     truth.sort_index(inplace=True)
     prediction.sort_index(inplace=True)

diff --git a/ml/solutions/exercise_1.py b/ml/solutions/exercise_1.py
@@ -1,6 +1,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 from sklearn import linear_model
+
 np.random.seed(1234)
 # create two gaussians
 A = np.random.multivariate_normal(mean=[1, 1], cov=[[2, 1], [1, 2]], size=200)
@@ -21,16 +22,16 @@
 x2s = (0.5 - b_0 - b_1 * x1s) / b_2
 
 
-plt.scatter(A[:, 0], A[:, 1], s=25, color='dodgerblue', label='True class A')
-plt.scatter(B[:, 0], B[:, 1], s=25, color='limegreen', label='True class B')
+plt.scatter(A[:, 0], A[:, 1], s=25, color="dodgerblue", label="True class A")
+plt.scatter(B[:, 0], B[:, 1], s=25, color="limegreen", label="True class B")
 
-plt.plot(x1s, x2s, color='gray', linestyle='--')
+plt.plot(x1s, x2s, color="gray", linestyle="--")
 
-plt.fill_between(x1s, x2s, 10, color='dodgerblue', alpha=0.07)
-plt.fill_between(x1s, x2s, -10, color='limegreen', alpha=0.07)
+plt.fill_between(x1s, x2s, 10, color="dodgerblue", alpha=0.07)
+plt.fill_between(x1s, x2s, -10, color="limegreen", alpha=0.07)
 plt.grid()
-plt.xlabel('X1')
-plt.ylabel('X2')
+plt.xlabel("X1")
+plt.ylabel("X2")
 plt.margins(x=0, y=0)
 plt.xlim([-8, 8])
 plt.ylim([-8, 8])

diff --git a/ml/solutions/exercise_2.py b/ml/solutions/exercise_2.py
@@ -6,27 +6,31 @@
 np.random.seed(1234)
 data = read_titanic()
 
-X = data[['Sex_Code', 'Pclass_Code', 'Fare', 'Age']]
-y = data['Survived_Code']
+X = data[["Sex_Code", "Pclass_Code", "Fare", "Age"]]
+y = data["Survived_Code"]
 
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
 
 
 # Use linear kernel
-reg = SVC(kernel='linear')
+reg = SVC(kernel="linear")
 reg.fit(X_train, y_train)
 prediction_linear = reg.predict(X_test)
 
 # Use the rbf kernel
-reg_rbf = SVC(kernel='rbf')
+reg_rbf = SVC(kernel="rbf")
 reg_rbf.fit(X_train, y_train)
 prediction_rbf = reg_rbf.predict(X_test)
 
 fig, ([ax1, ax2], [ax3, ax4]) = plt.subplots(2, 2, figsize=(10, 10))
-plots.plot_bars_and_confusion(truth=y_test, prediction=prediction_linear, axes=[ax1, ax2], vmin=0, vmax=182)
-plots.plot_bars_and_confusion(truth=y_test, prediction=prediction_rbf, axes=[ax3, ax4], vmin=0, vmax=182)
-ax1.set_title('Linear Kernel')
-ax3.set_title('Radial Kernel')
+plots.plot_bars_and_confusion(
+    truth=y_test, prediction=prediction_linear, axes=[ax1, ax2], vmin=0, vmax=182
+)
+plots.plot_bars_and_confusion(
+    truth=y_test, prediction=prediction_rbf, axes=[ax3, ax4], vmin=0, vmax=182
+)
+ax1.set_title("Linear Kernel")
+ax3.set_title("Radial Kernel")
 ax1.set_xlim([0, 300])
 ax3.set_xlim([0, 300])
 None
diff --git a/ml/solutions/exercise_3.py b/ml/solutions/exercise_3.py
@@ -4,23 +4,27 @@
 import seaborn as sns
 import pandas as pd
 import numpy as np
+
 np.random.seed(1235)
 
 data = read_titanic()
 
-X = data[['Sex_Code', 'Pclass_Code', 'Fare', 'Age']]
-y = data['Survived']
+X = data[["Sex_Code", "Pclass_Code", "Fare", "Age"]]
+y = data["Survived"]
 
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
 
 df = pd.DataFrame()
-ps = ParameterGrid({'max_depth':range(1, 20), 'criterion':['entropy', 'gini']})
+ps = ParameterGrid({"max_depth": range(1, 20), "criterion": ["entropy", "gini"]})
 for d in ps:
-    clf = DecisionTreeClassifier(max_depth=d['max_depth'], criterion=d['criterion'])
+    clf = DecisionTreeClassifier(max_depth=d["max_depth"], criterion=d["criterion"])
     clf.fit(X_train, y_train)
     acc = accuracy_score(y_test, clf.predict(X_test))
-    df = df.append({'max_depth': d['max_depth'], 'criterion': d['criterion'], 'accuracy': acc}, ignore_index=True)
+    df = df.append(
+        {"max_depth": d["max_depth"], "criterion": d["criterion"], "accuracy": acc},
+        ignore_index=True,
+    )
 
-df = df.pivot('max_depth', 'criterion', 'accuracy')
-sns.heatmap(df, cmap='YlOrRd', annot=True, fmt='.3f')
+df = df.pivot("max_depth", "criterion", "accuracy")
+sns.heatmap(df, cmap="YlOrRd", annot=True, fmt=".3f")
 None
diff --git a/ml/solutions/exercise_5.py b/ml/solutions/exercise_5.py
@@ -6,25 +6,27 @@
 np.random.seed(1234)
 data = read_titanic()
 
-X = data[['Sex_Code', 'Pclass_Code', 'Fare', 'Age']]
-y = data['Survived_Code']
+X = data[["Sex_Code", "Pclass_Code", "Fare", "Age"]]
+y = data["Survived_Code"]
 
-svc = SVC(kernel='linear')
+svc = SVC(kernel="linear")
 knn = KNeighborsClassifier(n_neighbors=5)
 tree = DecisionTreeClassifier(max_depth=5)
 
 results = []
-for clf, name in zip([svc, knn, tree], ['SVM', 'kNN', 'tree']):
-    r = cross_validate(clf, X=X, y=y, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])
+for clf, name in zip([svc, knn, tree], ["SVM", "kNN", "tree"]):
+    r = cross_validate(
+        clf, X=X, y=y, cv=5, scoring=["accuracy", "precision", "recall", "f1"]
+    )
     df = pd.DataFrame().from_dict(r)
-    df['classifier'] = name
+    df["classifier"] = name
     results.append(df)
 
-df = pd.concat(results).drop(['fit_time', 'score_time'], axis='columns')
+df = pd.concat(results).drop(["fit_time", "score_time"], axis="columns")
 
-means = df.groupby('classifier').mean()
-deviations = df.groupby('classifier').std()
+means = df.groupby("classifier").mean()
+deviations = df.groupby("classifier").std()
 
 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 4))
-sns.heatmap(means, cmap='viridis', annot=True, ax=ax1, vmin=0, vmax=1)
-sns.heatmap(deviations, cmap='viridis', annot=True, ax=ax2, vmin=0, vmax=1)
+sns.heatmap(means, cmap="viridis", annot=True, ax=ax1, vmin=0, vmax=1)
+sns.heatmap(deviations, cmap="viridis", annot=True, ax=ax2, vmin=0, vmax=1)