JuliaGNI · michakraus · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025
diff --git a/docs/src/GeometricMachineLearning.bib b/docs/src/GeometricMachineLearning.bib
@@ -1164,3 +1164,9 @@ @article{burby2020fast
   year={2020},
   publisher={IOP Publishing}
 }
+
+@misc{miller2023attention,
+  title = {{Attention Is Off By One},
+  howpublished = {\url{https://www.evanmiller.org/attention-is-off-by-one.html}},
+  note = {Accessed: 2025-10-13}
+}
diff --git a/docs/src/layers/symplectic_attention.md b/docs/src/layers/symplectic_attention.md
@@ -15,7 +15,7 @@ C = Z^TAZ \implies C_{mn} = \sum_{k\ell}Z_{km}A_{k\ell}Z_{\ell{}n}.
 Its element-wise derivative is:
 
 ```math
-\frac{\partial{}C_{mn}}{\partial{}Z_{ij}} = \sum_{\ell}(\delta_{jm}A_{i\ell}Z_{\ell{}n} + \delta_{jn}X_{\ell{}m}A_{\ell{}i}).
+\frac{\partial{}C_{mn}}{\partial{}Z_{ij}} = \sum_{\ell}(\delta_{jm}A_{i\ell}Z_{\ell{}n} + \delta_{jn}Z_{\ell{}m}A_{\ell{}i}).
 ```
 
 ## Matrix Softmax
@@ -29,19 +29,19 @@ Here we take as a staring point the expression:
 Its gradient (with respect to ``Z``) is:
 
 ```math
-\frac{\partial\Sigma(Z)}{\partial{}Z_{ij}} = \frac{1}{1 + \sum{m, n}\exp(C_{mn})}\sum_{m'n'}\exp(C_{m'n'})\sum_{\ell}(\delta_{jm'}A_{i\ell}Z_{\ell{}n'} + \delta_{jn'}X_{\ell{}m'}A_{\ell{}i}) = \frac{1}{1 + \sum_{m,n}\exp(C_{mn})}\{[AX\exp.(C)^T]_{ij} +  [A^TX\exp.(C)]_{ij}\}.
+\frac{\partial\Sigma(Z)}{\partial{}Z_{ij}} = \frac{1}{1 + \sum{m, n}\exp(C_{mn})}\sum_{m'n'}\exp(C_{m'n'})\sum_{\ell}(\delta_{jm'}A_{i\ell}Z_{\ell{}n'} + \delta_{jn'}Z_{\ell{}m'}A_{\ell{}i}) = \frac{1}{1 + \sum_{m,n}\exp(C_{mn})}\{[AZ\exp.(C)^T]_{ij} +  [A^TZ\exp.(C)]_{ij}\}.
 ```
 
 Note that if `A` is a [`SymmetricMatrix`](@ref) the expression than simplifies to:
 
 ```math
-\frac{\partial\Sigma(Z)}{\partial{}Z_{ij}} = 2\frac{1}{1 + \sum_{m,n}\exp(C_{mn})}[AX\exp.(C)^T]_{ij},
+\frac{\partial\Sigma(Z)}{\partial{}Z_{ij}} = 2\frac{1}{1 + \sum_{m,n}\exp(C_{mn})}[AZ\exp.(C)^T]_{ij},
 ```
 
 or written in matrix notation:
 
 ```math
-\nabla_Z\Sigma(Z) = 2\frac{1}{1 + \sum_{m,n}\exp(C_{mn})}AX\exp.(C).
+\nabla_Z\Sigma(Z) = 2\frac{1}{1 + \sum_{m,n}\exp(C_{mn})}AZ\exp.(C).
 ```
 
 Whether we use a [`SymmetricMatrix`](@ref) for ``A`` or not can be set with the keyword `symmetric` in [`SymplecticAttention`](@ref).
@@ -63,21 +63,26 @@ We then get:
 The second term in this expression is equivalent to a *standard attention step*:
 
 ```math
-\mathrm{TermII:}\qquad A^TZ\mathrm{softmax}(C).
+\mathrm{TermII:}\qquad A^TZ\mathrm{softmax}_1(C).
 ```
 
 The first term is equivalent to:
 
 ```math
-\mathrm{TermI:}\qquad \sum_n [AZ]_{in}[\mathrm{softmax}(C)^T]_{nj} \equiv AZ(\mathrm{softmax}(C))^T.
+\mathrm{TermI:}\qquad \sum_n [AZ]_{in}[\mathrm{softmax}_1(C)^T]_{nj} \equiv AZ(\mathrm{softmax}_1(C))^T.
 ```
 
 If we again assume that the matrix `A` is a [`SymmetricMatrix`](@ref) then the expression simplifies to:
 
 ```math
-\nabla_Z\Sigma(Z) = AZ\mathrm{softmax}(C).
+\nabla_Z\Sigma(Z) = AZ\mathrm{softmax}_1(C).
 ```
 
+!!! info
+    Note that we used the "one softmax" here instead of the standard softmax. The one softmax has been shown to be favorable to the standard softmax in some cases.
+
+A discussion of the one softmax can be found in [miller2023attention](@cite).
+
 ## Library Functions
 
 ```@docs

diff --git a/scripts/ForcedHamiltonianSystem.jl b/scripts/ForcedHamiltonianSystem.jl
@@ -0,0 +1,126 @@
+using HDF5
+using GeometricMachineLearning
+using GeometricMachineLearning: QPT, QPT2
+using CairoMakie
+using NNlib: relu
+
+# PARAMETERS
+omega  = 1.0                     # natural frequency of the harmonic Oscillator
+Omega  = 3.5                     # frequency of the external sinusoidal forcing
+F      = .9                      # amplitude of the external sinusoidal forcing   
+ni_dim = 10                      # number of initial conditions per dimension (so ni_dim^2 total)
+T      = 2π
+nt     = 500                    # number of time steps
+dt     = T/nt                    # time step
+
+# Generating the initial condition array
+IC = vec( [(q=q0, p=p0) for q0 in range(-1, 1, ni_dim), p0 in range(-1, 1, ni_dim)] )
+
+# Generating the solution array
+ni = ni_dim^2
+q  = zeros(Float64, ni, nt+1)
+p  = zeros(Float64, ni, nt+1)
+t  = collect(dt * range(0, nt, step=1))
+
+for i in 1:nt+1
+	for j=1:ni
+		q[j,i] =  ( IC[j].p - Omega*F/(omega^2-Omega^2) )/ omega *sin(omega*t[i]) + IC[j].q*cos(omega*t[i]) + F/(omega^2-Omega^2)*sin(Omega*t[i])
+		p[j,i] = -omega^2*IC[j].q*sin(omega*t[i]) + ( IC[j].p - Omega*F/(omega^2-Omega^2) )*cos(omega*t[i]) + Omega*F/(omega^2-Omega^2)*cos(Omega*t[i])
+		# q[j,i] =  ( IC[j].p - Omega*F/(omega^2-Omega^2) )/ omega *exp(-omega*t[i]) - IC[j].q*exp(-omega*t[i]) + F/(omega^2-Omega^2)*exp(-Omega*t[i])
+		# p[j,i] = -omega^2*IC[j].q*exp(-omega*t[i]) + ( IC[j].p + Omega*F/(omega^2-Omega^2) )*exp(-omega*t[i]) - Omega*F/(omega^2-Omega^2)*exp(-Omega*t[i])
+	end
+
+end
+
+@doc raw"""
+Turn a `NamedTuple` of ``(q,p)`` data into two tensors of the correct format.
+
+This is the tricky part as the structure of the input array(s) needs to conform with the structure of the parameters.
+
+Here the data are rearranged in an array of size ``(n, 2, t_f - 1)`` where ``[t_0, t_1, \ldots, t_f]`` is the vector storing the time steps.
+
+If we deal with different initial conditions as well, we still put everything into the third (parameter) axis.
+
+# Example
+
+```jldoctest
+using GeometricMachineLearning
+
+q = [1. 2. 3.; 4. 5. 6.]
+p = [1.5 2.5 3.5; 4.5 5.5 6.5]
+qp = (q = q, p = p)
+turn_q_p_data_into_correct_format(qp)
+
+# output
+
+(q = [1.0 2.0; 4.0 5.0;;; 2.0 3.0; 5.0 6.0], p = [1.5 2.5; 4.5 5.5;;; 2.5 3.5; 5.5 6.5])
+```
+"""
+function turn_q_p_data_into_correct_format(qp::QPT2{T, 2}) where {T}
+	number_of_time_steps = size(qp.q, 2) - 1 # not counting t₀
+	number_of_initial_conditions = size(qp.q, 1)
+	q_array = zeros(T, 1, 2, number_of_time_steps * number_of_initial_conditions)
+	p_array = zeros(T, 1, 2, number_of_time_steps * number_of_initial_conditions)
+	for initial_condition_index ∈ 0:(number_of_initial_conditions - 1)
+		for time_index ∈ 1:number_of_time_steps
+			q_array[:, 1, initial_condition_index * number_of_time_steps + time_index] .= qp.q[initial_condition_index + 1, time_index]
+			q_array[:, 2, initial_condition_index * number_of_time_steps + time_index] .= qp.q[initial_condition_index + 1, time_index + 1]
+			p_array[:, 1, initial_condition_index * number_of_time_steps + time_index] .= qp.p[initial_condition_index + 1, time_index]
+			p_array[:, 2, initial_condition_index * number_of_time_steps + time_index] .= qp.p[initial_condition_index + 1, time_index + 1]
+		end
+	end
+	(q = q_array, p = p_array)
+end
+
+# SAVING TO FILE
+
+# h5 = h5open(path, "w")
+# write(h5, "q", q)
+# write(h5, "p", p)
+# write(h5, "t", t)
+# 
+# attrs(h5)["ni"] = ni
+# attrs(h5)["nt"] = nt
+# attrs(h5)["dt"] = dt
+# 
+# close(h5)
+
+# This sets up the data loader
+dl = DataLoader(turn_q_p_data_into_correct_format((q = q, p = p)))
+
+# This sets up the neural network
+width::Int = 5
+nhidden::Int = 4
+activation = relu
+arch = ForcedSympNet(2; upscaling_dimension = width, 
+                        n_layers = nhidden, 
+                        activation = activation)
+nn = NeuralNetwork(arch)
+
+# This is where training starts
+function train_network(batch_size::Integer=5000, method=AdamOptimizer(), n_epochs=1000)
+	batch = Batch(batch_size)
+	o = Optimizer(method, nn)
+	o(nn, dl, batch, n_epochs)
+end
+
+loss_array = train_network()
+
+trajectory_number = 20
+
+# Testing the network
+initial_conditions = (q = q[trajectory_number, 1], p = p[trajectory_number, 1])
+n_steps = nt
+trajectory = (q = zeros(1, n_steps), p = zeros(1, n_steps))
+trajectory.q[:, 1] .= initial_conditions.q
+trajectory.p[:, 1] .= initial_conditions.p
+for t_step ∈ 0:(n_steps-2)
+	qp_temporary = nn.model((q = [trajectory.q[1, t_step+1]], p = [trajectory.p[1, t_step+1]]), nn.params)
+	trajectory.q[:, t_step+2] .= qp_temporary.q
+	trajectory.p[:, t_step+2] .= qp_temporary.p
+end
+
+fig = Figure()
+ax = Axis(fig[1,1])
+lines!(ax, trajectory.q[1,:]; label="nn")
+lines!(ax, q[trajectory_number,:]; label="analytic")
diff --git a/scripts/Project.toml b/scripts/Project.toml
@@ -1,7 +1,9 @@
 [deps]
+AbstractNeuralNetworks = "60874f82-5ada-4c70-bd1c-fa6be7711c8a"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 GeometricEquations = "c85262ba-a08a-430a-b926-d29770767bf2"
 GeometricIntegrators = "dcce2d33-59f6-5b8d-9047-0defad88ae06"
 GeometricMachineLearning = "194d25b2-d3f5-49f0-af24-c124f4aa80cc"
@@ -12,7 +14,11 @@ JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 ProfileView = "c46f51b8-102a-5cf2-8d2c-8597cb0e0da7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+SymbolicNeuralNetworks = "aed23131-dcd0-47ca-8090-d21e605652e3"
+Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/scripts/TimeDependentHarmonicOscillator_Analytic.jl b/scripts/TimeDependentHarmonicOscillator_Analytic.jl
@@ -0,0 +1,151 @@
+using HDF5
+using GeometricMachineLearning
+using GeometricMachineLearning: QPT, QPT2, Activation, ParametricLoss, SymbolicNeuralNetwork, SymbolicPullback
+using CairoMakie
+using NNlib: relu
+
+# PARAMETERS
+omega  = 1.0                     # natural frequency of the harmonic Oscillator
+Omega  = 3.5                     # frequency of the external sinusoidal forcing
+F      = .9                      # amplitude of the external sinusoidal forcing   
+ni_dim = 10                      # number of initial conditions per dimension (so ni_dim^2 total)
+T      = 2π * 10
+nt     = 2000                # number of time steps
+dt     = T/nt                    # time step
+
+# Generating the initial condition array
+IC = vec( [(q=q0, p=p0) for q0 in range(-1, 1, ni_dim), p0 in range(-1, 1, ni_dim)] )
+
+# Generating the solution array
+ni = ni_dim^2
+q  = zeros(Float64, ni, nt+1)
+p  = zeros(Float64, ni, nt+1)
+t  = collect(dt * range(0, nt, step=1))
+
+"""
+Turn a vector of numbers into a vector of `NamedTuple`s to be used by `ParametricDataLoader`.
+"""
+function turn_parameters_into_correct_format(t::AbstractVector, IC::AbstractVector{<:NamedTuple})
+	vec_of_params = NamedTuple[]
+	for time_step ∈ t
+		time_step == t[end] || push!(vec_of_params, (t = time_step, ))
+	end
+	vcat((vec_of_params for _ in axes(IC, 1))...)
+end
+
+for i in 1:nt+1
+	for j=1:ni
+		q[j,i] =  ( IC[j].p - Omega*F/(omega^2-Omega^2) )/ omega *sin(omega*t[i]) + IC[j].q*cos(omega*t[i]) + F/(omega^2-Omega^2)*sin(Omega*t[i])
+		p[j,i] = -omega^2*IC[j].q*sin(omega*t[i]) + ( IC[j].p - Omega*F/(omega^2-Omega^2) )*cos(omega*t[i]) + Omega*F/(omega^2-Omega^2)*cos(Omega*t[i])
+		# q[j,i] =  ( IC[j].p - Omega*F/(omega^2-Omega^2) )/ omega *exp(-omega*t[i]) - IC[j].q*exp(-omega*t[i]) + F/(omega^2-Omega^2)*exp(-Omega*t[i])
+		# p[j,i] = -omega^2*IC[j].q*exp(-omega*t[i]) + ( IC[j].p + Omega*F/(omega^2-Omega^2) )*exp(-omega*t[i]) - Omega*F/(omega^2-Omega^2)*exp(-Omega*t[i])
+	end
+
+end
+
+@doc raw"""
+Turn a `NamedTuple` of ``(q,p)`` data into two tensors of the correct format.
+
+This is the tricky part as the structure of the input array(s) needs to conform with the structure of the parameters.
+
+Here the data are rearranged in an array of size ``(n, 2, t_f - 1)`` where ``[t_0, t_1, \ldots, t_f]`` is the vector storing the time steps.
+
+If we deal with different initial conditions as well, we still put everything into the third (parameter) axis.
+
+# Example
+
+```jldoctest
+using GeometricMachineLearning
+
+q = [1. 2. 3.; 4. 5. 6.]
+p = [1.5 2.5 3.5; 4.5 5.5 6.5]
+qp = (q = q, p = p)
+turn_q_p_data_into_correct_format(qp)
+
+# output
+
+(q = [1.0 2.0; 4.0 5.0;;; 2.0 3.0; 5.0 6.0], p = [1.5 2.5; 4.5 5.5;;; 2.5 3.5; 5.5 6.5])
+```
+"""
+function turn_q_p_data_into_correct_format(qp::QPT2{T, 2}) where {T}
+	number_of_time_steps = size(qp.q, 2) - 1 # not counting t₀
+	number_of_initial_conditions = size(qp.q, 1)
+	q_array = zeros(T, 1, 2, number_of_time_steps * number_of_initial_conditions)
+	p_array = zeros(T, 1, 2, number_of_time_steps * number_of_initial_conditions)
+	for initial_condition_index ∈ 0:(number_of_initial_conditions - 1)
+		for time_index ∈ 1:number_of_time_steps
+			q_array[:, 1, initial_condition_index * number_of_time_steps + time_index] .= qp.q[initial_condition_index + 1, time_index]
+			q_array[:, 2, initial_condition_index * number_of_time_steps + time_index] .= qp.q[initial_condition_index + 1, time_index + 1]
+			p_array[:, 1, initial_condition_index * number_of_time_steps + time_index] .= qp.p[initial_condition_index + 1, time_index]
+			p_array[:, 2, initial_condition_index * number_of_time_steps + time_index] .= qp.p[initial_condition_index + 1, time_index + 1]
+		end
+	end
+	(q = q_array, p = p_array)
+end
+
+# SAVING TO FILE
+
+# h5 = h5open(path, "w")
+# write(h5, "q", q)
+# write(h5, "p", p)
+# write(h5, "t", t)
+# 
+# attrs(h5)["ni"] = ni
+# attrs(h5)["nt"] = nt
+# attrs(h5)["dt"] = dt
+# 
+# close(h5)
+
+"""
+This takes time as a single additional parameter (third axis).
+"""
+function load_time_dependent_harmonic_oscillator_with_parametric_data_loader(qp::QPT{T}, t::AbstractVector{T}, IC::AbstractVector) where {T}
+	qp_reformatted = turn_q_p_data_into_correct_format(qp)
+	t_reformatted = turn_parameters_into_correct_format(t, IC)
+	ParametricDataLoader(qp_reformatted, t_reformatted)
+end
+
+# This sets up the data loader
+dl = load_time_dependent_harmonic_oscillator_with_parametric_data_loader((q = q, p = p), t, IC)
+
+# This sets up the neural network
+width::Int = 1
+nhidden::Int = 1
+n_integrators::Int = 2
+# sigmoid_linear_unit(x::T) where {T<:Number} = x / (T(1) + exp(-x))
+arch = GeneralizedHamiltonianArchitecture(2; activation = tanh, width = width, nhidden = nhidden, n_integrators = n_integrators, parameters = turn_parameters_into_correct_format(t, IC)[1])
+nn = NeuralNetwork(arch)
+
+# This is where training starts
+batch_size = 32
+n_epochs = 200
+batch = Batch(batch_size)
+o = Optimizer(AdamOptimizer(), nn)
+loss = ParametricLoss()
+_pb = SymbolicPullback(nn, loss, turn_parameters_into_correct_format(t, IC)[1]);
+
+function train_network()
+	o(nn, dl, batch, n_epochs, loss, _pb)
+end
+
+loss_array = train_network()
+
+trajectory_number = 20
+
+# Testing the network
+initial_conditions = (q = q[trajectory_number, 1], p = p[trajectory_number, 1])
+n_steps = nt
+trajectory = (q = zeros(1, n_steps), p = zeros(1, n_steps))
+trajectory.q[:, 1] .= initial_conditions.q
+trajectory.p[:, 1] .= initial_conditions.p
+# note that we have to supply the parameters as a named tuple as well here:
+for t_step ∈ 0:(n_steps-2)
+	qp_temporary = nn.model((q = [trajectory.q[1, t_step+1]], p = [trajectory.p[1, t_step+1]]), (t = t[t_step+1],), nn.params)
+	trajectory.q[:, t_step+2] .= qp_temporary.q
+	trajectory.p[:, t_step+2] .= qp_temporary.p
+end
+
+fig = Figure()
+ax = Axis(fig[1,1])
+lines!(ax, trajectory.q[1,:]; label="nn")
+lines!(ax, q[trajectory_number,:]; label="analytic")