From b5b6ed3d0ba2dab352c6671608693075a8c1ad56 Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Tue, 2 Nov 2021 12:18:10 +0100
Subject: [PATCH 01/27] replace OMEinsum with batched_mul

---
 src/FourierLayer.jl | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index 6606b3d..28fdb6c 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -97,9 +97,10 @@ function (a::FourierLayer)(x::AbstractArray)
 
     # The linear path
     # x -> Wl
-    @ein linear[dim_out, batchsize, dim_grid] := Wl[dim_out, dim_in] *
-                            x[dim_in, batchsize, dim_grid]
-    linear += bl
+    linear = Wl ⊠ x .+ bl
+    #@ein linear[dim_out, batchsize, dim_grid] := Wl[dim_out, dim_in] *
+    #                        x[dim_in, batchsize, dim_grid]
+    #linear += bl
 
     # The convolution path
     # x -> 𝔉 -> Wf -> i𝔉
@@ -107,9 +108,10 @@ function (a::FourierLayer)(x::AbstractArray)
     fourier = 𝔉 * x
 
     # Multiply the weight matrix with the input using the Einstein convention
-    @ein fourier[dim_out, batchsize, dim_grid] := Wf[dim_in, dim_out, dim_grid] *
-                fourier[dim_in, batchsize, dim_grid]
-    fourier += bf
+    fourier = Wf ⊠ fourier .+ bf
+    #@ein fourier[dim_out, batchsize, dim_grid] := Wf[dim_in, dim_out, dim_grid] *
+    #            fourier[dim_in, batchsize, dim_grid]
+    #fourier += bf
     # Do the inverse transform
     fourier = i𝔉 * fourier
 

From 62d5d8e3bb192ae6de4869e4b6f6bdd55fe1e593 Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Tue, 2 Nov 2021 13:59:08 +0100
Subject: [PATCH 02/27] =?UTF-8?q?=F0=9F=9A=AE=20get=20rid=20of=20bias=20pa?=
 =?UTF-8?q?rams?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/FourierLayer.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index 28fdb6c..29e6271 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -66,8 +66,8 @@ function FourierLayer(in::Integer, out::Integer, batch::Integer, grid::Integer,
     # Initialize Linear weight matrix
     Wl = initl(out, in)
 
-    bf = Flux.create_bias(Wf, bias_fourier, out, batch, floor(Int, grid/2 + 1))
-    bl = Flux.create_bias(Wl, bias_linear, out, batch, grid)
+    bf = Flux.create_bias(Wf, bias_fourier, out, 1, floor(Int, grid/2 + 1))
+    bl = Flux.create_bias(Wl, bias_linear, out, 1, grid)
 
     # Pass the modes for output
     λ = modes

From 1b3986ef03f590dd92c67c553d5c102eb40a4059 Mon Sep 17 00:00:00 2001
From: pzimbrod <76100656+pzimbrod@users.noreply.github.com>
Date: Tue, 2 Nov 2021 14:32:17 +0100
Subject: [PATCH 03/27] remove old code

---
 src/FourierLayer.jl | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index 29e6271..3cc637b 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -98,9 +98,6 @@ function (a::FourierLayer)(x::AbstractArray)
     # The linear path
     # x -> Wl
     linear = Wl ⊠ x .+ bl
-    #@ein linear[dim_out, batchsize, dim_grid] := Wl[dim_out, dim_in] *
-    #                        x[dim_in, batchsize, dim_grid]
-    #linear += bl
 
     # The convolution path
     # x -> 𝔉 -> Wf -> i𝔉
@@ -109,9 +106,7 @@ function (a::FourierLayer)(x::AbstractArray)
 
     # Multiply the weight matrix with the input using the Einstein convention
     fourier = Wf ⊠ fourier .+ bf
-    #@ein fourier[dim_out, batchsize, dim_grid] := Wf[dim_in, dim_out, dim_grid] *
-    #            fourier[dim_in, batchsize, dim_grid]
-    #fourier += bf
+
     # Do the inverse transform
     fourier = i𝔉 * fourier
 

From 8cbd84cfc7d259b6293cb282839bf8673c467ff0 Mon Sep 17 00:00:00 2001
From: pzimbrod <76100656+pzimbrod@users.noreply.github.com>
Date: Tue, 2 Nov 2021 15:13:09 +0100
Subject: [PATCH 04/27] remove batch labels in burgers script

---
 test/burgers.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/burgers.jl b/test/burgers.jl
index df531d8..b4b1882 100644
--- a/test/burgers.jl
+++ b/test/burgers.jl
@@ -47,8 +47,8 @@ xtrain, xtest = permutedims(xtrain,(3,1,2)), permutedims(xtest,(3,1,2))
 ytrain, ytest = permutedims(ytrain,(3,1,2)), permutedims(ytest,(3,1,2))
 
 # Pass the data to the Flux DataLoader and give it a batch of 20
-train_loader = Flux.Data.DataLoader((data=xtrain, label=ytrain), batchsize=20)
-test_loader = Flux.Data.DataLoader((data=xtest, label=ytest), batchsize=20)
+train_loader = Flux.Data.DataLoader((xtrain, ytrain), batchsize=20)
+test_loader = Flux.Data.DataLoader((xtest, ytest), batchsize=20)
 
 # Set up the Fourier Layer
 # 128 in- and outputs, batch size 20 as given above, grid size 1024

From d4421e116deb9423ae8c42451154aab42f6feac1 Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Thu, 4 Nov 2021 11:15:05 +0100
Subject: [PATCH 05/27] add custom batched_mul! methods

---
 Project.toml          |  2 --
 src/FourierLayer.jl   |  2 +-
 src/NeuralOperator.jl |  4 +++-
 src/batched.jl        | 25 +++++++++++++++++++++++++
 4 files changed, 29 insertions(+), 4 deletions(-)
 create mode 100644 src/batched.jl

diff --git a/Project.toml b/Project.toml
index 217dcb7..f0a85f4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,7 +8,6 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
-OMEinsum = "ebe7aa44-baf0-506c-a96f-8464559b3922"
 PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
@@ -19,6 +18,5 @@ CUDA = "3"
 FFTW = "1"
 Flux = "0.12"
 MAT = "0.10"
-OMEinsum = "0.4, 0.6"
 PkgTemplates = "0.7"
 Revise = "3"
diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index 3cc637b..57cb406 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -104,7 +104,7 @@ function (a::FourierLayer)(x::AbstractArray)
     # Do the Fourier transform (FFT) along the last axis of the input
     fourier = 𝔉 * x
 
-    # Multiply the weight matrix with the input using the Einstein convention
+    # Multiply the weight matrix with the input using batched multiplication
     fourier = Wf ⊠ fourier .+ bf
 
     # Do the inverse transform
diff --git a/src/NeuralOperator.jl b/src/NeuralOperator.jl
index 0a5542f..e1ecc7b 100644
--- a/src/NeuralOperator.jl
+++ b/src/NeuralOperator.jl
@@ -4,14 +4,16 @@ using Base: Integer, ident_cmp, Float32
 using CUDA
 using Flux
 using FFTW
+using FFTW: assert_applicable, unsafe_execute!, FORWARD, BACKWARD, rFFTWPlan
 using Random
 using Random: AbstractRNG
 using Flux: nfan, glorot_uniform, batch
-using OMEinsum
 
 export FourierLayer
+export batched_mul!
 
 include("FourierLayer.jl")
 include("ComplexWeights.jl")
+include("batched.jl")
 
 end # module
diff --git a/src/batched.jl b/src/batched.jl
new file mode 100644
index 0000000..5b20ffe
--- /dev/null
+++ b/src/batched.jl
@@ -0,0 +1,25 @@
+"""
+Function overloads for `batched_mul` provided by `NNlib` so that FFTW Plans can be handled.
+
+For `mul!`, `FFTW.jl` already provides an implementation. However, we need to loop over the batch dimension.
+"""
+
+for (Tr,Tc) in ((:Float32,:(Complex{Float32})),(:Float64,:(Complex{Float64})))
+    # Note: use $FORWARD and $BACKWARD below because of issue #9775
+    @eval begin
+        function batched_mul!(y::StridedArray{$Tc}, p::rFFTWPlan{$Tr,$FORWARD}, x::StridedArray{$Tr})
+            assert_applicable(p, x[:,:,1], y[:,:,1]) # no need to check every batch dim
+            @inbounds for k ∈ 1:size(y,3)
+                @views unsafe_execute!(p, x[:,:,k], y[:,:,k])
+            end
+            return y
+        end
+        function batched_mul!(y::StridedArray{$Tr}, p::rFFTWPlan{$Tc,$BACKWARD}, x::StridedArray{$Tc})
+            assert_applicable(p, x[:,:,1], y[:,:,1]) # no need to check every batch dim
+            @inbounds for k ∈ 1:size(y,3)
+                @views unsafe_execute!(p, x[:,:,k], y[:,:,k])
+            end
+            return y
+        end
+    end
+end
\ No newline at end of file

From 4303e7bf651aba92521aa522c6c7f4249d10c7f1 Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Thu, 4 Nov 2021 13:07:53 +0100
Subject: [PATCH 06/27] fix CUDA detection for FFT plan

---
 src/FourierLayer.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index 57cb406..3e606eb 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -77,9 +77,9 @@ function FourierLayer(in::Integer, out::Integer, batch::Integer, grid::Integer,
     # First, an ugly workaround: FFTW.jl passes keywords that cuFFT complains about when the
     # constructor is wrapped with |> gpu. Instead, you have to pass a CuArray as input to plan_rfft
     # Ugh.
-    template𝔉 = Flux.use_cuda[] != Nothing ? Array{Float32}(undef,in,batch,grid) :
+    template𝔉 = Flux.use_cuda[] == false ? Array{Float32}(undef,in,batch,grid) :
                     CuArray{Float32}(undef,in,batch,grid)
-    templatei𝔉 = Flux.use_cuda[] != Nothing ? Array{Complex{Float32}}(undef,out,batch,floor(Int, grid/2 + 1)) :
+    templatei𝔉 = Flux.use_cuda[] == false ? Array{Complex{Float32}}(undef,out,batch,floor(Int, grid/2 + 1)) :
                     CuArray{Complex{Float32}}(undef,out,batch,floor(Int, grid/2 + 1))
 
     𝔉 = plan_rfft(template𝔉,3)

From f764a358c2099034ae6ae29b37df0ff07f92c150 Mon Sep 17 00:00:00 2001
From: pzimbrod <76100656+pzimbrod@users.noreply.github.com>
Date: Thu, 4 Nov 2021 15:10:03 +0100
Subject: [PATCH 07/27] add CUDA implementation of batched_mul!

---
 Project.toml          |  1 +
 src/FourierLayer.jl   |  8 ++++----
 src/NeuralOperator.jl |  1 -
 src/batched.jl        | 18 ++++++++++++++++--
 4 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/Project.toml b/Project.toml
index f0a85f4..10a8f41 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ authors = ["Patrick Zimbrod <76100656+pzimbrod@users.noreply.github.com>"]
 version = "0.1.0"
 
 [deps]
+AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index 3e606eb..e37f56d 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -77,10 +77,10 @@ function FourierLayer(in::Integer, out::Integer, batch::Integer, grid::Integer,
     # First, an ugly workaround: FFTW.jl passes keywords that cuFFT complains about when the
     # constructor is wrapped with |> gpu. Instead, you have to pass a CuArray as input to plan_rfft
     # Ugh.
-    template𝔉 = Flux.use_cuda[] == false ? Array{Float32}(undef,in,batch,grid) :
-                    CuArray{Float32}(undef,in,batch,grid)
-    templatei𝔉 = Flux.use_cuda[] == false ? Array{Complex{Float32}}(undef,out,batch,floor(Int, grid/2 + 1)) :
-                    CuArray{Complex{Float32}}(undef,out,batch,floor(Int, grid/2 + 1))
+    template𝔉 = Flux.use_cuda[] == true ? CuArray{Float32}(undef,in,batch,grid) :
+                    Array{Float32}(undef,in,batch,grid)
+    templatei𝔉 = Flux.use_cuda[] == true ? CuArray{Complex{Float32}}(undef,out,batch,floor(Int, grid/2 + 1)) :
+                    Array{Complex{Float32}}(undef,out,batch,floor(Int, grid/2 + 1))
 
     𝔉 = plan_rfft(template𝔉,3)
     i𝔉 = plan_irfft(templatei𝔉,grid, 3)
diff --git a/src/NeuralOperator.jl b/src/NeuralOperator.jl
index e1ecc7b..8dc72ad 100644
--- a/src/NeuralOperator.jl
+++ b/src/NeuralOperator.jl
@@ -10,7 +10,6 @@ using Random: AbstractRNG
 using Flux: nfan, glorot_uniform, batch
 
 export FourierLayer
-export batched_mul!
 
 include("FourierLayer.jl")
 include("ComplexWeights.jl")
diff --git a/src/batched.jl b/src/batched.jl
index 5b20ffe..87bfa91 100644
--- a/src/batched.jl
+++ b/src/batched.jl
@@ -1,20 +1,24 @@
+using CUDA: DenseCuArray
+using CUDA.CUFFT: CuFFTPlan
+
 """
 Function overloads for `batched_mul` provided by `NNlib` so that FFTW Plans can be handled.
 
 For `mul!`, `FFTW.jl` already provides an implementation. However, we need to loop over the batch dimension.
 """
 
+# Extension for CPU Arrays
 for (Tr,Tc) in ((:Float32,:(Complex{Float32})),(:Float64,:(Complex{Float64})))
     # Note: use $FORWARD and $BACKWARD below because of issue #9775
     @eval begin
-        function batched_mul!(y::StridedArray{$Tc}, p::rFFTWPlan{$Tr,$FORWARD}, x::StridedArray{$Tr})
+        function NNlib.batched_mul!(y::StridedArray{$Tc}, p::rFFTWPlan{$Tr,$FORWARD}, x::StridedArray{$Tr})
             assert_applicable(p, x[:,:,1], y[:,:,1]) # no need to check every batch dim
             @inbounds for k ∈ 1:size(y,3)
                 @views unsafe_execute!(p, x[:,:,k], y[:,:,k])
             end
             return y
         end
-        function batched_mul!(y::StridedArray{$Tr}, p::rFFTWPlan{$Tc,$BACKWARD}, x::StridedArray{$Tc})
+        function NNlib.batched_mul!(y::StridedArray{$Tr}, p::rFFTWPlan{$Tc,$BACKWARD}, x::StridedArray{$Tc})
             assert_applicable(p, x[:,:,1], y[:,:,1]) # no need to check every batch dim
             @inbounds for k ∈ 1:size(y,3)
                 @views unsafe_execute!(p, x[:,:,k], y[:,:,k])
@@ -22,4 +26,14 @@ for (Tr,Tc) in ((:Float32,:(Complex{Float32})),(:Float64,:(Complex{Float64})))
             return y
         end
     end
+end
+
+# Methods for GPU Arrays, borrowed from CUDA.jl -> fft.jl #490
+function NNlib.batched_mul!(y::DenseCuArray{Ty}, p::CuFFTPlan{T,K,false}, x::DenseCuArray{T}
+                           ) where {Ty,T,K}
+    assert_applicable(p,x,y)
+    @inbounds for k ∈ 1:size(y,3)
+        @views unsafe_execute!(p, x[:,:,k] ,y[:,:,k])
+    end
+    return y
 end
\ No newline at end of file

From 096bab9d8cfad3a4115a35bae885f1e60a307553 Mon Sep 17 00:00:00 2001
From: pzimbrod <76100656+pzimbrod@users.noreply.github.com>
Date: Thu, 4 Nov 2021 15:43:51 +0100
Subject: [PATCH 08/27] add proper (but slow) CUDA version of batched_mul!

---
 Project.toml   | 1 -
 src/batched.jl | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/Project.toml b/Project.toml
index 10a8f41..f0a85f4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,7 +4,6 @@ authors = ["Patrick Zimbrod <76100656+pzimbrod@users.noreply.github.com>"]
 version = "0.1.0"
 
 [deps]
-AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
diff --git a/src/batched.jl b/src/batched.jl
index 87bfa91..e20c848 100644
--- a/src/batched.jl
+++ b/src/batched.jl
@@ -31,9 +31,9 @@ end
 # Methods for GPU Arrays, borrowed from CUDA.jl -> fft.jl #490
 function NNlib.batched_mul!(y::DenseCuArray{Ty}, p::CuFFTPlan{T,K,false}, x::DenseCuArray{T}
                            ) where {Ty,T,K}
-    assert_applicable(p,x,y)
+    CUFFT.assert_applicable(p, x[:,:,1], y[:,:,1])
     @inbounds for k ∈ 1:size(y,3)
-        @views unsafe_execute!(p, x[:,:,k] ,y[:,:,k])
+        @views CUFFT.unsafe_execute!(p, x[:,:,k] ,y[:,:,k])
     end
     return y
 end
\ No newline at end of file

From dbc19642f9a6cf255f0761a1fedb79085ba095ca Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Fri, 5 Nov 2021 08:26:10 +0100
Subject: [PATCH 09/27] stick with non-planned FFT for now

---
 src/FourierLayer.jl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index e37f56d..161c386 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -94,6 +94,7 @@ Flux.@functor FourierLayer
 function (a::FourierLayer)(x::AbstractArray)
     # Assign the parameters
     Wf, Wl, bf, bl, σ, 𝔉, i𝔉 = a.weight_f, a.weight_l, a.bias_f, a.bias_l, a.σ, a.𝔉, a.i𝔉
+    grid = size(x,3)
 
     # The linear path
     # x -> Wl
@@ -102,13 +103,15 @@ function (a::FourierLayer)(x::AbstractArray)
     # The convolution path
     # x -> 𝔉 -> Wf -> i𝔉
     # Do the Fourier transform (FFT) along the last axis of the input
-    fourier = 𝔉 * x
+    # fourier = 𝔉 * x
+    fourier = rfft(x,3)
 
     # Multiply the weight matrix with the input using batched multiplication
     fourier = Wf ⊠ fourier .+ bf
 
     # Do the inverse transform
-    fourier = i𝔉 * fourier
+    # fourier = i𝔉 * fourier
+    fourier = irfft(fourier, grid, 3)
 
     # Return the activated sum
     return σ.(linear + fourier)

From ccb0e4332fa623d3a70758afc932c14011b20a6c Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Fri, 5 Nov 2021 10:46:11 +0100
Subject: [PATCH 10/27] extend training script

---
 test/burgers.jl | 45 ++++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/test/burgers.jl b/test/burgers.jl
index b4b1882..a3955cf 100644
--- a/test/burgers.jl
+++ b/test/burgers.jl
@@ -1,26 +1,28 @@
-using Flux: length, reshape
+using Flux: length, reshape, train!, @epochs
 using NeuralOperator, Flux, MAT
 
+device = gpu;
+
 # Read the data from MAT file and store it in a dict
-vars = matread("burgers_data_R10.mat")
+vars = matread("burgers_data_R10.mat") |> device
 
 # For trial purposes, we might want to train with different resolutions
 # So we sample only every n-th element
 subsample = 2^3;
 
 # create the x training array, according to our desired grid size
-xtrain = vars["a"][1:1000, 1:subsample:end];
+xtrain = vars["a"][1:1000, 1:subsample:end] |> device;
 # create the x test array
-xtest = vars["a"][end-99:end, 1:subsample:end];
+xtest = vars["a"][end-99:end, 1:subsample:end] |> device;
 
 # Create the y training array
-ytrain = vars["u"][1:1000, 1:subsample:end];
+ytrain = vars["u"][1:1000, 1:subsample:end] |> device;
 # Create the y test array
-ytest = vars["u"][end-99:end, 1:subsample:end];
+ytest = vars["u"][end-99:end, 1:subsample:end] |> device;
 
 # The data is missing grid data, so we create it
 # `collect` converts data type `range` into an array
-grid = collect(range(0, 1, length=length(xtrain[1,:])))
+grid = collect(range(0, 1, length=length(xtrain[1,:]))) |> device
 
 # Merge the created grid with the data
 # Output has the dims: batch x grid points x 2  (a(x), x)
@@ -29,40 +31,41 @@ grid = collect(range(0, 1, length=length(xtrain[1,:])))
 # and concatenate them along the newly created 3rd dim
 xtrain = cat(reshape(xtrain,(1000,1024,1)),
             reshape(repeat(grid,1000),(1000,1024,1));
-            dims=3)
+            dims=3) |> device
 ytrain = cat(reshape(ytrain,(1000,1024,1)),
             reshape(repeat(grid,1000),(1000,1024,1));
-            dims=3)
+            dims=3) |> device
 # Same treatment with the test data
 xtest = cat(reshape(xtest,(100,1024,1)),
             reshape(repeat(grid,100),(100,1024,1));
-            dims=3)
+            dims=3) |> device
 ytest = cat(reshape(ytest,(100,1024,1)),
             reshape(repeat(grid,100),(100,1024,1));
-            dims=3)
+            dims=3) |> device
 
 # Our net wants the input in the form (2,batch,grid), though,
 # So we permute
-xtrain, xtest = permutedims(xtrain,(3,1,2)), permutedims(xtest,(3,1,2))
-ytrain, ytest = permutedims(ytrain,(3,1,2)), permutedims(ytest,(3,1,2))
+xtrain, xtest = permutedims(xtrain,(3,1,2)), permutedims(xtest,(3,1,2)) |> device
+ytrain, ytest = permutedims(ytrain,(3,1,2)), permutedims(ytest,(3,1,2)) |> device
 
 # Pass the data to the Flux DataLoader and give it a batch of 20
-train_loader = Flux.Data.DataLoader((xtrain, ytrain), batchsize=20)
-test_loader = Flux.Data.DataLoader((xtest, ytest), batchsize=20)
+train_loader = Flux.Data.DataLoader((xtrain, ytrain), batchsize=20) |> device
+test_loader = Flux.Data.DataLoader((xtest, ytest), batchsize=20) |> device
 
 # Set up the Fourier Layer
 # 128 in- and outputs, batch size 20 as given above, grid size 1024
 # 16 modes to keep, σ activation on the gpu
-layer = FourierLayer(128,128,20,1024,16,σ)
+layer = FourierLayer(128,128,20,1024,16,σ) |> device
 
 # The whole architecture
 # linear transform into the latent space, 4 Fourier Layers,
 # then transform it back
 model = Chain(Dense(2,128;bias=false), layer, layer, layer, layer,
-                Dense(128,2;bias=false))
+                Dense(128,2;bias=false)) |> device
 
 # We use the ADAM optimizer for training
-opt = ADAM()
+learning_rate = 0.001
+opt = ADAM(learning_rate)
 
 # Specify the model parameters
 parameters = params(model)
@@ -71,4 +74,8 @@ parameters = params(model)
 loss(x,y) = Flux.Losses.mse(model(x),y)
 
 # Define a callback function that gives some output during training
-evalcb() = @show(loss(x,y))
\ No newline at end of file
+data = [(xtrain, ytrain)] |> device;
+evalcb() = @show(loss(xtrain,ytrain))
+
+# Do the training loop
+Flux.@epochs 500 train!(loss, parameters, data, opt, cb = evalcb)
\ No newline at end of file

From 6aa630ba116938f579bf246577676745e4529dde Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Fri, 5 Nov 2021 11:48:24 +0100
Subject: [PATCH 11/27] pre-allocate fourier path

---
 src/FourierLayer.jl | 47 +++++++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index 161c386..bcba15f 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -28,7 +28,7 @@ The output would be the diffused variable at a later time, which makes the outpu
 `2 x 200 x 64` as well.
 """
 struct FourierLayer{F, Mf<:AbstractArray, Ml<:AbstractArray, Bf<:AbstractArray,
-                Bl<:AbstractArray, fplan, ifplan,
+                Bl<:AbstractArray, fplan<:AbstractArray, ifplan<:AbstractArray,
                 Modes<:Int}
     weight_f::Mf
     weight_l::Ml
@@ -36,14 +36,15 @@ struct FourierLayer{F, Mf<:AbstractArray, Ml<:AbstractArray, Bf<:AbstractArray,
     bias_l::Bl
     𝔉::fplan
     i𝔉::ifplan
+    linear::ifplan
     σ::F
     λ::Modes
     # Constructor for the entire fourier layer
-    function FourierLayer(Wf::Mf, Wl::Ml, bf::Bf, bl::Bl, 𝔉::fplan, i𝔉::ifplan,
+    function FourierLayer(Wf::Mf, Wl::Ml, bf::Bf, bl::Bl, 𝔉::fplan, i𝔉::ifplan, linear::ifplan,
         σ::F = identity, λ::Modes = 12) where {Mf<:AbstractArray, Ml<:AbstractArray,
-        Bf<:AbstractArray, Bl<:AbstractArray, fplan,
-        ifplan, F, Modes<:Int}
-        new{F,Mf,Ml,Bf,Bl,fplan,ifplan,Modes}(Wf, Wl, bf, bl, 𝔉, i𝔉, σ, λ)
+        Bf<:AbstractArray, Bl<:AbstractArray, fplan<:AbstractArray,
+        ifplan<:AbstractArray, F, Modes<:Int}
+        new{F,Mf,Ml,Bf,Bl,fplan,ifplan,Modes}(Wf, Wl, bf, bl, 𝔉, i𝔉, linear, σ, λ)
     end
 end
 
@@ -77,23 +78,30 @@ function FourierLayer(in::Integer, out::Integer, batch::Integer, grid::Integer,
     # First, an ugly workaround: FFTW.jl passes keywords that cuFFT complains about when the
     # constructor is wrapped with |> gpu. Instead, you have to pass a CuArray as input to plan_rfft
     # Ugh.
-    template𝔉 = Flux.use_cuda[] == true ? CuArray{Float32}(undef,in,batch,grid) :
-                    Array{Float32}(undef,in,batch,grid)
-    templatei𝔉 = Flux.use_cuda[] == true ? CuArray{Complex{Float32}}(undef,out,batch,floor(Int, grid/2 + 1)) :
-                    Array{Complex{Float32}}(undef,out,batch,floor(Int, grid/2 + 1))
-
-    𝔉 = plan_rfft(template𝔉,3)
-    i𝔉 = plan_irfft(templatei𝔉,grid, 3)
-
-    return FourierLayer(Wf, Wl, bf, bl, 𝔉, i𝔉, σ, λ)
+    #template𝔉 = Flux.use_cuda[] == true ? CuArray{Float32}(undef,in,batch,grid) :
+    #                Array{Float32}(undef,in,batch,grid)
+    #templatei𝔉 = Flux.use_cuda[] == true ? CuArray{Complex{Float32}}(undef,out,batch,floor(Int, grid/2 + 1)) :
+    #                Array{Complex{Float32}}(undef,out,batch,floor(Int, grid/2 + 1))
+
+    #𝔉 = plan_rfft(template𝔉,3)
+    #i𝔉 = plan_irfft(templatei𝔉,grid, 3)
+    𝔉 = similar(Wf, out, batch, floor(Int, grid/2 + 1))
+    i𝔉 = similar(Wf, out, batch, grid)
+    linear = similar(i𝔉)
+
+    return FourierLayer(Wf, Wl, bf, bl, 𝔉, i𝔉, linear, σ, λ)
 end
 
+# Only train the weight array with non-zero modes
 Flux.@functor FourierLayer
+#Flux.trainable(a::FourierLayer) = (a.weight_f[:,:,1:a.λ], a.weight_l, a.bias_f, a.bias_l)
 
 # The actual layer that does stuff
 function (a::FourierLayer)(x::AbstractArray)
     # Assign the parameters
-    Wf, Wl, bf, bl, σ, 𝔉, i𝔉 = a.weight_f, a.weight_l, a.bias_f, a.bias_l, a.σ, a.𝔉, a.i𝔉
+    Wf, Wl, bf, bl, σ, = a.weight_f, a.weight_l, a.bias_f, a.bias_l, a.σ
+    𝔉, i𝔉 = a.𝔉, a.i𝔉
+    linear = a.linear
     grid = size(x,3)
 
     # The linear path
@@ -104,17 +112,18 @@ function (a::FourierLayer)(x::AbstractArray)
     # x -> 𝔉 -> Wf -> i𝔉
     # Do the Fourier transform (FFT) along the last axis of the input
     # fourier = 𝔉 * x
-    fourier = rfft(x,3)
+    𝔉 = rfft(x,3)
 
     # Multiply the weight matrix with the input using batched multiplication
-    fourier = Wf ⊠ fourier .+ bf
+    𝔉 = Wf ⊠ 𝔉 .+ bf
 
     # Do the inverse transform
     # fourier = i𝔉 * fourier
-    fourier = irfft(fourier, grid, 3)
+    i𝔉 = irfft(𝔉, grid, 3)
 
     # Return the activated sum
-    return σ.(linear + fourier)
+    # return σ.((Wl ⊠ x .+ bl) + irfft((Wf ⊠ rfft(x,3) .+ bf),grid,3))
+    return σ.(linear + i𝔉)
 end
 
 # Overload function to deal with higher-dimensional input arrays

From b85e6930bce833dc66a929e8e68e89671c724569 Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Mon, 8 Nov 2021 08:55:45 +0100
Subject: [PATCH 12/27] make only fourier modes trainable

---
 src/FourierLayer.jl | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index bcba15f..a2cc98c 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -30,10 +30,10 @@ The output would be the diffused variable at a later time, which makes the outpu
 struct FourierLayer{F, Mf<:AbstractArray, Ml<:AbstractArray, Bf<:AbstractArray,
                 Bl<:AbstractArray, fplan<:AbstractArray, ifplan<:AbstractArray,
                 Modes<:Int}
-    weight_f::Mf
-    weight_l::Ml
-    bias_f::Bf
-    bias_l::Bl
+    Wf::Mf
+    Wl::Ml
+    bf::Bf
+    bl::Bl
     𝔉::fplan
     i𝔉::ifplan
     linear::ifplan
@@ -86,20 +86,21 @@ function FourierLayer(in::Integer, out::Integer, batch::Integer, grid::Integer,
     #𝔉 = plan_rfft(template𝔉,3)
     #i𝔉 = plan_irfft(templatei𝔉,grid, 3)
     𝔉 = similar(Wf, out, batch, floor(Int, grid/2 + 1))
-    i𝔉 = similar(Wf, out, batch, grid)
+    i𝔉 = similar(Wl, out, batch, grid)
     linear = similar(i𝔉)
 
     return FourierLayer(Wf, Wl, bf, bl, 𝔉, i𝔉, linear, σ, λ)
 end
 
 # Only train the weight array with non-zero modes
-Flux.@functor FourierLayer
-#Flux.trainable(a::FourierLayer) = (a.weight_f[:,:,1:a.λ], a.weight_l, a.bias_f, a.bias_l)
+Flux.@functor FourierLayer (Wf, Wl, bf, bl)
+Flux.trainable(a::FourierLayer) = (a.weight_f[:,:,1:a.λ], a.weight_l,
+                                    a.bias_f[:,:,1:a.λ], a.bias_l)
 
 # The actual layer that does stuff
 function (a::FourierLayer)(x::AbstractArray)
     # Assign the parameters
-    Wf, Wl, bf, bl, σ, = a.weight_f, a.weight_l, a.bias_f, a.bias_l, a.σ
+    Wf, Wl, bf, bl, σ, = a.Wf, a.Wl, a.bf, a.bl, a.σ
     𝔉, i𝔉 = a.𝔉, a.i𝔉
     linear = a.linear
     grid = size(x,3)
@@ -131,11 +132,10 @@ end
 
 # Print nicely
 function Base.show(io::IO, l::FourierLayer)
-    print(io, "FourierLayer with\nConvolution path: (", size(l.weight_f, 2), ", ",
-            size(l.weight_f, 1), ", ", size(l.weight_f, 3))
+    print(io, "FourierLayer with\nConvolution path: (", size(l.Wf, 2), ", ",
+            size(l.Wf, 1), ", ", size(l.Wf, 3))
     print(io, ")\n")
-    print(io, "Linear path: (", size(l.weight_l, 2), ", ", size(l.weight_l, 1), ", ",
-            size(l.weight_l, 3))
+    print(io, "Linear path: (", size(l.Wl, 2), ", ", size(l.Wl, 1))
     print(io, ")\n")
     print(io, "Fourier modes: ", l.λ)
     print(io, "\n")

From d66d0b9caec27325856cd2af8884dc1b0c107c7e Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Mon, 8 Nov 2021 09:00:57 +0100
Subject: [PATCH 13/27] update test syntax

---
 test/fourierlayer.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/fourierlayer.jl b/test/fourierlayer.jl
index 0b48159..d2acaa6 100644
--- a/test/fourierlayer.jl
+++ b/test/fourierlayer.jl
@@ -2,8 +2,8 @@ using Test, Random, Flux
 
 @testset "FourierLayer" begin
     # Test the proper construction
-    @test size(FourierLayer(128, 64, 200, 100, 20).weight_f) == (128, 64, 51)
-    @test size(FourierLayer(128, 64, 200, 100, 20).weight_l) == (64, 128)
+    @test size(FourierLayer(128, 64, 200, 100, 20).Wf) == (128, 64, 51)
+    @test size(FourierLayer(128, 64, 200, 100, 20).Wl) == (64, 128)
     #@test size(FourierLayer(10, 100).bias_f) == (51,)
     #@test size(FourierLayer(10, 100).bias_l) == (100,)
 

From 353dcfa938ce0dd0f5c4f178089a8ee9ca744ba9 Mon Sep 17 00:00:00 2001
From: pzimbrod <76100656+pzimbrod@users.noreply.github.com>
Date: Mon, 8 Nov 2021 09:31:59 +0100
Subject: [PATCH 14/27] correct trainable parameter syntax

---
 src/batched.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/batched.jl b/src/batched.jl
index e20c848..1aff86d 100644
--- a/src/batched.jl
+++ b/src/batched.jl
@@ -36,4 +36,4 @@ function NNlib.batched_mul!(y::DenseCuArray{Ty}, p::CuFFTPlan{T,K,false}, x::Den
         @views CUFFT.unsafe_execute!(p, x[:,:,k] ,y[:,:,k])
     end
     return y
-end
\ No newline at end of file
+end

From f4deb179b44c354929ee08db38de1884921feba1 Mon Sep 17 00:00:00 2001
From: pzimbrod <76100656+pzimbrod@users.noreply.github.com>
Date: Mon, 8 Nov 2021 09:32:12 +0100
Subject: [PATCH 15/27] correct trainable parameter syntax

---
 src/FourierLayer.jl | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index a2cc98c..01c8caa 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -28,8 +28,7 @@ The output would be the diffused variable at a later time, which makes the outpu
 `2 x 200 x 64` as well.
 """
 struct FourierLayer{F, Mf<:AbstractArray, Ml<:AbstractArray, Bf<:AbstractArray,
-                Bl<:AbstractArray, fplan<:AbstractArray, ifplan<:AbstractArray,
-                Modes<:Int}
+                Bl<:AbstractArray, fplan<:AbstractArray, ifplan<:AbstractArray}
     Wf::Mf
     Wl::Ml
     bf::Bf
@@ -38,13 +37,13 @@ struct FourierLayer{F, Mf<:AbstractArray, Ml<:AbstractArray, Bf<:AbstractArray,
     i𝔉::ifplan
     linear::ifplan
     σ::F
-    λ::Modes
+    λ::Int
     # Constructor for the entire fourier layer
     function FourierLayer(Wf::Mf, Wl::Ml, bf::Bf, bl::Bl, 𝔉::fplan, i𝔉::ifplan, linear::ifplan,
-        σ::F = identity, λ::Modes = 12) where {Mf<:AbstractArray, Ml<:AbstractArray,
+        σ::F = identity, λ::Int = 12) where {Mf<:AbstractArray, Ml<:AbstractArray,
         Bf<:AbstractArray, Bl<:AbstractArray, fplan<:AbstractArray,
-        ifplan<:AbstractArray, F, Modes<:Int}
-        new{F,Mf,Ml,Bf,Bl,fplan,ifplan,Modes}(Wf, Wl, bf, bl, 𝔉, i𝔉, linear, σ, λ)
+        ifplan<:AbstractArray, F}
+        new{F,Mf,Ml,Bf,Bl,fplan,ifplan}(Wf, Wl, bf, bl, 𝔉, i𝔉, linear, σ, λ)
     end
 end
 
@@ -94,8 +93,7 @@ end
 
 # Only train the weight array with non-zero modes
 Flux.@functor FourierLayer (Wf, Wl, bf, bl)
-Flux.trainable(a::FourierLayer) = (a.weight_f[:,:,1:a.λ], a.weight_l,
-                                    a.bias_f[:,:,1:a.λ], a.bias_l)
+Flux.trainable(a::FourierLayer) = (a.Wf[:,:,1:a.λ], a.Wl, a.bf[:,:,1:a.λ], a.bl)
 
 # The actual layer that does stuff
 function (a::FourierLayer)(x::AbstractArray)

From a925df7075216833753818ddc3d47933ebaf0d5a Mon Sep 17 00:00:00 2001
From: pzimbrod <76100656+pzimbrod@users.noreply.github.com>
Date: Mon, 8 Nov 2021 15:24:10 +0100
Subject: [PATCH 16/27] =?UTF-8?q?=F0=9F=91=A8=E2=80=8D=F0=9F=8F=AB=20intro?=
 =?UTF-8?q?duce=20strict=20typing=20for=20struct?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/FourierLayer.jl | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index 01c8caa..0689c3d 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -27,23 +27,24 @@ So the input takes the dimension `2 x 200 x 64`.
 The output would be the diffused variable at a later time, which makes the output of the form
 `2 x 200 x 64` as well.
 """
-struct FourierLayer{F, Mf<:AbstractArray, Ml<:AbstractArray, Bf<:AbstractArray,
-                Bl<:AbstractArray, fplan<:AbstractArray, ifplan<:AbstractArray}
-    Wf::Mf
-    Wl::Ml
-    bf::Bf
-    bl::Bl
-    𝔉::fplan
-    i𝔉::ifplan
-    linear::ifplan
+struct FourierLayer{F,Tc<:Complex{<:AbstractFloat},Tr<:AbstractFloat}
+    # F: Activation, Tc/Tr: Complex/Real eltype
+    Wf::AbstractArray{Tc,3}
+    Wl::AbstractMatrix{Tr}
+    bf::AbstractArray{Tc,3}
+    bl::AbstractArray{Tr,3}
+    𝔉::AbstractArray{Tc,3}
+    i𝔉::AbstractArray{Tr,3}
+    linear::AbstractArray{Tr,3}
     σ::F
     λ::Int
     # Constructor for the entire fourier layer
-    function FourierLayer(Wf::Mf, Wl::Ml, bf::Bf, bl::Bl, 𝔉::fplan, i𝔉::ifplan, linear::ifplan,
-        σ::F = identity, λ::Int = 12) where {Mf<:AbstractArray, Ml<:AbstractArray,
-        Bf<:AbstractArray, Bl<:AbstractArray, fplan<:AbstractArray,
-        ifplan<:AbstractArray, F}
-        new{F,Mf,Ml,Bf,Bl,fplan,ifplan}(Wf, Wl, bf, bl, 𝔉, i𝔉, linear, σ, λ)
+    function FourierLayer(
+        Wf::AbstractArray{Tc,3}, Wl::AbstractMatrix{Tr}, bf::AbstractArray{Tc,3},
+        bl::AbstractArray{Tr,3}, 𝔉::AbstractArray{Tc,3}, i𝔉::AbstractArray{Tr,3},
+        linear::AbstractArray{Tr,3}, σ::F = identity, λ::Int = 12
+        ) where {F,Tc<:Complex{<:AbstractFloat},Tr<:AbstractFloat}
+        new{F,Tc,Tr}(Wf, Wl, bf, bl, 𝔉, i𝔉, linear, σ, λ)
     end
 end
 

From adcbd4c6c8968842fa5bc8328166fd148d410f60 Mon Sep 17 00:00:00 2001
From: pzimbrod <76100656+pzimbrod@users.noreply.github.com>
Date: Mon, 8 Nov 2021 15:29:47 +0100
Subject: [PATCH 17/27] use the allocated arrays

---
 src/FourierLayer.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index 0689c3d..98c79df 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -106,20 +106,20 @@ function (a::FourierLayer)(x::AbstractArray)
 
     # The linear path
     # x -> Wl
-    linear = Wl ⊠ x .+ bl
+    linear .= Wl ⊠ x .+ bl
 
     # The convolution path
     # x -> 𝔉 -> Wf -> i𝔉
     # Do the Fourier transform (FFT) along the last axis of the input
     # fourier = 𝔉 * x
-    𝔉 = rfft(x,3)
+    𝔉 .= rfft(x,3)
 
     # Multiply the weight matrix with the input using batched multiplication
-    𝔉 = Wf ⊠ 𝔉 .+ bf
+    𝔉 .= Wf ⊠ 𝔉 .+ bf
 
     # Do the inverse transform
     # fourier = i𝔉 * fourier
-    i𝔉 = irfft(𝔉, grid, 3)
+    i𝔉 .= irfft(𝔉, grid, 3)
 
     # Return the activated sum
     # return σ.((Wl ⊠ x .+ bl) + irfft((Wf ⊠ rfft(x,3) .+ bf),grid,3))

From 269fd9b6a386c906c94702142640cec539ec18ba Mon Sep 17 00:00:00 2001
From: pzimbrod <76100656+pzimbrod@users.noreply.github.com>
Date: Mon, 8 Nov 2021 15:49:53 +0100
Subject: [PATCH 18/27] revert dot operations as this breaks CUDA support

---
 src/FourierLayer.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index 98c79df..0689c3d 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -106,20 +106,20 @@ function (a::FourierLayer)(x::AbstractArray)
 
     # The linear path
     # x -> Wl
-    linear .= Wl ⊠ x .+ bl
+    linear = Wl ⊠ x .+ bl
 
     # The convolution path
     # x -> 𝔉 -> Wf -> i𝔉
     # Do the Fourier transform (FFT) along the last axis of the input
     # fourier = 𝔉 * x
-    𝔉 .= rfft(x,3)
+    𝔉 = rfft(x,3)
 
     # Multiply the weight matrix with the input using batched multiplication
-    𝔉 .= Wf ⊠ 𝔉 .+ bf
+    𝔉 = Wf ⊠ 𝔉 .+ bf
 
     # Do the inverse transform
     # fourier = i𝔉 * fourier
-    i𝔉 .= irfft(𝔉, grid, 3)
+    i𝔉 = irfft(𝔉, grid, 3)
 
     # Return the activated sum
     # return σ.((Wl ⊠ x .+ bl) + irfft((Wf ⊠ rfft(x,3) .+ bf),grid,3))

From 6a9bbcbb3ea4e1473c7a9433f8cae9111a680297 Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Tue, 9 Nov 2021 07:37:31 +0100
Subject: [PATCH 19/27] some cleanups, fix # of output dims in burgers.jl

---
 src/FourierLayer.jl | 17 ++---------------
 test/burgers.jl     |  2 +-
 2 files changed, 3 insertions(+), 16 deletions(-)

diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index 0689c3d..cbb7faf 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -67,24 +67,14 @@ function FourierLayer(in::Integer, out::Integer, batch::Integer, grid::Integer,
     # Initialize Linear weight matrix
     Wl = initl(out, in)
 
+    # create the biases with one singleton dimension
     bf = Flux.create_bias(Wf, bias_fourier, out, 1, floor(Int, grid/2 + 1))
     bl = Flux.create_bias(Wl, bias_linear, out, 1, grid)
 
     # Pass the modes for output
     λ = modes
 
-    # Create linear operators for the FFT and IFFT for efficiency
-    # So that it has to be only pre-allocated once
-    # First, an ugly workaround: FFTW.jl passes keywords that cuFFT complains about when the
-    # constructor is wrapped with |> gpu. Instead, you have to pass a CuArray as input to plan_rfft
-    # Ugh.
-    #template𝔉 = Flux.use_cuda[] == true ? CuArray{Float32}(undef,in,batch,grid) :
-    #                Array{Float32}(undef,in,batch,grid)
-    #templatei𝔉 = Flux.use_cuda[] == true ? CuArray{Complex{Float32}}(undef,out,batch,floor(Int, grid/2 + 1)) :
-    #                Array{Complex{Float32}}(undef,out,batch,floor(Int, grid/2 + 1))
-
-    #𝔉 = plan_rfft(template𝔉,3)
-    #i𝔉 = plan_irfft(templatei𝔉,grid, 3)
+    # Pre-allocate the interim arrays for the forward pass
     𝔉 = similar(Wf, out, batch, floor(Int, grid/2 + 1))
     i𝔉 = similar(Wl, out, batch, grid)
     linear = similar(i𝔉)
@@ -111,18 +101,15 @@ function (a::FourierLayer)(x::AbstractArray)
     # The convolution path
     # x -> 𝔉 -> Wf -> i𝔉
     # Do the Fourier transform (FFT) along the last axis of the input
-    # fourier = 𝔉 * x
     𝔉 = rfft(x,3)
 
     # Multiply the weight matrix with the input using batched multiplication
     𝔉 = Wf ⊠ 𝔉 .+ bf
 
     # Do the inverse transform
-    # fourier = i𝔉 * fourier
     i𝔉 = irfft(𝔉, grid, 3)
 
     # Return the activated sum
-    # return σ.((Wl ⊠ x .+ bl) + irfft((Wf ⊠ rfft(x,3) .+ bf),grid,3))
     return σ.(linear + i𝔉)
 end
 
diff --git a/test/burgers.jl b/test/burgers.jl
index a3955cf..0e826bc 100644
--- a/test/burgers.jl
+++ b/test/burgers.jl
@@ -61,7 +61,7 @@ layer = FourierLayer(128,128,20,1024,16,σ) |> device
 # linear transform into the latent space, 4 Fourier Layers,
 # then transform it back
 model = Chain(Dense(2,128;bias=false), layer, layer, layer, layer,
-                Dense(128,2;bias=false)) |> device
+                Dense(128,1;bias=false)) |> device
 
 # We use the ADAM optimizer for training
 learning_rate = 0.001

From 1383209d532e8a3bfeac8848063804a5b6f86981 Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Tue, 9 Nov 2021 09:17:38 +0100
Subject: [PATCH 20/27] use allocation in batch multiplication, fix gpu
 transfer of arrays

---
 Project.toml        |  1 +
 src/FourierLayer.jl | 19 +++++++++----------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/Project.toml b/Project.toml
index f0a85f4..83e7b6d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,6 +8,7 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index cbb7faf..a2faa29 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -30,7 +30,7 @@ The output would be the diffused variable at a later time, which makes the outpu
 struct FourierLayer{F,Tc<:Complex{<:AbstractFloat},Tr<:AbstractFloat}
     # F: Activation, Tc/Tr: Complex/Real eltype
     Wf::AbstractArray{Tc,3}
-    Wl::AbstractMatrix{Tr}
+    Wl::AbstractArray{Tr,3}
     bf::AbstractArray{Tc,3}
     bl::AbstractArray{Tr,3}
     𝔉::AbstractArray{Tc,3}
@@ -40,7 +40,7 @@ struct FourierLayer{F,Tc<:Complex{<:AbstractFloat},Tr<:AbstractFloat}
     λ::Int
     # Constructor for the entire fourier layer
     function FourierLayer(
-        Wf::AbstractArray{Tc,3}, Wl::AbstractMatrix{Tr}, bf::AbstractArray{Tc,3},
+        Wf::AbstractArray{Tc,3}, Wl::AbstractArray{Tr,3}, bf::AbstractArray{Tc,3},
         bl::AbstractArray{Tr,3}, 𝔉::AbstractArray{Tc,3}, i𝔉::AbstractArray{Tr,3},
         linear::AbstractArray{Tr,3}, σ::F = identity, λ::Int = 12
         ) where {F,Tc<:Complex{<:AbstractFloat},Tr<:AbstractFloat}
@@ -65,7 +65,7 @@ function FourierLayer(in::Integer, out::Integer, batch::Integer, grid::Integer,
     Wf = pad_zeros(Wf, (0, floor(Int, grid/2 + 1) - modes), dims=3)
 
     # Initialize Linear weight matrix
-    Wl = initl(out, in)
+    Wl = initl(out, in, 1)
 
     # create the biases with one singleton dimension
     bf = Flux.create_bias(Wf, bias_fourier, out, 1, floor(Int, grid/2 + 1))
@@ -73,17 +73,16 @@ function FourierLayer(in::Integer, out::Integer, batch::Integer, grid::Integer,
 
     # Pass the modes for output
     λ = modes
-
-    # Pre-allocate the interim arrays for the forward pass
-    𝔉 = similar(Wf, out, batch, floor(Int, grid/2 + 1))
-    i𝔉 = similar(Wl, out, batch, grid)
+# Pre-allocate the interim arrays for the forward pass
+    𝔉 = Array{ComplexF32}(undef, out, batch, floor(Int, grid/2 + 1))
+    i𝔉 = Array{Float32}(undef, out, batch, grid)
     linear = similar(i𝔉)
 
     return FourierLayer(Wf, Wl, bf, bl, 𝔉, i𝔉, linear, σ, λ)
 end
 
 # Only train the weight array with non-zero modes
-Flux.@functor FourierLayer (Wf, Wl, bf, bl)
+Flux.@functor FourierLayer 
 Flux.trainable(a::FourierLayer) = (a.Wf[:,:,1:a.λ], a.Wl, a.bf[:,:,1:a.λ], a.bl)
 
 # The actual layer that does stuff
@@ -96,7 +95,7 @@ function (a::FourierLayer)(x::AbstractArray)
 
     # The linear path
     # x -> Wl
-    linear = Wl ⊠ x .+ bl
+    linear .= batched_mul!(linear, Wl, x) .+ bl
 
     # The convolution path
     # x -> 𝔉 -> Wf -> i𝔉
@@ -104,7 +103,7 @@ function (a::FourierLayer)(x::AbstractArray)
     𝔉 = rfft(x,3)
 
     # Multiply the weight matrix with the input using batched multiplication
-    𝔉 = Wf ⊠ 𝔉 .+ bf
+    𝔉 .= batched_mul!(𝔉, Wf, 𝔉) .+ bf
 
     # Do the inverse transform
     i𝔉 = irfft(𝔉, grid, 3)

From 59ba985fdd4a1d9ce4752adf052aa7f9e8a7b354 Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Tue, 9 Nov 2021 09:23:12 +0100
Subject: [PATCH 21/27] fix test dim of Wl

---
 test/fourierlayer.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/fourierlayer.jl b/test/fourierlayer.jl
index d2acaa6..a6c5d01 100644
--- a/test/fourierlayer.jl
+++ b/test/fourierlayer.jl
@@ -3,7 +3,7 @@ using Test, Random, Flux
 @testset "FourierLayer" begin
     # Test the proper construction
     @test size(FourierLayer(128, 64, 200, 100, 20).Wf) == (128, 64, 51)
-    @test size(FourierLayer(128, 64, 200, 100, 20).Wl) == (64, 128)
+    @test size(FourierLayer(128, 64, 200, 100, 20).Wl) == (64, 128, 1)
     #@test size(FourierLayer(10, 100).bias_f) == (51,)
     #@test size(FourierLayer(10, 100).bias_l) == (100,)
 

From 6e0c24dbd5b016ae2c6b44588d24a8f1ffafee5f Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Tue, 9 Nov 2021 11:18:32 +0100
Subject: [PATCH 22/27] swap batch dim to ensure compatibility with
 Flux.DataLoader

---
 benchmarks/benchFourierLayer.jl |  2 +-
 src/FourierLayer.jl             | 23 +++++++++++------------
 test/burgers.jl                 | 11 +++++------
 3 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/benchmarks/benchFourierLayer.jl b/benchmarks/benchFourierLayer.jl
index 6460026..71e6e31 100644
--- a/benchmarks/benchFourierLayer.jl
+++ b/benchmarks/benchFourierLayer.jl
@@ -1,7 +1,7 @@
 # Stolen from Flux as well
 
 for n in [2, 20, 200, 2000]
-    x = randn(Float32, n, 2000, n)
+    x = randn(Float32, n, 2000, 100)
     model = FourierLayer(n, n, 2000, 100, 16)
     println("CPU n=$n")
     run_benchmark(model, x, cuda=false)
diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index a2faa29..9641b63 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -12,9 +12,9 @@ The output though only contains the relevant Fourier modes with the rest padded
 in the last axis as a result of the filtering.
 
 The input `x` should be a 3D tensor of shape
-(num parameters (`in`) x batch size (`batch`) x num grid points (`grid`))
+(num parameters (`in`) x num grid points (`grid`) x batch size (`batch`))
 The output `y` will be a 3D tensor of shape
-(`out` x batch size (`batch`) x num grid points (`grid`))
+(`out` x num grid points (`grid`) x batch size (`batch`))
 
 You can specify biases for the paths as you like, though the convolutional path is
 originally not intended to perform an affine transformation.
@@ -23,7 +23,7 @@ originally not intended to perform an affine transformation.
 Say you're considering a 1D diffusion problem on a 64 point grid. The input is comprised
 of the grid points as well as the IC at this point.
 The data consists of 200 instances of the solution.
-So the input takes the dimension `2 x 200 x 64`.
+So the input takes the dimension `2 x 64 x 200`.
 The output would be the diffused variable at a later time, which makes the output of the form
 `2 x 200 x 64` as well.
 """
@@ -69,13 +69,13 @@ function FourierLayer(in::Integer, out::Integer, batch::Integer, grid::Integer,
 
     # create the biases with one singleton dimension
     bf = Flux.create_bias(Wf, bias_fourier, out, 1, floor(Int, grid/2 + 1))
-    bl = Flux.create_bias(Wl, bias_linear, out, 1, grid)
+    bl = Flux.create_bias(Wl, bias_linear, out, grid, 1)
 
     # Pass the modes for output
     λ = modes
-# Pre-allocate the interim arrays for the forward pass
+    # Pre-allocate the interim arrays for the forward pass
     𝔉 = Array{ComplexF32}(undef, out, batch, floor(Int, grid/2 + 1))
-    i𝔉 = Array{Float32}(undef, out, batch, grid)
+    i𝔉 = Array{Float32}(undef, out, grid, batch)
     linear = similar(i𝔉)
 
     return FourierLayer(Wf, Wl, bf, bl, 𝔉, i𝔉, linear, σ, λ)
@@ -91,7 +91,6 @@ function (a::FourierLayer)(x::AbstractArray)
     Wf, Wl, bf, bl, σ, = a.Wf, a.Wl, a.bf, a.bl, a.σ
     𝔉, i𝔉 = a.𝔉, a.i𝔉
     linear = a.linear
-    grid = size(x,3)
 
     # The linear path
     # x -> Wl
@@ -99,14 +98,14 @@ function (a::FourierLayer)(x::AbstractArray)
 
     # The convolution path
     # x -> 𝔉 -> Wf -> i𝔉
-    # Do the Fourier transform (FFT) along the last axis of the input
-    𝔉 = rfft(x,3)
-
+    # Do the Fourier transform (FFT) along the grid dimension of the input and
     # Multiply the weight matrix with the input using batched multiplication
-    𝔉 .= batched_mul!(𝔉, Wf, 𝔉) .+ bf
+    # We need to permute the input, otherwise batching won't work
+    𝔉 .= batched_mul!(𝔉, Wf, rfft(permutedims(x, [1,3,2]),3)) .+ bf
 
     # Do the inverse transform
-    i𝔉 = irfft(𝔉, grid, 3)
+    # We need to permute back to match the shape of the linear path
+    i𝔉 = irfft(permutedims(𝔉, [1,3,2]), size(x,2), 2)
 
     # Return the activated sum
     return σ.(linear + i𝔉)
diff --git a/test/burgers.jl b/test/burgers.jl
index 0e826bc..4c70725 100644
--- a/test/burgers.jl
+++ b/test/burgers.jl
@@ -49,13 +49,13 @@ xtrain, xtest = permutedims(xtrain,(3,1,2)), permutedims(xtest,(3,1,2)) |> devic
 ytrain, ytest = permutedims(ytrain,(3,1,2)), permutedims(ytest,(3,1,2)) |> device
 
 # Pass the data to the Flux DataLoader and give it a batch of 20
-train_loader = Flux.Data.DataLoader((xtrain, ytrain), batchsize=20) |> device
-test_loader = Flux.Data.DataLoader((xtest, ytest), batchsize=20) |> device
+train_loader = Flux.Data.DataLoader((xtrain, ytrain), batchsize=20, shuffle=true) |> device
+test_loader = Flux.Data.DataLoader((xtest, ytest), batchsize=20, shuffle=true) |> device
 
 # Set up the Fourier Layer
 # 128 in- and outputs, batch size 20 as given above, grid size 1024
 # 16 modes to keep, σ activation on the gpu
-layer = FourierLayer(128,128,20,1024,16,σ) |> device
+layer = FourierLayer(128,128,20,1024,16,σ,bias_fourier=false) |> device
 
 # The whole architecture
 # linear transform into the latent space, 4 Fourier Layers,
@@ -74,8 +74,7 @@ parameters = params(model)
 loss(x,y) = Flux.Losses.mse(model(x),y)
 
 # Define a callback function that gives some output during training
-data = [(xtrain, ytrain)] |> device;
-evalcb() = @show(loss(xtrain,ytrain))
+evalcb() = @show(loss(x,y))
 
 # Do the training loop
-Flux.@epochs 500 train!(loss, parameters, data, opt, cb = evalcb)
\ No newline at end of file
+Flux.@epochs 500 train!(loss, parameters, train_loader, opt, cb = evalcb)
\ No newline at end of file

From 1b84fc15d2f3cec6425fd62cefed02ae862fa94b Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Tue, 9 Nov 2021 11:42:59 +0100
Subject: [PATCH 23/27] fix bias creation

---
 src/FourierLayer.jl | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index 9641b63..22af049 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -27,24 +27,28 @@ So the input takes the dimension `2 x 64 x 200`.
 The output would be the diffused variable at a later time, which makes the output of the form
 `2 x 200 x 64` as well.
 """
-struct FourierLayer{F,Tc<:Complex{<:AbstractFloat},Tr<:AbstractFloat}
+struct FourierLayer{F,Tc<:Complex{<:AbstractFloat},Tr<:AbstractFloat,Bf,Bl}
     # F: Activation, Tc/Tr: Complex/Real eltype
     Wf::AbstractArray{Tc,3}
     Wl::AbstractArray{Tr,3}
-    bf::AbstractArray{Tc,3}
-    bl::AbstractArray{Tr,3}
     𝔉::AbstractArray{Tc,3}
     i𝔉::AbstractArray{Tr,3}
     linear::AbstractArray{Tr,3}
     σ::F
     λ::Int
+    bf::Bf
+    bl::Bl
     # Constructor for the entire fourier layer
     function FourierLayer(
-        Wf::AbstractArray{Tc,3}, Wl::AbstractArray{Tr,3}, bf::AbstractArray{Tc,3},
-        bl::AbstractArray{Tr,3}, 𝔉::AbstractArray{Tc,3}, i𝔉::AbstractArray{Tr,3},
-        linear::AbstractArray{Tr,3}, σ::F = identity, λ::Int = 12
-        ) where {F,Tc<:Complex{<:AbstractFloat},Tr<:AbstractFloat}
-        new{F,Tc,Tr}(Wf, Wl, bf, bl, 𝔉, i𝔉, linear, σ, λ)
+        Wf::AbstractArray{Tc,3}, Wl::AbstractArray{Tr,3}, 𝔉::AbstractArray{Tc,3}, 
+        i𝔉::AbstractArray{Tr,3}, linear::AbstractArray{Tr,3}, σ::F = identity,
+        λ::Int = 12, bf = true, bl = true) where
+        {F,Tc<:Complex{<:AbstractFloat},Tr<:AbstractFloat}
+
+        # create the biases with one singleton dimension
+        bf = Flux.create_bias(Wf, bf, size(Wf,2), 1, size(Wf,3))
+        bl = Flux.create_bias(Wl, bl, size(Wl,1), size(Wf,3), 1)
+        new{F,Tc,Tr,typeof(bf),typeof(bl)}(Wf, Wl, 𝔉, i𝔉, linear, σ, λ, bf, bl)
     end
 end
 
@@ -67,9 +71,9 @@ function FourierLayer(in::Integer, out::Integer, batch::Integer, grid::Integer,
     # Initialize Linear weight matrix
     Wl = initl(out, in, 1)
 
-    # create the biases with one singleton dimension
-    bf = Flux.create_bias(Wf, bias_fourier, out, 1, floor(Int, grid/2 + 1))
-    bl = Flux.create_bias(Wl, bias_linear, out, grid, 1)
+    # Pass the bias bools
+    bf = bias_fourier
+    bl = bias_linear
 
     # Pass the modes for output
     λ = modes
@@ -78,7 +82,7 @@ function FourierLayer(in::Integer, out::Integer, batch::Integer, grid::Integer,
     i𝔉 = Array{Float32}(undef, out, grid, batch)
     linear = similar(i𝔉)
 
-    return FourierLayer(Wf, Wl, bf, bl, 𝔉, i𝔉, linear, σ, λ)
+    return FourierLayer(Wf, Wl, 𝔉, i𝔉, linear, σ, λ, bf, bl)
 end
 
 # Only train the weight array with non-zero modes

From 47d1798701faa4dabf83c17fca12c3928391236c Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Tue, 9 Nov 2021 11:51:54 +0100
Subject: [PATCH 24/27] fix params assignment of bias

---
 src/FourierLayer.jl | 4 +++-
 test/burgers.jl     | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index 22af049..31dbf2b 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -87,7 +87,9 @@ end
 
 # Only train the weight array with non-zero modes
 Flux.@functor FourierLayer 
-Flux.trainable(a::FourierLayer) = (a.Wf[:,:,1:a.λ], a.Wl, a.bf[:,:,1:a.λ], a.bl)
+Flux.trainable(a::FourierLayer) = (a.Wf[:,:,1:a.λ], a.Wl, 
+                                a.bf != Flux.zeros ? a.bf[:,:,1:a.λ] : nothing,
+                                a.bl != Flux.zeros ? a.bl : nothing)
 
 # The actual layer that does stuff
 function (a::FourierLayer)(x::AbstractArray)
diff --git a/test/burgers.jl b/test/burgers.jl
index 4c70725..7c7c835 100644
--- a/test/burgers.jl
+++ b/test/burgers.jl
@@ -55,7 +55,7 @@ test_loader = Flux.Data.DataLoader((xtest, ytest), batchsize=20, shuffle=true) |
 # Set up the Fourier Layer
 # 128 in- and outputs, batch size 20 as given above, grid size 1024
 # 16 modes to keep, σ activation on the gpu
-layer = FourierLayer(128,128,20,1024,16,σ,bias_fourier=false) |> device
+layer = FourierLayer(128,128,20,1024,16,gelu,bias_fourier=false) |> device
 
 # The whole architecture
 # linear transform into the latent space, 4 Fourier Layers,

From 6deedc109fb7d51c4adc963f81f079a7af1f8872 Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Tue, 9 Nov 2021 11:58:34 +0100
Subject: [PATCH 25/27] fix params syntax error

---
 src/FourierLayer.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index 31dbf2b..7de25b9 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -88,8 +88,8 @@ end
 # Only train the weight array with non-zero modes
 Flux.@functor FourierLayer 
 Flux.trainable(a::FourierLayer) = (a.Wf[:,:,1:a.λ], a.Wl, 
-                                a.bf != Flux.zeros ? a.bf[:,:,1:a.λ] : nothing,
-                                a.bl != Flux.zeros ? a.bl : nothing)
+                                typeof(a.bf) != Flux.Zeros ? a.bf[:,:,1:a.λ] : nothing,
+                                typeof(a.bl) != Flux.Zeros ? a.bl : nothing)
 
 # The actual layer that does stuff
 function (a::FourierLayer)(x::AbstractArray)

From f690d5262cc271e35ef63bcab219b03cec6a2dc7 Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Tue, 9 Nov 2021 15:53:25 +0100
Subject: [PATCH 26/27] re-introduce OMEinsum, get rid of batch argument

---
 Project.toml          |  1 +
 src/FourierLayer.jl   | 45 ++++++++++++++++++++++---------------------
 src/NeuralOperator.jl |  1 +
 test/burgers.jl       |  8 ++++----
 test/fourierlayer.jl  | 12 ++++++------
 5 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/Project.toml b/Project.toml
index 83e7b6d..069f047 100644
--- a/Project.toml
+++ b/Project.toml
@@ -9,6 +9,7 @@ FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+OMEinsum = "ebe7aa44-baf0-506c-a96f-8464559b3922"
 PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
diff --git a/src/FourierLayer.jl b/src/FourierLayer.jl
index 7de25b9..311b9b9 100644
--- a/src/FourierLayer.jl
+++ b/src/FourierLayer.jl
@@ -30,25 +30,23 @@ The output would be the diffused variable at a later time, which makes the outpu
 struct FourierLayer{F,Tc<:Complex{<:AbstractFloat},Tr<:AbstractFloat,Bf,Bl}
     # F: Activation, Tc/Tr: Complex/Real eltype
     Wf::AbstractArray{Tc,3}
-    Wl::AbstractArray{Tr,3}
-    𝔉::AbstractArray{Tc,3}
-    i𝔉::AbstractArray{Tr,3}
-    linear::AbstractArray{Tr,3}
+    Wl::AbstractMatrix{Tr}
+    grid::Int
     σ::F
     λ::Int
     bf::Bf
     bl::Bl
     # Constructor for the entire fourier layer
     function FourierLayer(
-        Wf::AbstractArray{Tc,3}, Wl::AbstractArray{Tr,3}, 𝔉::AbstractArray{Tc,3}, 
-        i𝔉::AbstractArray{Tr,3}, linear::AbstractArray{Tr,3}, σ::F = identity,
+        Wf::AbstractArray{Tc,3}, Wl::AbstractMatrix{Tr}, 
+        grid::Int, σ::F = identity,
         λ::Int = 12, bf = true, bl = true) where
         {F,Tc<:Complex{<:AbstractFloat},Tr<:AbstractFloat}
 
         # create the biases with one singleton dimension
-        bf = Flux.create_bias(Wf, bf, size(Wf,2), 1, size(Wf,3))
-        bl = Flux.create_bias(Wl, bl, size(Wl,1), size(Wf,3), 1)
-        new{F,Tc,Tr,typeof(bf),typeof(bl)}(Wf, Wl, 𝔉, i𝔉, linear, σ, λ, bf, bl)
+        bf = Flux.create_bias(Wf, bf, 1, size(Wf,2), size(Wf,3))
+        bl = Flux.create_bias(Wl, bl, 1, size(Wl,1), grid)
+        new{F,Tc,Tr,typeof(bf),typeof(bl)}(Wf, Wl, grid, σ, λ, bf, bl)
     end
 end
 
@@ -56,7 +54,7 @@ end
 # `in` and `out` refer to the dimensionality of the number of parameters
 # `modes` specifies the number of modes not to be filtered out
 # `grid` specifies the number of grid points in the data
-function FourierLayer(in::Integer, out::Integer, batch::Integer, grid::Integer, modes = 12,
+function FourierLayer(in::Integer, out::Integer, grid::Integer, modes = 12,
                         σ = identity; initf = cglorot_uniform, initl = Flux.glorot_uniform,
                         bias_fourier=true, bias_linear=true)
 
@@ -69,7 +67,7 @@ function FourierLayer(in::Integer, out::Integer, batch::Integer, grid::Integer,
     Wf = pad_zeros(Wf, (0, floor(Int, grid/2 + 1) - modes), dims=3)
 
     # Initialize Linear weight matrix
-    Wl = initl(out, in, 1)
+    Wl = initl(out, in)
 
     # Pass the bias bools
     bf = bias_fourier
@@ -78,11 +76,8 @@ function FourierLayer(in::Integer, out::Integer, batch::Integer, grid::Integer,
     # Pass the modes for output
     λ = modes
     # Pre-allocate the interim arrays for the forward pass
-    𝔉 = Array{ComplexF32}(undef, out, batch, floor(Int, grid/2 + 1))
-    i𝔉 = Array{Float32}(undef, out, grid, batch)
-    linear = similar(i𝔉)
 
-    return FourierLayer(Wf, Wl, 𝔉, i𝔉, linear, σ, λ, bf, bl)
+    return FourierLayer(Wf, Wl, grid, σ, λ, bf, bl)
 end
 
 # Only train the weight array with non-zero modes
@@ -95,26 +90,32 @@ Flux.trainable(a::FourierLayer) = (a.Wf[:,:,1:a.λ], a.Wl,
 function (a::FourierLayer)(x::AbstractArray)
     # Assign the parameters
     Wf, Wl, bf, bl, σ, = a.Wf, a.Wl, a.bf, a.bl, a.σ
-    𝔉, i𝔉 = a.𝔉, a.i𝔉
-    linear = a.linear
+    # Do a permutation: DataLoader requires batch to be the last dim
+    # for the rest, it's more convenient to have it in the first one
+    xp = permutedims(x, [3,1,2])
 
     # The linear path
     # x -> Wl
-    linear .= batched_mul!(linear, Wl, x) .+ bl
+    # linear .= batched_mul!(linear, Wl, x) .+ bl
+    @ein linear[batch, out, grid] := Wl[out, in] * xp[batch, in, grid]
+    linear .+ bl
 
     # The convolution path
     # x -> 𝔉 -> Wf -> i𝔉
     # Do the Fourier transform (FFT) along the grid dimension of the input and
     # Multiply the weight matrix with the input using batched multiplication
-    # We need to permute the input, otherwise batching won't work
-    𝔉 .= batched_mul!(𝔉, Wf, rfft(permutedims(x, [1,3,2]),3)) .+ bf
+    # We need to permute the input to (channel,batch,grid), otherwise batching won't work
+    # 𝔉 .= batched_mul!(𝔉, Wf, rfft(permutedims(x, [1,3,2]),3)) .+ bf
+    @ein 𝔉[batch, out, grid] := Wf[in, out, grid] * rfft(xp, 3)[batch, in, grid]
+    𝔉 .+ bf
 
     # Do the inverse transform
     # We need to permute back to match the shape of the linear path
-    i𝔉 = irfft(permutedims(𝔉, [1,3,2]), size(x,2), 2)
+    #i𝔉 = permutedims(irfft(𝔉, size(x,2), 3), [1,3,2])
+    i𝔉 = irfft(𝔉, size(xp,3),3)
 
     # Return the activated sum
-    return σ.(linear + i𝔉)
+    return permutedims(σ.(linear + i𝔉), [2,3,1])
 end
 
 # Overload function to deal with higher-dimensional input arrays
diff --git a/src/NeuralOperator.jl b/src/NeuralOperator.jl
index 8dc72ad..26f1ee2 100644
--- a/src/NeuralOperator.jl
+++ b/src/NeuralOperator.jl
@@ -8,6 +8,7 @@ using FFTW: assert_applicable, unsafe_execute!, FORWARD, BACKWARD, rFFTWPlan
 using Random
 using Random: AbstractRNG
 using Flux: nfan, glorot_uniform, batch
+using OMEinsum
 
 export FourierLayer
 
diff --git a/test/burgers.jl b/test/burgers.jl
index 7c7c835..dcb8db0 100644
--- a/test/burgers.jl
+++ b/test/burgers.jl
@@ -43,10 +43,10 @@ ytest = cat(reshape(ytest,(100,1024,1)),
             reshape(repeat(grid,100),(100,1024,1));
             dims=3) |> device
 
-# Our net wants the input in the form (2,batch,grid), though,
+# Our net wants the input in the form (2,grid,batch), though,
 # So we permute
-xtrain, xtest = permutedims(xtrain,(3,1,2)), permutedims(xtest,(3,1,2)) |> device
-ytrain, ytest = permutedims(ytrain,(3,1,2)), permutedims(ytest,(3,1,2)) |> device
+xtrain, xtest = permutedims(xtrain,(3,2,1)), permutedims(xtest,(3,2,1)) |> device
+ytrain, ytest = permutedims(ytrain,(3,2,1)), permutedims(ytest,(3,2,1)) |> device
 
 # Pass the data to the Flux DataLoader and give it a batch of 20
 train_loader = Flux.Data.DataLoader((xtrain, ytrain), batchsize=20, shuffle=true) |> device
@@ -61,7 +61,7 @@ layer = FourierLayer(128,128,20,1024,16,gelu,bias_fourier=false) |> device
 # linear transform into the latent space, 4 Fourier Layers,
 # then transform it back
 model = Chain(Dense(2,128;bias=false), layer, layer, layer, layer,
-                Dense(128,1;bias=false)) |> device
+                Dense(128,2;bias=false)) |> device
 
 # We use the ADAM optimizer for training
 learning_rate = 0.001
diff --git a/test/fourierlayer.jl b/test/fourierlayer.jl
index a6c5d01..22d8c1c 100644
--- a/test/fourierlayer.jl
+++ b/test/fourierlayer.jl
@@ -2,14 +2,14 @@ using Test, Random, Flux
 
 @testset "FourierLayer" begin
     # Test the proper construction
-    @test size(FourierLayer(128, 64, 200, 100, 20).Wf) == (128, 64, 51)
-    @test size(FourierLayer(128, 64, 200, 100, 20).Wl) == (64, 128, 1)
+    @test size(FourierLayer(128, 64, 100, 20).Wf) == (128, 64, 51)
+    @test size(FourierLayer(128, 64, 100, 20).Wl) == (64, 128, 1)
     #@test size(FourierLayer(10, 100).bias_f) == (51,)
     #@test size(FourierLayer(10, 100).bias_l) == (100,)
 
     # Accept only Int as architecture parameters
-    @test_throws MethodError FourierLayer(128.5, 64, 200, 100, 20)
-    @test_throws MethodError FourierLayer(128.5, 64, 200, 100, 20, tanh)
-    @test_throws AssertionError FourierLayer(100, 100, 100, 100, 60, σ)
-    @test_throws AssertionError FourierLayer(100, 100, 100, 100, 60)
+    @test_throws MethodError FourierLayer(128.5, 64, 100, 20)
+    @test_throws MethodError FourierLayer(128.5, 64, 100, 20, tanh)
+    @test_throws AssertionError FourierLayer(100, 100, 100, 60, σ)
+    @test_throws AssertionError FourierLayer(100, 100, 100, 60)
 end
\ No newline at end of file

From 75c095e1722eef904463e138aa53065f5e6057d6 Mon Sep 17 00:00:00 2001
From: pzimbrod <patrick.zimbrod@informatik.uni-augsburg.de>
Date: Tue, 9 Nov 2021 15:58:34 +0100
Subject: [PATCH 27/27] fix Wl dim in test & update constructor in burgers

---
 test/burgers.jl      | 2 +-
 test/fourierlayer.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/burgers.jl b/test/burgers.jl
index dcb8db0..f6c7794 100644
--- a/test/burgers.jl
+++ b/test/burgers.jl
@@ -55,7 +55,7 @@ test_loader = Flux.Data.DataLoader((xtest, ytest), batchsize=20, shuffle=true) |
 # Set up the Fourier Layer
 # 128 in- and outputs, batch size 20 as given above, grid size 1024
 # 16 modes to keep, σ activation on the gpu
-layer = FourierLayer(128,128,20,1024,16,gelu,bias_fourier=false) |> device
+layer = FourierLayer(128,128,1024,16,gelu,bias_fourier=false) |> device
 
 # The whole architecture
 # linear transform into the latent space, 4 Fourier Layers,
diff --git a/test/fourierlayer.jl b/test/fourierlayer.jl
index 22d8c1c..6b1773e 100644
--- a/test/fourierlayer.jl
+++ b/test/fourierlayer.jl
@@ -3,7 +3,7 @@ using Test, Random, Flux
 @testset "FourierLayer" begin
     # Test the proper construction
     @test size(FourierLayer(128, 64, 100, 20).Wf) == (128, 64, 51)
-    @test size(FourierLayer(128, 64, 100, 20).Wl) == (64, 128, 1)
+    @test size(FourierLayer(128, 64, 100, 20).Wl) == (64, 128)
     #@test size(FourierLayer(10, 100).bias_f) == (51,)
     #@test size(FourierLayer(10, 100).bias_l) == (100,)