From 5e2d0dcafe3c5f82f14456b7d5490d952395f135 Mon Sep 17 00:00:00 2001
From: andreaskuster <mail@andreaskuster.ch>
Date: Fri, 8 Jan 2021 16:26:37 +0100
Subject: [PATCH 01/27] Add minimal bug example.

---
 bug_min.json | 174 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 174 insertions(+)
 create mode 100644 bug_min.json

diff --git a/bug_min.json b/bug_min.json
new file mode 100644
index 0000000..1c8278a
--- /dev/null
+++ b/bug_min.json
@@ -0,0 +1,174 @@
+{
+ "inputs": {
+  "crlato": {
+   "data": "crlato_128_float32.dat",
+   "data_type": "float32",
+   "input_dims": [
+    "i"
+   ]
+  }
+ },
+ "outputs": [
+  "out"
+ ],
+ "dimensions": [
+  128,
+  80,
+  128
+ ],
+ "vectorization": 1,
+ "program": {
+  "v_tmp": {
+   "data_type": "float32",
+   "computation_string": "\nv_tmp = crlato[i]\n",
+   "boundary_conditions": {
+    "crlato": {
+     "btype": "shrink",
+     "halo": [
+      "halo-2",
+      "halo-1",
+      0,
+      0,
+      "halo-1",
+      "halo-2"
+     ]
+    }
+   }
+  },
+  "u_tmp": {
+   "data_type": "float32",
+   "computation_string": "\nu_tmp = crlato[i]\n",
+   "boundary_conditions": {
+    "crlato": {
+     "btype": "shrink",
+     "halo": [
+      "halo-1",
+      "halo-2",
+      0,
+      0,
+      "halo-2",
+      "halo-1"
+     ]
+    }
+   }
+  },
+  "__tmp_T": {
+   "data_type": "float32",
+   "computation_string": "\n__tmp_T = u_tmp[(i, j, k)] + v_tmp[((i + 1), j, k)] + v_tmp[(i, j, k)]\n",
+   "boundary_conditions": {
+    "u_tmp": {
+     "btype": "shrink",
+     "halo": [
+      "halo-1",
+      "halo-1",
+      0,
+      0,
+      "halo-1",
+      "halo-1"
+     ]
+    },
+    "v_tmp": {
+     "btype": "shrink",
+     "halo": [
+      "halo-1",
+      "halo-1",
+      0,
+      0,
+      "halo-1",
+      "halo-1"
+     ]
+    }
+   }
+  },
+  "__tmp_S": {
+   "data_type": "float32",
+   "computation_string": "\n__tmp_S = v_tmp[(i, j, k)] + u_tmp[((i + 1), j, k)] + u_tmp[(i, j, k)]\n",
+   "boundary_conditions": {
+    "v_tmp": {
+     "btype": "shrink",
+     "halo": [
+      "halo-1",
+      "halo-1",
+      0,
+      0,
+      "halo-1",
+      "halo-1"
+     ]
+    },
+    "u_tmp": {
+     "btype": "shrink",
+     "halo": [
+      "halo-1",
+      "halo-1",
+      0,
+      0,
+      "halo-1",
+      "halo-1"
+     ]
+    }
+   }
+  },
+  "out": {
+   "data_type": "float32",
+   "computation_string": "\nout = __tmp_S[(i,j,k)] + __tmp_T[(i,j,k)]\n",
+   "boundary_conditions": {
+    "__tmp_S":{
+     "btype": "shrink",
+     "halo": [
+      "halo",
+      "halo",
+      0,
+      0,
+      "halo",
+      "halo"
+     ]
+    },
+    "__tmp_T": {
+     "btype": "shrink",
+     "halo": [
+      "halo",
+      "halo",
+      0,
+      0,
+      "halo",
+      "halo"
+     ]
+    }
+    }
+   }
+ },
+ "constants": {
+  "eddlat": {
+   "value": "5729.58",
+   "data_type": "float32"
+  },
+  "eddlon": {
+   "value": "5729.58",
+   "data_type": "float32"
+  },
+  "tau_smag": {
+   "value": "0.3",
+   "data_type": "float32"
+  },
+  "weight_smag": {
+   "value": "0.5",
+   "data_type": "float32"
+  },
+  "I": {
+   "value": "128",
+   "data_type": "int32"
+  },
+  "J": {
+   "value": "128",
+   "data_type": "int32"
+  },
+  "K": {
+   "value": "80",
+   "data_type": "int32"
+  },
+  "halo": {
+   "value": "1",
+   "data_type": "int32"
+  }
+ }
+}

From f4763fc83c5fbc071e7e854143e6588445fc7233 Mon Sep 17 00:00:00 2001
From: andreaskuster <mail@andreaskuster.ch>
Date: Fri, 8 Jan 2021 19:58:40 +0100
Subject: [PATCH 02/27] Further reduce minimal example.

---
 bug_min.json | 170 ++++++++++++---------------------------------------
 1 file changed, 40 insertions(+), 130 deletions(-)

diff --git a/bug_min.json b/bug_min.json
index 1c8278a..981e180 100644
--- a/bug_min.json
+++ b/bug_min.json
@@ -1,7 +1,7 @@
 {
  "inputs": {
-  "crlato": {
-   "data": "crlato_128_float32.dat",
+  "inA": {
+   "data": "inA_float32.dat",
    "data_type": "float32",
    "input_dims": [
     "i"
@@ -12,163 +12,73 @@
   "out"
  ],
  "dimensions": [
-  128,
-  80,
-  128
+  8,
+  8,
+  8
  ],
  "vectorization": 1,
  "program": {
-  "v_tmp": {
+  "k0": {
    "data_type": "float32",
-   "computation_string": "\nv_tmp = crlato[i]\n",
+   "computation_string": "k0 = inA[i]",
    "boundary_conditions": {
-    "crlato": {
-     "btype": "shrink",
-     "halo": [
-      "halo-2",
-      "halo-1",
-      0,
-      0,
-      "halo-1",
-      "halo-2"
-     ]
-    }
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
    }
   },
-  "u_tmp": {
+  "k1": {
    "data_type": "float32",
-   "computation_string": "\nu_tmp = crlato[i]\n",
+   "computation_string": "k1 = inA[i]",
    "boundary_conditions": {
-    "crlato": {
-     "btype": "shrink",
-     "halo": [
-      "halo-1",
-      "halo-2",
-      0,
-      0,
-      "halo-2",
-      "halo-1"
-     ]
-    }
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
    }
   },
-  "__tmp_T": {
+  "k2": {
    "data_type": "float32",
-   "computation_string": "\n__tmp_T = u_tmp[(i, j, k)] + v_tmp[((i + 1), j, k)] + v_tmp[(i, j, k)]\n",
+   "computation_string": "k2 = k1[i, j, k] + k0[i + 1, j, k] + k0[i, j, k]",
    "boundary_conditions": {
-    "u_tmp": {
-     "btype": "shrink",
-     "halo": [
-      "halo-1",
-      "halo-1",
-      0,
-      0,
-      "halo-1",
-      "halo-1"
-     ]
+    "k1": {
+       "type": "constant",
+       "value": 0.0
     },
-    "v_tmp": {
-     "btype": "shrink",
-     "halo": [
-      "halo-1",
-      "halo-1",
-      0,
-      0,
-      "halo-1",
-      "halo-1"
-     ]
+    "k0": {
+       "type": "constant",
+       "value": 0.0
     }
    }
   },
-  "__tmp_S": {
+  "k3": {
    "data_type": "float32",
-   "computation_string": "\n__tmp_S = v_tmp[(i, j, k)] + u_tmp[((i + 1), j, k)] + u_tmp[(i, j, k)]\n",
+   "computation_string": "k3 = k0[i, j, k] + k1[i + 1, j, k] + k1[i, j, k]",
    "boundary_conditions": {
-    "v_tmp": {
-     "btype": "shrink",
-     "halo": [
-      "halo-1",
-      "halo-1",
-      0,
-      0,
-      "halo-1",
-      "halo-1"
-     ]
+    "k0": {
+       "type": "constant",
+       "value": 0.0
     },
-    "u_tmp": {
-     "btype": "shrink",
-     "halo": [
-      "halo-1",
-      "halo-1",
-      0,
-      0,
-      "halo-1",
-      "halo-1"
-     ]
+    "k1": {
+       "type": "constant",
+       "value": 0.0
     }
    }
   },
   "out": {
    "data_type": "float32",
-   "computation_string": "\nout = __tmp_S[(i,j,k)] + __tmp_T[(i,j,k)]\n",
+   "computation_string": "out = k2[i,j,k] + k3[i,j,k]",
    "boundary_conditions": {
-    "__tmp_S":{
-     "btype": "shrink",
-     "halo": [
-      "halo",
-      "halo",
-      0,
-      0,
-      "halo",
-      "halo"
-     ]
+    "k2":{
+       "type": "constant",
+       "value": 0.0
     },
-    "__tmp_T": {
-     "btype": "shrink",
-     "halo": [
-      "halo",
-      "halo",
-      0,
-      0,
-      "halo",
-      "halo"
-     ]
+    "k3": {
+       "type": "constant",
+       "value": 0.0
     }
     }
    }
- },
- "constants": {
-  "eddlat": {
-   "value": "5729.58",
-   "data_type": "float32"
-  },
-  "eddlon": {
-   "value": "5729.58",
-   "data_type": "float32"
-  },
-  "tau_smag": {
-   "value": "0.3",
-   "data_type": "float32"
-  },
-  "weight_smag": {
-   "value": "0.5",
-   "data_type": "float32"
-  },
-  "I": {
-   "value": "128",
-   "data_type": "int32"
-  },
-  "J": {
-   "value": "128",
-   "data_type": "int32"
-  },
-  "K": {
-   "value": "80",
-   "data_type": "int32"
-  },
-  "halo": {
-   "value": "1",
-   "data_type": "int32"
-  }
  }
 }

From 89f87c3a9186f6c1e1e41302fbaaa59c209f069f Mon Sep 17 00:00:00 2001
From: andreaskuster <mail@andreaskuster.ch>
Date: Fri, 8 Jan 2021 19:59:07 +0100
Subject: [PATCH 03/27] Account for offset to center.

---
 stencilflow/kernel_chain_graph.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/stencilflow/kernel_chain_graph.py b/stencilflow/kernel_chain_graph.py
index ff54023..301827f 100644
--- a/stencilflow/kernel_chain_graph.py
+++ b/stencilflow/kernel_chain_graph.py
@@ -507,6 +507,11 @@ def compute_delay_buffer(self) -> None:
                         dimensions=self.dimensions,
                         index=stencilflow.list_subtract_cwise(
                             max_delay[:-1], entry[:-1]))
+
+                    if not isinstance(node, Output):
+                        max_offset = node.dist_to_center[max(node.dist_to_center, key=lambda x: node.dist_to_center[x])]
+                        max_size = max_offset - node.dist_to_center[entry[-1]]
+
                     node.delay_buffer[name] = BoundedQueue(name=name,
                                                            maxsize=max_size)
                     node.delay_buffer[name].import_data(

From e1caeb135ddb5b425fe243e11569638e40799014 Mon Sep 17 00:00:00 2001
From: Andreas Kuster <mail@andreaskuster.ch>
Date: Sat, 9 Jan 2021 15:18:23 +0100
Subject: [PATCH 04/27] Add fpga0 sdk env vars script

---
 vars.sh | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 vars.sh

diff --git a/vars.sh b/vars.sh
new file mode 100644
index 0000000..80989aa
--- /dev/null
+++ b/vars.sh
@@ -0,0 +1,16 @@
+# intel fpga
+export INTELFPGAOCLSDKROOT=/opt/intelFPGA_pro/19.1/hld
+export PATH=$INTELFPGAOCLSDKROOT/bin/:$PATH
+export AOCL_BOARD_PACKAGE_ROOT=$INTELFPGAOCLSDKROOT/board/bittware_pcie/s10
+# /opt/intelFPGA_pro/19.4/hld/board/bittware_pcie/s10/board_env.xml
+# /opt/intelFPGA_pro/19.4/hld/board/bittware_pcie/s10_hpc_default/board_env.xml
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$AOCL_BOARD_PACKAGE_ROOT/linux64/lib
+
+# xilinx fpga
+export PATH=/opt/Xilinx/Vitis/2019.2/bin:/opt/Xilinx/Vitis_HLS/2019.2/bin:/opt/Xilinx/Vivado/2019.2/bin:$PATH
+export XILINX_XRT=/opt/xilinx/xrt
+export PATH=$XILINX_XRT/bin:$PATH
+export LD_LIBRARY_PATH=$XILINX_XRT/lib:$LD_LIBRARY_PATH
+export XILINXD_LICENSE_FILE=2100@sgv-license-01
+export LIBRARY_PATH=/usr/lib/x86_64-linux-gnu
+

From 34389ae15307458bb0057a6f1fdb6a3aef30265b Mon Sep 17 00:00:00 2001
From: Andreas Kuster <mail@andreaskuster.ch>
Date: Sat, 9 Jan 2021 16:27:03 +0100
Subject: [PATCH 05/27] Add larger jacobi3d example

---
 test/stencils/jacobi3d_512x512x512.json | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 test/stencils/jacobi3d_512x512x512.json

diff --git a/test/stencils/jacobi3d_512x512x512.json b/test/stencils/jacobi3d_512x512x512.json
new file mode 100644
index 0000000..82db32d
--- /dev/null
+++ b/test/stencils/jacobi3d_512x512x512.json
@@ -0,0 +1,24 @@
+{
+    "inputs": {
+        "a": {
+            "data": "data/zeros_32x32x32_fp32.dat",
+            "data_type": "float32"
+        }
+    },
+    "outputs": ["b"],
+    "dimensions": [512, 512, 512],
+    "program": {
+        "b": {
+            "computation_string":
+            "b = 0.16666666 * (a[i-1,j,k] + a[i+1,j,k] + a[i,j-1,k] + a[i,j+1,k] + a[i,j,k-1] + a[i,j,k+1])",
+            "boundary_conditions": {
+                "a": {
+                    "type": "constant",
+                    "value": 1.0
+                }
+            },
+            "data_type":
+            "float32"
+        }
+    }
+}

From b1bac07114135dd328e158c417f10fe11ab9885f Mon Sep 17 00:00:00 2001
From: andreaskuster <mail@andreaskuster.ch>
Date: Sat, 9 Jan 2021 17:54:42 +0100
Subject: [PATCH 06/27] Add temporary fix.

---
 stencilflow/kernel_chain_graph.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/stencilflow/kernel_chain_graph.py b/stencilflow/kernel_chain_graph.py
index 301827f..2763b65 100644
--- a/stencilflow/kernel_chain_graph.py
+++ b/stencilflow/kernel_chain_graph.py
@@ -85,6 +85,29 @@ def __init__(self,
         if self.log_level >= LogLevel.MODERATE:
             print("Compute delay buffer sizes.")
         self.compute_delay_buffer()  # compute the delay buffer sizes
+
+        for node in self.graph.nodes():
+            if node.name == "__tmp_T" or node.name == "__tmp_T_sqr_s_1351":
+                name = "u_tmp"
+                max_size = self.dimensions[0]*self.dimensions[1]
+                node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
+                node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
+            if node.name == "__tmp_S" or node.name == "__tmp_S_sqr_uv_1352":
+                name = "v_tmp"
+                max_size = self.dimensions[0] * self.dimensions[1]
+                node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
+                node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
+            if node.name == "__tmp_T_sqr_s_1351":
+                name = "ms_sdfg_1330___local_frac_1_dx_1660"
+                max_size = self.dimensions[0]*self.dimensions[1]
+                node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
+                node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
+            if node.name == "__tmp_S_sqr_uv_1352":
+                name = "ms_sdfg_1330___local_frac_1_dx_1660"
+                max_size = self.dimensions[0] * self.dimensions[1]
+                node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
+                node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
+
         if self.log_level >= LogLevel.MODERATE:
             print("Add channels to the graph edges.")
         # plot kernel graphs if flag set to true

From e83e4e3d76c0048ddd8c16511e41cd8dc3e14ad1 Mon Sep 17 00:00:00 2001
From: Andreas Kuster <mail@andreaskuster.ch>
Date: Sat, 9 Jan 2021 19:19:57 +0100
Subject: [PATCH 07/27] Increase problem size.

---
 bug_min.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bug_min.json b/bug_min.json
index 981e180..47815e5 100644
--- a/bug_min.json
+++ b/bug_min.json
@@ -12,9 +12,9 @@
   "out"
  ],
  "dimensions": [
-  8,
-  8,
-  8
+  256,
+  256,
+  256
  ],
  "vectorization": 1,
  "program": {

From 9da97a3d9c30fbcd02996e0adfa90f3031d62835 Mon Sep 17 00:00:00 2001
From: andreaskuster <mail@andreaskuster.ch>
Date: Mon, 11 Jan 2021 00:18:47 +0100
Subject: [PATCH 08/27] Add more complex example.

---
 bug_min_ext.json | 94 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 bug_min_ext.json

diff --git a/bug_min_ext.json b/bug_min_ext.json
new file mode 100644
index 0000000..adc7d4f
--- /dev/null
+++ b/bug_min_ext.json
@@ -0,0 +1,94 @@
+{
+ "inputs": {
+  "inA": {
+   "data": "inA_float32.dat",
+   "data_type": "float32",
+   "input_dims": [
+    "i"
+   ]
+  }
+ },
+ "outputs": [
+  "out"
+ ],
+ "dimensions": [
+  8,
+  8,
+  8
+ ],
+ "vectorization": 1,
+ "program": {
+  "k0": {
+   "data_type": "float32",
+   "computation_string": "k0 = inA[i]",
+   "boundary_conditions": {
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
+   }
+  },
+  "k1": {
+   "data_type": "float32",
+   "computation_string": "k1 = inA[i]",
+   "boundary_conditions": {
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
+   }
+  },
+  "k2": {
+   "data_type": "float32",
+   "computation_string": "k2 = k1[i, j, k] + k0[i + 1, j, k] + k0[i, j, k]",
+   "boundary_conditions": {
+    "k1": {
+       "type": "constant",
+       "value": 0.0
+    },
+    "k0": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+  "k3": {
+   "data_type": "float32",
+   "computation_string": "k3 = k0[i, j, k] + k4[i + 1, j, k] + k4[i, j, k]",
+   "boundary_conditions": {
+    "k0": {
+       "type": "constant",
+       "value": 0.0
+    },
+    "k4": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+   "k4": {
+   "data_type": "float32",
+   "computation_string": "k4 = k1[i, j, k] + k1[i+1, j, k]",
+   "boundary_conditions": {
+    "k1": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+  "out": {
+   "data_type": "float32",
+   "computation_string": "out = k2[i,j,k] + k3[i,j,k]",
+   "boundary_conditions": {
+    "k2":{
+       "type": "constant",
+       "value": 0.0
+    },
+    "k3": {
+       "type": "constant",
+       "value": 0.0
+    }
+    }
+   }
+ }
+}

From 4000eca386f2e0beb51b4d30d57aef5f7b748f1a Mon Sep 17 00:00:00 2001
From: andreaskuster <mail@andreaskuster.ch>
Date: Fri, 8 Jan 2021 16:26:37 +0100
Subject: [PATCH 09/27] Add minimal bug example.

---
 bug_min.json | 174 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 174 insertions(+)
 create mode 100644 bug_min.json

diff --git a/bug_min.json b/bug_min.json
new file mode 100644
index 0000000..1c8278a
--- /dev/null
+++ b/bug_min.json
@@ -0,0 +1,174 @@
+{
+ "inputs": {
+  "crlato": {
+   "data": "crlato_128_float32.dat",
+   "data_type": "float32",
+   "input_dims": [
+    "i"
+   ]
+  }
+ },
+ "outputs": [
+  "out"
+ ],
+ "dimensions": [
+  128,
+  80,
+  128
+ ],
+ "vectorization": 1,
+ "program": {
+  "v_tmp": {
+   "data_type": "float32",
+   "computation_string": "\nv_tmp = crlato[i]\n",
+   "boundary_conditions": {
+    "crlato": {
+     "btype": "shrink",
+     "halo": [
+      "halo-2",
+      "halo-1",
+      0,
+      0,
+      "halo-1",
+      "halo-2"
+     ]
+    }
+   }
+  },
+  "u_tmp": {
+   "data_type": "float32",
+   "computation_string": "\nu_tmp = crlato[i]\n",
+   "boundary_conditions": {
+    "crlato": {
+     "btype": "shrink",
+     "halo": [
+      "halo-1",
+      "halo-2",
+      0,
+      0,
+      "halo-2",
+      "halo-1"
+     ]
+    }
+   }
+  },
+  "__tmp_T": {
+   "data_type": "float32",
+   "computation_string": "\n__tmp_T = u_tmp[(i, j, k)] + v_tmp[((i + 1), j, k)] + v_tmp[(i, j, k)]\n",
+   "boundary_conditions": {
+    "u_tmp": {
+     "btype": "shrink",
+     "halo": [
+      "halo-1",
+      "halo-1",
+      0,
+      0,
+      "halo-1",
+      "halo-1"
+     ]
+    },
+    "v_tmp": {
+     "btype": "shrink",
+     "halo": [
+      "halo-1",
+      "halo-1",
+      0,
+      0,
+      "halo-1",
+      "halo-1"
+     ]
+    }
+   }
+  },
+  "__tmp_S": {
+   "data_type": "float32",
+   "computation_string": "\n__tmp_S = v_tmp[(i, j, k)] + u_tmp[((i + 1), j, k)] + u_tmp[(i, j, k)]\n",
+   "boundary_conditions": {
+    "v_tmp": {
+     "btype": "shrink",
+     "halo": [
+      "halo-1",
+      "halo-1",
+      0,
+      0,
+      "halo-1",
+      "halo-1"
+     ]
+    },
+    "u_tmp": {
+     "btype": "shrink",
+     "halo": [
+      "halo-1",
+      "halo-1",
+      0,
+      0,
+      "halo-1",
+      "halo-1"
+     ]
+    }
+   }
+  },
+  "out": {
+   "data_type": "float32",
+   "computation_string": "\nout = __tmp_S[(i,j,k)] + __tmp_T[(i,j,k)]\n",
+   "boundary_conditions": {
+    "__tmp_S":{
+     "btype": "shrink",
+     "halo": [
+      "halo",
+      "halo",
+      0,
+      0,
+      "halo",
+      "halo"
+     ]
+    },
+    "__tmp_T": {
+     "btype": "shrink",
+     "halo": [
+      "halo",
+      "halo",
+      0,
+      0,
+      "halo",
+      "halo"
+     ]
+    }
+    }
+   }
+ },
+ "constants": {
+  "eddlat": {
+   "value": "5729.58",
+   "data_type": "float32"
+  },
+  "eddlon": {
+   "value": "5729.58",
+   "data_type": "float32"
+  },
+  "tau_smag": {
+   "value": "0.3",
+   "data_type": "float32"
+  },
+  "weight_smag": {
+   "value": "0.5",
+   "data_type": "float32"
+  },
+  "I": {
+   "value": "128",
+   "data_type": "int32"
+  },
+  "J": {
+   "value": "128",
+   "data_type": "int32"
+  },
+  "K": {
+   "value": "80",
+   "data_type": "int32"
+  },
+  "halo": {
+   "value": "1",
+   "data_type": "int32"
+  }
+ }
+}

From f0e2e3b8abbc09558f895565f33b30fb0a9f6b4c Mon Sep 17 00:00:00 2001
From: andreaskuster <mail@andreaskuster.ch>
Date: Fri, 8 Jan 2021 19:58:40 +0100
Subject: [PATCH 10/27] Further reduce minimal example.

---
 bug_min.json | 170 ++++++++++++---------------------------------------
 1 file changed, 40 insertions(+), 130 deletions(-)

diff --git a/bug_min.json b/bug_min.json
index 1c8278a..981e180 100644
--- a/bug_min.json
+++ b/bug_min.json
@@ -1,7 +1,7 @@
 {
  "inputs": {
-  "crlato": {
-   "data": "crlato_128_float32.dat",
+  "inA": {
+   "data": "inA_float32.dat",
    "data_type": "float32",
    "input_dims": [
     "i"
@@ -12,163 +12,73 @@
   "out"
  ],
  "dimensions": [
-  128,
-  80,
-  128
+  8,
+  8,
+  8
  ],
  "vectorization": 1,
  "program": {
-  "v_tmp": {
+  "k0": {
    "data_type": "float32",
-   "computation_string": "\nv_tmp = crlato[i]\n",
+   "computation_string": "k0 = inA[i]",
    "boundary_conditions": {
-    "crlato": {
-     "btype": "shrink",
-     "halo": [
-      "halo-2",
-      "halo-1",
-      0,
-      0,
-      "halo-1",
-      "halo-2"
-     ]
-    }
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
    }
   },
-  "u_tmp": {
+  "k1": {
    "data_type": "float32",
-   "computation_string": "\nu_tmp = crlato[i]\n",
+   "computation_string": "k1 = inA[i]",
    "boundary_conditions": {
-    "crlato": {
-     "btype": "shrink",
-     "halo": [
-      "halo-1",
-      "halo-2",
-      0,
-      0,
-      "halo-2",
-      "halo-1"
-     ]
-    }
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
    }
   },
-  "__tmp_T": {
+  "k2": {
    "data_type": "float32",
-   "computation_string": "\n__tmp_T = u_tmp[(i, j, k)] + v_tmp[((i + 1), j, k)] + v_tmp[(i, j, k)]\n",
+   "computation_string": "k2 = k1[i, j, k] + k0[i + 1, j, k] + k0[i, j, k]",
    "boundary_conditions": {
-    "u_tmp": {
-     "btype": "shrink",
-     "halo": [
-      "halo-1",
-      "halo-1",
-      0,
-      0,
-      "halo-1",
-      "halo-1"
-     ]
+    "k1": {
+       "type": "constant",
+       "value": 0.0
     },
-    "v_tmp": {
-     "btype": "shrink",
-     "halo": [
-      "halo-1",
-      "halo-1",
-      0,
-      0,
-      "halo-1",
-      "halo-1"
-     ]
+    "k0": {
+       "type": "constant",
+       "value": 0.0
     }
    }
   },
-  "__tmp_S": {
+  "k3": {
    "data_type": "float32",
-   "computation_string": "\n__tmp_S = v_tmp[(i, j, k)] + u_tmp[((i + 1), j, k)] + u_tmp[(i, j, k)]\n",
+   "computation_string": "k3 = k0[i, j, k] + k1[i + 1, j, k] + k1[i, j, k]",
    "boundary_conditions": {
-    "v_tmp": {
-     "btype": "shrink",
-     "halo": [
-      "halo-1",
-      "halo-1",
-      0,
-      0,
-      "halo-1",
-      "halo-1"
-     ]
+    "k0": {
+       "type": "constant",
+       "value": 0.0
     },
-    "u_tmp": {
-     "btype": "shrink",
-     "halo": [
-      "halo-1",
-      "halo-1",
-      0,
-      0,
-      "halo-1",
-      "halo-1"
-     ]
+    "k1": {
+       "type": "constant",
+       "value": 0.0
     }
    }
   },
   "out": {
    "data_type": "float32",
-   "computation_string": "\nout = __tmp_S[(i,j,k)] + __tmp_T[(i,j,k)]\n",
+   "computation_string": "out = k2[i,j,k] + k3[i,j,k]",
    "boundary_conditions": {
-    "__tmp_S":{
-     "btype": "shrink",
-     "halo": [
-      "halo",
-      "halo",
-      0,
-      0,
-      "halo",
-      "halo"
-     ]
+    "k2":{
+       "type": "constant",
+       "value": 0.0
     },
-    "__tmp_T": {
-     "btype": "shrink",
-     "halo": [
-      "halo",
-      "halo",
-      0,
-      0,
-      "halo",
-      "halo"
-     ]
+    "k3": {
+       "type": "constant",
+       "value": 0.0
     }
     }
    }
- },
- "constants": {
-  "eddlat": {
-   "value": "5729.58",
-   "data_type": "float32"
-  },
-  "eddlon": {
-   "value": "5729.58",
-   "data_type": "float32"
-  },
-  "tau_smag": {
-   "value": "0.3",
-   "data_type": "float32"
-  },
-  "weight_smag": {
-   "value": "0.5",
-   "data_type": "float32"
-  },
-  "I": {
-   "value": "128",
-   "data_type": "int32"
-  },
-  "J": {
-   "value": "128",
-   "data_type": "int32"
-  },
-  "K": {
-   "value": "80",
-   "data_type": "int32"
-  },
-  "halo": {
-   "value": "1",
-   "data_type": "int32"
-  }
  }
 }

From 8023480d5d49a3e67bc46bc9a1c65459da6ec8ad Mon Sep 17 00:00:00 2001
From: andreaskuster <mail@andreaskuster.ch>
Date: Fri, 8 Jan 2021 19:59:07 +0100
Subject: [PATCH 11/27] Account for offset to center.

---
 stencilflow/kernel_chain_graph.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/stencilflow/kernel_chain_graph.py b/stencilflow/kernel_chain_graph.py
index ff54023..301827f 100644
--- a/stencilflow/kernel_chain_graph.py
+++ b/stencilflow/kernel_chain_graph.py
@@ -507,6 +507,11 @@ def compute_delay_buffer(self) -> None:
                         dimensions=self.dimensions,
                         index=stencilflow.list_subtract_cwise(
                             max_delay[:-1], entry[:-1]))
+
+                    if not isinstance(node, Output):
+                        max_offset = node.dist_to_center[max(node.dist_to_center, key=lambda x: node.dist_to_center[x])]
+                        max_size = max_offset - node.dist_to_center[entry[-1]]
+
                     node.delay_buffer[name] = BoundedQueue(name=name,
                                                            maxsize=max_size)
                     node.delay_buffer[name].import_data(

From 0afabe0b7403aed827b47d9a2fbee07782abc76e Mon Sep 17 00:00:00 2001
From: andreaskuster <mail@andreaskuster.ch>
Date: Sat, 9 Jan 2021 17:54:42 +0100
Subject: [PATCH 12/27] Add temporary fix.

---
 stencilflow/kernel_chain_graph.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/stencilflow/kernel_chain_graph.py b/stencilflow/kernel_chain_graph.py
index 301827f..2763b65 100644
--- a/stencilflow/kernel_chain_graph.py
+++ b/stencilflow/kernel_chain_graph.py
@@ -85,6 +85,29 @@ def __init__(self,
         if self.log_level >= LogLevel.MODERATE:
             print("Compute delay buffer sizes.")
         self.compute_delay_buffer()  # compute the delay buffer sizes
+
+        for node in self.graph.nodes():
+            if node.name == "__tmp_T" or node.name == "__tmp_T_sqr_s_1351":
+                name = "u_tmp"
+                max_size = self.dimensions[0]*self.dimensions[1]
+                node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
+                node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
+            if node.name == "__tmp_S" or node.name == "__tmp_S_sqr_uv_1352":
+                name = "v_tmp"
+                max_size = self.dimensions[0] * self.dimensions[1]
+                node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
+                node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
+            if node.name == "__tmp_T_sqr_s_1351":
+                name = "ms_sdfg_1330___local_frac_1_dx_1660"
+                max_size = self.dimensions[0]*self.dimensions[1]
+                node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
+                node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
+            if node.name == "__tmp_S_sqr_uv_1352":
+                name = "ms_sdfg_1330___local_frac_1_dx_1660"
+                max_size = self.dimensions[0] * self.dimensions[1]
+                node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
+                node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
+
         if self.log_level >= LogLevel.MODERATE:
             print("Add channels to the graph edges.")
         # plot kernel graphs if flag set to true

From c4b83c5e5a8cfdb41ef94b895850eb750e21418f Mon Sep 17 00:00:00 2001
From: Andreas Kuster <mail@andreaskuster.ch>
Date: Sat, 9 Jan 2021 15:18:23 +0100
Subject: [PATCH 13/27] Add fpga0 sdk env vars script

---
 vars.sh | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 vars.sh

diff --git a/vars.sh b/vars.sh
new file mode 100644
index 0000000..80989aa
--- /dev/null
+++ b/vars.sh
@@ -0,0 +1,16 @@
+# intel fpga
+export INTELFPGAOCLSDKROOT=/opt/intelFPGA_pro/19.1/hld
+export PATH=$INTELFPGAOCLSDKROOT/bin/:$PATH
+export AOCL_BOARD_PACKAGE_ROOT=$INTELFPGAOCLSDKROOT/board/bittware_pcie/s10
+# /opt/intelFPGA_pro/19.4/hld/board/bittware_pcie/s10/board_env.xml
+# /opt/intelFPGA_pro/19.4/hld/board/bittware_pcie/s10_hpc_default/board_env.xml
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$AOCL_BOARD_PACKAGE_ROOT/linux64/lib
+
+# xilinx fpga
+export PATH=/opt/Xilinx/Vitis/2019.2/bin:/opt/Xilinx/Vitis_HLS/2019.2/bin:/opt/Xilinx/Vivado/2019.2/bin:$PATH
+export XILINX_XRT=/opt/xilinx/xrt
+export PATH=$XILINX_XRT/bin:$PATH
+export LD_LIBRARY_PATH=$XILINX_XRT/lib:$LD_LIBRARY_PATH
+export XILINXD_LICENSE_FILE=2100@sgv-license-01
+export LIBRARY_PATH=/usr/lib/x86_64-linux-gnu
+

From ab8c555961403a03fa7609030bbc08e5fe4765d4 Mon Sep 17 00:00:00 2001
From: Andreas Kuster <mail@andreaskuster.ch>
Date: Sat, 9 Jan 2021 16:27:03 +0100
Subject: [PATCH 14/27] Add larger jacobi3d example

---
 test/stencils/jacobi3d_512x512x512.json | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 test/stencils/jacobi3d_512x512x512.json

diff --git a/test/stencils/jacobi3d_512x512x512.json b/test/stencils/jacobi3d_512x512x512.json
new file mode 100644
index 0000000..82db32d
--- /dev/null
+++ b/test/stencils/jacobi3d_512x512x512.json
@@ -0,0 +1,24 @@
+{
+    "inputs": {
+        "a": {
+            "data": "data/zeros_32x32x32_fp32.dat",
+            "data_type": "float32"
+        }
+    },
+    "outputs": ["b"],
+    "dimensions": [512, 512, 512],
+    "program": {
+        "b": {
+            "computation_string":
+            "b = 0.16666666 * (a[i-1,j,k] + a[i+1,j,k] + a[i,j-1,k] + a[i,j+1,k] + a[i,j,k-1] + a[i,j,k+1])",
+            "boundary_conditions": {
+                "a": {
+                    "type": "constant",
+                    "value": 1.0
+                }
+            },
+            "data_type":
+            "float32"
+        }
+    }
+}

From 47cc6666b27c857e64ae2fb881ba33140008d318 Mon Sep 17 00:00:00 2001
From: andreaskuster <mail@andreaskuster.ch>
Date: Mon, 11 Jan 2021 00:18:47 +0100
Subject: [PATCH 15/27] Add more complex example.

---
 bug_min_ext.json | 94 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 bug_min_ext.json

diff --git a/bug_min_ext.json b/bug_min_ext.json
new file mode 100644
index 0000000..adc7d4f
--- /dev/null
+++ b/bug_min_ext.json
@@ -0,0 +1,94 @@
+{
+ "inputs": {
+  "inA": {
+   "data": "inA_float32.dat",
+   "data_type": "float32",
+   "input_dims": [
+    "i"
+   ]
+  }
+ },
+ "outputs": [
+  "out"
+ ],
+ "dimensions": [
+  8,
+  8,
+  8
+ ],
+ "vectorization": 1,
+ "program": {
+  "k0": {
+   "data_type": "float32",
+   "computation_string": "k0 = inA[i]",
+   "boundary_conditions": {
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
+   }
+  },
+  "k1": {
+   "data_type": "float32",
+   "computation_string": "k1 = inA[i]",
+   "boundary_conditions": {
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
+   }
+  },
+  "k2": {
+   "data_type": "float32",
+   "computation_string": "k2 = k1[i, j, k] + k0[i + 1, j, k] + k0[i, j, k]",
+   "boundary_conditions": {
+    "k1": {
+       "type": "constant",
+       "value": 0.0
+    },
+    "k0": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+  "k3": {
+   "data_type": "float32",
+   "computation_string": "k3 = k0[i, j, k] + k4[i + 1, j, k] + k4[i, j, k]",
+   "boundary_conditions": {
+    "k0": {
+       "type": "constant",
+       "value": 0.0
+    },
+    "k4": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+   "k4": {
+   "data_type": "float32",
+   "computation_string": "k4 = k1[i, j, k] + k1[i+1, j, k]",
+   "boundary_conditions": {
+    "k1": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+  "out": {
+   "data_type": "float32",
+   "computation_string": "out = k2[i,j,k] + k3[i,j,k]",
+   "boundary_conditions": {
+    "k2":{
+       "type": "constant",
+       "value": 0.0
+    },
+    "k3": {
+       "type": "constant",
+       "value": 0.0
+    }
+    }
+   }
+ }
+}

From ed1dcb8fc6e9953292bda2f4b45be161e429a1de Mon Sep 17 00:00:00 2001
From: Andreas Kuster <mail@andreaskuster.ch>
Date: Sat, 9 Jan 2021 19:19:57 +0100
Subject: [PATCH 16/27] Increase problem size.

---
 bug_min.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bug_min.json b/bug_min.json
index 981e180..47815e5 100644
--- a/bug_min.json
+++ b/bug_min.json
@@ -12,9 +12,9 @@
   "out"
  ],
  "dimensions": [
-  8,
-  8,
-  8
+  256,
+  256,
+  256
  ],
  "vectorization": 1,
  "program": {

From ff683e158a8718a74ee52bf6f9392af6d22ec715 Mon Sep 17 00:00:00 2001
From: Andreas Kuster <mail@andreaskuster.ch>
Date: Sat, 11 Sep 2021 21:45:31 +0200
Subject: [PATCH 17/27] Make example more distinct to test function
 correctness.

---
 bug_min.json | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/bug_min.json b/bug_min.json
index 47815e5..cf2679b 100644
--- a/bug_min.json
+++ b/bug_min.json
@@ -12,9 +12,9 @@
   "out"
  ],
  "dimensions": [
-  256,
-  256,
-  256
+  10,
+  10,
+  10
  ],
  "vectorization": 1,
  "program": {
@@ -40,7 +40,7 @@
   },
   "k2": {
    "data_type": "float32",
-   "computation_string": "k2 = k1[i, j, k] + k0[i + 1, j, k] + k0[i, j, k]",
+   "computation_string": "k2 = k1[i, j, k] + k0[i+1, j, k] + k0[i, j, k]",
    "boundary_conditions": {
     "k1": {
        "type": "constant",
@@ -54,7 +54,7 @@
   },
   "k3": {
    "data_type": "float32",
-   "computation_string": "k3 = k0[i, j, k] + k1[i + 1, j, k] + k1[i, j, k]",
+   "computation_string": "k3 = k0[i, j, k] + k1[i+1, j+1, k+1] + k1[i, j, k]",
    "boundary_conditions": {
     "k0": {
        "type": "constant",
@@ -68,7 +68,7 @@
   },
   "out": {
    "data_type": "float32",
-   "computation_string": "out = k2[i,j,k] + k3[i,j,k]",
+   "computation_string": "out = k2[i, j, k] + k3[i, j, k]",
    "boundary_conditions": {
     "k2":{
        "type": "constant",

From 0e35d03e928ab1fd82faaf60cf7a15aeb6d09707 Mon Sep 17 00:00:00 2001
From: Andreas Kuster <mail@andreaskuster.ch>
Date: Sat, 11 Sep 2021 21:46:16 +0200
Subject: [PATCH 18/27] Add path inclusion for direct file execution. Extend
 optimization functionality.

---
 stencilflow/kernel_chain_graph.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/stencilflow/kernel_chain_graph.py b/stencilflow/kernel_chain_graph.py
index 2763b65..52c73b6 100644
--- a/stencilflow/kernel_chain_graph.py
+++ b/stencilflow/kernel_chain_graph.py
@@ -15,6 +15,8 @@
 import operator
 import re
 import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 
 from typing import Any, List, Dict, Tuple
 
@@ -817,6 +819,14 @@ def runtime_lower_bound(self):
                         type=int)
     parser.add_argument("-report", action="store_true")
     parser.add_argument("-simulate", action="store_true")
+    parser.add_argument("-opt", action="store_true")
+    parser.add_argument("-opt_goal", default=["min_fast_mem", 12000], nargs="+")
+    """
+        choices:
+        - min_com_vol, FAST_MEM_BOUND, SLOW_MEM_BOUND
+        - min_fast_mem, COM_VOL_BOUND
+        - opt_ratio, RATIO
+    """
     args = parser.parse_args()
     args.log_level = stencilflow.log_level.LogLevel(args.log_level)
     program_description = stencilflow.parse_json(args.stencil_file)
@@ -837,6 +847,17 @@ def runtime_lower_bound(self):
                         log_level=LogLevel(args.log_level))
         sim.simulate()
 
+    # choose optimization goal
+    if args.opt:
+        from stencilflow import Optimizer
+        opt = Optimizer(self.kernel_nodes, self.dimensions)
+        if args.opt_goal[0] == "min_com_vol":
+            opt.minimize_comm_vol(fast_memory_bound=args.opt_goal[1], slow_memory_bound=args.opt_goal[2])
+        if args.opt_goal[0] == "min_fast_mem":
+            opt.minimize_fast_mem(communication_volume_bound=args.opt_goal[1])
+        if args.opt_goal[0] == "opt_ratio":
+            opt.optimize_to_ratio(ratio=args.opt_goal[1])
+
     # output a report if argument -report is true
     if args.report:
         chain.report(args.stencil_file)

From fb9966c163cd4bab69935a85a027b07f0bbb8712 Mon Sep 17 00:00:00 2001
From: Andreas Kuster <mail@andreaskuster.ch>
Date: Sat, 11 Sep 2021 21:46:44 +0200
Subject: [PATCH 19/27] Several readme extension

---
 README.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/README.md b/README.md
index 7f9989c..08d3765 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ To run the code, the following software must be available:
 - Python 3.6.x or newer.
 - The `virtualenv` module (installed with `pip install virtualenv`).
 - A C++17-capable compiler (e.g., GCC 7.x or Clang 6.x).
+- graphviz (for graph plotting support)
 - One or both FPGA compilers:
   - Intel FPGA OpenCL SDK (tested with 18.1.1 and 19.1)
   - Xilinx Vitis (tested with 2020.2) 
@@ -47,6 +48,13 @@ kernel source files themselves in:
 .dacecache/<kernel name>/src/intel_fpga/device
 ```
 
+To run low-level analysis of the buffer size and stencil program visualization, you can invoke the executable `stencilflow/kernel_chain_graph.py`.
+Example usage:
+
+```bash
+stencilflow/kernel_chain_graph.py -stencil_file test/stencils/jacobi3d_32x32x32_8itr_8vec.json -plot -simulate -report -optimize
+```
+
 Verification
 ------------
 
@@ -81,3 +89,16 @@ It is a known issue that launching multiple Intel FPGA kernels in quick
 succession (such as is done in the tests) can sometimes fail sporadically,
 seemingly due to file I/O issues. Running individual programs should never fail.
 
+Publication
+-----------
+
+If you use StencilFlow, cite us:
+```bibtex
+@inproceedings{dace,
+  author    = {Johannes de Fine Licht, Andreas Kuster, Tiziano De Matteis, Tal Ben-Nun, Dominic Hofer, Torsten Hoefler},
+  title     = {StencilFlow: Mapping Large Stencil Programs to Distributed Spatial Computing Systems},
+  year      = {2021},
+  booktitle = {Proceedings of the IEEE/ACM International Symposium on Code Generation and Optimization (CGO)},
+  series = {CGO '21}
+}
+```
\ No newline at end of file

From 67a6b93a87e1163470f2317645c287eaebc22ace Mon Sep 17 00:00:00 2001
From: Andreas Kuster <mail@andreaskuster.ch>
Date: Sat, 11 Sep 2021 22:40:02 +0200
Subject: [PATCH 20/27] Remove horidiff hotfix

---
 stencilflow/kernel_chain_graph.py | 73 +++++++++++--------------------
 1 file changed, 25 insertions(+), 48 deletions(-)

diff --git a/stencilflow/kernel_chain_graph.py b/stencilflow/kernel_chain_graph.py
index 52c73b6..8723851 100644
--- a/stencilflow/kernel_chain_graph.py
+++ b/stencilflow/kernel_chain_graph.py
@@ -16,6 +16,7 @@
 import re
 import os
 import sys
+
 sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 
 from typing import Any, List, Dict, Tuple
@@ -87,29 +88,6 @@ def __init__(self,
         if self.log_level >= LogLevel.MODERATE:
             print("Compute delay buffer sizes.")
         self.compute_delay_buffer()  # compute the delay buffer sizes
-
-        for node in self.graph.nodes():
-            if node.name == "__tmp_T" or node.name == "__tmp_T_sqr_s_1351":
-                name = "u_tmp"
-                max_size = self.dimensions[0]*self.dimensions[1]
-                node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
-                node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
-            if node.name == "__tmp_S" or node.name == "__tmp_S_sqr_uv_1352":
-                name = "v_tmp"
-                max_size = self.dimensions[0] * self.dimensions[1]
-                node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
-                node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
-            if node.name == "__tmp_T_sqr_s_1351":
-                name = "ms_sdfg_1330___local_frac_1_dx_1660"
-                max_size = self.dimensions[0]*self.dimensions[1]
-                node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
-                node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
-            if node.name == "__tmp_S_sqr_uv_1352":
-                name = "ms_sdfg_1330___local_frac_1_dx_1660"
-                max_size = self.dimensions[0] * self.dimensions[1]
-                node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
-                node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
-
         if self.log_level >= LogLevel.MODERATE:
             print("Add channels to the graph edges.")
         # plot kernel graphs if flag set to true
@@ -314,14 +292,14 @@ def add_channels(self) -> None:
                                 name = src.name + "_" + dest.name
                                 channel = {
                                     "name":
-                                    name,
+                                        name,
                                     "delay_buffer":
-                                    self.kernel_nodes[dest.name].delay_buffer[
-                                        src.name],
+                                        self.kernel_nodes[dest.name].delay_buffer[
+                                            src.name],
                                     "internal_buffer":
-                                    dest.internal_buffer[src.name],
+                                        dest.internal_buffer[src.name],
                                     "data_type":
-                                    src.data_type
+                                        src.data_type
                                 }
                                 # add channel reference to global channel dictionary
                                 self.channels[name] = channel
@@ -339,18 +317,18 @@ def add_channels(self) -> None:
                                 name = src.name + "_" + dest.name
                                 channel = {
                                     "name":
-                                    name,
+                                        name,
                                     "delay_buffer":
-                                    self.kernel_nodes[dest.name].delay_buffer[
-                                        src.name],
+                                        self.kernel_nodes[dest.name].delay_buffer[
+                                            src.name],
                                     "internal_buffer":
-                                    dest.internal_buffer[src.name],
+                                        dest.internal_buffer[src.name],
                                     "data_type":
-                                    src.data_type,
+                                        src.data_type,
                                     "input_dims":
-                                    self.inputs[src.name]["input_dims"]
-                                    if "input_dims" in self.inputs[src.name]
-                                    else None
+                                        self.inputs[src.name]["input_dims"]
+                                        if "input_dims" in self.inputs[src.name]
+                                        else None
                                 }
                                 # add channel reference to global channel dictionary
                                 self.channels[name] = channel
@@ -367,13 +345,13 @@ def add_channels(self) -> None:
                             name = src.name + "_" + dest.name
                             channel = {
                                 "name":
-                                name,
+                                    name,
                                 "delay_buffer":
-                                self.output_nodes[dest.name].delay_buffer[
-                                    src.name],
+                                    self.output_nodes[dest.name].delay_buffer[
+                                        src.name],
                                 "internal_buffer": {},
                                 "data_type":
-                                src.data_type
+                                    src.data_type
                             }
                             # add channel reference to global channel dictionary
                             self.channels[name] = channel
@@ -411,7 +389,7 @@ def import_input(self) -> None:
                 else:
                     i["input_dims"] = stencilflow.ITERATORS[len(stencilflow.
                                                                 ITERATORS) -
-                                                        self.kernel_dimensions:]
+                                                            self.kernel_dimensions:]
         self.outputs = inp["outputs"]
         # handle stencil program output dimensions
         if self.kernel_dimensions == 1:  # 1D
@@ -419,8 +397,8 @@ def import_input(self) -> None:
                 self.program[entry]["computation_string"] = \
                     self.program[entry]["computation_string"].replace("[", "[i, j,")  # add two extra indices
             self.dimensions = [
-                1, 1
-            ] + inp["dimensions"]  # add two extra dimensions
+                                  1, 1
+                              ] + inp["dimensions"]  # add two extra dimensions
         elif self.kernel_dimensions == 2:  # 2D
             for entry in self.program:
                 self.program[entry]["computation_string"] = self.program[entry]["computation_string"] \
@@ -514,16 +492,14 @@ def compute_delay_buffer(self) -> None:
             order = list(nx.topological_sort(self.graph))
         except nx.exception.NetworkXUnfeasible:
             cycle = next(nx.algorithms.cycles.simple_cycles(self.graph))
-            raise ValueError("Cycle detected: {}".format(
-                [c.name for c in cycle]))
+            raise ValueError("Cycle detected: {}".format([c.name for c in cycle]))
         # go through all nodes
         for node in order:
             # process delay buffer (no additional delay buffer will appear because of the topological order)
             for inp in node.input_paths:
                 # compute maximum delay size per input
                 max_delay = max(node.input_paths[inp])
-                max_delay[
-                    2] += 1  # add an extra delay cycle for the processing in the kernel node
+                max_delay[2] += 1  # add an extra delay cycle for the processing in the kernel node
                 # loop over all inputs and set their size relative to the max size to have data ready at the exact
                 # same time
                 for entry in node.input_paths[inp]:
@@ -746,7 +722,7 @@ def report(self, name):
                         u.name, v.name, entry.name, entry.maxsize))
                     total_fast += entry.maxsize
         print("buffer size slow memory: {} \nbuffer size fast memory: {}".format(
-                total_slow, total_fast))
+            total_slow, total_fast))
 
     def operation_count(self):
         """For each operation type found in the ASTs, return a tuple of
@@ -850,6 +826,7 @@ def runtime_lower_bound(self):
     # choose optimization goal
     if args.opt:
         from stencilflow import Optimizer
+
         opt = Optimizer(self.kernel_nodes, self.dimensions)
         if args.opt_goal[0] == "min_com_vol":
             opt.minimize_comm_vol(fast_memory_bound=args.opt_goal[1], slow_memory_bound=args.opt_goal[2])

From 47dfc5874ac45f0758e6429bd9e453a99dd6a18e Mon Sep 17 00:00:00 2001
From: Andreas Kuster <20418060+andreaskuster@users.noreply.github.com>
Date: Mon, 13 Sep 2021 20:34:47 +0200
Subject: [PATCH 21/27] Update README.md

Co-authored-by: definelicht <definelicht@inf.ethz.ch>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 08d3765..6f4c9d2 100644
--- a/README.md
+++ b/README.md
@@ -98,7 +98,7 @@ If you use StencilFlow, cite us:
   author    = {Johannes de Fine Licht, Andreas Kuster, Tiziano De Matteis, Tal Ben-Nun, Dominic Hofer, Torsten Hoefler},
   title     = {StencilFlow: Mapping Large Stencil Programs to Distributed Spatial Computing Systems},
   year      = {2021},
-  booktitle = {Proceedings of the IEEE/ACM International Symposium on Code Generation and Optimization (CGO)},
+  booktitle = {Proceedings of the IEEE/ACM International Symposium on Code Generation and Optimization (CGO'21)},
   series = {CGO '21}
 }
 ```
\ No newline at end of file

From 6b737df8ad9700b000e69b458f8da60b42214397 Mon Sep 17 00:00:00 2001
From: Andreas Kuster <20418060+andreaskuster@users.noreply.github.com>
Date: Mon, 13 Sep 2021 20:35:00 +0200
Subject: [PATCH 22/27] Update README.md

Co-authored-by: definelicht <definelicht@inf.ethz.ch>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6f4c9d2..f86405d 100644
--- a/README.md
+++ b/README.md
@@ -95,7 +95,7 @@ Publication
 If you use StencilFlow, cite us:
 ```bibtex
 @inproceedings{dace,
-  author    = {Johannes de Fine Licht, Andreas Kuster, Tiziano De Matteis, Tal Ben-Nun, Dominic Hofer, Torsten Hoefler},
+  author    = {Johannes de~Fine~Licht, Andreas Kuster, Tiziano De~Matteis, Tal Ben-Nun, Dominic Hofer, Torsten Hoefler},
   title     = {StencilFlow: Mapping Large Stencil Programs to Distributed Spatial Computing Systems},
   year      = {2021},
   booktitle = {Proceedings of the IEEE/ACM International Symposium on Code Generation and Optimization (CGO'21)},

From 6a27a5f1ae7bb292202a8276bebde933bd1d6e75 Mon Sep 17 00:00:00 2001
From: Andreas Kuster <mail@andreaskuster.ch>
Date: Mon, 13 Sep 2021 21:22:44 +0200
Subject: [PATCH 23/27] Move test config to default location

---
 bug_min_ext.json                              | 94 -------------------
 .../stencils/horidiff_min.json                |  0
 2 files changed, 94 deletions(-)
 delete mode 100644 bug_min_ext.json
 rename bug_min.json => test/stencils/horidiff_min.json (100%)

diff --git a/bug_min_ext.json b/bug_min_ext.json
deleted file mode 100644
index adc7d4f..0000000
--- a/bug_min_ext.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
- "inputs": {
-  "inA": {
-   "data": "inA_float32.dat",
-   "data_type": "float32",
-   "input_dims": [
-    "i"
-   ]
-  }
- },
- "outputs": [
-  "out"
- ],
- "dimensions": [
-  8,
-  8,
-  8
- ],
- "vectorization": 1,
- "program": {
-  "k0": {
-   "data_type": "float32",
-   "computation_string": "k0 = inA[i]",
-   "boundary_conditions": {
-      "inA": {
-          "type": "constant",
-          "value": 0.0
-      }
-   }
-  },
-  "k1": {
-   "data_type": "float32",
-   "computation_string": "k1 = inA[i]",
-   "boundary_conditions": {
-      "inA": {
-          "type": "constant",
-          "value": 0.0
-      }
-   }
-  },
-  "k2": {
-   "data_type": "float32",
-   "computation_string": "k2 = k1[i, j, k] + k0[i + 1, j, k] + k0[i, j, k]",
-   "boundary_conditions": {
-    "k1": {
-       "type": "constant",
-       "value": 0.0
-    },
-    "k0": {
-       "type": "constant",
-       "value": 0.0
-    }
-   }
-  },
-  "k3": {
-   "data_type": "float32",
-   "computation_string": "k3 = k0[i, j, k] + k4[i + 1, j, k] + k4[i, j, k]",
-   "boundary_conditions": {
-    "k0": {
-       "type": "constant",
-       "value": 0.0
-    },
-    "k4": {
-       "type": "constant",
-       "value": 0.0
-    }
-   }
-  },
-   "k4": {
-   "data_type": "float32",
-   "computation_string": "k4 = k1[i, j, k] + k1[i+1, j, k]",
-   "boundary_conditions": {
-    "k1": {
-       "type": "constant",
-       "value": 0.0
-    }
-   }
-  },
-  "out": {
-   "data_type": "float32",
-   "computation_string": "out = k2[i,j,k] + k3[i,j,k]",
-   "boundary_conditions": {
-    "k2":{
-       "type": "constant",
-       "value": 0.0
-    },
-    "k3": {
-       "type": "constant",
-       "value": 0.0
-    }
-    }
-   }
- }
-}
diff --git a/bug_min.json b/test/stencils/horidiff_min.json
similarity index 100%
rename from bug_min.json
rename to test/stencils/horidiff_min.json

From b9ae43b6083c5623f1aedae96c27ea1d926ffb74 Mon Sep 17 00:00:00 2001
From: Andreas Kuster <mail@andreaskuster.ch>
Date: Mon, 13 Sep 2021 21:23:00 +0200
Subject: [PATCH 24/27] Remove local env setup

---
 vars.sh | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 vars.sh

diff --git a/vars.sh b/vars.sh
deleted file mode 100644
index 80989aa..0000000
--- a/vars.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-# intel fpga
-export INTELFPGAOCLSDKROOT=/opt/intelFPGA_pro/19.1/hld
-export PATH=$INTELFPGAOCLSDKROOT/bin/:$PATH
-export AOCL_BOARD_PACKAGE_ROOT=$INTELFPGAOCLSDKROOT/board/bittware_pcie/s10
-# /opt/intelFPGA_pro/19.4/hld/board/bittware_pcie/s10/board_env.xml
-# /opt/intelFPGA_pro/19.4/hld/board/bittware_pcie/s10_hpc_default/board_env.xml
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$AOCL_BOARD_PACKAGE_ROOT/linux64/lib
-
-# xilinx fpga
-export PATH=/opt/Xilinx/Vitis/2019.2/bin:/opt/Xilinx/Vitis_HLS/2019.2/bin:/opt/Xilinx/Vivado/2019.2/bin:$PATH
-export XILINX_XRT=/opt/xilinx/xrt
-export PATH=$XILINX_XRT/bin:$PATH
-export LD_LIBRARY_PATH=$XILINX_XRT/lib:$LD_LIBRARY_PATH
-export XILINXD_LICENSE_FILE=2100@sgv-license-01
-export LIBRARY_PATH=/usr/lib/x86_64-linux-gnu
-

From 86ab704c8d114a105a72a96a108a933c7558c627 Mon Sep 17 00:00:00 2001
From: Andreas Kuster <mail@andreaskuster.ch>
Date: Tue, 14 Sep 2021 13:25:39 +0200
Subject: [PATCH 25/27] Add extended horidiff example. Adjust delay buffer
 computation.

---
 stencilflow/kernel_chain_graph.py   | 24 +++++---
 test/stencils/horidiff_min_ext.json | 94 +++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+), 9 deletions(-)
 create mode 100644 test/stencils/horidiff_min_ext.json

diff --git a/stencilflow/kernel_chain_graph.py b/stencilflow/kernel_chain_graph.py
index 8723851..8aad222 100644
--- a/stencilflow/kernel_chain_graph.py
+++ b/stencilflow/kernel_chain_graph.py
@@ -497,6 +497,13 @@ def compute_delay_buffer(self) -> None:
         for node in order:
             # process delay buffer (no additional delay buffer will appear because of the topological order)
             for inp in node.input_paths:
+
+                # add internal buffer latency for internal computation
+                if not isinstance(node, Output):
+                    for entry in node.input_paths[inp]:
+                        name = entry[-1]
+                        entry[2] += node.dist_to_center[name]
+
                 # compute maximum delay size per input
                 max_delay = max(node.input_paths[inp])
                 max_delay[2] += 1  # add an extra delay cycle for the processing in the kernel node
@@ -506,17 +513,16 @@ def compute_delay_buffer(self) -> None:
                     name = entry[-1]
                     max_size = stencilflow.convert_3d_to_1d(
                         dimensions=self.dimensions,
-                        index=stencilflow.list_subtract_cwise(
-                            max_delay[:-1], entry[:-1]))
+                        index=stencilflow.list_subtract_cwise(max_delay[:-1], entry[:-1]))
+                    node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
+                    node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
 
-                    if not isinstance(node, Output):
-                        max_offset = node.dist_to_center[max(node.dist_to_center, key=lambda x: node.dist_to_center[x])]
-                        max_size = max_offset - node.dist_to_center[entry[-1]]
+                # remove internal buffer latency for internal computation
+                if not isinstance(node, Output):
+                    for entry in node.input_paths[inp]:
+                        name = entry[-1]
+                        entry[2] -= node.dist_to_center[name]
 
-                    node.delay_buffer[name] = BoundedQueue(name=name,
-                                                           maxsize=max_size)
-                    node.delay_buffer[name].import_data(
-                        [None] * node.delay_buffer[name].maxsize)
             # set input node delay buffers to 1
             if isinstance(node, Input):
                 node.delay_buffer = BoundedQueue(name=node.name,
diff --git a/test/stencils/horidiff_min_ext.json b/test/stencils/horidiff_min_ext.json
new file mode 100644
index 0000000..607ff61
--- /dev/null
+++ b/test/stencils/horidiff_min_ext.json
@@ -0,0 +1,94 @@
+{
+ "inputs": {
+  "inA": {
+   "data": "inA_float32.dat",
+   "data_type": "float32",
+   "input_dims": [
+    "i"
+   ]
+  }
+ },
+ "outputs": [
+  "out"
+ ],
+ "dimensions": [
+  10,
+  10,
+  10
+ ],
+ "vectorization": 1,
+ "program": {
+  "k0": {
+   "data_type": "float32",
+   "computation_string": "k0 = inA[i]",
+   "boundary_conditions": {
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
+   }
+  },
+  "k1": {
+   "data_type": "float32",
+   "computation_string": "k1 = inA[i]",
+   "boundary_conditions": {
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
+   }
+  },
+  "k2": {
+   "data_type": "float32",
+   "computation_string": "k2 = k1[i, j, k] + k0[i + 1, j, k] + k0[i, j, k]",
+   "boundary_conditions": {
+    "k1": {
+       "type": "constant",
+       "value": 0.0
+    },
+    "k0": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+  "k3": {
+   "data_type": "float32",
+   "computation_string": "k3 = k0[i, j, k] + k4[i+1, j+1, k+1] + k4[i, j, k]",
+   "boundary_conditions": {
+    "k0": {
+       "type": "constant",
+       "value": 0.0
+    },
+    "k4": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+   "k4": {
+   "data_type": "float32",
+   "computation_string": "k4 = k1[i, j, k] + k1[i+1, j+1, k+1]",
+   "boundary_conditions": {
+    "k1": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+  "out": {
+   "data_type": "float32",
+   "computation_string": "out = k2[i,j,k] + k3[i,j,k]",
+   "boundary_conditions": {
+    "k2":{
+       "type": "constant",
+       "value": 0.0
+    },
+    "k3": {
+       "type": "constant",
+       "value": 0.0
+    }
+    }
+   }
+ }
+}

From d442f68c9d28a43d0e415ef75ab34376947b836e Mon Sep 17 00:00:00 2001
From: Andreas Kuster <mail@andreaskuster.ch>
Date: Wed, 15 Sep 2021 14:34:00 +0200
Subject: [PATCH 26/27] Reduce min channel depth to 1024

---
 stencilflow/sdfg_generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stencilflow/sdfg_generator.py b/stencilflow/sdfg_generator.py
index 22629b4..87e54e5 100644
--- a/stencilflow/sdfg_generator.py
+++ b/stencilflow/sdfg_generator.py
@@ -28,7 +28,7 @@
 
 import networkx as nx
 
-MINIMUM_CHANNEL_DEPTH = 2048
+MINIMUM_CHANNEL_DEPTH = 1024
 
 NUM_BANKS = 4
 

From 6a0cf1b806622f7a2c62fb967b610c459286e94b Mon Sep 17 00:00:00 2001
From: Andreas Kuster <mail@andreaskuster.ch>
Date: Wed, 15 Sep 2021 23:13:29 +0200
Subject: [PATCH 27/27] Update dace version

---
 dace | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace b/dace
index 1fc6ddd..e732b1d 160000
--- a/dace
+++ b/dace
@@ -1 +1 @@
-Subproject commit 1fc6dddd94ee7fd467f1802398f4dad778c9a68a
+Subproject commit e732b1d7ff83debeac9c7075f9ec78f4d5facc05