From 3e9e10eb1156be7185fe8e34bc1b25d34360443d Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Wed, 13 Nov 2024 09:33:17 +0100
Subject: [PATCH 01/11] GPU TPC: Reject clusters with too small radius during
 refit instead of giving them IFC mask errors

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  1 +
 GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx | 26 ++++++++++++-------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 0b2da89b79ad5..106a222862f49 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -131,6 +131,7 @@ AddOptionRTC(cfNoiseSuppressionEpsilonRelative, uint8_t, 76, "", 0, "Cluster Fin
 AddOptionRTC(nWays, uint8_t, 3, "", 0, "Do N fit passes in final fit of merger")
 AddOptionRTC(nWaysOuter, int8_t, 0, "", 0, "Store outer param")
 AddOptionRTC(trackFitRejectMode, int8_t, 5, "", 0, "0: no limit on rejection or missed hits, >0: break after n rejected hits, <0: reject at max -n hits")
+AddOptionRTC(rejectIFCLowRadiusCluster, uint8_t, 0, "", 0, "Reject clusters that get the IFC mask error during refit")
 AddOptionRTC(dEdxTruncLow, uint8_t, 2, "", 0, "Low truncation threshold, fraction of 128")
 AddOptionRTC(dEdxTruncHigh, uint8_t, 77, "", 0, "High truncation threshold, fraction of 128")
 AddOptionRTC(globalTracking, int8_t, 1, "", 0, "Enable Global Tracking (prolong tracks to adjacent sectors to find short segments)")
diff --git a/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx b/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx
index 13244dcb4b621..0b1c282f3b2f0 100644
--- a/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx
+++ b/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx
@@ -68,7 +68,7 @@ GPUd() bool GPUTPCGMTrackParam::Fit(GPUTPCGMMerger* GPUrestrict() merger, int32_
   GPUTPCGMPropagator prop;
   gputpcgmmergertypes::InterpolationErrors interpolation;
   prop.SetMaterialTPC();
-  prop.SetPolynomialField(&merger->Param().polynomialField);
+  prop.SetPolynomialField(&param.polynomialField);
   prop.SetMaxSinPhi(maxSinPhi);
   prop.SetToyMCEventsFlag(param.par.toyMCEventsFlag);
   if ((clusters[0].slice < 18) == (clusters[N - 1].slice < 18)) {
@@ -157,7 +157,7 @@ GPUd() bool GPUTPCGMTrackParam::Fit(GPUTPCGMMerger* GPUrestrict() merger, int32_
       uint8_t clusterState = clusters[ihit].state;
       const float clAlpha = param.Alpha(clusters[ihit].slice);
       float xx, yy, zz;
-      if (merger->Param().par.earlyTpcTransform) {
+      if (param.par.earlyTpcTransform) {
         const float zOffset = (clusters[ihit].slice < 18) == (clusters[0].slice < 18) ? mTZOffset : -mTZOffset;
         xx = clustersXYZ[ihit].x;
         yy = clustersXYZ[ihit].y;
@@ -177,6 +177,14 @@ GPUd() bool GPUTPCGMTrackParam::Fit(GPUTPCGMMerger* GPUrestrict() merger, int32_
         continue;
       }
 
+      if (param.rec.tpc.rejectIFCLowRadiusCluster) {
+        const float r2 = xx * xx + yy * yy;
+        const float rmax = (83.5f + param.rec.tpc.sysClusErrorMinDist);
+        if (r2 < rmax * rmax) {
+          MarkClusters(clusters, ihitMergeFirst, ihit, wayDirection, GPUTPCGMMergedTrackHit::flagRejectErr);
+        }
+      }
+
       const auto& cluster = clusters[ihit];
 
       bool changeDirection = (cluster.leg - lastLeg) & 1;
@@ -212,7 +220,7 @@ GPUd() bool GPUTPCGMTrackParam::Fit(GPUTPCGMMerger* GPUrestrict() merger, int32_
           continue;
         }
       } else if (allowModification && lastRow != 255 && CAMath::Abs(cluster.row - lastRow) > 1) {
-        bool dodEdx = merger->Param().par.dodEdx && merger->Param().dodEdxDownscaled && merger->Param().rec.tpc.adddEdxSubThresholdClusters && iWay == nWays - 1 && CAMath::Abs(cluster.row - lastRow) == 2 && cluster.leg == clusters[maxN - 1].leg;
+        bool dodEdx = param.par.dodEdx && param.dodEdxDownscaled && param.rec.tpc.adddEdxSubThresholdClusters && iWay == nWays - 1 && CAMath::Abs(cluster.row - lastRow) == 2 && cluster.leg == clusters[maxN - 1].leg;
         dodEdx = AttachClustersPropagate(merger, cluster.slice, lastRow, cluster.row, iTrk, cluster.leg == clusters[maxN - 1].leg, prop, inFlyDirection, GPUCA_MAX_SIN_PHI, dodEdx);
         if (dodEdx) {
           dEdx.fillSubThreshold(lastRow - 1, param);
@@ -323,7 +331,7 @@ GPUd() bool GPUTPCGMTrackParam::Fit(GPUTPCGMMerger* GPUrestrict() merger, int32_
         }
 #endif
         GPUCA_DEBUG_STREAMER_CHECK(GPUTPCGMPropagator::DebugStreamerVals debugVals;);
-        if (merger->Param().rec.tpc.rejectEdgeClustersInTrackFit && uncorrectedY > -1e6f && merger->Param().rejectEdgeClusterByY(uncorrectedY, cluster.row, CAMath::Sqrt(mC[0]))) { // uncorrectedY > -1e6f implies allowModification
+        if (param.rec.tpc.rejectEdgeClustersInTrackFit && uncorrectedY > -1e6f && param.rejectEdgeClusterByY(uncorrectedY, cluster.row, CAMath::Sqrt(mC[0]))) { // uncorrectedY > -1e6f implies allowModification
           retVal = GPUTPCGMPropagator::updateErrorEdgeCluster;
         } else {
           const float time = merger->GetConstantMem()->ioPtrs.clustersNative ? merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[cluster.num].getTime() : -1.f;
@@ -358,11 +366,11 @@ GPUd() bool GPUTPCGMTrackParam::Fit(GPUTPCGMMerger* GPUrestrict() merger, int32_
         ihitStart = ihit;
         float dy = mP[0] - prop.Model().Y();
         float dz = mP[1] - prop.Model().Z();
-        if (CAMath::Abs(mP[4]) * merger->Param().qptB5Scaler > 10 && --resetT0 <= 0 && CAMath::Abs(mP[2]) < 0.15f && dy * dy + dz * dz > 1) {
+        if (CAMath::Abs(mP[4]) * param.qptB5Scaler > 10 && --resetT0 <= 0 && CAMath::Abs(mP[2]) < 0.15f && dy * dy + dz * dz > 1) {
           CADEBUG(printf("Reinit linearization\n"));
           prop.SetTrack(this, prop.GetAlpha());
         }
-        if (merger->Param().par.dodEdx && merger->Param().dodEdxDownscaled && iWay == nWays - 1 && cluster.leg == clusters[maxN - 1].leg && !(clusterState & GPUTPCGMMergedTrackHit::flagEdge)) {
+        if (param.par.dodEdx && param.dodEdxDownscaled && iWay == nWays - 1 && cluster.leg == clusters[maxN - 1].leg && !(clusterState & GPUTPCGMMergedTrackHit::flagEdge)) {
           float qtot = 0, qmax = 0, pad = 0, relTime = 0;
           const int32_t clusterCount = (ihit - ihitMergeFirst) * wayDirection + 1;
           for (int32_t iTmp = ihitMergeFirst; iTmp != ihit + wayDirection; iTmp += wayDirection) {
@@ -404,16 +412,16 @@ GPUd() bool GPUTPCGMTrackParam::Fit(GPUTPCGMMerger* GPUrestrict() merger, int32_
     o2::utils::DebugStreamer::instance()->getStreamer("debug_accept_track", "UPDATE") << o2::utils::DebugStreamer::instance()->getUniqueTreeName("debug_accept_track").data() << "iTrk=" << iTrk << "outerParam=" << *outerParam << "track=" << this << "ihitStart=" << ihitStart << "\n";
   })
 
-  if (!(N + NTolerated >= GPUCA_TRACKLET_SELECTOR_MIN_HITS_B5(mP[4] * merger->Param().qptB5Scaler) && 2 * NTolerated <= CAMath::Max(10, N) && CheckNumericalQuality(covYYUpd))) {
+  if (!(N + NTolerated >= GPUCA_TRACKLET_SELECTOR_MIN_HITS_B5(mP[4] * param.qptB5Scaler) && 2 * NTolerated <= CAMath::Max(10, N) && CheckNumericalQuality(covYYUpd))) {
     return false; // TODO: NTolerated should never become that large, check what is going wrong!
   }
-  if (merger->Param().rec.tpc.minNClustersFinalTrack != -1 && N + NTolerated < merger->Param().rec.tpc.minNClustersFinalTrack) {
+  if (param.rec.tpc.minNClustersFinalTrack != -1 && N + NTolerated < param.rec.tpc.minNClustersFinalTrack) {
     return false;
   }
 
   // TODO: we have looping tracks here with 0 accepted clusters in the primary leg. In that case we should refit the track using only the primary leg.
 
-  if (merger->Param().par.dodEdx && merger->Param().dodEdxDownscaled) {
+  if (param.par.dodEdx && param.dodEdxDownscaled) {
     dEdx.computedEdx(merger->OutputTracksdEdx()[iTrk], param);
   }
   Alpha = prop.GetAlpha();

From 8e2e28a2f7a47e9bc6e485b298ff815ba2e6ed58 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Mon, 11 Nov 2024 14:11:45 +0100
Subject: [PATCH 02/11] Add empty streaming operator, so that std::cout <<
 SMatrixGPU() does not fail

---
 Common/MathUtils/include/MathUtils/SMatrixGPU.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/Common/MathUtils/include/MathUtils/SMatrixGPU.h b/Common/MathUtils/include/MathUtils/SMatrixGPU.h
index 60965a4fa2776..2bfdcf54752b2 100644
--- a/Common/MathUtils/include/MathUtils/SMatrixGPU.h
+++ b/Common/MathUtils/include/MathUtils/SMatrixGPU.h
@@ -29,6 +29,7 @@
 #include "GPUCommonMath.h"
 #include "GPUCommonAlgorithm.h"
 #include "GPUCommonLogger.h"
+#include "GPUCommonTypeTraits.h"
 
 namespace o2::math_utils::detail
 {
@@ -468,6 +469,9 @@ class SMatrixGPU
   GPUd() const T& operator()(unsigned int i, unsigned int j) const;
   GPUd() T& operator()(unsigned int i, unsigned int j);
 
+  template <typename Y, typename X>
+  GPUd() friend X& operator<<(Y& y, const SMatrixGPU&);
+
   class SMatrixRowGPU
   {
    public:
@@ -512,6 +516,13 @@ class SMatrixGPU
   R mRep;
 };
 
+template <class T, unsigned int D1, unsigned int D2, class R, typename Y, typename X = Y>
+  requires(sizeof(typename X::traits_type::pos_type) != 0) // do not provide a template to fair::Logger, etc... (pos_type is a member type of all std::ostream classes)
+GPUd() X& operator<<(Y& y, const SMatrixGPU<T, D1, D2, R>&)
+{
+  return y;
+}
+
 template <class T, unsigned int D1, unsigned int D2, class R>
 GPUdi() SMatrixGPU<T, D1, D2, R>::SMatrixGPU(SMatrixIdentity)
 {

From 5e7a6b0c15fa2d90d46888ea44221d2db227c643 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Mon, 11 Nov 2024 14:12:30 +0100
Subject: [PATCH 03/11] GPU: Fix includes of certain headers (fix order, avoid
 ROOT in GPU code)

---
 GPU/GPUTracking/DataTypes/CalibdEdxContainer.cxx           | 5 ++---
 GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologyPol.cxx    | 1 +
 GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologySpline.cxx | 3 +--
 GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologySpline.h   | 2 +-
 GPU/GPUTracking/Interface/GPUO2InterfaceQA.cxx             | 2 +-
 GPU/GPUTracking/Refit/GPUTrackingRefitKernel.cxx           | 2 +-
 6 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/GPU/GPUTracking/DataTypes/CalibdEdxContainer.cxx b/GPU/GPUTracking/DataTypes/CalibdEdxContainer.cxx
index a632bf361498c..002bb1ed9e9d7 100644
--- a/GPU/GPUTracking/DataTypes/CalibdEdxContainer.cxx
+++ b/GPU/GPUTracking/DataTypes/CalibdEdxContainer.cxx
@@ -12,14 +12,13 @@
 /// \file  CalibdEdxContainer.cxx
 /// \author Matthias Kleiner <mkleiner@ikf.uni-frankfurt.de>
 
-#include "CalibdEdxContainer.h"
-
-#if !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE)
+#if !defined(GPUCA_STANDALONE)
 #include "TFile.h"
 #include "TPCBase/CalDet.h"
 #include "Framework/Logger.h"
 #include "clusterFinderDefs.h"
 #endif
+#include "CalibdEdxContainer.h"
 
 using namespace GPUCA_NAMESPACE::gpu;
 using namespace o2::tpc;
diff --git a/GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologyPol.cxx b/GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologyPol.cxx
index 548bbafae686d..533763e14c6d7 100644
--- a/GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologyPol.cxx
+++ b/GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologyPol.cxx
@@ -9,6 +9,7 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 
+#include "Rtypes.h"
 #include "CalibdEdxTrackTopologyPol.h"
 
 #include <cstddef>
diff --git a/GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologySpline.cxx b/GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologySpline.cxx
index 4c6e750355397..3b0e718026536 100644
--- a/GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologySpline.cxx
+++ b/GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologySpline.cxx
@@ -14,11 +14,10 @@
 ///
 /// \author  Matthias Kleiner <matthias.kleiner@cern.ch>
 
-#include "CalibdEdxTrackTopologySpline.h"
-
 #if !defined(GPUCA_STANDALONE)
 #include "TFile.h"
 #endif
+#include "CalibdEdxTrackTopologySpline.h"
 
 using namespace GPUCA_NAMESPACE::gpu;
 using namespace o2::tpc;
diff --git a/GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologySpline.h b/GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologySpline.h
index 563872fb90d4d..d9d4b9e35592d 100644
--- a/GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologySpline.h
+++ b/GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologySpline.h
@@ -19,12 +19,12 @@
 
 #include "FlatObject.h"
 #include "Spline.h"
+#include "GPUCommonRtypes.h"
 #ifdef GPUCA_HAVE_O2HEADERS
 #include "DataFormatsTPC/Defs.h"
 #endif
 
 #if !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE) // code invisible on GPU and in the standalone compilation
-#include "Rtypes.h"                                       // for ClassDefNV
 #include <fmt/format.h>
 #endif
 
diff --git a/GPU/GPUTracking/Interface/GPUO2InterfaceQA.cxx b/GPU/GPUTracking/Interface/GPUO2InterfaceQA.cxx
index db6df3f9f1ede..7005fbb3bab25 100644
--- a/GPU/GPUTracking/Interface/GPUO2InterfaceQA.cxx
+++ b/GPU/GPUTracking/Interface/GPUO2InterfaceQA.cxx
@@ -12,11 +12,11 @@
 /// \file GPUO2InterfaceQA.cxx
 /// \author David Rohr
 
+#include "TGraphAsymmErrors.h"
 #include "GPUParam.h"
 #include "GPUQA.h"
 #include "GPUO2InterfaceConfiguration.h"
 #include "GPUO2InterfaceQA.h"
-#include "TGraphAsymmErrors.h"
 
 using namespace o2::gpu;
 using namespace o2::tpc;
diff --git a/GPU/GPUTracking/Refit/GPUTrackingRefitKernel.cxx b/GPU/GPUTracking/Refit/GPUTrackingRefitKernel.cxx
index 6baea86f05d36..f7e3bca47a0fc 100644
--- a/GPU/GPUTracking/Refit/GPUTrackingRefitKernel.cxx
+++ b/GPU/GPUTracking/Refit/GPUTrackingRefitKernel.cxx
@@ -12,9 +12,9 @@
 /// \file GPUTrackingRefitKernel.cxx
 /// \author David Rohr
 
+#include "GPUROOTDump.h"
 #include "GPUTrackingRefitKernel.h"
 #include "GPUTrackingRefit.h"
-#include "GPUROOTDump.h"
 
 using namespace GPUCA_NAMESPACE::gpu;
 

From 71faa853dba5198907cfca9c06feb37f6b5335e6 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Mon, 11 Nov 2024 14:36:09 +0100
Subject: [PATCH 04/11] FST: Force correct number of orbits to gpu-reco

---
 prodtests/full_system_test.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/prodtests/full_system_test.sh b/prodtests/full_system_test.sh
index f8b6d66ce87e4..8d6a0ca3cf1f9 100755
--- a/prodtests/full_system_test.sh
+++ b/prodtests/full_system_test.sh
@@ -227,6 +227,7 @@ if [[ ${RANS_OPT:-} =~ (--ans-version +)(compat) ]] ; then
   # for decoding we use either just produced or externally provided common local file
   export ARGS_EXTRA_PROCESS_o2_ctf_reader_workflow+="--ctf-dict $CTFDICTFILE"
 fi
+export CONFIG_EXTRA_PROCESS_o2_gpu_reco_workflow+="GPU_global.overrideNHbfPerTF=$NHBPERTF;"
 
 for STAGE in $STAGES; do
   logfile=reco_${STAGE}.log

From e9587cf6137569d7c6994abebd09cb288103a40a Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Mon, 11 Nov 2024 14:36:55 +0100
Subject: [PATCH 05/11] Calibration aggregator-workflow.sh: Update default
 lanes/threads for TPC IDC calib

---
 prodtests/full-system-test/aggregator-workflow.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/prodtests/full-system-test/aggregator-workflow.sh b/prodtests/full-system-test/aggregator-workflow.sh
index 4c20e901a2978..23336cafffab8 100755
--- a/prodtests/full-system-test/aggregator-workflow.sh
+++ b/prodtests/full-system-test/aggregator-workflow.sh
@@ -295,8 +295,8 @@ fi
 
 # TPC IDCs and SAC
 crus="0-359"  # to be used with $AGGREGATOR_TASKS == TPC_IDCBOTH_SAC or ALL
-lanesFactorize=${O2_TPC_IDC_FACTORIZE_NLANES:-10}
-threadFactorize=${O2_TPC_IDC_FACTORIZE_NTHREADS:-8}
+lanesFactorize=${O2_TPC_IDC_FACTORIZE_NLANES:-12}
+threadFactorize=${O2_TPC_IDC_FACTORIZE_NTHREADS:-16}
 nTFs=$((1000 * 128 / ${NHBPERTF}))
 nTFs_SAC=$((1000 * 128 / ${NHBPERTF}))
 nBuffer=$((100 * 128 / ${NHBPERTF}))

From b162faae42c624387ed0f2e156361ff9d10e229e Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Mon, 11 Nov 2024 18:06:24 +0100
Subject: [PATCH 06/11] GPU Display: Fix race condition

---
 GPU/GPUTracking/display/render/GPUDisplayDraw.cxx | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx b/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx
index ab7ebf6811766..746c41938e2e1 100644
--- a/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx
+++ b/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx
@@ -27,6 +27,7 @@
 #include "GPUTPCGMPropagator.h"
 #include "GPUTPCMCInfo.h"
 #include "GPUParam.inc"
+#include "GPUCommonMath.h"
 
 #include <type_traits>
 
@@ -66,8 +67,12 @@ inline void GPUDisplay::insertVertexList(int32_t iSlice, size_t first, size_t la
 inline void GPUDisplay::drawPointLinestrip(int32_t iSlice, int32_t cid, int32_t id, int32_t id_limit)
 {
   mVertexBuffer[iSlice].emplace_back(mGlobalPos[cid].x, mGlobalPos[cid].y * mYFactor, mCfgH.projectXY ? 0 : mGlobalPos[cid].z);
-  if (mGlobalPos[cid].w < id_limit) {
-    mGlobalPos[cid].w = id;
+  float curVal;
+  while ((curVal = mGlobalPos[cid].w) < id_limit) {
+    if (GPUCommonMath::AtomicCAS(&mGlobalPos[cid].w, curVal, (float)id)) {
+      break;
+    }
+    curVal = mGlobalPos[cid].w;
   }
 }
 

From d7e0151da5af30685923def5190fd5f4c4466ffe Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Wed, 13 Nov 2024 09:30:47 +0100
Subject: [PATCH 07/11] GPU: Split NDPiecewisePolynomials in header and inc
 file, get rid of ROOT in the header

---
 .../DataTypes/CalibdEdxTrackTopologyPol.cxx   |   1 +
 .../NDPiecewisePolynomials.h                  | 351 +++---------------
 .../NDPiecewisePolynomials.inc                | 276 ++++++++++++++
 .../test/testMultivarPolynomials.cxx          |   2 +-
 4 files changed, 330 insertions(+), 300 deletions(-)
 create mode 100644 GPU/TPCFastTransformation/NDPiecewisePolynomials.inc

diff --git a/GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologyPol.cxx b/GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologyPol.cxx
index 533763e14c6d7..47a6e4cff72df 100644
--- a/GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologyPol.cxx
+++ b/GPU/GPUTracking/DataTypes/CalibdEdxTrackTopologyPol.cxx
@@ -21,6 +21,7 @@
 using namespace o2::tpc;
 
 #if !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE) // code invisible on GPU and in the standalone compilation
+#include "NDPiecewisePolynomials.inc"
 void CalibdEdxTrackTopologyPol::dumpToTree(const uint32_t nSamplingPoints[/* Dim */], const char* outName) const
 {
   for (uint32_t i = 0; i < FFits; i++) {
diff --git a/GPU/TPCFastTransformation/NDPiecewisePolynomials.h b/GPU/TPCFastTransformation/NDPiecewisePolynomials.h
index 6de2bc7afbae8..9498645b76220 100644
--- a/GPU/TPCFastTransformation/NDPiecewisePolynomials.h
+++ b/GPU/TPCFastTransformation/NDPiecewisePolynomials.h
@@ -20,17 +20,12 @@
 #include "MultivariatePolynomialHelper.h"
 #include "GPUCommonMath.h"
 
-#if !defined(GPUCA_GPUCODE)
+#if !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE)
 #include <vector>
-#if !defined(GPUCA_STANDALONE)
-#include "TLinearFitter.h"
-#ifndef GPUCA_ALIROOT_LIB
-#include "CommonUtils/TreeStreamRedirector.h"
-#endif
-#include <TFile.h>
-#endif
 #endif
 
+class TFile;
+
 namespace GPUCA_NAMESPACE::gpu
 {
 
@@ -81,23 +76,20 @@ template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
 class NDPiecewisePolynomials : public FlatObject
 {
  public:
-#ifndef GPUCA_GPUCODE
+#if !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE)
   /// constructor
   /// \param min minimum coordinates of the grid
   /// \param max maximum coordinates of the grid (note: the resulting polynomials can NOT be evaluated at the maximum coordinates: only at min <= X < max)
   /// \param n number of vertices: defines number of fits per dimension: nFits = n - 1. n should be at least 2 to perform one fit
   NDPiecewisePolynomials(const float min[/* Dim */], const float max[/* Dim */], const uint32_t n[/* Dim */]) { init(min, max, n); }
-#endif
-#if !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE)
   /// constructor construct and object by initializing it from an object stored in a Root file
   /// \param fileName name of the file
   /// \param name name of the object
   NDPiecewisePolynomials(const char* fileName, const char* name)
   {
-    TFile f(fileName, "READ");
-    loadFromFile(f, name);
+    loadFromFile(fileName, name);
   };
-#endif
+#endif // !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE)
   /// default constructor
   NDPiecewisePolynomials() CON_DEFAULT;
 
@@ -115,7 +107,7 @@ class NDPiecewisePolynomials : public FlatObject
   /// move flat buffer to new location
   /// \param newBufferPtr new buffer location
   void moveBufferTo(char* newBufferPtr);
-#endif
+#endif // !defined(GPUCA_GPUCODE)
 
   /// destroy the object (release internal flat buffer)
   void destroy();
@@ -168,17 +160,16 @@ class NDPiecewisePolynomials : public FlatObject
   /// \return returns the parameters of the coefficients
   GPUd() const float* getParams() const { return mParams; }
 
-#if !defined(GPUCA_GPUCODE)
-  /// Setting directly the parameters of the polynomials
-  void setParams(const float params[/* getNParameters() */]) { std::copy(params, params + getNParameters(), mParams); }
-
   /// initalize the members
   /// \param min minimum coordinates of the grid
   /// \param max maximum coordinates of the grid (note: the resulting polynomials can NOT be evaluated at the maximum coordinates: only at min <= X < max)
   /// \param n number of vertices: defines number of fits per dimension: nFits = n - 1. n should be at least 2 to perform one fit
   void init(const float min[/* Dim */], const float max[/* Dim */], const uint32_t n[/* Dim */]);
 
-#ifndef GPUCA_STANDALONE
+#if !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE)
+  /// Setting directly the parameters of the polynomials
+  void setParams(const float params[/* getNParameters() */]) { std::copy(params, params + getNParameters(), mParams); }
+
   /// perform the polynomial fits on the grid
   /// \param func function which returns for every input x on the defined grid the true value
   /// \param nAuxiliaryPoints number of points which will be used for the fits (should be at least 2)
@@ -194,6 +185,8 @@ class NDPiecewisePolynomials : public FlatObject
   /// \param name name of the object in the file
   void loadFromFile(TFile& inpf, const char* name);
 
+  void loadFromFile(const char* fileName, const char* name);
+
   /// write parameters to file
   /// \param outf output file
   /// \param name name of the output object
@@ -211,7 +204,6 @@ class NDPiecewisePolynomials : public FlatObject
 
   /// \return returns total number of polynomial fits
   uint32_t getNPolynomials() const;
-#endif
 
   /// converts the class to a container which can be written to a root file
   NDPiecewisePolynomialContainer getContainer() const { return NDPiecewisePolynomialContainer{Dim, Degree, getNParameters(), mParams, InteractionOnly, mMin, mMax, mN}; }
@@ -219,10 +211,10 @@ class NDPiecewisePolynomials : public FlatObject
   /// set the parameters from NDPiecewisePolynomialContainer
   /// \param container container for the parameters
   void setFromContainer(const NDPiecewisePolynomialContainer& container);
+#endif // !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE)
 
   /// \return returns the total number of stored parameters
   uint32_t getNParameters() const { return getNPolynomials() * MultivariatePolynomialParametersHelper::getNParameters(Degree, Dim, InteractionOnly); }
-#endif
 
   /// \return returns number of dimensions of the polynomials
   GPUd() static constexpr uint32_t getDim() { return Dim; }
@@ -292,15 +284,15 @@ class NDPiecewisePolynomials : public FlatObject
   /// \param ix index
   /// \param dim dimension
   double getVertexPosition(const uint32_t ix, const int32_t dim) const { return ix / static_cast<double>(mInvSpacing[dim]) + mMin[dim]; }
-#endif
+#endif // !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE)
 
 #if !defined(GPUCA_GPUCODE)
   /// \return returns the size of the parameters
   std::size_t sizeOfParameters() const { return getNParameters() * sizeof(DataTParams); }
+#endif // #if !defined(GPUCA_GPUCODE)
 
   // construct the object (flatbuffer)
   void construct();
-#endif
 
 #ifndef GPUCA_ALIROOT_LIB
   ClassDefNV(NDPiecewisePolynomials, 1);
@@ -313,20 +305,6 @@ class NDPiecewisePolynomials : public FlatObject
 
 #if !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE)
 template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
-void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::loadFromFile(TFile& inpf, const char* name)
-{
-  NDPiecewisePolynomialContainer* gridTmp = nullptr;
-  inpf.GetObject(name, gridTmp);
-  if (gridTmp) {
-    setFromContainer(*gridTmp);
-    delete gridTmp;
-  } else {
-#ifndef GPUCA_ALIROOT_LIB
-    LOGP(info, "couldnt load object {} from input file", name);
-#endif
-  }
-}
-template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
 void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::setFromContainer(const NDPiecewisePolynomialContainer& container)
 {
   if (Dim != container.mDim) {
@@ -350,12 +328,6 @@ void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::setFromContainer(cons
   init(container.mMin.data(), container.mMax.data(), container.mN.data());
   setParams(container.mParams.data());
 }
-template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
-void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::writeToFile(TFile& outf, const char* name) const
-{
-  const NDPiecewisePolynomialContainer cont = getContainer();
-  outf.WriteObject(&cont, name);
-}
 
 template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
 void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::setDefault()
@@ -368,7 +340,29 @@ void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::setDefault()
     std::copy(params.begin(), params.end(), &mParams[i * nParamsPerPol]);
   }
 }
-#endif
+
+template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
+uint32_t NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::getNPolynomials() const
+{
+  uint32_t nP = getNPolynomials(0);
+  for (uint32_t i = 1; i < Dim; ++i) {
+    nP *= getNPolynomials(i);
+  }
+  return nP;
+}
+
+template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
+void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::checkPos(const uint32_t iMax[/* Dim */], int32_t pos[/* Dim */]) const
+{
+  for (uint32_t i = 0; i < Dim; ++i) {
+    if (pos[i] == int32_t(iMax[i])) {
+      ++pos[i + 1];
+      std::fill_n(pos, i + 1, 0);
+    }
+  }
+}
+
+#endif // !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE)
 
 #ifndef GPUCA_GPUCODE
 template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
@@ -405,7 +399,19 @@ void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::construct()
   FlatObject::finishConstruction(flatbufferSize);
   mParams = reinterpret_cast<DataTParams*>(mFlatBufferPtr);
 }
-#endif
+
+template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
+void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::init(const float min[], const float max[], const uint32_t n[])
+{
+  for (uint32_t i = 0; i < Dim; ++i) {
+    mMin[i] = min[i];
+    mMax[i] = max[i];
+    mN[i] = n[i];
+    mInvSpacing[i] = (mN[i] - 1) / (mMax[i] - mMin[i]);
+  }
+  construct();
+}
+#endif // !GPUCA_GPUCODE
 
 template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
 void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::destroy()
@@ -472,259 +478,6 @@ GPUdi() void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::clamp(float x
   }
 }
 
-#ifndef GPUCA_GPUCODE
-template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
-void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::init(const float min[], const float max[], const uint32_t n[])
-{
-  for (uint32_t i = 0; i < Dim; ++i) {
-    mMin[i] = min[i];
-    mMax[i] = max[i];
-    mN[i] = n[i];
-    mInvSpacing[i] = (mN[i] - 1) / (mMax[i] - mMin[i]);
-  }
-  construct();
-}
-#endif
-
-#if !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE)
-template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
-uint32_t NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::getNPolynomials() const
-{
-  uint32_t nP = getNPolynomials(0);
-  for (uint32_t i = 1; i < Dim; ++i) {
-    nP *= getNPolynomials(i);
-  }
-  return nP;
-}
-
-template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
-void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::checkPos(const uint32_t iMax[/* Dim */], int32_t pos[/* Dim */]) const
-{
-  for (uint32_t i = 0; i < Dim; ++i) {
-    if (pos[i] == int32_t(iMax[i])) {
-      ++pos[i + 1];
-      std::fill_n(pos, i + 1, 0);
-    }
-  }
-}
-
-template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
-void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::performFits(const std::function<double(const double x[/* Dim */])>& func, const uint32_t nAuxiliaryPoints[/* Dim */])
-{
-  const int32_t nTotalFits = getNPolynomials();
-#ifndef GPUCA_ALIROOT_LIB
-  LOGP(info, "Perform fitting of {}D-Polynomials of degree {} for a total of {} fits.", Dim, Degree, nTotalFits);
-#endif
-
-  MultivariatePolynomialHelper<0, 0, false> pol(Dim, Degree, InteractionOnly);
-  TLinearFitter fitter = pol.getTLinearFitter();
-
-  uint32_t nPoints = 1;
-  for (uint32_t i = 0; i < Dim; ++i) {
-    nPoints *= nAuxiliaryPoints[i];
-  }
-
-  std::vector<double> xCords;
-  std::vector<double> response;
-  xCords.reserve(Dim * nPoints);
-  response.reserve(nPoints);
-
-  uint32_t nPolynomials[Dim]{0};
-  for (uint32_t i = 0; i < Dim; ++i) {
-    nPolynomials[i] = getNPolynomials(i);
-  }
-
-  int32_t pos[Dim + 1]{0};
-  uint32_t counter = 0;
-  const int32_t printDebugForNFits = int32_t(nTotalFits / 20) + 1;
-
-  for (;;) {
-    const bool debug = !(++counter % printDebugForNFits);
-    if (debug) {
-#ifndef GPUCA_ALIROOT_LIB
-      LOGP(info, "Performing fit {} out of {}", counter, nTotalFits);
-#endif
-    }
-
-    checkPos(nPolynomials, pos);
-
-    if (pos[Dim] == 1) {
-      break;
-    }
-
-    xCords.clear();
-    response.clear();
-    fitInnerGrid(func, nAuxiliaryPoints, pos, fitter, xCords, response);
-    ++pos[0];
-  }
-}
-
-template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
-void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::performFits(const std::vector<float>& x, const std::vector<float>& y)
-{
-  const int32_t nTotalFits = getNPolynomials();
-#ifndef GPUCA_ALIROOT_LIB
-  LOGP(info, "Perform fitting of {}D-Polynomials of degree {} for a total of {} fits.", Dim, Degree, nTotalFits);
-#endif
-
-  // approximate number of points
-  uint32_t nPoints = 2 * y.size() / nTotalFits;
-
-  // polynomial index -> indices to datapoints
-  std::unordered_map<int32_t, std::vector<size_t>> dataPointsIndices;
-  for (int32_t i = 0; i < nTotalFits; ++i) {
-    dataPointsIndices[i].reserve(nPoints);
-  }
-
-  // check for each data point which polynomial to use
-  for (size_t i = 0; i < y.size(); ++i) {
-    std::array<int32_t, Dim> index;
-    float xVal[Dim];
-    std::copy(x.begin() + i * Dim, x.begin() + i * Dim + Dim, xVal);
-    setIndex<Dim - 1>(xVal, index.data());
-
-    std::array<int32_t, Dim> indexClamped{index};
-    clamp<Dim - 1>(xVal, indexClamped.data());
-
-    // check if data points are in the grid
-    if (index == indexClamped) {
-      // index of the polyniomial
-      const uint32_t idx = getDataIndex(index.data()) / MultivariatePolynomialParametersHelper::getNParameters(Degree, Dim, InteractionOnly);
-
-      // store index to data point
-      dataPointsIndices[idx].emplace_back(i);
-    }
-  }
-
-  // for fitting
-  MultivariatePolynomialHelper<0, 0, false> pol(Dim, Degree, InteractionOnly);
-  TLinearFitter fitter = pol.getTLinearFitter();
-
-  uint32_t counter = 0;
-  const int32_t printDebugForNFits = int32_t(nTotalFits / 20) + 1;
-
-  // temp storage for x and y values for fitting
-  std::vector<double> xCords;
-  std::vector<double> response;
-
-  for (int32_t i = 0; i < nTotalFits; ++i) {
-    const bool debug = !(++counter % printDebugForNFits);
-    if (debug) {
-#ifndef GPUCA_ALIROOT_LIB
-      LOGP(info, "Performing fit {} out of {}", counter, nTotalFits);
-#endif
-    }
-
-    // store values for fitting
-    if (dataPointsIndices[i].empty()) {
-#ifndef GPUCA_ALIROOT_LIB
-      LOGP(info, "No data points to fit");
-#endif
-      continue;
-    }
-
-    const auto nP = dataPointsIndices[i].size();
-    xCords.reserve(Dim * nP);
-    response.reserve(nP);
-    xCords.clear();
-    response.clear();
-
-    // add datapoints to fit
-    for (size_t j = 0; j < nP; ++j) {
-      const size_t idxOrig = dataPointsIndices[i][j];
-
-      // insert x values at the end of xCords
-      const int32_t idxXStart = idxOrig * Dim;
-      xCords.insert(xCords.end(), x.begin() + idxXStart, x.begin() + idxXStart + Dim);
-      response.emplace_back(y[idxOrig]);
-    }
-
-    // perform the fit on the points TODO make errors configurable
-    std::vector<double> error;
-    const auto params = MultivariatePolynomialHelper<0, 0, false>::fit(fitter, xCords, response, error, true);
-
-    // store parameters
-    std::copy(params.begin(), params.end(), &mParams[i * MultivariatePolynomialParametersHelper::getNParameters(Degree, Dim, InteractionOnly)]);
-  }
-}
-
-template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
-void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::fitInnerGrid(const std::function<double(const double x[/* Dim */])>& func, const uint32_t nAuxiliaryPoints[/* Dim */], const int32_t currentIndex[/* Dim */], TLinearFitter& fitter, std::vector<double>& xCords, std::vector<double>& response)
-{
-  int32_t pos[Dim + 1]{0};
-
-  // add points which will be used for the fit
-  for (;;) {
-    checkPos(nAuxiliaryPoints, pos);
-
-    if (pos[Dim] == 1) {
-      break;
-    }
-
-    for (uint32_t iDim = 0; iDim < Dim; ++iDim) {
-      const double stepWidth = getStepWidth(iDim, nAuxiliaryPoints[iDim]);
-      const double vertexPos = getVertexPosition(currentIndex[iDim], iDim);
-      const double realPosTmp = vertexPos + pos[iDim] * stepWidth;
-      xCords.emplace_back(realPosTmp);
-    }
-
-    // get response for last added points
-    const double responseTmp = func(&xCords[xCords.size() - Dim]);
-    response.emplace_back(responseTmp);
-    ++pos[0];
-  }
-
-  // perform the fit on the points TODO make errors configurable
-  std::vector<double> error;
-  const auto params = MultivariatePolynomialHelper<0, 0, false>::fit(fitter, xCords, response, error, true);
-
-  // store parameters
-  const uint32_t index = getDataIndex(currentIndex);
-  std::copy(params.begin(), params.end(), &mParams[index]);
-}
-
-#ifndef GPUCA_ALIROOT_LIB
-template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
-void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::dumpToTree(const uint32_t nSamplingPoints[/* Dim */], const char* outName, const char* treeName, const bool recreateFile) const
-{
-  o2::utils::TreeStreamRedirector pcstream(outName, recreateFile ? "RECREATE" : "UPDATE");
-
-  double factor[Dim]{};
-  for (uint32_t iDim = 0; iDim < Dim; ++iDim) {
-    factor[iDim] = (mMax[iDim] - mMin[iDim]) / (nSamplingPoints[iDim] - 1);
-  }
-
-  std::vector<float> x(Dim);
-  std::vector<uint32_t> ix(Dim);
-  int32_t pos[Dim + 1]{0};
-
-  for (;;) {
-    checkPos(nSamplingPoints, pos);
-
-    if (pos[Dim] == 1) {
-      break;
-    }
-
-    for (uint32_t iDim = 0; iDim < Dim; ++iDim) {
-      ix[iDim] = pos[iDim];
-      x[iDim] = mMin[iDim] + pos[iDim] * factor[iDim];
-    }
-
-    float value = eval(x.data());
-    pcstream << treeName
-             << "ix=" << ix
-             << "x=" << x
-             << "value=" << value
-             << "\n";
-
-    ++pos[0];
-  }
-  pcstream.Close();
-}
-#endif
-
-#endif
-
 } // namespace GPUCA_NAMESPACE::gpu
 
 #endif
diff --git a/GPU/TPCFastTransformation/NDPiecewisePolynomials.inc b/GPU/TPCFastTransformation/NDPiecewisePolynomials.inc
new file mode 100644
index 0000000000000..d7bb9d702e96f
--- /dev/null
+++ b/GPU/TPCFastTransformation/NDPiecewisePolynomials.inc
@@ -0,0 +1,276 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file NDPiecewisePolynomials.inc
+/// \author Matthias Kleiner <mkleiner@ikf.uni-frankfurt.de>
+
+#ifndef ALICEO2_TPC_NDPIECEWISEPOLYNOMIALS_INC
+#define ALICEO2_TPC_NDPIECEWISEPOLYNOMIALS_INC
+
+#include <TLinearFitter.h>
+#include <TFile.h>
+#include "CommonUtils/TreeStreamRedirector.h"
+#include "NDPiecewisePolynomials.h"
+
+namespace GPUCA_NAMESPACE::gpu
+{
+
+#ifndef GPUCA_ALIROOT_LIB
+template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
+void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::dumpToTree(const uint32_t nSamplingPoints[/* Dim */], const char* outName, const char* treeName, const bool recreateFile) const
+{
+  o2::utils::TreeStreamRedirector pcstream(outName, recreateFile ? "RECREATE" : "UPDATE");
+
+  double factor[Dim]{};
+  for (uint32_t iDim = 0; iDim < Dim; ++iDim) {
+    factor[iDim] = (mMax[iDim] - mMin[iDim]) / (nSamplingPoints[iDim] - 1);
+  }
+
+  std::vector<float> x(Dim);
+  std::vector<uint32_t> ix(Dim);
+  int32_t pos[Dim + 1]{0};
+
+  for (;;) {
+    checkPos(nSamplingPoints, pos);
+
+    if (pos[Dim] == 1) {
+      break;
+    }
+
+    for (uint32_t iDim = 0; iDim < Dim; ++iDim) {
+      ix[iDim] = pos[iDim];
+      x[iDim] = mMin[iDim] + pos[iDim] * factor[iDim];
+    }
+
+    float value = eval(x.data());
+    pcstream << treeName
+             << "ix=" << ix
+             << "x=" << x
+             << "value=" << value
+             << "\n";
+
+    ++pos[0];
+  }
+  pcstream.Close();
+}
+#endif // GPUCA_ALIROOT_LIB
+
+#if !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE)
+
+template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
+void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::loadFromFile(TFile& inpf, const char* name)
+{
+  NDPiecewisePolynomialContainer* gridTmp = nullptr;
+  inpf.GetObject(name, gridTmp);
+  if (gridTmp) {
+    setFromContainer(*gridTmp);
+    delete gridTmp;
+  } else {
+#ifndef GPUCA_ALIROOT_LIB
+    LOGP(info, "couldnt load object {} from input file", name);
+#endif
+  }
+}
+
+template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
+void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::loadFromFile(const char* fileName, const char* name)
+{
+  TFile f(fileName, "READ");
+  loadFromFile(f, name);
+}
+
+template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
+void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::writeToFile(TFile& outf, const char* name) const
+{
+  const NDPiecewisePolynomialContainer cont = getContainer();
+  outf.WriteObject(&cont, name);
+}
+
+template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
+void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::performFits(const std::function<double(const double x[/* Dim */])>& func, const uint32_t nAuxiliaryPoints[/* Dim */])
+{
+  const int32_t nTotalFits = getNPolynomials();
+#ifndef GPUCA_ALIROOT_LIB
+  LOGP(info, "Perform fitting of {}D-Polynomials of degree {} for a total of {} fits.", Dim, Degree, nTotalFits);
+#endif
+
+  MultivariatePolynomialHelper<0, 0, false> pol(Dim, Degree, InteractionOnly);
+  TLinearFitter fitter = pol.getTLinearFitter();
+
+  uint32_t nPoints = 1;
+  for (uint32_t i = 0; i < Dim; ++i) {
+    nPoints *= nAuxiliaryPoints[i];
+  }
+
+  std::vector<double> xCords;
+  std::vector<double> response;
+  xCords.reserve(Dim * nPoints);
+  response.reserve(nPoints);
+
+  uint32_t nPolynomials[Dim]{0};
+  for (uint32_t i = 0; i < Dim; ++i) {
+    nPolynomials[i] = getNPolynomials(i);
+  }
+
+  int32_t pos[Dim + 1]{0};
+  uint32_t counter = 0;
+  const int32_t printDebugForNFits = int32_t(nTotalFits / 20) + 1;
+
+  for (;;) {
+    const bool debug = !(++counter % printDebugForNFits);
+    if (debug) {
+#ifndef GPUCA_ALIROOT_LIB
+      LOGP(info, "Performing fit {} out of {}", counter, nTotalFits);
+#endif
+    }
+
+    checkPos(nPolynomials, pos);
+
+    if (pos[Dim] == 1) {
+      break;
+    }
+
+    xCords.clear();
+    response.clear();
+    fitInnerGrid(func, nAuxiliaryPoints, pos, fitter, xCords, response);
+    ++pos[0];
+  }
+}
+
+template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
+void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::performFits(const std::vector<float>& x, const std::vector<float>& y)
+{
+  const int32_t nTotalFits = getNPolynomials();
+#ifndef GPUCA_ALIROOT_LIB
+  LOGP(info, "Perform fitting of {}D-Polynomials of degree {} for a total of {} fits.", Dim, Degree, nTotalFits);
+#endif
+
+  // approximate number of points
+  uint32_t nPoints = 2 * y.size() / nTotalFits;
+
+  // polynomial index -> indices to datapoints
+  std::unordered_map<int32_t, std::vector<size_t>> dataPointsIndices;
+  for (int32_t i = 0; i < nTotalFits; ++i) {
+    dataPointsIndices[i].reserve(nPoints);
+  }
+
+  // check for each data point which polynomial to use
+  for (size_t i = 0; i < y.size(); ++i) {
+    std::array<int32_t, Dim> index;
+    float xVal[Dim];
+    std::copy(x.begin() + i * Dim, x.begin() + i * Dim + Dim, xVal);
+    setIndex<Dim - 1>(xVal, index.data());
+
+    std::array<int32_t, Dim> indexClamped{index};
+    clamp<Dim - 1>(xVal, indexClamped.data());
+
+    // check if data points are in the grid
+    if (index == indexClamped) {
+      // index of the polyniomial
+      const uint32_t idx = getDataIndex(index.data()) / MultivariatePolynomialParametersHelper::getNParameters(Degree, Dim, InteractionOnly);
+
+      // store index to data point
+      dataPointsIndices[idx].emplace_back(i);
+    }
+  }
+
+  // for fitting
+  MultivariatePolynomialHelper<0, 0, false> pol(Dim, Degree, InteractionOnly);
+  TLinearFitter fitter = pol.getTLinearFitter();
+
+  uint32_t counter = 0;
+  const int32_t printDebugForNFits = int32_t(nTotalFits / 20) + 1;
+
+  // temp storage for x and y values for fitting
+  std::vector<double> xCords;
+  std::vector<double> response;
+
+  for (int32_t i = 0; i < nTotalFits; ++i) {
+    const bool debug = !(++counter % printDebugForNFits);
+    if (debug) {
+#ifndef GPUCA_ALIROOT_LIB
+      LOGP(info, "Performing fit {} out of {}", counter, nTotalFits);
+#endif
+    }
+
+    // store values for fitting
+    if (dataPointsIndices[i].empty()) {
+#ifndef GPUCA_ALIROOT_LIB
+      LOGP(info, "No data points to fit");
+#endif
+      continue;
+    }
+
+    const auto nP = dataPointsIndices[i].size();
+    xCords.reserve(Dim * nP);
+    response.reserve(nP);
+    xCords.clear();
+    response.clear();
+
+    // add datapoints to fit
+    for (size_t j = 0; j < nP; ++j) {
+      const size_t idxOrig = dataPointsIndices[i][j];
+
+      // insert x values at the end of xCords
+      const int32_t idxXStart = idxOrig * Dim;
+      xCords.insert(xCords.end(), x.begin() + idxXStart, x.begin() + idxXStart + Dim);
+      response.emplace_back(y[idxOrig]);
+    }
+
+    // perform the fit on the points TODO make errors configurable
+    std::vector<double> error;
+    const auto params = MultivariatePolynomialHelper<0, 0, false>::fit(fitter, xCords, response, error, true);
+
+    // store parameters
+    std::copy(params.begin(), params.end(), &mParams[i * MultivariatePolynomialParametersHelper::getNParameters(Degree, Dim, InteractionOnly)]);
+  }
+}
+
+template <uint32_t Dim, uint32_t Degree, bool InteractionOnly>
+void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::fitInnerGrid(const std::function<double(const double x[/* Dim */])>& func, const uint32_t nAuxiliaryPoints[/* Dim */], const int32_t currentIndex[/* Dim */], TLinearFitter& fitter, std::vector<double>& xCords, std::vector<double>& response)
+{
+  int32_t pos[Dim + 1]{0};
+
+  // add points which will be used for the fit
+  for (;;) {
+    checkPos(nAuxiliaryPoints, pos);
+
+    if (pos[Dim] == 1) {
+      break;
+    }
+
+    for (uint32_t iDim = 0; iDim < Dim; ++iDim) {
+      const double stepWidth = getStepWidth(iDim, nAuxiliaryPoints[iDim]);
+      const double vertexPos = getVertexPosition(currentIndex[iDim], iDim);
+      const double realPosTmp = vertexPos + pos[iDim] * stepWidth;
+      xCords.emplace_back(realPosTmp);
+    }
+
+    // get response for last added points
+    const double responseTmp = func(&xCords[xCords.size() - Dim]);
+    response.emplace_back(responseTmp);
+    ++pos[0];
+  }
+
+  // perform the fit on the points TODO make errors configurable
+  std::vector<double> error;
+  const auto params = MultivariatePolynomialHelper<0, 0, false>::fit(fitter, xCords, response, error, true);
+
+  // store parameters
+  const uint32_t index = getDataIndex(currentIndex);
+  std::copy(params.begin(), params.end(), &mParams[index]);
+}
+
+} // namespace GPUCA_NAMESPACE::gpu
+
+#endif // !defined(GPUCA_GPUCODE) && !defined(GPUCA_STANDALONE)
+
+#endif // ALICEO2_TPC_NDPIECEWISEPOLYNOMIALS_INC
diff --git a/GPU/TPCFastTransformation/test/testMultivarPolynomials.cxx b/GPU/TPCFastTransformation/test/testMultivarPolynomials.cxx
index c3373cdad63f0..a9c39e8528354 100644
--- a/GPU/TPCFastTransformation/test/testMultivarPolynomials.cxx
+++ b/GPU/TPCFastTransformation/test/testMultivarPolynomials.cxx
@@ -18,7 +18,7 @@
 
 #include <boost/test/unit_test.hpp>
 #include "MultivariatePolynomial.h"
-#include "NDPiecewisePolynomials.h"
+#include "NDPiecewisePolynomials.inc"
 #include <vector>
 
 namespace o2::gpu

From 2541e978c1d3575c0d52e482f82a4596f7fbbd0c Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Mon, 11 Nov 2024 14:12:16 +0100
Subject: [PATCH 08/11] GPU: Some protection so we get a compiler warning when
 headers are included in wrong order

---
 GPU/Common/GPUCommonRtypes.h   | 4 ++--
 GPU/Common/GPUROOTSMatrixFwd.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPU/Common/GPUCommonRtypes.h b/GPU/Common/GPUCommonRtypes.h
index 5ae2ddbb83b26..7aaf5a36befe2 100644
--- a/GPU/Common/GPUCommonRtypes.h
+++ b/GPU/Common/GPUCommonRtypes.h
@@ -20,14 +20,14 @@
 #if defined(GPUCA_STANDALONE) || (defined(GPUCA_O2_LIB) && !defined(GPUCA_O2_INTERFACE)) || defined(GPUCA_GPUCODE) // clang-format off
   #if !defined(ROOT_Rtypes) && !defined(__CLING__)
     #define GPUCOMMONRTYPES_H_ACTIVE
+    struct MUST_NOT_USE_Rtypes_h {};
+    typedef MUST_NOT_USE_Rtypes_h TClass;
     #define ClassDef(name,id)
     #define ClassDefNV(name, id)
     #define ClassDefOverride(name, id)
     #define ClassImp(name)
     #define templateClassImp(name)
     #ifndef GPUCA_GPUCODE_DEVICE
-//      typedef uint64_t ULong64_t;
-//      typedef uint32_t UInt_t;
       #include <iostream>
     #endif
   #endif
diff --git a/GPU/Common/GPUROOTSMatrixFwd.h b/GPU/Common/GPUROOTSMatrixFwd.h
index a3b5abc55d3bc..44b2254949df2 100644
--- a/GPU/Common/GPUROOTSMatrixFwd.h
+++ b/GPU/Common/GPUROOTSMatrixFwd.h
@@ -52,7 +52,7 @@ template <class T, uint32_t D1, uint32_t D2>
 class MatRepStdGPU;
 } // namespace detail
 
-#if !defined(GPUCA_STANDALONE) && !defined(GPUCA_GPUCODE)
+#if !defined(GPUCA_STANDALONE) && !defined(GPUCA_GPUCODE) && !defined(GPUCOMMONRTYPES_H_ACTIVE)
 template <typename T, uint32_t N>
 using SVector = ROOT::Math::SVector<T, N>;
 template <class T, uint32_t D1, uint32_t D2, class R>

From 3b160b5c28219901e453bddea4e725b56eb14e49 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Wed, 13 Nov 2024 19:37:19 +0100
Subject: [PATCH 09/11] GPU: Workaround for OpenCL

---
 Common/MathUtils/include/MathUtils/SMatrixGPU.h | 2 ++
 GPU/GPUTracking/dEdx/GPUdEdx.h                  | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/Common/MathUtils/include/MathUtils/SMatrixGPU.h b/Common/MathUtils/include/MathUtils/SMatrixGPU.h
index 2bfdcf54752b2..5ecdcd75a9906 100644
--- a/Common/MathUtils/include/MathUtils/SMatrixGPU.h
+++ b/Common/MathUtils/include/MathUtils/SMatrixGPU.h
@@ -516,12 +516,14 @@ class SMatrixGPU
   R mRep;
 };
 
+#ifndef __OPENCL__ // TODO: current C++ for OpenCL 2021 is at C++17, so no concepts. But we don't need this trick for OpenCL anyway, so we can just hide it.
 template <class T, unsigned int D1, unsigned int D2, class R, typename Y, typename X = Y>
   requires(sizeof(typename X::traits_type::pos_type) != 0) // do not provide a template to fair::Logger, etc... (pos_type is a member type of all std::ostream classes)
 GPUd() X& operator<<(Y& y, const SMatrixGPU<T, D1, D2, R>&)
 {
   return y;
 }
+#endif
 
 template <class T, unsigned int D1, unsigned int D2, class R>
 GPUdi() SMatrixGPU<T, D1, D2, R>::SMatrixGPU(SMatrixIdentity)
diff --git a/GPU/GPUTracking/dEdx/GPUdEdx.h b/GPU/GPUTracking/dEdx/GPUdEdx.h
index 9a1784e2be49a..516d1fced0a20 100644
--- a/GPU/GPUTracking/dEdx/GPUdEdx.h
+++ b/GPU/GPUTracking/dEdx/GPUdEdx.h
@@ -212,7 +212,7 @@ GPUdi() void GPUdEdx::fillSubThreshold(int32_t padRow, const GPUParam& GPUrestri
   mNSubThresh++;
 }
 
-#endif // !GPUCA_HAVE_O2HEADERS || __OPENCL1__
+#endif // !GPUCA_HAVE_O2HEADERS || GPUCA_OPENCL1
 } // namespace gpu
 } // namespace GPUCA_NAMESPACE
 

From 4dd8d1dd307374308057516f6b7fb0d14f58b3e3 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Wed, 13 Nov 2024 20:05:36 +0100
Subject: [PATCH 10/11] GPU: Simplify __OPENCL__ macros using __OPENCL1__

---
 GPU/Common/GPUCommonConstants.h                  |  2 +-
 GPU/Common/GPUCommonDef.h                        |  4 ++--
 GPU/Common/GPUCommonMath.h                       | 10 +++++-----
 GPU/Common/GPUCommonTypeTraits.h                 |  2 +-
 GPU/GPUTracking/Base/GPUParam.inc                |  4 ++--
 .../Base/opencl-common/GPUReconstructionOCL.cl   |  5 +++++
 GPU/GPUTracking/Base/opencl2/CMakeLists.txt      |  2 --
 GPU/GPUTracking/DataTypes/GPUDataTypes.h         |  2 +-
 GPU/GPUTracking/DataTypes/GPUO2DataTypes.h       |  4 ++--
 GPU/GPUTracking/DataTypes/GPUSettings.h          |  2 +-
 .../DataTypes/GPUTPCGMPolynomialField.h          |  4 ++--
 GPU/GPUTracking/DataTypes/GPUTPCGeometry.h       | 10 +++++-----
 .../Definitions/GPUDefConstantsAndSettings.h     |  2 +-
 .../SliceTracker/GPUTPCGlobalTracking.cxx        |  4 ++--
 .../SliceTracker/GPUTPCGlobalTracking.h          |  2 +-
 GPU/GPUTracking/SliceTracker/GPUTPCSliceOutput.h |  2 +-
 GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx   |  2 +-
 GPU/GPUTracking/SliceTracker/GPUTPCTracker.h     |  2 +-
 .../SliceTracker/GPUTPCTrackletConstructor.cxx   | 16 ++++++++--------
 .../SliceTracker/GPUTPCTrackletConstructor.h     |  2 +-
 20 files changed, 43 insertions(+), 40 deletions(-)

diff --git a/GPU/Common/GPUCommonConstants.h b/GPU/Common/GPUCommonConstants.h
index 5744c078dc197..883f64b7bdd12 100644
--- a/GPU/Common/GPUCommonConstants.h
+++ b/GPU/Common/GPUCommonConstants.h
@@ -17,7 +17,7 @@
 
 #include "GPUCommonDef.h"
 
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
 namespace GPUCA_NAMESPACE::gpu::gpu_common_constants
 {
 static CONSTEXPR const float kCLight = 0.000299792458f;
diff --git a/GPU/Common/GPUCommonDef.h b/GPU/Common/GPUCommonDef.h
index a8bf772d7aacc..ac3d7279fbaf4 100644
--- a/GPU/Common/GPUCommonDef.h
+++ b/GPU/Common/GPUCommonDef.h
@@ -30,7 +30,7 @@
 //Some GPU configuration settings, must be included first
 #include "GPUCommonDefSettings.h"
 
-#if (!defined(__OPENCL__) || defined(__OPENCLCPP__)) && (!(defined(__CINT__) || defined(__ROOTCINT__)) || defined(__CLING__)) && defined(__cplusplus) && __cplusplus >= 201103L
+#if !defined(__OPENCL1__) && (!(defined(__CINT__) || defined(__ROOTCINT__)) || defined(__CLING__)) && defined(__cplusplus) && __cplusplus >= 201103L
   #define GPUCA_NOCOMPAT // C++11 + No old ROOT5 + No old OpenCL
   #ifndef __OPENCL__
     #define GPUCA_NOCOMPAT_ALLOPENCL // + No OpenCL at all
@@ -82,7 +82,7 @@
   #define GPUCA_NAMESPACE o2
 #endif
 
-#if (defined(__CUDACC__) && defined(GPUCA_CUDA_NO_CONSTANT_MEMORY)) || (defined(__HIPCC__) && defined(GPUCA_HIP_NO_CONSTANT_MEMORY)) || (defined(__OPENCL__) && !defined(__OPENCLCPP__) && defined(GPUCA_OPENCL_NO_CONSTANT_MEMORY)) || (defined(__OPENCLCPP__) && defined(GPUCA_OPENCLCPP_NO_CONSTANT_MEMORY))
+#if (defined(__CUDACC__) && defined(GPUCA_CUDA_NO_CONSTANT_MEMORY)) || (defined(__HIPCC__) && defined(GPUCA_HIP_NO_CONSTANT_MEMORY)) || (defined(__OPENCL1__) && defined(GPUCA_OPENCL_NO_CONSTANT_MEMORY)) || (defined(__OPENCLCPP__) && defined(GPUCA_OPENCLCPP_NO_CONSTANT_MEMORY))
   #define GPUCA_NO_CONSTANT_MEMORY
 #elif defined(__CUDACC__) || defined(__HIPCC__)
   #define GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM
diff --git a/GPU/Common/GPUCommonMath.h b/GPU/Common/GPUCommonMath.h
index 8b129ff29a987..bc842d00c6568 100644
--- a/GPU/Common/GPUCommonMath.h
+++ b/GPU/Common/GPUCommonMath.h
@@ -31,7 +31,7 @@
 #include <cstdint>
 #endif
 
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
 namespace GPUCA_NAMESPACE
 {
 namespace gpu
@@ -220,7 +220,7 @@ GPUdi() uint32_t GPUCommonMath::Float2UIntReint(const float& x)
 {
 #if defined(GPUCA_GPUCODE_DEVICE) && (defined(__CUDACC__) || defined(__HIPCC__))
   return __float_as_uint(x);
-#elif defined(GPUCA_GPUCODE_DEVICE) && (defined(__OPENCL__) || defined(__OPENCLCPP__))
+#elif defined(GPUCA_GPUCODE_DEVICE) && defined(__OPENCL__)
   return as_uint(x);
 #else
   return reinterpret_cast<const uint32_t&>(x);
@@ -289,7 +289,7 @@ GPUhdi() void GPUCommonMath::SinCosd(double x, double& s, double& c)
 
 GPUdi() uint32_t GPUCommonMath::Clz(uint32_t x)
 {
-#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__)) && (!defined(__OPENCL__) || defined(__OPENCLCPP__))
+#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__)) && !defined(__OPENCL1__)
   return x == 0 ? 32 : CHOICE(__builtin_clz(x), __clz(x), __builtin_clz(x)); // use builtin if available
 #else
   for (int32_t i = 31; i >= 0; i--) {
@@ -303,7 +303,7 @@ GPUdi() uint32_t GPUCommonMath::Clz(uint32_t x)
 
 GPUdi() uint32_t GPUCommonMath::Popcount(uint32_t x)
 {
-#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__)) && (!defined(__OPENCL__) /*|| defined(__OPENCLCPP__)*/) // TODO: remove OPENCLCPP workaround when reported SPIR-V bug is fixed
+#if (defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) || defined(__HIPCC__)) && (!defined(__OPENCL__) /* !defined(__OPENCL1__)*/) // TODO: exclude only OPENCLC (not CPP) when reported SPIR-V bug is fixed
   // use builtin if available
   return CHOICE(__builtin_popcount(x), __popc(x), __builtin_popcount(x));
 #else
@@ -563,7 +563,7 @@ GPUdii() void GPUCommonMath::AtomicMinInternal(GPUglobalref() GPUgeneric() GPUAt
 
 #undef CHOICE
 
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
 }
 }
 #endif
diff --git a/GPU/Common/GPUCommonTypeTraits.h b/GPU/Common/GPUCommonTypeTraits.h
index 2ae524f8d1c76..88fcc9b838a65 100644
--- a/GPU/Common/GPUCommonTypeTraits.h
+++ b/GPU/Common/GPUCommonTypeTraits.h
@@ -21,7 +21,7 @@
 #ifndef GPUCA_GPUCODE_COMPILEKERNELS
 #include <type_traits>
 #endif
-#elif !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#elif !defined(__OPENCL1__)
 // We just reimplement some type traits in std for the GPU
 namespace std
 {
diff --git a/GPU/GPUTracking/Base/GPUParam.inc b/GPU/GPUTracking/Base/GPUParam.inc
index c7c526471d505..41ed3c8f203cb 100644
--- a/GPU/GPUTracking/Base/GPUParam.inc
+++ b/GPU/GPUTracking/Base/GPUParam.inc
@@ -17,7 +17,7 @@
 
 #include "GPUParam.h"
 #include "GPUTPCGMMergedTrackHit.h"
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
 #include "GPUTPCClusterOccupancyMap.h"
 #endif
 
@@ -228,7 +228,7 @@ GPUdi() void MEM_LG(GPUParam)::UpdateClusterError2ByState(int16_t clusterState,
 MEM_CLASS_PRE()
 GPUdi() float MEM_LG(GPUParam)::GetUnscaledMult(float time) const
 {
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
   if (!occupancyMap) {
     return 0.f;
   }
diff --git a/GPU/GPUTracking/Base/opencl-common/GPUReconstructionOCL.cl b/GPU/GPUTracking/Base/opencl-common/GPUReconstructionOCL.cl
index 42a640579e9e3..672c4b63eb476 100644
--- a/GPU/GPUTracking/Base/opencl-common/GPUReconstructionOCL.cl
+++ b/GPU/GPUTracking/Base/opencl-common/GPUReconstructionOCL.cl
@@ -14,6 +14,11 @@
 
 // clang-format off
 #define __OPENCL__
+#if defined(__cplusplus) && __cplusplus >= 201703L
+  #define __OPENCLCPP__
+#else
+  #define __OPENCL1__
+#endif
 #define GPUCA_GPUTYPE_OPENCL
 
 #ifdef __OPENCLCPP__
diff --git a/GPU/GPUTracking/Base/opencl2/CMakeLists.txt b/GPU/GPUTracking/Base/opencl2/CMakeLists.txt
index ec2a4446142c8..0a4168b130766 100644
--- a/GPU/GPUTracking/Base/opencl2/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/opencl2/CMakeLists.txt
@@ -32,8 +32,6 @@ set(OCL_DEFINECL "-D$<JOIN:$<TARGET_PROPERTY:O2::GPUTracking,COMPILE_DEFINITIONS
             -I${CMAKE_SOURCE_DIR}/Detectors/TRD/base/src
             -I${CMAKE_SOURCE_DIR}/Detectors/Base/src
             -I${CMAKE_SOURCE_DIR}/DataFormats/Reconstruction/src
-            -I${CMAKE_SOURCE_DIR}/Detectors/ITSMFT/ITS/tracking/cuda/include
-            -D__OPENCLCPP__
 )
 
 set(SRCS GPUReconstructionOCL2.cxx)
diff --git a/GPU/GPUTracking/DataTypes/GPUDataTypes.h b/GPU/GPUTracking/DataTypes/GPUDataTypes.h
index c746dc1af5a0b..d3b88f0239c7b 100644
--- a/GPU/GPUTracking/DataTypes/GPUDataTypes.h
+++ b/GPU/GPUTracking/DataTypes/GPUDataTypes.h
@@ -125,7 +125,7 @@ namespace gpu
 #define GPUCA_RECO_STEP GPUDataTypes
 #endif
 
-#if defined(__OPENCL__) && !defined(__OPENCLCPP__)
+#if defined(__OPENCL1__)
 MEM_CLASS_PRE() // Macro with some template magic for OpenCL 1.2
 #endif
 class GPUTPCTrack;
diff --git a/GPU/GPUTracking/DataTypes/GPUO2DataTypes.h b/GPU/GPUTracking/DataTypes/GPUO2DataTypes.h
index 3ffdd42b9cf81..1015b31fe6556 100644
--- a/GPU/GPUTracking/DataTypes/GPUO2DataTypes.h
+++ b/GPU/GPUTracking/DataTypes/GPUO2DataTypes.h
@@ -17,7 +17,7 @@
 
 // Pull in several O2 headers with basic data types, or load a header with empty fake classes if O2 headers not available
 
-#if defined(GPUCA_HAVE_O2HEADERS) && (!defined(__OPENCL__) || defined(__OPENCLCPP__))
+#if defined(GPUCA_HAVE_O2HEADERS) && !defined(__OPENCL1__)
 #include "DataFormatsTPC/ClusterNative.h"
 #include "DataFormatsTPC/Digit.h"
 #include "DetectorsBase/MatLayerCylSet.h"
@@ -27,7 +27,7 @@
 #include "GPUO2FakeClasses.h"
 #endif
 
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
 #include "GPUdEdxInfo.h"
 #endif
 
diff --git a/GPU/GPUTracking/DataTypes/GPUSettings.h b/GPU/GPUTracking/DataTypes/GPUSettings.h
index 738457ec99d7b..69bfb15e3f4b0 100644
--- a/GPU/GPUTracking/DataTypes/GPUSettings.h
+++ b/GPU/GPUTracking/DataTypes/GPUSettings.h
@@ -45,7 +45,7 @@ class GPUSettings
                               RejectionStrategyA = 1,
                               RejectionStrategyB = 2 };
 
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
   static CONSTEXPR const uint32_t TPC_MAX_TF_TIME_BIN = ((256 * 3564 + 2 * 8 - 2) / 8);
 #endif
 };
diff --git a/GPU/GPUTracking/DataTypes/GPUTPCGMPolynomialField.h b/GPU/GPUTracking/DataTypes/GPUTPCGMPolynomialField.h
index a7e38bc31fc14..09193e76b9382 100644
--- a/GPU/GPUTracking/DataTypes/GPUTPCGMPolynomialField.h
+++ b/GPU/GPUTracking/DataTypes/GPUTPCGMPolynomialField.h
@@ -29,7 +29,7 @@ namespace gpu
 class GPUTPCGMPolynomialField
 {
  public:
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
   GPUTPCGMPolynomialField() : mNominalBz(0.f)
   {
     Reset();
@@ -94,7 +94,7 @@ class GPUTPCGMPolynomialField
   float mItsBz[NITSM];
 };
 
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
 
 inline void GPUTPCGMPolynomialField::Reset()
 {
diff --git a/GPU/GPUTracking/DataTypes/GPUTPCGeometry.h b/GPU/GPUTracking/DataTypes/GPUTPCGeometry.h
index 40d711a4a672b..515905abe48b5 100644
--- a/GPU/GPUTracking/DataTypes/GPUTPCGeometry.h
+++ b/GPU/GPUTracking/DataTypes/GPUTPCGeometry.h
@@ -34,7 +34,7 @@ namespace gpu
 // Should be unified, but cannot take the contants from the official headers for now, since we want it to be constexpr
 class GPUTPCGeometry // TODO: Make values constexpr
 {
-#if defined(__OPENCL__) && !defined(__OPENCLCPP__)
+#if defined(__OPENCL1__)
   GPUTPCGeometry(); // Fake constructor declaration for OpenCL due to static members, does not exist!
 #endif
 #ifdef GPUCA_TPC_GEOMETRY_O2
@@ -63,7 +63,7 @@ class GPUTPCGeometry // TODO: Make values constexpr
   const float mPadHeight[10] GPUCA_CPP11_INIT(= {.75f, .75f, .75f, .75f, 1.f, 1.f, 1.2f, 1.2f, 1.5f, 1.5f});
   const float mPadWidth[10] GPUCA_CPP11_INIT(= {.416f, .420f, .420f, .436f, .6f, .6f, .608f, .588f, .604f, .607f});
 
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
   static CONSTEXPR float FACTOR_T2Z GPUCA_CPP11_INIT(= 250.f / 512.f); // Used in compression, must remain constant at 250cm, 512 time bins!
 #endif
 
@@ -95,7 +95,7 @@ class GPUTPCGeometry // TODO: Make values constexpr
   const float mPadHeight[3] GPUCA_CPP11_INIT(= {.75f, 1.f, 1.5f});
   const float mPadWidth[3] GPUCA_CPP11_INIT(= {.4f, .6f, .6f});
 
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
   static CONSTEXPR float FACTOR_T2Z GPUCA_CPP11_INIT(= 250.f / 1024.f); // Used in compression, must remain constant at 250cm, 1024 time bins!
 #endif
 
@@ -109,7 +109,7 @@ class GPUTPCGeometry // TODO: Make values constexpr
   GPUd() int32_t EndOROC2() const { return GPUCA_ROW_COUNT; }
 #endif
  private:
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
   static CONSTEXPR float FACTOR_Z2T GPUCA_CPP11_INIT(= 1.f / FACTOR_T2Z);
 #endif
  public:
@@ -120,7 +120,7 @@ class GPUTPCGeometry // TODO: Make values constexpr
   GPUd() float PadWidth(int32_t row) const { return (mPadWidth[GetRegion(row)]); }
   GPUd() uint8_t NPads(int32_t row) const { return mNPads[row]; }
 
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
   GPUd() float LinearPad2Y(int32_t slice, int32_t row, float pad) const
   {
     const float u = (pad - 0.5f * mNPads[row]) * PadWidth(row);
diff --git a/GPU/GPUTracking/Definitions/GPUDefConstantsAndSettings.h b/GPU/GPUTracking/Definitions/GPUDefConstantsAndSettings.h
index ddb3d5e73bb53..7693ee8553b77 100644
--- a/GPU/GPUTracking/Definitions/GPUDefConstantsAndSettings.h
+++ b/GPU/GPUTracking/Definitions/GPUDefConstantsAndSettings.h
@@ -51,7 +51,7 @@
 #if defined(GPUCA_NSLICES) || defined(GPUCA_ROW_COUNT)
   #error GPUCA_NSLICES or GPUCA_ROW_COUNT already defined, do not include GPUTPCGeometry.h before!
 #endif
-#if defined(GPUCA_HAVE_O2HEADERS) && defined(GPUCA_TPC_GEOMETRY_O2) && (!defined(__OPENCL__) || defined(__OPENCLCPP__)) && !(defined(ROOT_VERSION_CODE) && ROOT_VERSION_CODE < 393216)
+#if defined(GPUCA_HAVE_O2HEADERS) && defined(GPUCA_TPC_GEOMETRY_O2) && !defined(__OPENCL1__) && !(defined(ROOT_VERSION_CODE) && ROOT_VERSION_CODE < 393216)
   //Use definitions from the O2 headers if available for nicer code and type safety
   #include "DataFormatsTPC/Constants.h"
   #define GPUCA_NSLICES o2::tpc::constants::MAXSECTOR
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCGlobalTracking.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCGlobalTracking.cxx
index e8d7a405261f2..c1a3c685947d6 100644
--- a/GPU/GPUTracking/SliceTracker/GPUTPCGlobalTracking.cxx
+++ b/GPU/GPUTracking/SliceTracker/GPUTPCGlobalTracking.cxx
@@ -22,7 +22,7 @@
 
 using namespace GPUCA_NAMESPACE::gpu;
 
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
 
 GPUd() int32_t GPUTPCGlobalTracking::PerformGlobalTrackingRun(GPUTPCTracker& tracker, GPUsharedref() MEM_LOCAL(GPUSharedMemory) & smem, const GPUTPCTracker& GPUrestrict() sliceSource, int32_t iTrack, int32_t rowIndex, float angle, int32_t direction)
 {
@@ -200,7 +200,7 @@ GPUd() void GPUTPCGlobalTracking::GlobalTrackingSliceLeftRight(uint32_t iSlice,
     right += GPUDataTypes::NSLICES / 2;
   }
 }
-#endif // !__OPENCL__ || __OPENCLCPP__
+#endif // !__OPENCL1__
 
 template <>
 GPUdii() void GPUTPCGlobalTrackingCopyNumbers::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() MEM_LOCAL(GPUSharedMemory) & smem, processorType& GPUrestrict() tracker, int32_t n)
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCGlobalTracking.h b/GPU/GPUTracking/SliceTracker/GPUTPCGlobalTracking.h
index 075957ff4c8c8..9d732a582b1c4 100644
--- a/GPU/GPUTracking/SliceTracker/GPUTPCGlobalTracking.h
+++ b/GPU/GPUTracking/SliceTracker/GPUTPCGlobalTracking.h
@@ -25,7 +25,7 @@ namespace gpu
 MEM_CLASS_PRE()
 class GPUTPCTracker;
 
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
 class GPUTPCGlobalTracking : public GPUKernelTemplate
 {
  public:
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCSliceOutput.h b/GPU/GPUTracking/SliceTracker/GPUTPCSliceOutput.h
index 8892225f119cd..3ab5b0a331f31 100644
--- a/GPU/GPUTracking/SliceTracker/GPUTPCSliceOutput.h
+++ b/GPU/GPUTracking/SliceTracker/GPUTPCSliceOutput.h
@@ -44,7 +44,7 @@ class GPUTPCSliceOutput
   }
   GPUhd() uint32_t NLocalTracks() const { return mNLocalTracks; }
   GPUhd() uint32_t NTrackClusters() const { return mNTrackClusters; }
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
   GPUhd() const GPUTPCTrack* GetFirstTrack() const
   {
     return (const GPUTPCTrack*)((const char*)this + sizeof(*this));
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx
index 552d61a88fc39..7428a4ccbd0ed 100644
--- a/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx
+++ b/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx
@@ -22,7 +22,7 @@
 #include "GPUO2DataTypes.h"
 #include "GPUTPCTrackParam.h"
 #include "GPUParam.inc"
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
 #include "GPUTPCConvertImpl.h"
 #endif
 
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCTracker.h b/GPU/GPUTracking/SliceTracker/GPUTPCTracker.h
index f19b4f0a6c0a7..da8d3d1fb28d4 100644
--- a/GPU/GPUTracking/SliceTracker/GPUTPCTracker.h
+++ b/GPU/GPUTracking/SliceTracker/GPUTPCTracker.h
@@ -94,7 +94,7 @@ class GPUTPCTracker : public GPUProcessor
     StructGPUParameters gpuParameters;  // GPU parameters
   };
 
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
   GPUhdi() GPUglobalref() const GPUTPCClusterData* ClusterData() const
   {
     return mData.ClusterData();
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCTrackletConstructor.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCTrackletConstructor.cxx
index 9d6ed630dee8c..ba17b88436845 100644
--- a/GPU/GPUTracking/SliceTracker/GPUTPCTrackletConstructor.cxx
+++ b/GPU/GPUTracking/SliceTracker/GPUTPCTrackletConstructor.cxx
@@ -21,7 +21,7 @@
 #include "GPUTPCTracker.h"
 #include "GPUTPCTracklet.h"
 #include "GPUTPCTrackletConstructor.h"
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
 #include "GPUTPCGlobalTracking.h"
 #include "CorrectionMapsHelper.h"
 #ifdef GPUCA_HAVE_O2HEADERS
@@ -140,14 +140,14 @@ GPUdic(2, 1) void GPUTPCTrackletConstructor::UpdateTracklet(int32_t /*nBlocks*/,
       float z = z0 + hh.y * stepZ;
       if (iRow != r.mStartRow || !tracker.Param().par.continuousTracking) {
         tParam.ConstrainZ(z, tracker.ISlice(), z0, r.mLastZ);
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
         tracker.GetConstantMem()->calibObjects.fastTransformHelper->TransformXYZ(tracker.ISlice(), iRow, x, y, z);
 #endif
       }
       if (iRow == r.mStartRow) {
         if (tracker.Param().par.continuousTracking) {
           float refZ = ((z > 0) ? tracker.Param().rec.tpc.defaultZOffsetOverR : -tracker.Param().rec.tpc.defaultZOffsetOverR) * x;
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
           float zTmp = refZ;
           tracker.GetConstantMem()->calibObjects.fastTransformHelper->TransformXYZ(tracker.ISlice(), iRow, x, y, zTmp);
           z += zTmp - refZ; // Add zCorrection (=zTmp - refZ) to z, such that zOffset is set such, that transformed (z - zOffset) becomes refZ
@@ -266,7 +266,7 @@ GPUdic(2, 1) void GPUTPCTrackletConstructor::UpdateTracklet(int32_t /*nBlocks*/,
       r.mNMissed++;
 
       float x = row.X();
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
       {
         float tmpY, tmpZ;
         if (!tParam.GetPropagatedYZ(tracker.Param().bzCLight, x, tmpY, tmpZ)) {
@@ -299,7 +299,7 @@ GPUdic(2, 1) void GPUTPCTrackletConstructor::UpdateTracklet(int32_t /*nBlocks*/,
         GPUglobalref() const cahit2* hits = tracker.HitData(row);
         GPUglobalref() const calink* firsthit = tracker.FirstHitInBin(row);
 #endif //! GPUCA_TEXTURE_FETCH_CONSTRUCTOR
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
         tracker.GetConstantMem()->calibObjects.fastTransformHelper->InverseTransformYZtoNominalYZ(tracker.ISlice(), iRow, yUncorrected, zUncorrected, yUncorrected, zUncorrected);
 #endif
 
@@ -391,7 +391,7 @@ GPUdic(2, 1) void GPUTPCTrackletConstructor::UpdateTracklet(int32_t /*nBlocks*/,
         }
       } while (false);
       (void)found;
-#if defined(GPUCA_HAVE_O2HEADERS) && (!defined(__OPENCL__) || defined(__OPENCLCPP__))
+#if defined(GPUCA_HAVE_O2HEADERS) && !defined(__OPENCL1__)
       if (!found && tracker.GetConstantMem()->calibObjects.dEdxCalibContainer) {
         uint32_t pad = CAMath::Float2UIntRn(tracker.Param().tpcGeometry.LinearY2Pad(tracker.ISlice(), iRow, yUncorrected));
         if (pad < tracker.Param().tpcGeometry.NPads(iRow) && tracker.GetConstantMem()->calibObjects.dEdxCalibContainer->isDead(tracker.ISlice(), iRow, pad)) {
@@ -461,7 +461,7 @@ GPUdic(2, 1) void GPUTPCTrackletConstructor::DoTracklet(GPUconstantref() MEM_GLO
       iRow = r.mEndRow;
       iRowEnd = -1;
       float x = tracker.Row(r.mEndRow).X();
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
       {
         float tmpY, tmpZ;
         if (tParam.GetPropagatedYZ(tracker.Param().bzCLight, x, tmpY, tmpZ)) {
@@ -584,7 +584,7 @@ GPUd() int32_t GPUTPCTrackletConstructor::FetchTracklet(GPUconstantref() MEM_GLO
 
 #endif // GPUCA_GPUCODE
 
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
 template <>
 GPUd() int32_t GPUTPCTrackletConstructor::GPUTPCTrackletConstructorGlobalTracking<GPUTPCGlobalTracking::GPUSharedMemory>(GPUconstantref() MEM_GLOBAL(GPUTPCTracker) & GPUrestrict() tracker, GPUsharedref() GPUTPCGlobalTracking::GPUSharedMemory& sMem, MEM_LG(GPUTPCTrackParam) & GPUrestrict() tParam, int32_t row, int32_t increment, int32_t iTracklet, calink* rowHits)
 {
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCTrackletConstructor.h b/GPU/GPUTracking/SliceTracker/GPUTPCTrackletConstructor.h
index 06dd941ca5cf7..effee4fa757b8 100644
--- a/GPU/GPUTracking/SliceTracker/GPUTPCTrackletConstructor.h
+++ b/GPU/GPUTracking/SliceTracker/GPUTPCTrackletConstructor.h
@@ -100,7 +100,7 @@ class GPUTPCTrackletConstructor
   GPUd() static int32_t FetchTracklet(GPUconstantref() MEM_GLOBAL(GPUTPCTracker) & tracker, GPUsharedref() MEM_LOCAL(GPUSharedMemory) & sMem);
 #endif // GPUCA_GPUCODE
 
-#if !defined(__OPENCL__) || defined(__OPENCLCPP__)
+#if !defined(__OPENCL1__)
   template <class T>
   GPUd() static int32_t GPUTPCTrackletConstructorGlobalTracking(GPUconstantref() MEM_GLOBAL(GPUTPCTracker) & tracker, GPUsharedref() T& sMem, GPUTPCTrackParam& tParam, int32_t startrow, int32_t increment, int32_t iTracklet, calink* rowHits);
 #endif

From 6c81c31a8ec56a064414f646580a547d9158a6be Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Wed, 13 Nov 2024 09:35:07 +0100
Subject: [PATCH 11/11] GPU Display: make connecting A and C side segments of a
 track optional

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  3 +-
 GPU/GPUTracking/display/GPUDisplay.cxx        |  1 +
 GPU/GPUTracking/display/GPUDisplay.h          |  1 +
 .../display/frontend/GPUDisplayKeys.cxx       |  7 ++--
 .../display/helpers/GPUDisplayHelpers.cxx     |  7 ++++
 .../display/render/GPUDisplayDraw.cxx         | 33 +++++++++++--------
 6 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 106a222862f49..c4e0dadb87659 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -346,7 +346,8 @@ AddOption(drawTracksAndFilter, bool, false, "", 0, "Use AND filter instead of OR
 AddOption(propagateLoopers, bool, false, "", 0, "Enabale propagation of loopers")
 AddOption(clustersOnly, bool, false, "", 0, "Visualize clusters only")
 AddOption(clustersOnNominalRow, bool, false, "", 0, "Show clusters at nominal x of pad row for early-transformed data")
-AddOption(separateGlobalTracks, bool, false, "", 0, "Separate global tracks")
+AddOption(separateGlobalTracks, bool, false, "", 0, "Draw track segments propagated to adjacent sectors separately")
+AddOption(splitCETracks, int8_t, -1, "", 0, "Split CE tracks when they cross the central electrode (-1 = for triggered data)")
 AddOption(markClusters, int32_t, 0, "", 0, "Mark clusters")
 AddOption(markFakeClusters, int32_t, 0, "", 0, "Mark fake clusters")
 AddOption(markAdjacentClusters, int32_t, 0, "", 0, "Mark adjacent clusters")
diff --git a/GPU/GPUTracking/display/GPUDisplay.cxx b/GPU/GPUTracking/display/GPUDisplay.cxx
index 74d89fbf6de81..56e59d664491a 100644
--- a/GPU/GPUTracking/display/GPUDisplay.cxx
+++ b/GPU/GPUTracking/display/GPUDisplay.cxx
@@ -611,6 +611,7 @@ void GPUDisplay::DrawGLScene_internal(float animateTime, bool renderToMixBuffer)
   bool showTimer = false;
   bool doScreenshot = (mRequestScreenshot || mAnimateScreenshot) && animateTime < 0;
 
+  updateOptions();
   if (animateTime < 0 && (mUpdateEventData || mResetScene || mUpdateVertexLists) && mIOPtrs) {
     disableUnsupportedOptions();
   }
diff --git a/GPU/GPUTracking/display/GPUDisplay.h b/GPU/GPUTracking/display/GPUDisplay.h
index 38dacae60c51a..ab6fe540d01bf 100644
--- a/GPU/GPUTracking/display/GPUDisplay.h
+++ b/GPU/GPUTracking/display/GPUDisplay.h
@@ -150,6 +150,7 @@ class GPUDisplay : public GPUDisplayInterface
   void DrawGLScene_drawCommands();
   int32_t InitDisplay_internal();
   int32_t getNumThreads();
+  void updateOptions();
   void disableUnsupportedOptions();
   int32_t buildTrackFilter();
   const GPUTPCTracker& sliceTracker(int32_t iSlice);
diff --git a/GPU/GPUTracking/display/frontend/GPUDisplayKeys.cxx b/GPU/GPUTracking/display/frontend/GPUDisplayKeys.cxx
index 1842c276a580c..8dccdc60c0d93 100644
--- a/GPU/GPUTracking/display/frontend/GPUDisplayKeys.cxx
+++ b/GPU/GPUTracking/display/frontend/GPUDisplayKeys.cxx
@@ -35,7 +35,7 @@ const char* HelpText[] = {
   "[L] / [K]                     Draw single collisions (next / previous)",
   "[C]                           Colorcode clusters of different collisions",
   "[v]                           Hide rejected clusters from tracks",
-  "[j]                           Show global tracks as additional segments of final tracks",
+  "[j]                           Show tracks segments propagated to adjacent sector in different color / splt CE tracks",
   "[u]                           Cycle through track filter",
   "[E] / [G]                     Extrapolate tracks / loopers",
   "[t] / [T]                     Take Screenshot / Record Animation to pictures",
@@ -164,8 +164,11 @@ void GPUDisplay::HandleKey(uint8_t key)
     mPrintInfoText &= 3;
     SetInfo("Info text display - console: %s, onscreen %s", (mPrintInfoText & 2) ? "enabled" : "disabled", (mPrintInfoText & 1) ? "enabled" : "disabled");
   } else if (key == 'j') {
+    if (mCfgH.separateGlobalTracks) {
+      mCfgH.splitCETracks ^= 1;
+    }
     mCfgH.separateGlobalTracks ^= 1;
-    SetInfo("Seperated display of global tracks %s", mCfgH.separateGlobalTracks ? "enabled" : "disabled");
+    SetInfo("Seperated display of tracks propagated to adjacent sectors %s / of CE tracks %s", mCfgH.separateGlobalTracks ? "enabled" : "disabled", mCfgH.splitCETracks ? "enabled" : "disabled");
   } else if (key == 'c') {
     if (mCfgH.markClusters == 0) {
       mCfgH.markClusters = 1;
diff --git a/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx b/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx
index cd73cc0b9b34f..d782898380281 100644
--- a/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx
+++ b/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx
@@ -36,6 +36,13 @@ int32_t GPUDisplay::getNumThreads()
   }
 }
 
+void GPUDisplay::updateOptions()
+{
+  if (mCfgH.splitCETracks == -1 && mParam) {
+    mCfgH.splitCETracks = mParam->continuousMaxTimeBin != 0;
+  }
+}
+
 void GPUDisplay::disableUnsupportedOptions()
 {
   if (!mIOPtrs->mergedTrackHitAttachment) {
diff --git a/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx b/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx
index 746c41938e2e1..ffebc373b253f 100644
--- a/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx
+++ b/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx
@@ -45,7 +45,6 @@
 using namespace GPUCA_NAMESPACE::gpu;
 
 #define GET_CID(slice, i) (mParam->par.earlyTpcTransform ? mIOPtrs->clusterData[slice][i].id : (mIOPtrs->clustersNative->clusterOffset[slice][0] + i))
-#define SEPERATE_GLOBAL_TRACKS_LIMIT (mCfgH.separateGlobalTracks ? tGLOBALTRACK : TRACK_TYPE_ID_LIMIT)
 
 const GPUTRDGeometry* GPUDisplay::trdGeometry() { return (GPUTRDGeometry*)mCalib->trdGeometry; }
 const GPUTPCTracker& GPUDisplay::sliceTracker(int32_t iSlice) { return mChain->GetTPCSliceTrackers()[iSlice]; }
@@ -421,6 +420,8 @@ void GPUDisplay::DrawFinal(int32_t iSlice, int32_t /*iCol*/, GPUTPCGMPropagator*
       }
 
       // Print TPC part of track
+      int32_t separateGlobalTracksLimit = (mCfgH.separateGlobalTracks ? tGLOBALTRACK : TRACK_TYPE_ID_LIMIT);
+      uint32_t lastSide = -1;
       for (int32_t k = 0; k < nClusters; k++) {
         if constexpr (std::is_same_v<T, GPUTPCGMMergedTrack>) {
           if (mCfgH.hideRejectedClusters && (mIOPtrs->mergedTrackHits[track->FirstClusterRef() + k].state & GPUTPCGMMergedTrackHit::flagReject)) {
@@ -435,9 +436,15 @@ void GPUDisplay::DrawFinal(int32_t iSlice, int32_t /*iCol*/, GPUTPCGMPropagator*
         }
         int32_t w = mGlobalPos[cid].w;
         if (drawing) {
-          drawPointLinestrip(iSlice, cid, tFINALTRACK, SEPERATE_GLOBAL_TRACKS_LIMIT);
+          if (mCfgH.splitCETracks && lastSide != (mGlobalPos[cid].z < 0)) {
+            insertVertexList(vBuf[0], startCountInner, mVertexBuffer[iSlice].size());
+            drawing = false;
+            lastCluster = -1;
+          } else {
+            drawPointLinestrip(iSlice, cid, tFINALTRACK, separateGlobalTracksLimit);
+          }
         }
-        if (w == SEPERATE_GLOBAL_TRACKS_LIMIT) {
+        if (w == separateGlobalTracksLimit) {
           if (drawing) {
             insertVertexList(vBuf[0], startCountInner, mVertexBuffer[iSlice].size());
           }
@@ -445,21 +452,21 @@ void GPUDisplay::DrawFinal(int32_t iSlice, int32_t /*iCol*/, GPUTPCGMPropagator*
         } else {
           if (!drawing) {
             startCountInner = mVertexBuffer[iSlice].size();
-          }
-          if (!drawing) {
-            drawPointLinestrip(iSlice, cid, tFINALTRACK, SEPERATE_GLOBAL_TRACKS_LIMIT);
-          }
-          if (!drawing && lastCluster != -1) {
-            if constexpr (std::is_same_v<T, GPUTPCGMMergedTrack>) {
-              cid = mIOPtrs->mergedTrackHits[track->FirstClusterRef() + lastCluster].num;
-            } else {
-              cid = &track->getCluster(mIOPtrs->outputClusRefsTPCO2, lastCluster, *mIOPtrs->clustersNative) - mIOPtrs->clustersNative->clustersLinear;
+            if (lastCluster != -1 && (!mCfgH.splitCETracks || lastSide == (mGlobalPos[cid].z < 0))) {
+              int32_t lastcid;
+              if constexpr (std::is_same_v<T, GPUTPCGMMergedTrack>) {
+                lastcid = mIOPtrs->mergedTrackHits[track->FirstClusterRef() + lastCluster].num;
+              } else {
+                lastcid = &track->getCluster(mIOPtrs->outputClusRefsTPCO2, lastCluster, *mIOPtrs->clustersNative) - mIOPtrs->clustersNative->clustersLinear;
+              }
+              drawPointLinestrip(iSlice, lastcid, tFINALTRACK, separateGlobalTracksLimit);
             }
-            drawPointLinestrip(iSlice, cid, 7, SEPERATE_GLOBAL_TRACKS_LIMIT);
+            drawPointLinestrip(iSlice, cid, tFINALTRACK, separateGlobalTracksLimit);
           }
           drawing = true;
         }
         lastCluster = k;
+        lastSide = mGlobalPos[cid].z < 0;
       }
 
       // Print ITS part of track