From 89830e97b1582c1f4603b6b8f0260e0945876a27 Mon Sep 17 00:00:00 2001
From: Jan Gorecki <jangorecki@users.noreply.github.com>
Date: Thu, 18 Jun 2020 09:56:31 +0100
Subject: [PATCH] throttle threads for iterated small data tasks (#4484)

---
 NEWS.md               |  2 ++
 R/openmp-utils.R      |  6 +++---
 inst/tests/tests.Rraw |  6 +++++-
 man/openmp-utils.Rd   |  3 ++-
 src/between.c         | 16 +++++++--------
 src/cj.c              | 12 +++++------
 src/coalesce.c        |  8 ++++----
 src/data.table.h      |  2 +-
 src/fifelse.c         |  8 ++++----
 src/forder.c          | 35 ++++++++++++++++---------------
 src/froll.c           |  8 ++++----
 src/frollR.c          |  2 +-
 src/frolladaptive.c   | 16 +++++++--------
 src/fsort.c           |  6 +++---
 src/gsumm.c           | 40 ++++++++++++++++++------------------
 src/nafill.c          |  2 +-
 src/openmp-utils.c    | 48 ++++++++++++++++++++++++++-----------------
 src/reorder.c         |  6 +++---
 src/subset.c          |  6 +++---
 src/types.c           |  2 +-
 20 files changed, 126 insertions(+), 108 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 0ba902c87..98484687c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -81,6 +81,8 @@ unit = "s")
 
 14. Added support for `round()` and `trunc()` to extend functionality of `ITime`. `round()` and `trunc()` can be used with argument units: "hours" or "minutes". Thanks to @JensPederM for the suggestion and PR.
 
+15. A new throttle feature has been introduced to speed up small data tasks that are repeated in a loop, [#3175](https://github.com/Rdatatable/data.table/issues/3175) [#3438](https://github.com/Rdatatable/data.table/issues/3438) [#3205](https://github.com/Rdatatable/data.table/issues/3205) [#3735](https://github.com/Rdatatable/data.table/issues/3735) [#3739](https://github.com/Rdatatable/data.table/issues/3739) [#4284](https://github.com/Rdatatable/data.table/issues/4284) [#4527](https://github.com/Rdatatable/data.table/issues/4527) [#4294](https://github.com/Rdatatable/data.table/issues/4294) [#1120](https://github.com/Rdatatable/data.table/issues/1120). The default throttle of 1024 means that a single thread will be used when nrow<=1024, two threads when nrow<=2048, etc. To change the default, use `setDTthreads(throttle=)`. Or use the new environment variable `R_DATATABLE_THROTTLE`. If you use `Sys.setenv()` in a running R session to change this environment variable, be sure to run an empty `setDTthreads()` call afterwards for the change to take effect; see `?setDTthreads`. The word *throttle* is used to convey that the number of threads is restricted (throttled) for small data tasks. Reducing throttle to 1 will turn off throttling and should revert behaviour to past versions (i.e. using many threads even for small data). Increasing throttle to, say, 65536 will utilize multi-threading only for larger datasets. The value 1024 is a guess. We welcome feedback and test results indicating what the best default should be.
+
 ## BUG FIXES
 
 1. A NULL timezone on POSIXct was interpreted by `as.IDate` and `as.ITime` as UTC rather than the session's default timezone (`tz=""`) , [#4085](https://github.com/Rdatatable/data.table/issues/4085).
diff --git a/R/openmp-utils.R b/R/openmp-utils.R
index 5e11222c5..9df55f114 100644
--- a/R/openmp-utils.R
+++ b/R/openmp-utils.R
@@ -1,12 +1,12 @@
-setDTthreads = function(threads=NULL, restore_after_fork=NULL, percent=NULL) {
+setDTthreads = function(threads=NULL, restore_after_fork=NULL, percent=NULL, throttle=NULL) {
   if (!missing(percent)) {
     if (!missing(threads)) stop("Provide either threads= or percent= but not both")
     if (length(percent)!=1) stop("percent= is provided but is length ", length(percent))
     percent=as.integer(percent)
     if (is.na(percent) || percent<2L || percent>100L) stop("percent==",percent," but should be a number between 2 and 100")
-    invisible(.Call(CsetDTthreads, percent, restore_after_fork, TRUE))
+    invisible(.Call(CsetDTthreads, percent, restore_after_fork, TRUE, as.integer(throttle)))
   } else {
-    invisible(.Call(CsetDTthreads, threads, restore_after_fork, FALSE))
+    invisible(.Call(CsetDTthreads, as.integer(threads), restore_after_fork, FALSE, as.integer(throttle)))
   }
 }
 
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 1def357eb..12790ed92 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -14188,7 +14188,7 @@ test(1996.2, d[, eval(qcall)], data.table(a=1L, b=3))
 # setDTthreads; #3435
 test(1997.01, setDTthreads(NULL, percent=75), error="Provide either threads= or percent= but not both")
 test(1997.02, setDTthreads(1L, percent=75), error="Provide either threads= or percent= but not both")
-test(1997.03, setDTthreads(-1L), error="must be either NULL or a single integer >= 0")
+test(1997.03, setDTthreads(-1L), error="threads= must be either NULL or a single number >= 0")
 test(1997.04, setDTthreads(percent=101), error="should be a number between 2 and 100")
 test(1997.05, setDTthreads(percent=1), error="should be a number between 2 and 100")
 test(1997.06, setDTthreads(percent=NULL), error="but is length 0")
@@ -14211,6 +14211,10 @@ test(1997.14, getDTthreads(), new)
 Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT=oldenv)
 test(1997.15, setDTthreads(old), new)
 test(1997.16, getDTthreads(), old)
+test(1997.17, setDTthreads(throttle=NA), error="throttle.*must be a single number, non-NA, and >=1")
+setDTthreads(throttle=65536)
+test(1997.18, getDTthreads(TRUE), output="throttle==65536")
+setDTthreads(throttle=1024)
 
 # test that a copy is being made and output is printed, #3385 after partial revert of #3281
 x = 5L
diff --git a/man/openmp-utils.Rd b/man/openmp-utils.Rd
index 8bb6dccc2..b8d014976 100644
--- a/man/openmp-utils.Rd
+++ b/man/openmp-utils.Rd
@@ -8,13 +8,14 @@
   Set and get number of threads to be used in \code{data.table} functions that are parallelized with OpenMP. The number of threads is initialized when \code{data.table} is first loaded in the R session using optional envioronment variables. Thereafter, the number of threads may be changed by calling \code{setDTthreads}. If you change an environment variable using \code{Sys.setenv} you will need to call \code{setDTthreads} again to reread the environment variables.
 }
 \usage{
-  setDTthreads(threads = NULL, restore_after_fork = NULL, percent = NULL)
+  setDTthreads(threads = NULL, restore_after_fork = NULL, percent = NULL, throttle = NULL)
   getDTthreads(verbose = getOption("datatable.verbose"))
 }
 \arguments{
   \item{threads}{ NULL (default) rereads environment variables. 0 means to use all logical CPUs available. Otherwise a number >= 1 }
   \item{restore_after_fork}{ Should data.table be multi-threaded after a fork has completed? NULL leaves the current setting unchanged which by default is TRUE. See details below. }
   \item{percent}{ If provided it should be a number between 2 and 100; the percentage of logical CPUs to use. By default on startup, 50\%. }
+  \item{throttle}{ 1024 (default) means that, roughly speaking, a single thread will be used when nrow(DT)<=1024, 2 threads when nrow(DT)<=2048, etc. The throttle is to speed up small data tasks (especially when repeated many times) by not incurring the overhead of managing multiple threads. Hence the number of threads is throttled (restricted) for small tasks. }
   \item{verbose}{ Display the value of relevant OpenMP settings plus the \code{restore_after_fork} internal option. }
 }
 \value{
diff --git a/src/between.c b/src/between.c
index b4444d968..c5d91b30c 100644
--- a/src/between.c
+++ b/src/between.c
@@ -64,14 +64,14 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg, S
         error(_("Item %d of lower (%d) is greater than item %d of upper (%d)"), (i&lowMask)+1, l, (i&uppMask)+1, u);
     }
     if (NAbounds) {  // default NAbounds==TRUE => NA bound means TRUE; i.e. asif lower=-Inf or upper==Inf)
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(longest, true))
       for (int i=0; i<longest; ++i) {
         const int elem=xp[i & xMask], l=lp[i & lowMask], u=up[i & uppMask];
         ansp[i] = elem==NA_INTEGER ? NA_LOGICAL : (l==NA_INTEGER || l+open<=elem) && (u==NA_INTEGER || elem<=u-open);
         // +open so we can always use >= and <=.  NA_INTEGER+1 == -INT_MAX == INT_MIN+1 (so NA limit handled by this too)
       }
     } else {
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(longest, true))
       for (int i=0; i<longest; ++i) {
         const int elem=xp[i & xMask], l=lp[i & lowMask], u=up[i & uppMask];
         if (elem==NA_INTEGER) { ansp[i]=NA_LOGICAL; continue; }
@@ -95,13 +95,13 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg, S
           error(_("Item %d of lower (%"PRId64") is greater than item %d of upper (%"PRId64")"), (i&lowMask)+1, l, (i&uppMask)+1, u);
       }
       if (NAbounds) {
-        #pragma omp parallel for num_threads(getDTthreads())
+        #pragma omp parallel for num_threads(getDTthreads(longest, true))
         for (int i=0; i<longest; ++i) {
           const int64_t elem=xp[i & xMask], l=lp[i & lowMask], u=up[i & uppMask];
           ansp[i] = elem==NA_INTEGER64 ? NA_LOGICAL : (l==NA_INTEGER64 || l+open<=elem) && (u==NA_INTEGER64 || elem<=u-open);
         }
       } else {
-        #pragma omp parallel for num_threads(getDTthreads())
+        #pragma omp parallel for num_threads(getDTthreads(longest, true))
         for (int i=0; i<longest; ++i) {
           const int64_t elem=xp[i & xMask], l=lp[i & lowMask], u=up[i & uppMask];
           if (elem==NA_INTEGER64) { ansp[i]=NA_LOGICAL; continue; }
@@ -123,13 +123,13 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg, S
       }
       if (open) {
         if (NAbounds) {
-          #pragma omp parallel for num_threads(getDTthreads())
+          #pragma omp parallel for num_threads(getDTthreads(longest, true))
           for (int i=0; i<longest; ++i) {
             const double elem=xp[i & xMask], l=lp[i & lowMask], u=up[i & uppMask];
             ansp[i] = isnan(elem) ? NA_LOGICAL : (isnan(l) || l<elem) && (isnan(u) || elem<u);
           }
         } else {
-          #pragma omp parallel for num_threads(getDTthreads())
+          #pragma omp parallel for num_threads(getDTthreads(longest, true))
           for (int i=0; i<longest; ++i) {
             const double elem=xp[i & xMask], l=lp[i & lowMask], u=up[i & uppMask];
             if (isnan(elem)) { ansp[i]=NA_LOGICAL; continue; }
@@ -140,13 +140,13 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg, S
         if (verbose) Rprintf(_("between parallel processing of double with open bounds took %8.3fs\n"), omp_get_wtime()-tic);
       } else {
         if (NAbounds) {
-          #pragma omp parallel for num_threads(getDTthreads())
+          #pragma omp parallel for num_threads(getDTthreads(longest, true))
           for (int i=0; i<longest; ++i) {
             const double elem=xp[i & xMask], l=lp[i & lowMask], u=up[i & uppMask];
             ansp[i] = isnan(elem) ? NA_LOGICAL : (isnan(l) || l<=elem) && (isnan(u) || elem<=u);
           }
         } else {
-          #pragma omp parallel for num_threads(getDTthreads())
+          #pragma omp parallel for num_threads(getDTthreads(longest, true))
           for (int i=0; i<longest; ++i) {
             const double elem=xp[i & xMask], l=lp[i & lowMask], u=up[i & uppMask];
             if (isnan(elem)) { ansp[i]=NA_LOGICAL; continue; }
diff --git a/src/cj.c b/src/cj.c
index 6205ca517..d2c8fc22c 100644
--- a/src/cj.c
+++ b/src/cj.c
@@ -20,7 +20,7 @@ SEXP cj(SEXP base_list) {
     case INTSXP: {
       const int *restrict sourceP = INTEGER(source);
       int *restrict targetP = INTEGER(target);
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(thislen*eachrep, true))
       // default static schedule so two threads won't write to same cache line in last column
       // if they did write to same cache line (and will when last column's thislen is small) there's no correctness issue
       for (int i=0; i<thislen; ++i) {
@@ -28,7 +28,7 @@ SEXP cj(SEXP base_list) {
         const int end = (i+1)*eachrep;
         for (int j=i*eachrep; j<end; ++j) targetP[j] = item;  // no div, mod or read ops inside loop; just rep a const contiguous write
       }
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(ncopy*blocklen, true))
       for (int i=1; i<ncopy; ++i) {
         memcpy(targetP + i*blocklen, targetP, blocklen*sizeof(int));
       }
@@ -36,13 +36,13 @@ SEXP cj(SEXP base_list) {
     case REALSXP: {
       const double *restrict sourceP = REAL(source);
       double *restrict targetP = REAL(target);
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(thislen*eachrep, true))
       for (int i=0; i<thislen; ++i) {
         const double item = sourceP[i];
         const int end=(i+1)*eachrep;
         for (int j=i*eachrep; j<end; ++j) targetP[j] = item;
       }
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(ncopy*blocklen, true))
       for (int i=1; i<ncopy; ++i) {
         memcpy(targetP + i*blocklen, targetP, blocklen*sizeof(double));
       }
@@ -50,13 +50,13 @@ SEXP cj(SEXP base_list) {
     case CPLXSXP: {
       const Rcomplex *restrict sourceP = COMPLEX(source);
       Rcomplex *restrict targetP = COMPLEX(target);
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(thislen*eachrep, true))
       for (int i=0; i<thislen; ++i) {
         const Rcomplex item = sourceP[i];
         const int end=(i+1)*eachrep;
         for (int j=i*eachrep; j<end; ++j) targetP[j] = item;
       }
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(ncopy*blocklen, true))
       for (int i=1; i<ncopy; ++i) {
         memcpy(targetP + i*blocklen, targetP, blocklen*sizeof(Rcomplex));
       }
diff --git a/src/coalesce.c b/src/coalesce.c
index 9daf628d2..558d2d4da 100644
--- a/src/coalesce.c
+++ b/src/coalesce.c
@@ -65,7 +65,7 @@ SEXP coalesce(SEXP x, SEXP inplaceArg) {
       valP[k++] = INTEGER(item);
     }
     const bool final=(finalVal!=NA_INTEGER);
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(nrow, true))
     for (int i=0; i<nrow; ++i) {
       int val = xP[i];
       if (val!=NA_INTEGER) continue;
@@ -88,7 +88,7 @@ SEXP coalesce(SEXP x, SEXP inplaceArg) {
         valP[k++] = REAL(item);
       }
       const bool final = (finalVal!=NA_INTEGER64);
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(nrow, true))
       for (int i=0; i<nrow; ++i) {
         int64_t val=xP[i];
         if (val!=NA_INTEGER64) continue;
@@ -109,7 +109,7 @@ SEXP coalesce(SEXP x, SEXP inplaceArg) {
         valP[k++] = REAL(item);
       }
       const bool final = !ISNAN(finalVal);
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(nrow, true))
       for (int i=0; i<nrow; ++i) {
         double val=xP[i];
         if (!ISNAN(val)) continue;
@@ -132,7 +132,7 @@ SEXP coalesce(SEXP x, SEXP inplaceArg) {
       valP[k++] = COMPLEX(item);
     }
     const bool final = !ISNAN(finalVal.r) && !ISNAN(finalVal.i);
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(nrow, true))
     for (int i=0; i<nrow; ++i) {
       Rcomplex val=xP[i];
       if (!ISNAN(val.r) && !ISNAN(val.i)) continue;
diff --git a/src/data.table.h b/src/data.table.h
index eed481be6..6f907bfa7 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -185,7 +185,7 @@ double wallclock();
 
 // openmp-utils.c
 void initDTthreads();
-int getDTthreads();
+int getDTthreads(const int64_t n, const bool throttle);
 void avoid_openmp_hang_within_fork();
 
 // froll.c
diff --git a/src/fifelse.c b/src/fifelse.c
index 3a05fce6d..4c60b4bb3 100644
--- a/src/fifelse.c
+++ b/src/fifelse.c
@@ -77,7 +77,7 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na) {
     const int *restrict pa   = LOGICAL(a);
     const int *restrict pb   = LOGICAL(b);
     const int pna = nonna ? LOGICAL(na)[0] : NA_LOGICAL;
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(len0, true))
     for (int64_t i=0; i<len0; ++i) {
       pans[i] = pl[i]==0 ? pb[i & bmask] : (pl[i]==1 ? pa[i & amask] : pna);
     }
@@ -87,7 +87,7 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na) {
     const int *restrict pa   = INTEGER(a);
     const int *restrict pb   = INTEGER(b);
     const int pna = nonna ? INTEGER(na)[0] : NA_INTEGER;
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(len0, true))
     for (int64_t i=0; i<len0; ++i) {
       pans[i] = pl[i]==0 ? pb[i & bmask] : (pl[i]==1 ? pa[i & amask] : pna);
     }
@@ -98,7 +98,7 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na) {
     const double *restrict pb   = REAL(b);
     const double na_double = Rinherits(a, char_integer64) ? NA_INT64_D : NA_REAL; // Rinherits() is true for nanotime
     const double pna = nonna ? REAL(na)[0] : na_double;
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(len0, true))
     for (int64_t i=0; i<len0; ++i) {
       pans[i] = pl[i]==0 ? pb[i & bmask] : (pl[i]==1 ? pa[i & amask] : pna);
     }
@@ -116,7 +116,7 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na) {
     const Rcomplex *restrict pa   = COMPLEX(a);
     const Rcomplex *restrict pb   = COMPLEX(b);
     const Rcomplex pna = nonna ? COMPLEX(na)[0] : NA_CPLX;
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(len0, true))
     for (int64_t i=0; i<len0; ++i) {
       pans[i] = pl[i]==0 ? pb[i & bmask] : (pl[i]==1 ? pa[i & amask] : pna);
     }
diff --git a/src/forder.c b/src/forder.c
index 1ee8c74ce..79d126e4c 100644
--- a/src/forder.c
+++ b/src/forder.c
@@ -30,6 +30,7 @@
 
 // #define TIMING_ON
 
+static int nth = 1;                 // number of threads to use, throttled by default; used by cleanup() to ensure no mismatch in getDTthreads() calls
 static bool retgrp = true;          // return group sizes as well as the ordering vector? If so then use gs, gsalloc and gsn :
 static int nrow = 0;                // used as group size stack allocation limit (when all groups are 1 row)
 static int *gs = NULL;              // gs = final groupsizes e.g. 23,12,87,2,1,34,...
@@ -79,7 +80,7 @@ static void cleanup() {
   gs_alloc = 0;
   gs_n = 0;
 
-  if (gs_thread!=NULL) for (int i=0; i<getDTthreads(); i++) free(gs_thread[i]);
+  if (gs_thread!=NULL) for (int i=0; i<nth; i++) free(gs_thread[i]);
   free(gs_thread);       gs_thread=NULL;
   free(gs_thread_alloc); gs_thread_alloc=NULL;
   free(gs_thread_n);     gs_thread_n=NULL;
@@ -291,7 +292,7 @@ static void range_str(SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int
   if (ustr_n!=0) STOP(_("Internal error: ustr isn't empty when starting range_str: ustr_n=%d, ustr_alloc=%d"), ustr_n, ustr_alloc);  // # nocov
   if (ustr_maxlen!=0) STOP(_("Internal error: ustr_maxlen isn't 0 when starting range_str"));  // # nocov
   // savetl_init() has already been called at the start of forder
-  #pragma omp parallel for num_threads(getDTthreads())
+  #pragma omp parallel for num_threads(getDTthreads(n, true))
   for(int i=0; i<n; i++) {
     SEXP s = x[i];
     if (s==NA_STRING) {
@@ -491,7 +492,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S
   SEXP ans = PROTECT(allocVector(INTSXP, nrow)); n_protect++;
   anso = INTEGER(ans);
   TEND(0)
-  #pragma omp parallel for num_threads(getDTthreads())
+  #pragma omp parallel for num_threads(getDTthreads(nrow, true))
   for (int i=0; i<nrow; i++) anso[i]=i+1;   // gdb 8.1.0.20180409-git very slow here, oddly
   TEND(1)
   savetl_init();   // from now on use Error not error
@@ -650,7 +651,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S
     switch(TYPEOF(x)) {
     case INTSXP : case LGLSXP : {
       int32_t *xd = INTEGER(x);
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(nrow, true))
       for (int i=0; i<nrow; i++) {
         uint64_t elem=0;
         if (xd[i]==NA_INTEGER) {  // TODO: go branchless if na_count==0
@@ -665,7 +666,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S
     case REALSXP :
       if (inherits(x, "integer64")) {
         int64_t *xd = (int64_t *)REAL(x);
-        #pragma omp parallel for num_threads(getDTthreads())
+        #pragma omp parallel for num_threads(getDTthreads(nrow, true))
         for (int i=0; i<nrow; i++) {
           uint64_t elem=0;
           if (xd[i]==INT64_MIN) {
@@ -678,7 +679,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S
         }
       } else {
         double *xd = REAL(x);     // TODO: revisit double compression (skip bytes/mult by 10,100 etc) as currently it's often 6-8 bytes even for 3.14,3.15
-        #pragma omp parallel for num_threads(getDTthreads())
+        #pragma omp parallel for num_threads(getDTthreads(nrow, true))
         for (int i=0; i<nrow; i++) {
           uint64_t elem=0;
           if (!R_FINITE(xd[i])) {
@@ -697,7 +698,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S
       break;
     case STRSXP : {
       SEXP *xd = STRING_PTR(x);
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(nrow, true))
       for (int i=0; i<nrow; i++) {
         uint64_t elem=0;
         if (xd[i]==NA_STRING) {
@@ -722,7 +723,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S
   Rprintf(_("nradix=%d\n"), nradix);
   #endif
 
-  int nth = getDTthreads();
+  nth = getDTthreads(nrow, true);  // this nth is relied on in cleanup()
   TMP =  (int *)malloc(nth*UINT16_MAX*sizeof(int)); // used by counting sort (my_n<=65536) in radix_r()
   UGRP = (uint8_t *)malloc(nth*256);                // TODO: align TMP and UGRP to cache lines (and do the same for stack allocations too)
   if (!TMP || !UGRP /*|| TMP%64 || UGRP%64*/) STOP(_("Failed to allocate TMP or UGRP or they weren't cache line aligned: nth=%d"), nth);
@@ -747,7 +748,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S
     // Alternatively, we could try and avoid creating anso[] until it's needed, but that has similar complexity issues as (ii)
     // Note that if nalast==-1 (remove NA) anso will contain 0's for the NAs and will be considered not-sorted.
     bool stop = false;
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(nrow, true))
     for (int i=0; i<nrow; i++) {
       if (stop) continue;
       if (anso[i]!=i+1) stop=true;
@@ -829,7 +830,7 @@ void radix_r(const int from, const int to, const int radix) {
     return;
   }
   else if (my_n<=256) {
-    // if (getDTthreads()==1)
+    // if nth==1
     // Rprintf(_("insert clause: radix=%d, my_n=%d, from=%d, to=%d\n"), radix, my_n, from, to);
     // insert sort with some twists:
     // i) detects if grouped; if sortType==0 can then skip
@@ -942,7 +943,7 @@ void radix_r(const int from, const int to, const int radix) {
     return;
   }
   else if (my_n<=UINT16_MAX) {    // UINT16_MAX==65535 (important not 65536)
-    // if (getDTthreads()==1) Rprintf(_("counting clause: radix=%d, my_n=%d\n"), radix, my_n);
+    // if (nth==1) Rprintf(_("counting clause: radix=%d, my_n=%d\n"), radix, my_n);
     uint16_t my_counts[256] = {0};  // Needs to be all-0 on entry. This ={0} initialization should be fast as it's on stack. Otherwise, we have to manage
                                     // a stack of counts anyway since this is called recursively and these counts are needed to make the recursive calls.
                                     // This thread-private stack alloc has no chance of false sharing and gives omp and compiler best chance.
@@ -1044,7 +1045,7 @@ void radix_r(const int from, const int to, const int radix) {
   }
   // else parallel batches. This is called recursively but only once or maybe twice before resolving to UINT16_MAX branch above
 
-  int batchSize = MIN(UINT16_MAX, 1+my_n/getDTthreads());  // (my_n-1)/nBatch + 1;   //UINT16_MAX == 65535
+  int batchSize = MIN(UINT16_MAX, 1+my_n/getDTthreads(my_n, true));  // (my_n-1)/nBatch + 1;   //UINT16_MAX == 65535
   int nBatch = (my_n-1)/batchSize + 1;   // TODO: make nBatch a multiple of nThreads?
   int lastBatchSize = my_n - (nBatch-1)*batchSize;
   uint16_t *counts = calloc(nBatch*256,sizeof(uint16_t));
@@ -1055,7 +1056,7 @@ void radix_r(const int from, const int to, const int radix) {
   bool skip=true;
   const int n_rem = nradix-radix-1;   // how many radix are remaining after this one
   TEND(16)
-  #pragma omp parallel num_threads(getDTthreads())
+  #pragma omp parallel num_threads(getDTthreads(nBatch, false))
   {
     int     *my_otmp = malloc(batchSize * sizeof(int)); // thread-private write
     uint8_t *my_ktmp = malloc(batchSize * sizeof(uint8_t) * n_rem);
@@ -1160,7 +1161,7 @@ void radix_r(const int from, const int to, const int radix) {
   if (!skip) {
     int *TMP = malloc(my_n * sizeof(int));
     if (!TMP) STOP(_("Unable to allocate TMP for my_n=%d items in parallel batch counting"), my_n);
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(nBatch, false))
     for (int batch=0; batch<nBatch; batch++) {
       const int *restrict      my_starts = starts + batch*256;
       const uint16_t *restrict my_counts = counts + batch*256;
@@ -1176,7 +1177,7 @@ void radix_r(const int from, const int to, const int radix) {
     memcpy(anso+from, TMP, my_n*sizeof(int));
 
     for (int r=0; r<n_rem; r++) {    // TODO: groups of sizeof(anso)  4 byte int currently  (in future 8).  To save team startup cost (but unlikely significant anyway)
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(nBatch, false))
       for (int batch=0; batch<nBatch; batch++) {
         const int *restrict      my_starts = starts + batch*256;
         const uint16_t *restrict my_counts = counts + batch*256;
@@ -1231,7 +1232,7 @@ void radix_r(const int from, const int to, const int radix) {
       // all groups are <=65535 and radix_r() will handle each one single-threaded. Therefore, this time
       // it does make sense to start a parallel team and there will be no nestedness here either.
       if (retgrp) {
-        #pragma omp parallel for ordered schedule(dynamic) num_threads(getDTthreads())
+        #pragma omp parallel for ordered schedule(dynamic) num_threads(getDTthreads(ngrp, false))
         for (int i=0; i<ngrp; i++) {
           int start = from + starts[ugrp[i]];
           radix_r(start, start+my_gs[i]-1, radix+1);
@@ -1240,7 +1241,7 @@ void radix_r(const int from, const int to, const int radix) {
         }
       } else {
         // flush() is only relevant when retgrp==true so save the redundant ordered clause
-        #pragma omp parallel for schedule(dynamic) num_threads(getDTthreads())
+        #pragma omp parallel for schedule(dynamic) num_threads(getDTthreads(ngrp, false))
         for (int i=0; i<ngrp; i++) {
           int start = from + starts[ugrp[i]];
           radix_r(start, start+my_gs[i]-1, radix+1);
diff --git a/src/froll.c b/src/froll.c
index 2229c5cdb..b044431de 100644
--- a/src/froll.c
+++ b/src/froll.c
@@ -140,7 +140,7 @@ void frollmeanExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool
   }
   bool truehasna = hasna>0;                                     // flag to re-run with NA support if NAs detected
   if (!truehasna || !narm) {
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(nx, true))
     for (uint64_t i=k-1; i<nx; i++) {                           // loop on every observation with complete window, partial already filled in single threaded section
       if (narm && truehasna) {
         continue;                                               // if NAs detected no point to continue
@@ -178,7 +178,7 @@ void frollmeanExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool
     }
   }
   if (truehasna && narm) {
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(nx, true))
     for (uint64_t i=k-1; i<nx; i++) {                           // loop on every observation with complete window, partial already filled in single threaded section
       long double w = 0.0;
       int nc = 0;                                               // NA counter within sliding window
@@ -338,7 +338,7 @@ void frollsumExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool
   }
   bool truehasna = hasna>0;
   if (!truehasna || !narm) {
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(nx, true))
     for (uint64_t i=k-1; i<nx; i++) {
       if (narm && truehasna) {
         continue;
@@ -371,7 +371,7 @@ void frollsumExact(double *x, uint64_t nx, ans_t *ans, int k, double fill, bool
     }
   }
   if (truehasna && narm) {
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(nx, true))
     for (uint64_t i=k-1; i<nx; i++) {
       long double w = 0.0;
       int nc = 0;
diff --git a/src/frollR.c b/src/frollR.c
index 46ddea0df..f8ff97781 100644
--- a/src/frollR.c
+++ b/src/frollR.c
@@ -199,7 +199,7 @@ SEXP frollfunR(SEXP fun, SEXP obj, SEXP k, SEXP fill, SEXP algo, SEXP align, SEX
     else if (ialgo==1)
       Rprintf(_("%s: %d column(s) and %d window(s), not entering parallel execution here because algo='exact' will compute results in parallel\n"), __func__, nx, nk);
   }
-  #pragma omp parallel for if (ialgo==0 && nx*nk>1) schedule(auto) collapse(2) num_threads(getDTthreads())
+  #pragma omp parallel for if (ialgo==0) schedule(dynamic) collapse(2) num_threads(getDTthreads(nx*nk, false))
   for (R_len_t i=0; i<nx; i++) {                                // loop over multiple columns
     for (R_len_t j=0; j<nk; j++) {                              // loop over multiple windows
       switch (sfun) {
diff --git a/src/frolladaptive.c b/src/frolladaptive.c
index b65512d20..e005cef69 100644
--- a/src/frolladaptive.c
+++ b/src/frolladaptive.c
@@ -42,7 +42,7 @@ void fadaptiverollmeanFast(double *x, uint64_t nx, ans_t *ans, int *k, double fi
       cs[i] = (double) w;
     }
     if (R_FINITE((double) w)) {                                 // no need to calc this if NAs detected as will re-calc all below in truehasna==1
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(nx, true))
       for (uint64_t i=0; i<nx; i++) {                           // loop over observations to calculate final answer
         if (i+1 == k[i]) {
           ans->dbl_v[i] = cs[i]/k[i];                           // current obs window width exactly same as obs position in a vector
@@ -82,7 +82,7 @@ void fadaptiverollmeanFast(double *x, uint64_t nx, ans_t *ans, int *k, double fi
       cs[i] = (double) w;                                       // cumsum, na.rm=TRUE always, NAs handled using cum NA counter
       cn[i] = nc;                                               // cum NA counter
     }
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(nx, true))
     for (uint64_t i=0; i<nx; i++) {                             // loop over observations to calculate final answer
       if (i+1 < k[i]) {                                         // partial window
         ans->dbl_v[i] = fill;
@@ -114,7 +114,7 @@ void fadaptiverollmeanExact(double *x, uint64_t nx, ans_t *ans, int *k, double f
     snprintf(end(ans->message[0]), 500, _("%s: running in parallel for input length %"PRIu64", hasna %d, narm %d\n"), "fadaptiverollmeanExact", (uint64_t)nx, hasna, (int) narm);
   bool truehasna = hasna>0;                                     // flag to re-run if NAs detected
   if (!truehasna || !narm) {                                    // narm=FALSE handled here as NAs properly propagated in exact algo
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(nx, true))
     for (uint64_t i=0; i<nx; i++) {                             // loop on every observation to produce final answer
       if (narm && truehasna) {
         continue;                                               // if NAs detected no point to continue
@@ -156,7 +156,7 @@ void fadaptiverollmeanExact(double *x, uint64_t nx, ans_t *ans, int *k, double f
     }
   }
   if (truehasna && narm) {
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(nx, true))
     for (uint64_t i=0; i<nx; i++) {                             // loop over observations to produce final answer
       if (i+1 < k[i]) {
         ans->dbl_v[i] = fill;                                   // partial window
@@ -231,7 +231,7 @@ void fadaptiverollsumFast(double *x, uint64_t nx, ans_t *ans, int *k, double fil
       cs[i] = (double) w;
     }
     if (R_FINITE((double) w)) {
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(nx, true))
       for (uint64_t i=0; i<nx; i++) {
         if (i+1 == k[i]) {
           ans->dbl_v[i] = cs[i];
@@ -271,7 +271,7 @@ void fadaptiverollsumFast(double *x, uint64_t nx, ans_t *ans, int *k, double fil
       cs[i] = (double) w;
       cn[i] = nc;
     }
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(nx, true))
     for (uint64_t i=0; i<nx; i++) {
       if (i+1 < k[i]) {
         ans->dbl_v[i] = fill;
@@ -298,7 +298,7 @@ void fadaptiverollsumExact(double *x, uint64_t nx, ans_t *ans, int *k, double fi
     snprintf(end(ans->message[0]), 500, _("%s: running in parallel for input length %"PRIu64", hasna %d, narm %d\n"), "fadaptiverollsumExact", (uint64_t)nx, hasna, (int) narm);
   bool truehasna = hasna>0;
   if (!truehasna || !narm) {
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(nx, true))
     for (uint64_t i=0; i<nx; i++) {
       if (narm && truehasna) {
         continue;
@@ -335,7 +335,7 @@ void fadaptiverollsumExact(double *x, uint64_t nx, ans_t *ans, int *k, double fi
     }
   }
   if (truehasna && narm) {
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(nx, true))
     for (uint64_t i=0; i<nx; i++) {
       if (i+1 < k[i]) {
         ans->dbl_v[i] = fill;
diff --git a/src/fsort.c b/src/fsort.c
index d3c695eac..00c7e5c10 100644
--- a/src/fsort.c
+++ b/src/fsort.c
@@ -117,7 +117,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
   // allocate early in case fails if not enough RAM
   // TODO: document this is much cheaper than a copy followed by in-place.
 
-  int nth = getDTthreads();
+  int nth = getDTthreads(xlength(x), true);
   int nBatch=nth*2;  // at least nth; more to reduce last-man-home; but not too large to keep counts small in cache
   if (verbose) Rprintf(_("nth=%d, nBatch=%d\n"),nth,nBatch);
 
@@ -131,7 +131,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
   t[1] = wallclock();
   double mins[nBatch], maxs[nBatch];
   const double *restrict xp = REAL(x);
-  #pragma omp parallel for schedule(dynamic) num_threads(nth)
+  #pragma omp parallel for schedule(dynamic) num_threads(getDTthreads(nBatch, false))
   for (int batch=0; batch<nBatch; batch++) {
     R_xlen_t thisLen = (batch==nBatch-1) ? lastBatchSize : batchSize;
     const double *restrict d = xp + batchSize*batch;
@@ -253,7 +253,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
     }
 
     t[6] = wallclock();
-    #pragma omp parallel num_threads(getDTthreads())
+    #pragma omp parallel num_threads(getDTthreads(MSBsize, false))
     {
       R_xlen_t *counts = calloc((toBit/8 + 1)*256, sizeof(R_xlen_t));
       // each thread has its own (small) stack of counts
diff --git a/src/gsumm.c b/src/gsumm.c
index ef63519a3..372ae5944 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -79,7 +79,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
                                              // maybe better to malloc to avoid R's heap. This grp isn't global, so it doesn't need to be R_alloc
   const int *restrict fp = INTEGER(f);
 
-  nBatch = MIN((nrow+1)/2, getDTthreads()*2);  // *2 to reduce last-thread-home. TODO: experiment. The higher this is though, the bigger is counts[]
+  nBatch = MIN((nrow+1)/2, getDTthreads(nrow, true)*2);  // *2 to reduce last-thread-home. TODO: experiment. The higher this is though, the bigger is counts[]
   batchSize = MAX(1, (nrow-1)/nBatch);
   lastBatchSize = nrow - (nBatch-1)*batchSize;
   // We deliberate use, for example, 40 batches of just 14 rows, to stress-test tests. This strategy proved to be a good one as #3204 immediately came to light.
@@ -90,7 +90,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
            nrow, ngrp, nb, shift, highSize, nBatch, batchSize, lastBatchSize);                                              // # nocov
   }
   // initial population of g:
-  #pragma omp parallel for num_threads(getDTthreads())
+  #pragma omp parallel for num_threads(getDTthreads(ngrp, false))
   for (int g=0; g<ngrp; g++) {
     int *elem = grp + fp[g]-1;
     for (int j=0; j<grpsize[g]; j++)  elem[j] = g;
@@ -114,7 +114,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
     int *counts = calloc(nBatch*highSize, sizeof(int));  // TODO: cache-line align and make highSize a multiple of 64
     int *TMP   = malloc(nrow*2*sizeof(int));
     if (!counts || !TMP ) error(_("Internal error: Failed to allocate counts or TMP when assigning g in gforce"));
-    #pragma omp parallel for num_threads(getDTthreads())   // schedule(dynamic,1)
+    #pragma omp parallel for num_threads(getDTthreads(nBatch, false))   // schedule(dynamic,1)
     for (int b=0; b<nBatch; b++) {
       const int howMany = b==nBatch-1 ? lastBatchSize : batchSize;
       const int *my_o = op + b*batchSize;
@@ -138,7 +138,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
       }
     }
     //Rprintf(_("gforce assign TMP (o,g) pairs took %.3f\n"), wallclock()-started); started=wallclock();
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(highSize, false))
     for (int h=0; h<highSize; h++) {  // very important that high is first loop here
       for (int b=0; b<nBatch; b++) {
         const int start = h==0 ? 0 : counts[ b*highSize + h - 1 ];
@@ -162,10 +162,10 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
   // TODO: reduce to the largest type present; won't be faster (untouched RAM won't be fetched) but it will increase the largest size that works.
 
   counts = (int *)S_alloc(nBatch*highSize, sizeof(int));  // (S_ zeros) TODO: cache-line align and make highSize a multiple of 64
-  tmpcounts = (int *)R_alloc(getDTthreads()*highSize, sizeof(int));
+  tmpcounts = (int *)R_alloc(getDTthreads(nBatch, false)*highSize, sizeof(int));
 
   const int *restrict gp = grp;
-  #pragma omp parallel for num_threads(getDTthreads())   // schedule(dynamic,1)
+  #pragma omp parallel for num_threads(getDTthreads(nBatch, false))   // schedule(dynamic,1)
   for (int b=0; b<nBatch; b++) {
     int *restrict my_counts = counts + b*highSize;
     uint16_t *restrict my_high = high + b*batchSize;
@@ -217,7 +217,7 @@ void *gather(SEXP x, bool *anyNA)
   switch (TYPEOF(x)) {
   case LGLSXP: case INTSXP: {
     const int *restrict thisx = INTEGER(x);
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(nBatch, false))
     for (int b=0; b<nBatch; b++) {
       int *restrict my_tmpcounts = tmpcounts + omp_get_thread_num()*highSize;
       memcpy(my_tmpcounts, counts + b*highSize, highSize*sizeof(int));   // original cumulated   // already cumulated for this batch
@@ -246,7 +246,7 @@ void *gather(SEXP x, bool *anyNA)
   case REALSXP: {
     if (!INHERITS(x, char_integer64)) {
       const double *restrict thisx = REAL(x);
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(nBatch, false))
       for (int b=0; b<nBatch; b++) {
         int *restrict my_tmpcounts = tmpcounts + omp_get_thread_num()*highSize;
         memcpy(my_tmpcounts, counts + b*highSize, highSize*sizeof(int));
@@ -273,7 +273,7 @@ void *gather(SEXP x, bool *anyNA)
       }
     } else {
       const int64_t *restrict thisx = (int64_t *)REAL(x);
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(nBatch, false))
       for (int b=0; b<nBatch; b++) {
         int *restrict my_tmpcounts = tmpcounts + omp_get_thread_num()*highSize;
         memcpy(my_tmpcounts, counts + b*highSize, highSize*sizeof(int));
@@ -302,7 +302,7 @@ void *gather(SEXP x, bool *anyNA)
   } break;
   case CPLXSXP: {
     const Rcomplex *restrict thisx = COMPLEX(x);
-    #pragma omp parallel for num_threads(getDTthreads())
+    #pragma omp parallel for num_threads(getDTthreads(nBatch, false))
     for (int b=0; b<nBatch; b++) {
       int *restrict my_tmpcounts = tmpcounts + omp_get_thread_num()*highSize;
       memcpy(my_tmpcounts, counts + b*highSize, highSize*sizeof(int));
@@ -359,7 +359,7 @@ SEXP gsum(SEXP x, SEXP narmArg, SEXP warnOverflowArg)
     bool overflow=false;
     //double started = wallclock();
     if (!anyNA) {
-      #pragma omp parallel for num_threads(getDTthreads()) //schedule(dynamic,1)
+      #pragma omp parallel for num_threads(getDTthreads(highSize, false)) //schedule(dynamic,1)
       for (int h=0; h<highSize; h++) {   // very important that high is first loop here
         int *restrict _ans = ansp + (h<<shift);
         for (int b=0; b<nBatch; b++) {
@@ -376,7 +376,7 @@ SEXP gsum(SEXP x, SEXP narmArg, SEXP warnOverflowArg)
         }
       }
     } else {
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(highSize, false))
       for (int h=0; h<highSize; h++) {
         int *restrict _ans = ansp + (h<<shift);
         for (int b=0; b<nBatch; b++) {
@@ -405,7 +405,7 @@ SEXP gsum(SEXP x, SEXP narmArg, SEXP warnOverflowArg)
       ans = PROTECT(allocVector(REALSXP, ngrp));
       double *restrict ansp = REAL(ans);
       memset(ansp, 0, ngrp*sizeof(double));
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(highSize, false))
       for (int h=0; h<highSize; h++) {
         double *restrict _ans = ansp + (h<<shift);
         for (int b=0; b<nBatch; b++) {
@@ -433,7 +433,7 @@ SEXP gsum(SEXP x, SEXP narmArg, SEXP warnOverflowArg)
       double *restrict ansp = REAL(ans);
       memset(ansp, 0, ngrp*sizeof(double));
       if (!narm || !anyNA) {
-        #pragma omp parallel for num_threads(getDTthreads())
+        #pragma omp parallel for num_threads(getDTthreads(highSize, false))
         for (int h=0; h<highSize; h++) {
           double *restrict _ans = ansp + (h<<shift);
           for (int b=0; b<nBatch; b++) {
@@ -448,7 +448,7 @@ SEXP gsum(SEXP x, SEXP narmArg, SEXP warnOverflowArg)
         }
       } else {
         // narm==true and anyNA==true
-        #pragma omp parallel for num_threads(getDTthreads())
+        #pragma omp parallel for num_threads(getDTthreads(highSize, false))
         for (int h=0; h<highSize; h++) {
           double *restrict _ans = ansp + (h<<shift);
           for (int b=0; b<nBatch; b++) {
@@ -469,7 +469,7 @@ SEXP gsum(SEXP x, SEXP narmArg, SEXP warnOverflowArg)
       int64_t *restrict ansp = (int64_t *)REAL(ans);
       memset(ansp, 0, ngrp*sizeof(int64_t));
       if (!anyNA) {
-        #pragma omp parallel for num_threads(getDTthreads())
+        #pragma omp parallel for num_threads(getDTthreads(highSize, false))
         for (int h=0; h<highSize; h++) {
           int64_t *restrict _ans = ansp + (h<<shift);
           for (int b=0; b<nBatch; b++) {
@@ -484,7 +484,7 @@ SEXP gsum(SEXP x, SEXP narmArg, SEXP warnOverflowArg)
         }
       } else { // narm==true/false and anyNA==true
         if (!narm) {
-          #pragma omp parallel for num_threads(getDTthreads())
+          #pragma omp parallel for num_threads(getDTthreads(highSize, false))
           for (int h=0; h<highSize; h++) {
             int64_t *restrict _ans = ansp + (h<<shift);
             for (int b=0; b<nBatch; b++) {
@@ -504,7 +504,7 @@ SEXP gsum(SEXP x, SEXP narmArg, SEXP warnOverflowArg)
             }
           }
         } else {
-          #pragma omp parallel for num_threads(getDTthreads())
+          #pragma omp parallel for num_threads(getDTthreads(highSize, false))
           for (int h=0; h<highSize; h++) {
             int64_t *restrict _ans = ansp + (h<<shift);
             for (int b=0; b<nBatch; b++) {
@@ -528,7 +528,7 @@ SEXP gsum(SEXP x, SEXP narmArg, SEXP warnOverflowArg)
     Rcomplex *restrict ansp = COMPLEX(ans);
     memset(ansp, 0, ngrp*sizeof(Rcomplex));
     if (!narm || !anyNA) {
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(highSize, false))
       for (int h=0; h<highSize; h++) {
         Rcomplex *restrict _ans = ansp + (h<<shift);
         for (int b=0; b<nBatch; b++) {
@@ -544,7 +544,7 @@ SEXP gsum(SEXP x, SEXP narmArg, SEXP warnOverflowArg)
       }
     } else {
       // narm==true and anyNA==true
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(highSize, false))
       for (int h=0; h<highSize; h++) {
         Rcomplex *restrict _ans = ansp + (h<<shift);
         for (int b=0; b<nBatch; b++) {
diff --git a/src/nafill.c b/src/nafill.c
index eb4e5c0e2..56928a9e5 100644
--- a/src/nafill.c
+++ b/src/nafill.c
@@ -170,7 +170,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S
   double tic=0.0, toc=0.0;
   if (verbose)
     tic = omp_get_wtime();
-  #pragma omp parallel for if (nx>1) num_threads(getDTthreads())
+  #pragma omp parallel for schedule(dynamic) num_threads(getDTthreads(nx, false))
   for (R_len_t i=0; i<nx; i++) {
     SEXP this_x = VECTOR_ELT(x, i);
     switch (TYPEOF(this_x)) {
diff --git a/src/openmp-utils.c b/src/openmp-utils.c
index b90160184..cfd0e3806 100644
--- a/src/openmp-utils.c
+++ b/src/openmp-utils.c
@@ -5,7 +5,8 @@
 #include <errno.h>     // errno
 #include <ctype.h>     // isspace
 
-static int  DTthreads = -1;           // Never read directly hence static; use getDTthreads(). -1 so we know for sure initDTthreads() ran and set it >= 1.
+static int  DTthreads = -1;   // Never read directly hence static; use getDTthreads(n, /*throttle=*/0|1). -1 so we know for sure initDTthreads() ran and set it >= 1.
+static int  DTthrottle = -1;  // Thread 1 is assigned DTthrottle iterations before a 2nd thread is utilized; #4484.
 static bool RestoreAfterFork = true;  // see #2885 in v1.12.0
 
 static int getIntEnv(const char *name, int def)
@@ -50,12 +51,19 @@ void initDTthreads() {
   ans = imin(ans, getIntEnv("OMP_THREAD_LIMIT", INT_MAX));  // user might expect `Sys.setenv(OMP_THREAD_LIMIT=2);setDTthreads()` to work. Satisfy this
   ans = imin(ans, getIntEnv("OMP_NUM_THREADS", INT_MAX));   //   expectation by reading them again now. OpenMP just reads them on startup (quite reasonably)
   DTthreads = ans;
+  DTthrottle = imax(1, getIntEnv("R_DATATABLE_THROTTLE", 1024)); // 2nd thread is used only when n>1024, 3rd thread when n>2048, etc
 }
 
-int getDTthreads() {
-  // this is the main getter used by all parallel regions; they specify num_threads(getDTthreads())
-  // Therefore keep it light, simple and robust. Local static variable. initDTthreads() ensures 1 <= DTthreads <= omp_get_num_proc()
-  return DTthreads;
+int getDTthreads(const int64_t n, const bool throttle) {
+  // this is the main getter used by all parallel regions; they specify num_threads(n, true|false).
+  // Keep this light, simple and robust. initDTthreads() ensures 1 <= DTthreads <= omp_get_num_proc()
+  // throttle introduced in 1.12.10 (see NEWS item); #4484
+  // throttle==true  : a number of iterations per thread (DTthrottle) is applied before a second thread is utilized 
+  // throttle==false : parallel region is already pre-chunked such as in fread; e.g. two batches intended for two threads
+  if (n<1) return 1; // 0 or negative could be deliberate in calling code for edge cases where loop is not intended to run at all
+  int64_t ans = throttle ? 1+(n-1)/DTthrottle :  // 1 thread for n<=1024, 2 thread for n<=2048, etc
+                           n;                    // don't use 20 threads for just one or two batches
+  return ans>=DTthreads ? DTthreads : (int)ans;  // apply limit in static local DTthreads saved there by initDTthreads() and setDTthreads()
 }
 
 static const char *mygetenv(const char *name, const char *unset) {
@@ -75,40 +83,42 @@ SEXP getDTthreads_R(SEXP verbose) {
     Rprintf(_("  omp_get_num_procs()            %d\n"), omp_get_num_procs());
     Rprintf(_("  R_DATATABLE_NUM_PROCS_PERCENT  %s\n"), mygetenv("R_DATATABLE_NUM_PROCS_PERCENT", "unset (default 50)"));
     Rprintf(_("  R_DATATABLE_NUM_THREADS        %s\n"), mygetenv("R_DATATABLE_NUM_THREADS", "unset"));
+    Rprintf(_("  R_DATATABLE_THROTTLE           %s\n"), mygetenv("R_DATATABLE_THROTTLE", "unset (default 1024)"));
     Rprintf(_("  omp_get_thread_limit()         %d\n"), omp_get_thread_limit());
     Rprintf(_("  omp_get_max_threads()          %d\n"), omp_get_max_threads());
     Rprintf(_("  OMP_THREAD_LIMIT               %s\n"), mygetenv("OMP_THREAD_LIMIT", "unset"));  // CRAN sets to 2
     Rprintf(_("  OMP_NUM_THREADS                %s\n"), mygetenv("OMP_NUM_THREADS", "unset"));
     Rprintf(_("  RestoreAfterFork               %s\n"), RestoreAfterFork ? "true" : "false");
-    Rprintf(_("  data.table is using %d threads. See ?setDTthreads.\n"), getDTthreads());
+    Rprintf(_("  data.table is using %d threads with throttle==%d. See ?setDTthreads.\n"), getDTthreads(INT_MAX, false), DTthrottle);
   }
-  return ScalarInteger(getDTthreads());
+  return ScalarInteger(getDTthreads(INT_MAX, false));
 }
 
-SEXP setDTthreads(SEXP threads, SEXP restore_after_fork, SEXP percent) {
+SEXP setDTthreads(SEXP threads, SEXP restore_after_fork, SEXP percent, SEXP throttle) {
   if (!isNull(restore_after_fork)) {
     if (!isLogical(restore_after_fork) || LOGICAL(restore_after_fork)[0]==NA_LOGICAL) {
       error(_("restore_after_fork= must be TRUE, FALSE, or NULL (default). getDTthreads(verbose=TRUE) reports the current setting.\n"));
     }
     RestoreAfterFork = LOGICAL(restore_after_fork)[0];  // # nocov
   }
+  if (length(throttle)) {
+    if (!isInteger(throttle) || LENGTH(throttle)!=1 || INTEGER(throttle)[0]<1)
+      error(_("'throttle' must be a single number, non-NA, and >=1"));
+    DTthrottle = INTEGER(throttle)[0];
+  }
   int old = DTthreads;
-  if (isNull(threads)) {
+  if (!length(threads) && !length(throttle)) {
     initDTthreads();
     // Rerun exactly the same function used on startup (re-reads env variables); this is now default setDTthreads() behavior from 1.12.2
     // Allows robust testing of environment variables using Sys.setenv() to experiment.
     // Default  is now (as from 1.12.2) threads=NULL which re-reads environment variables.
     // If a CPU has been unplugged (high end servers allow live hardware replacement) then omp_get_num_procs() will
     // reflect that and a call to setDTthreads(threads=NULL) will update DTthreads.
-  } else {
-    int n=0, protecti=0;
-    if (length(threads)!=1) error(_("threads= must be either NULL (default) or a single number. It has length %d"), length(threads));
-    if (isReal(threads)) { threads = PROTECT(coerceVector(threads, INTSXP)); protecti++; }
-    if (!isInteger(threads)) error(_("threads= must be either NULL (default) or type integer/numeric"));
-    if ((n=INTEGER(threads)[0]) < 0) {  // <0 catches NA too since NA is negative (INT_MIN)
-      error(_("threads= must be either NULL or a single integer >= 0. See ?setDTthreads."));
+  } else if (length(threads)) {
+    int n=0;
+    if (length(threads)!=1 || !isInteger(threads) || (n=INTEGER(threads)[0]) < 0) {  // <0 catches NA too since NA is negative (INT_MIN)
+      error(_("threads= must be either NULL or a single number >= 0. See ?setDTthreads."));
     }
-    UNPROTECT(protecti);
     int num_procs = imax(omp_get_num_procs(), 1); // max just in case omp_get_num_procs() returns <= 0 (perhaps error, or unsupported)
     if (!isLogical(percent) || length(percent)!=1 || LOGICAL(percent)[0]==NA_LOGICAL) {
       error(_("Internal error: percent= must be TRUE or FALSE at C level"));  // # nocov
@@ -124,8 +134,8 @@ SEXP setDTthreads(SEXP threads, SEXP restore_after_fork, SEXP percent) {
     DTthreads = imax(n, 1);  // imax just in case
     // Do not call omp_set_num_threads() here. Any calls to omp_set_num_threads() affect other
     // packages and R itself too which has some OpenMP usage. Instead we set our own DTthreads
-    // static variable and read that from getDTthreads().
-    // All parallel regions should include num_threads(getDTthreads()) and this is ensured via
+    // static variable and read that from getDTthreads(n, throttle).
+    // All parallel regions should include num_threads(getDTthreads(n, true|false)) and this is ensured via
     // a grep in CRAN_Release.cmd.
   }
   return ScalarInteger(old);
diff --git a/src/reorder.c b/src/reorder.c
index da3784e94..c2deea8ae 100644
--- a/src/reorder.c
+++ b/src/reorder.c
@@ -64,7 +64,7 @@ SEXP reorder(SEXP x, SEXP order)
     if (size==4) {
       const int *restrict vd = DATAPTR_RO(v);
       int *restrict tmp = (int *)TMP;
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(end, true))
       for (int i=start; i<=end; ++i) {
         tmp[i-start] = vd[idx[i]-1];  // copies 4 bytes; e.g. INTSXP and also SEXP pointers on 32bit (STRSXP and VECSXP)
       }
@@ -75,14 +75,14 @@ SEXP reorder(SEXP x, SEXP order)
     } else if (size==8) {
       const double *restrict vd = DATAPTR_RO(v);
       double *restrict tmp = (double *)TMP;
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(end, true))
       for (int i=start; i<=end; ++i) {
         tmp[i-start] = vd[idx[i]-1];  // copies 8 bytes; e.g. REALSXP and also SEXP pointers on 64bit (STRSXP and VECSXP)
       }
     } else { // size 16; checked up front
       const Rcomplex *restrict vd = DATAPTR_RO(v);
       Rcomplex *restrict tmp = (Rcomplex *)TMP;
-      #pragma omp parallel for num_threads(getDTthreads())
+      #pragma omp parallel for num_threads(getDTthreads(end, true))
       for (int i=start; i<=end; ++i) {
         tmp[i-start] = vd[idx[i]-1];
       }
diff --git a/src/subset.c b/src/subset.c
index d9fea2800..91a4018e2 100644
--- a/src/subset.c
+++ b/src/subset.c
@@ -13,13 +13,13 @@ void subsetVectorRaw(SEXP ans, SEXP source, SEXP idx, const bool anyNA)
 
   #define PARLOOP(_NAVAL_)                                        \
   if (anyNA) {                                                    \
-    _Pragma("omp parallel for num_threads(getDTthreads())")       \
+    _Pragma("omp parallel for num_threads(getDTthreads(n, true))") \
     for (int i=0; i<n; i++) {                                     \
       int elem = idxp[i];                                         \
       ap[i] = elem==NA_INTEGER ? _NAVAL_ : sp[elem-1];            \
     }                                                             \
   } else {                                                        \
-    _Pragma("omp parallel for num_threads(getDTthreads())")       \
+    _Pragma("omp parallel for num_threads(getDTthreads(n, true))") \
     for (int i=0; i<n; i++) {                                     \
       ap[i] = sp[idxp[i]-1];                                      \
     }                                                             \
@@ -121,7 +121,7 @@ SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax)
   int *idxp = INTEGER(idx);
 
   bool stop = false;
-  #pragma omp parallel for num_threads(getDTthreads())
+  #pragma omp parallel for num_threads(getDTthreads(n, true))
   for (int i=0; i<n; i++) {
     if (stop) continue;
     int elem = idxp[i];
diff --git a/src/types.c b/src/types.c
index 3418fc740..18f1993dc 100644
--- a/src/types.c
+++ b/src/types.c
@@ -69,7 +69,7 @@ SEXP testMsgR(SEXP status, SEXP x, SEXP k) {
     }
   }
 
-  #pragma omp parallel for if (nx*nk>1) schedule(auto) collapse(2) num_threads(getDTthreads())
+  #pragma omp parallel for schedule(dynamic) collapse(2) num_threads(getDTthreads(nx*nk, false))
   for (R_len_t i=0; i<nx; i++) {
     for (R_len_t j=0; j<nk; j++) {
       testRaiseMsg(&vans[i*nk+j], istatus, verbose);