diff --git a/.lintrunner.toml b/.lintrunner.toml
index 7667ac430d1..1a27228d266 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -218,6 +218,8 @@ exclude_patterns = [
     'examples/**',
     'extension/**',
     'kernels/optimized/**',
+    # Justified <functional> include.
+    'runtime/kernel/thread_parallel_interface.h',
     'scripts/**',
     'third-party/**',
     'util/**',
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 73b89b6171e..fabf667cbe1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -751,7 +751,6 @@ if(EXECUTORCH_BUILD_PTHREADPOOL
    AND EXECUTORCH_BUILD_CPUINFO
 )
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/parallel)
 endif()
 
 if(EXECUTORCH_BUILD_PYBIND)
diff --git a/Test.cmake b/Test.cmake
index d4b5f6aa1db..6bd7a86e70b 100644
--- a/Test.cmake
+++ b/Test.cmake
@@ -13,7 +13,6 @@ if(BUILD_TESTING)
   add_subdirectory(extension/evalue_util/test)
   add_subdirectory(extension/kernel_util/test)
   add_subdirectory(extension/memory_allocator/test)
-  add_subdirectory(extension/parallel/test)
   add_subdirectory(extension/pytree/test)
   add_subdirectory(kernels/portable/cpu/util/test)
   add_subdirectory(kernels/prim_ops/test)
diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
index 4811563269c..bbc9eec3a0e 100644
--- a/build/cmake_deps.toml
+++ b/build/cmake_deps.toml
@@ -58,6 +58,21 @@ deps = [
   "executorch_core",
 ]
 
+# HACK: prevent reduce_util from also showing up in custom_ops. The
+# actual medium-term fix is to stop using Buck to drive our CMake
+# builds.
+[targets.reduce_util]
+buck_targets = [
+  "//kernels/portable/cpu/util:reduce_util",
+]
+filters = [
+  ".cpp$",
+]
+deps = [
+  "executorch",
+  "executorch_core",
+]
+
 [targets.optimized_kernels]
 buck_targets = [
   "//kernels/optimized:generated_lib",
@@ -73,7 +88,6 @@ excludes = [
 deps = [
   "executorch",
   "executorch_core",
-  "extension_parallel",
   "extension_threadpool",
   "portable_kernels",
 ]
@@ -116,7 +130,7 @@ excludes = [
 deps = [
   "executorch_core",
   "executorch",
-  "extension_parallel",
+  "extension_threadpool",
 ]
 
 [targets.optimized_native_cpu_ops]
@@ -131,7 +145,6 @@ excludes = [
 deps = [
   "executorch_core",
   "executorch",
-  "extension_parallel",
   "extension_threadpool",
   "portable_kernels",
 ]
@@ -212,19 +225,6 @@ deps = [
   "extension_runner_util",
 ]
 
-[targets.extension_parallel]
-buck_targets = [
-  "//extension/parallel:thread_parallel",
-]
-filters = [
-  ".cpp$",
-]
-deps = [
-  "executorch",
-  "executorch_core",
-  "extension_threadpool",
-]
-
 [targets.extension_tensor]
 buck_targets = [
   "//extension/tensor:tensor",
@@ -364,6 +364,7 @@ excludes = [
 deps = [
   "executorch",
   "executorch_core",
+  "extension_threadpool",
   "xnnpack_backend",
   "portable_kernels",
 ]
@@ -378,6 +379,7 @@ filters = [
 deps = [
   "executorch",
   "executorch_core",
+  "extension_threadpool",
 ]
 
 [targets.xnnpack_schema]
@@ -412,8 +414,8 @@ deps = [
   "executorch",
   "executorch_core",
   "optimized_kernels",
-  "extension_parallel",
   "extension_threadpool",
+  "reduce_util",
   "xnnpack_backend",
 ]
 
@@ -449,7 +451,7 @@ deps = [
   "executorch_core",
   "extension_data_loader",
   "extension_module",
-  "extension_parallel",
+  "extension_threadpool",
   "portable_kernels",
   "quantized_kernels",
   "xnnpack_backend",
diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 2c459b66ac8..931d31de8ef 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -1,4 +1,3 @@
-
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -16,20 +15,23 @@
 #
 # This will define the following variables:
 #
-#   EXECUTORCH_FOUND        -- True if the system has the ExecuTorch library
-#   EXECUTORCH_INCLUDE_DIRS -- The include directories for ExecuTorch
-#   EXECUTORCH_LIBRARIES    -- Libraries to link against
+# EXECUTORCH_FOUND        -- True if the system has the ExecuTorch library
+# EXECUTORCH_INCLUDE_DIRS -- The include directories for ExecuTorch
+# EXECUTORCH_LIBRARIES    -- Libraries to link against
 #
-# The actual values for these variables will be different from what executorch-config.cmake
-# in executorch pip package gives, but we wanted to keep the contract of exposing these
-# CMake variables.
+# The actual values for these variables will be different from what
+# executorch-config.cmake in executorch pip package gives, but we wanted to keep
+# the contract of exposing these CMake variables.
 
 cmake_minimum_required(VERSION 3.19)
 
 set(_root "${CMAKE_CURRENT_LIST_DIR}/../../..")
 set(required_lib_list executorch executorch_core portable_kernels)
 set(EXECUTORCH_LIBRARIES)
-set(EXECUTORCH_INCLUDE_DIRS ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib)
+set(EXECUTORCH_INCLUDE_DIRS
+    ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10
+    ${_root}/lib
+)
 foreach(lib ${required_lib_list})
   set(lib_var "LIB_${lib}")
   add_library(${lib} STATIC IMPORTED)
@@ -40,7 +42,12 @@ foreach(lib ${required_lib_list})
   )
   set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}")
   target_compile_definitions(${lib} INTERFACE C10_USING_CUSTOM_GENERATED_MACROS)
-  target_include_directories(${lib} INTERFACE ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib)
+  target_include_directories(
+    ${lib}
+    INTERFACE ${_root}/include
+              ${_root}/include/executorch/runtime/core/portable_type/c10
+              ${_root}/lib
+  )
   list(APPEND EXECUTORCH_LIBRARIES ${lib})
 endforeach()
 
@@ -65,9 +72,9 @@ set(lib_list
     neuron_backend
     qnn_executorch_backend
     portable_ops_lib
+    custom_ops
     extension_module
     extension_module_static
-    extension_parallel
     extension_runner_util
     extension_tensor
     extension_threadpool
@@ -111,20 +118,23 @@ foreach(lib ${lib_list})
       add_library(${lib} STATIC IMPORTED)
     endif()
     set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}")
-    target_include_directories(${lib} INTERFACE ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib)
+    target_include_directories(
+      ${lib}
+      INTERFACE ${_root}/include
+                ${_root}/include/executorch/runtime/core/portable_type/c10
+                ${_root}/lib
+    )
     list(APPEND EXECUTORCH_LIBRARIES ${lib})
   endif()
 endforeach()
 
 # TODO: investigate use of install(EXPORT) to cleanly handle
 # target_compile_options/target_compile_definitions for everything.
-if(TARGET extension_parallel)
-  set_target_properties(
-    extension_parallel PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool
-  )
-endif()
 if(TARGET cpublas)
   set_target_properties(
-    cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel
+    cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool
   )
 endif()
+if(TARGET extension_threadpool)
+  target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL)
+endif()
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
index 5f49581ea25..96ff28d8f49 100644
--- a/examples/models/llama/CMakeLists.txt
+++ b/examples/models/llama/CMakeLists.txt
@@ -15,7 +15,7 @@
 # ~~~
 # It should also be cmake-lint clean.
 #
-cmake_minimum_required(VERSION 3.19)
+cmake_minimum_required(VERSION 3.24)  # 3.24 is required for WHOLE_ARCHIVE
 project(llama_runner)
 
 # Duplicating options as root CMakeLists.txt
@@ -84,14 +84,6 @@ if(CMAKE_TOOLCHAIN_IOS OR ANDROID)
   target_link_options_shared_lib(executorch)
 endif()
 
-# custom ops library
-if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/custom_ops
-    ${CMAKE_CURRENT_BINARY_DIR}/../../../extension/llm/custom_ops
-  )
-endif()
-
 # llama_runner library
 add_subdirectory(runner)
 
@@ -119,8 +111,7 @@ target_link_options_shared_lib(quantized_ops_lib)
 list(APPEND link_libraries quantized_kernels quantized_ops_lib)
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  target_link_options_shared_lib(custom_ops)
-  list(APPEND link_libraries custom_ops)
+  list(APPEND link_libraries $<LINK_LIBRARY:WHOLE_ARCHIVE,custom_ops>)
 endif()
 
 if(EXECUTORCH_BUILD_TORCHAO)
@@ -140,7 +131,6 @@ endif()
 set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack)
 # Extra compile option and include dir for pthreadpool
 if(EXECUTORCH_BUILD_PTHREADPOOL)
-  list(APPEND _common_compile_options -DET_USE_THREADPOOL)
   list(APPEND link_libraries extension_threadpool pthreadpool)
   list(APPEND _common_include_directories
        ${XNNPACK_ROOT}/third-party/pthreadpool/include
diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt
index ecd00809fdb..5d5857dd5af 100644
--- a/examples/models/llava/CMakeLists.txt
+++ b/examples/models/llava/CMakeLists.txt
@@ -93,14 +93,6 @@ if(CMAKE_TOOLCHAIN_IOS OR ANDROID)
   target_link_options_shared_lib(executorch)
 endif()
 
-# custom ops library
-if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  add_subdirectory(
-    ${EXECUTORCH_ROOT}/extension/llm/custom_ops
-    ${CMAKE_CURRENT_BINARY_DIR}/../../../extension/llm/custom_ops
-  )
-endif()
-
 # llava_runner library
 add_subdirectory(runner)
 
@@ -132,14 +124,12 @@ target_link_options_shared_lib(quantized_ops_lib)
 list(APPEND link_libraries quantized_kernels quantized_ops_lib)
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  target_link_options_shared_lib(custom_ops)
   list(APPEND link_libraries custom_ops)
 endif()
 
 set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack)
 # Extra compile option and include dir for pthreadpool
 if(EXECUTORCH_BUILD_PTHREADPOOL)
-  list(APPEND _common_compile_options -DET_USE_THREADPOOL)
   list(APPEND link_libraries extension_threadpool pthreadpool)
   list(APPEND _common_include_directories
        ${XNNPACK_ROOT}/third-party/pthreadpool/include
diff --git a/examples/models/llava/targets.bzl b/examples/models/llava/targets.bzl
index 5efb099f06f..6f3a370acf4 100644
--- a/examples/models/llava/targets.bzl
+++ b/examples/models/llava/targets.bzl
@@ -7,9 +7,6 @@ def define_common_targets():
             "main.cpp",
         ],
         compiler_flags = ["-Wno-global-constructors"],
-        preprocessor_flags = [
-            "-DET_USE_THREADPOOL",
-        ],
         deps = [
             "//executorch/examples/models/llava/runner:runner",
             "//executorch/extension/evalue_util:print_evalue",
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index 70f21f2751c..ba722d9c791 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-cmake_minimum_required(VERSION 3.19)
+cmake_minimum_required(VERSION 3.24) # 3.24 is required for WHOLE_ARCHIVE
 
 project(executorch_jni)
 
@@ -115,16 +115,10 @@ if(TARGET vulkan_backend)
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  add_subdirectory(
-    ${EXECUTORCH_ROOT}/extension/llm/custom_ops
-    ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/custom_ops
-  )
-  list(APPEND link_libraries custom_ops)
-  target_link_options_shared_lib(custom_ops)
+  list(APPEND link_libraries $<LINK_LIBRARY:WHOLE_ARCHIVE,custom_ops>)
 endif()
 
 if(TARGET pthreadpool)
-  target_compile_definitions(executorch_jni PRIVATE ET_USE_THREADPOOL=1)
   target_include_directories(
     executorch_jni
     PUBLIC
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index c3969e6f9bf..eeb118d4344 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -78,7 +78,7 @@ target_include_directories(
 target_link_libraries(custom_ops PUBLIC ${custom_ops_libs} executorch_core)
 
 target_compile_options(
-  custom_ops PUBLIC ${_common_compile_options} -DET_USE_THREADPOOL
+  custom_ops PUBLIC ${_common_compile_options}
 )
 
 install(TARGETS custom_ops DESTINATION lib)
@@ -130,7 +130,7 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   target_compile_options(
     custom_ops_aot_lib
     PUBLIC -Wno-deprecated-declarations -fPIC -frtti -fexceptions
-           ${_common_compile_options} -DET_USE_THREADPOOL
+           ${_common_compile_options}
   )
 
   install(TARGETS custom_ops_aot_lib
diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp
index f0a7775e803..371fcf38a24 100644
--- a/extension/llm/custom_ops/op_sdpa.cpp
+++ b/extension/llm/custom_ops/op_sdpa.cpp
@@ -19,8 +19,8 @@
 #include <vector>
 
 #ifdef ET_USE_THREADPOOL
-#include <executorch/extension/parallel/thread_parallel.h>
 #include <executorch/extension/threadpool/threadpool.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #endif
 #include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
 
diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl
index e3e8b30520f..1c4686fe3d0 100644
--- a/extension/llm/custom_ops/targets.bzl
+++ b/extension/llm/custom_ops/targets.bzl
@@ -37,7 +37,6 @@ def define_common_targets():
                 "//executorch/kernels/optimized:libblas{}".format(mkl_dep),
                 "//executorch/kernels/optimized:libvec",
                 "//executorch/extension/kernel_util:kernel_util",
-                "//executorch/extension/parallel:thread_parallel",
                 "//executorch/extension/threadpool:threadpool",
             ],
             deps = [
diff --git a/extension/parallel/CMakeLists.txt b/extension/parallel/CMakeLists.txt
deleted file mode 100644
index 7f727aafe46..00000000000
--- a/extension/parallel/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Please keep this file formatted by running:
-# ~~~
-# cmake-format -i CMakeLists.txt
-# ~~~
-
-if(NOT (EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO))
-  message(FATAL_ERROR "extension/parallel requires extension/threadpool")
-endif()
-
-add_library(extension_parallel thread_parallel.cpp)
-
-target_link_libraries(extension_parallel PUBLIC executorch_core extension_threadpool)
-target_compile_options(extension_parallel PUBLIC ${_common_compile_options})
-
-install(
-  TARGETS extension_parallel
-  DESTINATION lib
-  INCLUDES
-  DESTINATION ${_common_include_directories})
diff --git a/extension/parallel/TARGETS b/extension/parallel/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/extension/parallel/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl
deleted file mode 100644
index 82b3deab129..00000000000
--- a/extension/parallel/targets.bzl
+++ /dev/null
@@ -1,30 +0,0 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
-
-def define_common_targets():
-    """Defines targets that should be shared between fbcode and xplat.
-
-    The directory containing this targets.bzl file should also contain both
-    TARGETS and BUCK files that call this function.
-    """
-
-    for aten_mode in get_aten_mode_options():
-        aten_suffix = ("_aten" if aten_mode else "")
-
-        runtime.cxx_library(
-            name = "thread_parallel" + aten_suffix,
-            srcs = [
-                "thread_parallel.cpp",
-            ],
-            exported_headers = [
-                "thread_parallel.h",
-            ],
-            visibility = [
-                "//executorch/...",
-                "@EXECUTORCH_CLIENTS",
-            ],
-            deps = [
-                "//executorch/extension/threadpool:threadpool",
-                "//executorch/runtime/core:core",
-                "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
-            ],
-        )
diff --git a/extension/parallel/test/TARGETS b/extension/parallel/test/TARGETS
deleted file mode 100644
index 2341af9282f..00000000000
--- a/extension/parallel/test/TARGETS
+++ /dev/null
@@ -1,8 +0,0 @@
-# Any targets that should be shared between fbcode and xplat must be defined in
-# targets.bzl. This file can contain fbcode-only targets.
-
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets()
diff --git a/extension/parallel/test/targets.bzl b/extension/parallel/test/targets.bzl
deleted file mode 100644
index 791c0727471..00000000000
--- a/extension/parallel/test/targets.bzl
+++ /dev/null
@@ -1,19 +0,0 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-
-def define_common_targets():
-    """Defines targets that should be shared between fbcode and xplat.
-
-    The directory containing this targets.bzl file should also contain both
-    TARGETS and BUCK files that call this function.
-    """
-
-    runtime.cxx_test(
-        name = "thread_parallel_test",
-        srcs = [
-            "thread_parallel_test.cpp",
-        ],
-        deps = [
-            "//executorch/extension/parallel:thread_parallel",
-            "//executorch/runtime/platform:platform",
-        ],
-    )
diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h
index 8b174075ae9..8bd1a572cd7 100644
--- a/extension/parallel/thread_parallel.h
+++ b/extension/parallel/thread_parallel.h
@@ -8,46 +8,7 @@
 
 #pragma once
 
-#include <cstdint>
-#include <functional>
-
-namespace executorch {
-namespace extension {
-
-/**
- * A helper to run function in parallel.
- *
- * begin, end: describe the extent of the workitems via first and last workitem
- * to be processed
- * grain_size: number of workitems processed by user callback which is
- * described below
- * f: user function applied in parallel to the chunks, signature:
- *   void f(int64_t begin, int64_t end)
- * Returns true if all work items are processed successfully, false otherwise
- *
- * Warning: parallel_for does NOT copy thread local states from the current
- * thread to the worker threads. Users need to protect the access to captured
- * data if they mutate them in f.
- */
-bool parallel_for(
-    const int64_t begin,
-    const int64_t end,
-    const int64_t grain_size,
-    const std::function<void(int64_t, int64_t)>& f);
-
-int64_t get_thread_num();
-
-void set_thread_num(int64_t thread_num);
-
-} // namespace extension
-} // namespace executorch
-
-namespace torch {
-namespace executor {
-// TODO(T197294990): Remove these deprecated aliases once all users have moved
-// to the new `::executorch` namespaces.
-using ::executorch::extension::get_thread_num;
-using ::executorch::extension::parallel_for;
-using ::executorch::extension::set_thread_num;
-} // namespace executor
-} // namespace torch
+// This header is a stub left behind after the move to
+// executorch/runtime/kernel. As such, it is deprecated; include and
+// use the below header directly instead.
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt
index 90288656674..6e107cb6634 100644
--- a/extension/threadpool/CMakeLists.txt
+++ b/extension/threadpool/CMakeLists.txt
@@ -21,7 +21,8 @@ if(NOT CMAKE_CXX_STANDARD)
 endif()
 
 add_library(
-  extension_threadpool threadpool.cpp threadpool_guard.cpp cpuinfo_utils.cpp
+  extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp
+                       cpuinfo_utils.cpp
 )
 target_link_libraries(
   extension_threadpool PUBLIC executorch_core cpuinfo pthreadpool
@@ -32,6 +33,7 @@ target_include_directories(
   PUBLIC ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
          ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
 )
+target_compile_definitions(extension_threadpool PUBLIC ET_USE_THREADPOOL)
 target_compile_options(extension_threadpool PUBLIC ${_common_compile_options})
 
 # Install libraries
@@ -41,3 +43,7 @@ install(
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
+
+if(BUILD_TESTING)
+  add_subdirectory(test)
+endif()
diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl
index 8bb0398b385..1c34dbbc7d4 100644
--- a/extension/threadpool/targets.bzl
+++ b/extension/threadpool/targets.bzl
@@ -9,6 +9,7 @@ def define_common_targets():
     """
 
     _THREADPOOL_SRCS = [
+        "thread_parallel.cpp",
         "threadpool.cpp",
         "threadpool_guard.cpp",
     ] + (["fb/threadpool_use_n_threads.cpp"] if not runtime.is_oss else [])
@@ -29,6 +30,8 @@ def define_common_targets():
         exported_deps = [
             third_party_dep("pthreadpool"),
             third_party_dep("cpuinfo"),
+            # Allow users to use the header without an extra deps entry.
+            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
         exported_preprocessor_flags = [
             "-DET_USE_THREADPOOL",
diff --git a/extension/parallel/test/CMakeLists.txt b/extension/threadpool/test/CMakeLists.txt
similarity index 53%
rename from extension/parallel/test/CMakeLists.txt
rename to extension/threadpool/test/CMakeLists.txt
index ab37f66c17d..3f9b13f2ab4 100644
--- a/extension/parallel/test/CMakeLists.txt
+++ b/extension/threadpool/test/CMakeLists.txt
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# @generated by test/utils/generate_gtest_cmakelists.py
+#
 # This file should be formatted with
 # ~~~
 # cmake-format -i CMakeLists.txt
@@ -12,28 +14,14 @@
 #
 
 cmake_minimum_required(VERSION 3.19)
-project(extension_parallel_test)
-
-# Use C++17 for test.
-set(CMAKE_CXX_STANDARD 17)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 
 include(${EXECUTORCH_ROOT}/build/Test.cmake)
 
-set(_test_srcs thread_parallel_test.cpp ../thread_parallel.cpp)
+set(_test_srcs thread_parallel_test.cpp threadpool_test.cpp)
 
 et_cxx_test(
-  extension_parallel_test
-  SOURCES
-  ${_test_srcs}
-  EXTRA_LIBS
-  pthreadpool
-  cpuinfo
+  extension_threadpool_test SOURCES ${_test_srcs} EXTRA_LIBS
   extension_threadpool
 )
-target_include_directories(
-  extension_parallel_test
-  PRIVATE ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
-          ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
-)
diff --git a/extension/threadpool/test/targets.bzl b/extension/threadpool/test/targets.bzl
index b8a39d8969a..8bdf776c825 100644
--- a/extension/threadpool/test/targets.bzl
+++ b/extension/threadpool/test/targets.bzl
@@ -18,3 +18,15 @@ def define_common_targets():
             "//executorch/extension/threadpool:threadpool",
         ],
     )
+
+    runtime.cxx_test(
+        name = "thread_parallel_test",
+        srcs = [
+            "thread_parallel_test.cpp",
+        ],
+        deps = [
+            "//executorch/extension/threadpool:threadpool",
+            "//executorch/runtime/kernel:thread_parallel_interface",
+            "//executorch/runtime/platform:platform",
+        ],
+    )
diff --git a/extension/parallel/test/thread_parallel_test.cpp b/extension/threadpool/test/thread_parallel_test.cpp
similarity index 76%
rename from extension/parallel/test/thread_parallel_test.cpp
rename to extension/threadpool/test/thread_parallel_test.cpp
index d386429100d..e31f16eee22 100644
--- a/extension/parallel/test/thread_parallel_test.cpp
+++ b/extension/threadpool/test/thread_parallel_test.cpp
@@ -11,13 +11,16 @@
 #include <array>
 #include <mutex>
 
-#include <executorch/extension/parallel/thread_parallel.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <executorch/runtime/platform/platform.h>
 
 using namespace ::testing;
 using ::executorch::extension::parallel_for;
 
-class ParallelTest : public ::testing::Test {
+#ifndef ET_USE_THREADPOOL
+#endif
+
+class ParallelTest : public ::testing::TestWithParam<bool> {
  protected:
   void SetUp() override {
     data_.fill(0);
@@ -42,12 +45,25 @@ class ParallelTest : public ::testing::Test {
     }
   }
 
+  template <typename Func>
+  bool parallel_for(
+      const int64_t begin,
+      const int64_t end,
+      const int64_t grain_size,
+      const Func& func) {
+    if (GetParam()) {
+      return executorch::extension::parallel_for(begin, end, grain_size, func);
+    }
+    return executorch::extension::internal::parallel_for_no_threadpool(
+        begin, end, grain_size, func);
+  }
+
   std::array<int, 10> data_;
   std::mutex mutex_;
   int sum_of_all_elements_;
 };
 
-TEST_F(ParallelTest, TestAllInvoked) {
+TEST_P(ParallelTest, TestAllInvoked) {
   EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -57,7 +73,7 @@ TEST_F(ParallelTest, TestAllInvoked) {
   }
 }
 
-TEST_F(ParallelTest, TestAllInvokedWithMutex) {
+TEST_P(ParallelTest, TestAllInvokedWithMutex) {
   EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) {
     this->RunExclusiveTask(begin, end);
   }));
@@ -70,7 +86,7 @@ TEST_F(ParallelTest, TestAllInvokedWithMutex) {
   EXPECT_EQ(sum_of_all_elements_, expected_sum);
 }
 
-TEST_F(ParallelTest, TestInvalidRange) {
+TEST_P(ParallelTest, TestInvalidRange) {
   et_pal_init();
   EXPECT_FALSE(parallel_for(10, 0, 1, [this](int64_t begin, int64_t end) {
     this->RunExclusiveTask(begin, end);
@@ -82,7 +98,7 @@ TEST_F(ParallelTest, TestInvalidRange) {
   EXPECT_EQ(sum_of_all_elements_, 0);
 }
 
-TEST_F(ParallelTest, TestInvalidRange2) {
+TEST_P(ParallelTest, TestInvalidRange2) {
   et_pal_init();
   EXPECT_FALSE(parallel_for(6, 5, 1, [this](int64_t begin, int64_t end) {
     this->RunExclusiveTask(begin, end);
@@ -94,7 +110,7 @@ TEST_F(ParallelTest, TestInvalidRange2) {
   EXPECT_EQ(sum_of_all_elements_, 0);
 }
 
-TEST_F(ParallelTest, TestInvokePartialFromBeginning) {
+TEST_P(ParallelTest, TestInvokePartialFromBeginning) {
   EXPECT_TRUE(parallel_for(0, 5, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -107,7 +123,7 @@ TEST_F(ParallelTest, TestInvokePartialFromBeginning) {
   }
 }
 
-TEST_F(ParallelTest, TestInvokePartialToEnd) {
+TEST_P(ParallelTest, TestInvokePartialToEnd) {
   EXPECT_TRUE(parallel_for(5, 10, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -120,7 +136,7 @@ TEST_F(ParallelTest, TestInvokePartialToEnd) {
   }
 }
 
-TEST_F(ParallelTest, TestInvokePartialMiddle) {
+TEST_P(ParallelTest, TestInvokePartialMiddle) {
   EXPECT_TRUE(parallel_for(2, 8, 1, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -136,7 +152,7 @@ TEST_F(ParallelTest, TestInvokePartialMiddle) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSize2) {
+TEST_P(ParallelTest, TestChunkSize2) {
   EXPECT_TRUE(parallel_for(0, 10, 2, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -146,7 +162,7 @@ TEST_F(ParallelTest, TestChunkSize2) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSize2Middle) {
+TEST_P(ParallelTest, TestChunkSize2Middle) {
   EXPECT_TRUE(parallel_for(3, 8, 2, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -162,7 +178,7 @@ TEST_F(ParallelTest, TestChunkSize2Middle) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSize3) {
+TEST_P(ParallelTest, TestChunkSize3) {
   EXPECT_TRUE(parallel_for(0, 10, 3, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -172,7 +188,7 @@ TEST_F(ParallelTest, TestChunkSize3) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSize6) {
+TEST_P(ParallelTest, TestChunkSize6) {
   EXPECT_TRUE(parallel_for(0, 10, 6, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -182,7 +198,7 @@ TEST_F(ParallelTest, TestChunkSize6) {
   }
 }
 
-TEST_F(ParallelTest, TestChunkSizeTooLarge) {
+TEST_P(ParallelTest, TestChunkSizeTooLarge) {
   EXPECT_TRUE(parallel_for(0, 10, 11, [this](int64_t begin, int64_t end) {
     this->RunTask(begin, end);
   }));
@@ -191,3 +207,8 @@ TEST_F(ParallelTest, TestChunkSizeTooLarge) {
     EXPECT_EQ(data_[i], i);
   }
 }
+
+INSTANTIATE_TEST_SUITE_P(
+    ParallelTestWithOrWithoutThreadpool,
+    ParallelTest,
+    ::testing::Values(true, false));
diff --git a/extension/parallel/thread_parallel.cpp b/extension/threadpool/thread_parallel.cpp
similarity index 85%
rename from extension/parallel/thread_parallel.cpp
rename to extension/threadpool/thread_parallel.cpp
index dfbb911d3a9..fa26742368f 100644
--- a/extension/parallel/thread_parallel.cpp
+++ b/extension/threadpool/thread_parallel.cpp
@@ -6,11 +6,12 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <cinttypes>
 #include <tuple>
 
-#include <executorch/extension/parallel/thread_parallel.h>
 #include <executorch/extension/threadpool/threadpool.h>
-#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <executorch/runtime/platform/assert.h>
 
 namespace executorch {
@@ -53,9 +54,12 @@ bool parallel_for(
     const int64_t end,
     const int64_t grain_size,
     const std::function<void(int64_t, int64_t)>& f) {
-  ET_LOG_AND_RETURN_IF_FALSE(begin >= 0 && end >= 0);
-  ET_LOG_AND_RETURN_IF_FALSE(end >= begin);
-  ET_LOG_AND_RETURN_IF_FALSE(grain_size > 0);
+  ET_CHECK_OR_RETURN_FALSE(
+      begin >= 0 && end >= 0 && end >= begin,
+      "begin = %" PRId64 ", end = %" PRId64,
+      begin,
+      end);
+  ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size);
   int64_t num_tasks = 0, chunk_size = 0;
   std::tie(num_tasks, chunk_size) =
       calc_num_tasks_and_chunk_size(begin, end, grain_size);
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index c6d31c20263..23e26bfa72b 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -43,7 +43,7 @@ endif()
 list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(cpublas STATIC ${_optimized_cpublas__srcs})
 target_link_libraries(
-  cpublas PUBLIC executorch_core eigen_blas extension_parallel extension_threadpool
+  cpublas PUBLIC executorch_core eigen_blas extension_threadpool
 )
 target_compile_options(cpublas PUBLIC ${_common_compile_options})
 
diff --git a/kernels/optimized/blas/BlasKernel.h b/kernels/optimized/blas/BlasKernel.h
index c2b03cfebdd..fc47b4482d6 100644
--- a/kernels/optimized/blas/BlasKernel.h
+++ b/kernels/optimized/blas/BlasKernel.h
@@ -11,8 +11,8 @@
 #include <executorch/kernels/optimized/utils/math_utils.h>
 #include <executorch/kernels/optimized/utils/unroll.h>
 
-#include <executorch/extension/parallel/thread_parallel.h>
 #include <executorch/runtime/core/portable_type/bfloat16.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 
 #include <array>
 
diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl
index 659c7afe090..dd246f38984 100644
--- a/kernels/optimized/lib_defs.bzl
+++ b/kernels/optimized/lib_defs.bzl
@@ -186,7 +186,10 @@ def define_libs(is_fbcode=False):
         ],
     )
 
-    LIBBLAS_DEPS = [third_party_dep("cpuinfo")]
+    LIBBLAS_DEPS = [
+        third_party_dep("cpuinfo"),
+        "//executorch/extension/threadpool:threadpool",
+    ]
 
     for libblas_name, mkl_dep in [("libblas", "fbsource//third-party/mkl:mkl_lp64_omp"), ("libblas_mkl_noomp", "fbsource//third-party/mkl:mkl")]:
         runtime.cxx_library(
@@ -229,7 +232,6 @@ def define_libs(is_fbcode=False):
                 "DEFAULT": [],
             }) + LIBBLAS_DEPS,
             exported_deps = [
-                "//executorch/extension/parallel:thread_parallel",
                 "//executorch/kernels/optimized:libutils",
                 "//executorch/runtime/core/exec_aten:lib",
             ],
diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h
index 35cfdfbaa72..6d7b17443ee 100644
--- a/kernels/portable/cpu/util/reduce_util.h
+++ b/kernels/portable/cpu/util/reduce_util.h
@@ -8,8 +8,10 @@
 
 #pragma once
 
+#include <c10/util/irange.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <cstring>
 #include <tuple>
 
@@ -24,9 +26,12 @@ void apply_on_flat_ix_with_stride_and_base(
     const size_t base,
     const size_t start,
     const size_t end) {
-  for (size_t i = start; i <= end; i++) {
-    fn(base + i * stride);
-  }
+  executorch::extension::parallel_for(
+      start, end + 1, [&](auto start_, auto end_) {
+        for (const auto i : c10::irange(start_, end_)) {
+          fn(base + i * stride);
+        }
+      });
 }
 
 template <typename Fn>
@@ -36,9 +41,12 @@ void apply_on_flat_and_dim_ix_with_stride_and_base(
     const size_t base,
     const size_t start,
     const size_t end) {
-  for (size_t i = start; i <= end; i++) {
-    fn(base + i * stride, i);
-  }
+  executorch::extension::parallel_for(
+      start, end + 1, [&](auto start_, auto end_) {
+        for (const auto i : c10::irange(start_, end_)) {
+          fn(base + i * stride, i);
+        }
+      });
 }
 
 template <typename Fn>
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index 739bc117fbf..eb48a6e59b1 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -302,8 +302,12 @@ def define_common_targets():
             srcs = ["reduce_util.cpp"],
             exported_headers = ["reduce_util.h"],
             deps = [
-                "//executorch/runtime/kernel:kernel_includes{}".format(suffix),
                 "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix),
+                "//executorch/runtime/kernel:kernel_includes{}".format(suffix),
+            ],
+            exported_deps = [
+                "//executorch/runtime/kernel:thread_parallel_interface",
+                "//executorch/runtime/core/portable_type/c10/c10:c10",
             ],
             exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [],
             visibility = [
diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl
index d49435f2825..e67f76728b8 100644
--- a/runtime/kernel/targets.bzl
+++ b/runtime/kernel/targets.bzl
@@ -51,6 +51,19 @@ def define_common_targets():
         preprocessor_flags = ["-DMAX_KERNEL_NUM=1"],
     )
 
+    runtime.cxx_library(
+        name = "thread_parallel_interface",
+        exported_headers = ["thread_parallel_interface.h"],
+        exported_deps = [
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/platform:platform",
+        ],
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
     for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
 
diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h
new file mode 100644
index 00000000000..cd6901e48a6
--- /dev/null
+++ b/runtime/kernel/thread_parallel_interface.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace executorch {
+namespace extension {
+namespace internal {
+template <typename Func>
+inline bool parallel_for_no_threadpool(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const Func& f) {
+  ET_CHECK_OR_RETURN_FALSE(
+      begin >= 0 && end >= 0 && end >= begin,
+      "begin = %" PRId64 ", end = %" PRId64,
+      begin,
+      end);
+  ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size);
+  f(begin, end);
+  return true;
+}
+
+// Match GRAIN_SIZE from PyTorch core.
+// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/TensorIterator.h#L78
+constexpr int64_t GRAIN_SIZE = 32768;
+
+} // namespace internal
+
+#ifdef ET_USE_THREADPOOL
+/**
+ * A helper to run a function in parallel.
+ *
+ * begin, end: describe the extent of the workitems via first and last workitem
+ * to be processed
+ * grain_size: number of workitems processed by user callback which is
+ * described below
+ * f: user function applied in parallel to the chunks, signature:
+ *   void f(int64_t begin, int64_t end)
+ * Returns true if all work items are processed successfully, false otherwise
+ *
+ * Warning: parallel_for does NOT copy thread local states from the current
+ * thread to the worker threads. Users need to protect the access to captured
+ * data if they mutate them in f.
+ */
+bool parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const std::function<void(int64_t, int64_t)>& f);
+
+int64_t get_thread_num();
+
+void set_thread_num(int64_t thread_num);
+#else // ET_USE_THREADPOOL
+template <typename Func>
+bool parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const Func& func) {
+  return internal::parallel_for_no_threadpool(begin, end, grain_size, func);
+}
+
+inline int64_t get_thread_num() {
+  return 0;
+}
+
+inline void set_thread_num(int64_t thread_num) {
+  ET_DCHECK_MSG(false, "cannot set_thread_num without threading support!");
+}
+#endif // ET_USE_THREADPOOL
+
+/**
+ * Convenience version of parallel_for that sets the grain size to
+ * internal::GRAIN_SIZE.
+ */
+template <typename Func>
+bool parallel_for(const int64_t begin, const int64_t end, const Func& func) {
+  return parallel_for(begin, end, internal::GRAIN_SIZE, func);
+}
+} // namespace extension
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::get_thread_num;
+using ::executorch::extension::parallel_for;
+using ::executorch::extension::set_thread_num;
+} // namespace executor
+} // namespace torch
diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json
index cc5e625f1e8..be594f9d5f4 100644
--- a/test/utils/OSSTestConfig.json
+++ b/test/utils/OSSTestConfig.json
@@ -59,6 +59,16 @@
             "extension_tensor"
         ]
     },
+    {
+        "directory": "extension/threadpool/test",
+        "sources": [
+            "thread_parallel_test.cpp",
+            "threadpool_test.cpp"
+        ],
+        "additional_libs": [
+            "extension_threadpool"
+        ]
+    },
     {
         "directory": "kernels/portable/cpu/util/test",
         "sources": [