diff --git a/.lintrunner.toml b/.lintrunner.toml index 7667ac430d1..1a27228d266 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -218,6 +218,8 @@ exclude_patterns = [ 'examples/**', 'extension/**', 'kernels/optimized/**', + # Justified include. + 'runtime/kernel/thread_parallel_interface.h', 'scripts/**', 'third-party/**', 'util/**', diff --git a/CMakeLists.txt b/CMakeLists.txt index 73b89b6171e..fabf667cbe1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -751,7 +751,6 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO ) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/parallel) endif() if(EXECUTORCH_BUILD_PYBIND) diff --git a/Test.cmake b/Test.cmake index d4b5f6aa1db..6bd7a86e70b 100644 --- a/Test.cmake +++ b/Test.cmake @@ -13,7 +13,6 @@ if(BUILD_TESTING) add_subdirectory(extension/evalue_util/test) add_subdirectory(extension/kernel_util/test) add_subdirectory(extension/memory_allocator/test) - add_subdirectory(extension/parallel/test) add_subdirectory(extension/pytree/test) add_subdirectory(kernels/portable/cpu/util/test) add_subdirectory(kernels/prim_ops/test) diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index 4811563269c..bbc9eec3a0e 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -58,6 +58,21 @@ deps = [ "executorch_core", ] +# HACK: prevent reduce_util from also showing up in custom_ops. The +# actual medium-term fix is to stop using Buck to drive our CMake +# builds. +[targets.reduce_util] +buck_targets = [ + "//kernels/portable/cpu/util:reduce_util", +] +filters = [ + ".cpp$", +] +deps = [ + "executorch", + "executorch_core", +] + [targets.optimized_kernels] buck_targets = [ "//kernels/optimized:generated_lib", @@ -73,7 +88,6 @@ excludes = [ deps = [ "executorch", "executorch_core", - "extension_parallel", "extension_threadpool", "portable_kernels", ] @@ -116,7 +130,7 @@ excludes = [ deps = [ "executorch_core", "executorch", - "extension_parallel", + "extension_threadpool", ] [targets.optimized_native_cpu_ops] @@ -131,7 +145,6 @@ excludes = [ deps = [ "executorch_core", "executorch", - "extension_parallel", "extension_threadpool", "portable_kernels", ] @@ -212,19 +225,6 @@ deps = [ "extension_runner_util", ] -[targets.extension_parallel] -buck_targets = [ - "//extension/parallel:thread_parallel", -] -filters = [ - ".cpp$", -] -deps = [ - "executorch", - "executorch_core", - "extension_threadpool", -] - [targets.extension_tensor] buck_targets = [ "//extension/tensor:tensor", @@ -364,6 +364,7 @@ excludes = [ deps = [ "executorch", "executorch_core", + "extension_threadpool", "xnnpack_backend", "portable_kernels", ] @@ -378,6 +379,7 @@ filters = [ deps = [ "executorch", "executorch_core", + "extension_threadpool", ] [targets.xnnpack_schema] @@ -412,8 +414,8 @@ deps = [ "executorch", "executorch_core", "optimized_kernels", - "extension_parallel", "extension_threadpool", + "reduce_util", "xnnpack_backend", ] @@ -449,7 +451,7 @@ deps = [ "executorch_core", "extension_data_loader", "extension_module", - "extension_parallel", + "extension_threadpool", "portable_kernels", "quantized_kernels", "xnnpack_backend", diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 2c459b66ac8..931d31de8ef 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -1,4 +1,3 @@ - # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # @@ -16,20 +15,23 @@ # # This will define the following variables: # -# EXECUTORCH_FOUND -- True if the system has the ExecuTorch library -# EXECUTORCH_INCLUDE_DIRS -- The include directories for ExecuTorch -# EXECUTORCH_LIBRARIES -- Libraries to link against +# EXECUTORCH_FOUND -- True if the system has the ExecuTorch library +# EXECUTORCH_INCLUDE_DIRS -- The include directories for ExecuTorch +# EXECUTORCH_LIBRARIES -- Libraries to link against # -# The actual values for these variables will be different from what executorch-config.cmake -# in executorch pip package gives, but we wanted to keep the contract of exposing these -# CMake variables. +# The actual values for these variables will be different from what +# executorch-config.cmake in executorch pip package gives, but we wanted to keep +# the contract of exposing these CMake variables. cmake_minimum_required(VERSION 3.19) set(_root "${CMAKE_CURRENT_LIST_DIR}/../../..") set(required_lib_list executorch executorch_core portable_kernels) set(EXECUTORCH_LIBRARIES) -set(EXECUTORCH_INCLUDE_DIRS ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib) +set(EXECUTORCH_INCLUDE_DIRS + ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 + ${_root}/lib +) foreach(lib ${required_lib_list}) set(lib_var "LIB_${lib}") add_library(${lib} STATIC IMPORTED) @@ -40,7 +42,12 @@ foreach(lib ${required_lib_list}) ) set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}") target_compile_definitions(${lib} INTERFACE C10_USING_CUSTOM_GENERATED_MACROS) - target_include_directories(${lib} INTERFACE ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib) + target_include_directories( + ${lib} + INTERFACE ${_root}/include + ${_root}/include/executorch/runtime/core/portable_type/c10 + ${_root}/lib + ) list(APPEND EXECUTORCH_LIBRARIES ${lib}) endforeach() @@ -65,9 +72,9 @@ set(lib_list neuron_backend qnn_executorch_backend portable_ops_lib + custom_ops extension_module extension_module_static - extension_parallel extension_runner_util extension_tensor extension_threadpool @@ -111,20 +118,23 @@ foreach(lib ${lib_list}) add_library(${lib} STATIC IMPORTED) endif() set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}") - target_include_directories(${lib} INTERFACE ${_root}/include ${_root}/include/executorch/runtime/core/portable_type/c10 ${_root}/lib) + target_include_directories( + ${lib} + INTERFACE ${_root}/include + ${_root}/include/executorch/runtime/core/portable_type/c10 + ${_root}/lib + ) list(APPEND EXECUTORCH_LIBRARIES ${lib}) endif() endforeach() # TODO: investigate use of install(EXPORT) to cleanly handle # target_compile_options/target_compile_definitions for everything. -if(TARGET extension_parallel) - set_target_properties( - extension_parallel PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool - ) -endif() if(TARGET cpublas) set_target_properties( - cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_parallel + cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool ) endif() +if(TARGET extension_threadpool) + target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL) +endif() diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt index 5f49581ea25..96ff28d8f49 100644 --- a/examples/models/llama/CMakeLists.txt +++ b/examples/models/llama/CMakeLists.txt @@ -15,7 +15,7 @@ # ~~~ # It should also be cmake-lint clean. # -cmake_minimum_required(VERSION 3.19) +cmake_minimum_required(VERSION 3.24) # 3.24 is required for WHOLE_ARCHIVE project(llama_runner) # Duplicating options as root CMakeLists.txt @@ -84,14 +84,6 @@ if(CMAKE_TOOLCHAIN_IOS OR ANDROID) target_link_options_shared_lib(executorch) endif() -# custom ops library -if(EXECUTORCH_BUILD_KERNELS_CUSTOM) - add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/custom_ops - ${CMAKE_CURRENT_BINARY_DIR}/../../../extension/llm/custom_ops - ) -endif() - # llama_runner library add_subdirectory(runner) @@ -119,8 +111,7 @@ target_link_options_shared_lib(quantized_ops_lib) list(APPEND link_libraries quantized_kernels quantized_ops_lib) if(EXECUTORCH_BUILD_KERNELS_CUSTOM) - target_link_options_shared_lib(custom_ops) - list(APPEND link_libraries custom_ops) + list(APPEND link_libraries $) endif() if(EXECUTORCH_BUILD_TORCHAO) @@ -140,7 +131,6 @@ endif() set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack) # Extra compile option and include dir for pthreadpool if(EXECUTORCH_BUILD_PTHREADPOOL) - list(APPEND _common_compile_options -DET_USE_THREADPOOL) list(APPEND link_libraries extension_threadpool pthreadpool) list(APPEND _common_include_directories ${XNNPACK_ROOT}/third-party/pthreadpool/include diff --git a/examples/models/llava/CMakeLists.txt b/examples/models/llava/CMakeLists.txt index ecd00809fdb..5d5857dd5af 100644 --- a/examples/models/llava/CMakeLists.txt +++ b/examples/models/llava/CMakeLists.txt @@ -93,14 +93,6 @@ if(CMAKE_TOOLCHAIN_IOS OR ANDROID) target_link_options_shared_lib(executorch) endif() -# custom ops library -if(EXECUTORCH_BUILD_KERNELS_CUSTOM) - add_subdirectory( - ${EXECUTORCH_ROOT}/extension/llm/custom_ops - ${CMAKE_CURRENT_BINARY_DIR}/../../../extension/llm/custom_ops - ) -endif() - # llava_runner library add_subdirectory(runner) @@ -132,14 +124,12 @@ target_link_options_shared_lib(quantized_ops_lib) list(APPEND link_libraries quantized_kernels quantized_ops_lib) if(EXECUTORCH_BUILD_KERNELS_CUSTOM) - target_link_options_shared_lib(custom_ops) list(APPEND link_libraries custom_ops) endif() set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack) # Extra compile option and include dir for pthreadpool if(EXECUTORCH_BUILD_PTHREADPOOL) - list(APPEND _common_compile_options -DET_USE_THREADPOOL) list(APPEND link_libraries extension_threadpool pthreadpool) list(APPEND _common_include_directories ${XNNPACK_ROOT}/third-party/pthreadpool/include diff --git a/examples/models/llava/targets.bzl b/examples/models/llava/targets.bzl index 5efb099f06f..6f3a370acf4 100644 --- a/examples/models/llava/targets.bzl +++ b/examples/models/llava/targets.bzl @@ -7,9 +7,6 @@ def define_common_targets(): "main.cpp", ], compiler_flags = ["-Wno-global-constructors"], - preprocessor_flags = [ - "-DET_USE_THREADPOOL", - ], deps = [ "//executorch/examples/models/llava/runner:runner", "//executorch/extension/evalue_util:print_evalue", diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index 70f21f2751c..ba722d9c791 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -cmake_minimum_required(VERSION 3.19) +cmake_minimum_required(VERSION 3.24) # 3.24 is required for WHOLE_ARCHIVE project(executorch_jni) @@ -115,16 +115,10 @@ if(TARGET vulkan_backend) endif() if(EXECUTORCH_BUILD_KERNELS_CUSTOM) - add_subdirectory( - ${EXECUTORCH_ROOT}/extension/llm/custom_ops - ${CMAKE_CURRENT_BINARY_DIR}/../../extension/llm/custom_ops - ) - list(APPEND link_libraries custom_ops) - target_link_options_shared_lib(custom_ops) + list(APPEND link_libraries $) endif() if(TARGET pthreadpool) - target_compile_definitions(executorch_jni PRIVATE ET_USE_THREADPOOL=1) target_include_directories( executorch_jni PUBLIC diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt index c3969e6f9bf..eeb118d4344 100644 --- a/extension/llm/custom_ops/CMakeLists.txt +++ b/extension/llm/custom_ops/CMakeLists.txt @@ -78,7 +78,7 @@ target_include_directories( target_link_libraries(custom_ops PUBLIC ${custom_ops_libs} executorch_core) target_compile_options( - custom_ops PUBLIC ${_common_compile_options} -DET_USE_THREADPOOL + custom_ops PUBLIC ${_common_compile_options} ) install(TARGETS custom_ops DESTINATION lib) @@ -130,7 +130,7 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT) target_compile_options( custom_ops_aot_lib PUBLIC -Wno-deprecated-declarations -fPIC -frtti -fexceptions - ${_common_compile_options} -DET_USE_THREADPOOL + ${_common_compile_options} ) install(TARGETS custom_ops_aot_lib diff --git a/extension/llm/custom_ops/op_sdpa.cpp b/extension/llm/custom_ops/op_sdpa.cpp index f0a7775e803..371fcf38a24 100644 --- a/extension/llm/custom_ops/op_sdpa.cpp +++ b/extension/llm/custom_ops/op_sdpa.cpp @@ -19,8 +19,8 @@ #include #ifdef ET_USE_THREADPOOL -#include #include +#include #endif #include diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl index e3e8b30520f..1c4686fe3d0 100644 --- a/extension/llm/custom_ops/targets.bzl +++ b/extension/llm/custom_ops/targets.bzl @@ -37,7 +37,6 @@ def define_common_targets(): "//executorch/kernels/optimized:libblas{}".format(mkl_dep), "//executorch/kernels/optimized:libvec", "//executorch/extension/kernel_util:kernel_util", - "//executorch/extension/parallel:thread_parallel", "//executorch/extension/threadpool:threadpool", ], deps = [ diff --git a/extension/parallel/CMakeLists.txt b/extension/parallel/CMakeLists.txt deleted file mode 100644 index 7f727aafe46..00000000000 --- a/extension/parallel/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# Please keep this file formatted by running: -# ~~~ -# cmake-format -i CMakeLists.txt -# ~~~ - -if(NOT (EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO)) - message(FATAL_ERROR "extension/parallel requires extension/threadpool") -endif() - -add_library(extension_parallel thread_parallel.cpp) - -target_link_libraries(extension_parallel PUBLIC executorch_core extension_threadpool) -target_compile_options(extension_parallel PUBLIC ${_common_compile_options}) - -install( - TARGETS extension_parallel - DESTINATION lib - INCLUDES - DESTINATION ${_common_include_directories}) diff --git a/extension/parallel/TARGETS b/extension/parallel/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/parallel/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/parallel/targets.bzl b/extension/parallel/targets.bzl deleted file mode 100644 index 82b3deab129..00000000000 --- a/extension/parallel/targets.bzl +++ /dev/null @@ -1,30 +0,0 @@ -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime") - -def define_common_targets(): - """Defines targets that should be shared between fbcode and xplat. - - The directory containing this targets.bzl file should also contain both - TARGETS and BUCK files that call this function. - """ - - for aten_mode in get_aten_mode_options(): - aten_suffix = ("_aten" if aten_mode else "") - - runtime.cxx_library( - name = "thread_parallel" + aten_suffix, - srcs = [ - "thread_parallel.cpp", - ], - exported_headers = [ - "thread_parallel.h", - ], - visibility = [ - "//executorch/...", - "@EXECUTORCH_CLIENTS", - ], - deps = [ - "//executorch/extension/threadpool:threadpool", - "//executorch/runtime/core:core", - "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, - ], - ) diff --git a/extension/parallel/test/TARGETS b/extension/parallel/test/TARGETS deleted file mode 100644 index 2341af9282f..00000000000 --- a/extension/parallel/test/TARGETS +++ /dev/null @@ -1,8 +0,0 @@ -# Any targets that should be shared between fbcode and xplat must be defined in -# targets.bzl. This file can contain fbcode-only targets. - -load(":targets.bzl", "define_common_targets") - -oncall("executorch") - -define_common_targets() diff --git a/extension/parallel/test/targets.bzl b/extension/parallel/test/targets.bzl deleted file mode 100644 index 791c0727471..00000000000 --- a/extension/parallel/test/targets.bzl +++ /dev/null @@ -1,19 +0,0 @@ -load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") - -def define_common_targets(): - """Defines targets that should be shared between fbcode and xplat. - - The directory containing this targets.bzl file should also contain both - TARGETS and BUCK files that call this function. - """ - - runtime.cxx_test( - name = "thread_parallel_test", - srcs = [ - "thread_parallel_test.cpp", - ], - deps = [ - "//executorch/extension/parallel:thread_parallel", - "//executorch/runtime/platform:platform", - ], - ) diff --git a/extension/parallel/thread_parallel.h b/extension/parallel/thread_parallel.h index 8b174075ae9..8bd1a572cd7 100644 --- a/extension/parallel/thread_parallel.h +++ b/extension/parallel/thread_parallel.h @@ -8,46 +8,7 @@ #pragma once -#include -#include - -namespace executorch { -namespace extension { - -/** - * A helper to run function in parallel. - * - * begin, end: describe the extent of the workitems via first and last workitem - * to be processed - * grain_size: number of workitems processed by user callback which is - * described below - * f: user function applied in parallel to the chunks, signature: - * void f(int64_t begin, int64_t end) - * Returns true if all work items are processed successfully, false otherwise - * - * Warning: parallel_for does NOT copy thread local states from the current - * thread to the worker threads. Users need to protect the access to captured - * data if they mutate them in f. - */ -bool parallel_for( - const int64_t begin, - const int64_t end, - const int64_t grain_size, - const std::function& f); - -int64_t get_thread_num(); - -void set_thread_num(int64_t thread_num); - -} // namespace extension -} // namespace executorch - -namespace torch { -namespace executor { -// TODO(T197294990): Remove these deprecated aliases once all users have moved -// to the new `::executorch` namespaces. -using ::executorch::extension::get_thread_num; -using ::executorch::extension::parallel_for; -using ::executorch::extension::set_thread_num; -} // namespace executor -} // namespace torch +// This header is a stub left behind after the move to +// executorch/runtime/kernel. As such, it is deprecated; include and +// use the below header directly instead. +#include diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt index 90288656674..6e107cb6634 100644 --- a/extension/threadpool/CMakeLists.txt +++ b/extension/threadpool/CMakeLists.txt @@ -21,7 +21,8 @@ if(NOT CMAKE_CXX_STANDARD) endif() add_library( - extension_threadpool threadpool.cpp threadpool_guard.cpp cpuinfo_utils.cpp + extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp + cpuinfo_utils.cpp ) target_link_libraries( extension_threadpool PUBLIC executorch_core cpuinfo pthreadpool @@ -32,6 +33,7 @@ target_include_directories( PUBLIC ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include ) +target_compile_definitions(extension_threadpool PUBLIC ET_USE_THREADPOOL) target_compile_options(extension_threadpool PUBLIC ${_common_compile_options}) # Install libraries @@ -41,3 +43,7 @@ install( INCLUDES DESTINATION ${_common_include_directories} ) + +if(BUILD_TESTING) + add_subdirectory(test) +endif() diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl index 8bb0398b385..1c34dbbc7d4 100644 --- a/extension/threadpool/targets.bzl +++ b/extension/threadpool/targets.bzl @@ -9,6 +9,7 @@ def define_common_targets(): """ _THREADPOOL_SRCS = [ + "thread_parallel.cpp", "threadpool.cpp", "threadpool_guard.cpp", ] + (["fb/threadpool_use_n_threads.cpp"] if not runtime.is_oss else []) @@ -29,6 +30,8 @@ def define_common_targets(): exported_deps = [ third_party_dep("pthreadpool"), third_party_dep("cpuinfo"), + # Allow users to use the header without an extra deps entry. + "//executorch/runtime/kernel:thread_parallel_interface", ], exported_preprocessor_flags = [ "-DET_USE_THREADPOOL", diff --git a/extension/parallel/test/CMakeLists.txt b/extension/threadpool/test/CMakeLists.txt similarity index 53% rename from extension/parallel/test/CMakeLists.txt rename to extension/threadpool/test/CMakeLists.txt index ab37f66c17d..3f9b13f2ab4 100644 --- a/extension/parallel/test/CMakeLists.txt +++ b/extension/threadpool/test/CMakeLists.txt @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# @generated by test/utils/generate_gtest_cmakelists.py +# # This file should be formatted with # ~~~ # cmake-format -i CMakeLists.txt @@ -12,28 +14,14 @@ # cmake_minimum_required(VERSION 3.19) -project(extension_parallel_test) - -# Use C++17 for test. -set(CMAKE_CXX_STANDARD 17) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) include(${EXECUTORCH_ROOT}/build/Test.cmake) -set(_test_srcs thread_parallel_test.cpp ../thread_parallel.cpp) +set(_test_srcs thread_parallel_test.cpp threadpool_test.cpp) et_cxx_test( - extension_parallel_test - SOURCES - ${_test_srcs} - EXTRA_LIBS - pthreadpool - cpuinfo + extension_threadpool_test SOURCES ${_test_srcs} EXTRA_LIBS extension_threadpool ) -target_include_directories( - extension_parallel_test - PRIVATE ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include - ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include -) diff --git a/extension/threadpool/test/targets.bzl b/extension/threadpool/test/targets.bzl index b8a39d8969a..8bdf776c825 100644 --- a/extension/threadpool/test/targets.bzl +++ b/extension/threadpool/test/targets.bzl @@ -18,3 +18,15 @@ def define_common_targets(): "//executorch/extension/threadpool:threadpool", ], ) + + runtime.cxx_test( + name = "thread_parallel_test", + srcs = [ + "thread_parallel_test.cpp", + ], + deps = [ + "//executorch/extension/threadpool:threadpool", + "//executorch/runtime/kernel:thread_parallel_interface", + "//executorch/runtime/platform:platform", + ], + ) diff --git a/extension/parallel/test/thread_parallel_test.cpp b/extension/threadpool/test/thread_parallel_test.cpp similarity index 76% rename from extension/parallel/test/thread_parallel_test.cpp rename to extension/threadpool/test/thread_parallel_test.cpp index d386429100d..e31f16eee22 100644 --- a/extension/parallel/test/thread_parallel_test.cpp +++ b/extension/threadpool/test/thread_parallel_test.cpp @@ -11,13 +11,16 @@ #include #include -#include +#include #include using namespace ::testing; using ::executorch::extension::parallel_for; -class ParallelTest : public ::testing::Test { +#ifndef ET_USE_THREADPOOL +#endif + +class ParallelTest : public ::testing::TestWithParam { protected: void SetUp() override { data_.fill(0); @@ -42,12 +45,25 @@ class ParallelTest : public ::testing::Test { } } + template + bool parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const Func& func) { + if (GetParam()) { + return executorch::extension::parallel_for(begin, end, grain_size, func); + } + return executorch::extension::internal::parallel_for_no_threadpool( + begin, end, grain_size, func); + } + std::array data_; std::mutex mutex_; int sum_of_all_elements_; }; -TEST_F(ParallelTest, TestAllInvoked) { +TEST_P(ParallelTest, TestAllInvoked) { EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -57,7 +73,7 @@ TEST_F(ParallelTest, TestAllInvoked) { } } -TEST_F(ParallelTest, TestAllInvokedWithMutex) { +TEST_P(ParallelTest, TestAllInvokedWithMutex) { EXPECT_TRUE(parallel_for(0, 10, 1, [this](int64_t begin, int64_t end) { this->RunExclusiveTask(begin, end); })); @@ -70,7 +86,7 @@ TEST_F(ParallelTest, TestAllInvokedWithMutex) { EXPECT_EQ(sum_of_all_elements_, expected_sum); } -TEST_F(ParallelTest, TestInvalidRange) { +TEST_P(ParallelTest, TestInvalidRange) { et_pal_init(); EXPECT_FALSE(parallel_for(10, 0, 1, [this](int64_t begin, int64_t end) { this->RunExclusiveTask(begin, end); @@ -82,7 +98,7 @@ TEST_F(ParallelTest, TestInvalidRange) { EXPECT_EQ(sum_of_all_elements_, 0); } -TEST_F(ParallelTest, TestInvalidRange2) { +TEST_P(ParallelTest, TestInvalidRange2) { et_pal_init(); EXPECT_FALSE(parallel_for(6, 5, 1, [this](int64_t begin, int64_t end) { this->RunExclusiveTask(begin, end); @@ -94,7 +110,7 @@ TEST_F(ParallelTest, TestInvalidRange2) { EXPECT_EQ(sum_of_all_elements_, 0); } -TEST_F(ParallelTest, TestInvokePartialFromBeginning) { +TEST_P(ParallelTest, TestInvokePartialFromBeginning) { EXPECT_TRUE(parallel_for(0, 5, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -107,7 +123,7 @@ TEST_F(ParallelTest, TestInvokePartialFromBeginning) { } } -TEST_F(ParallelTest, TestInvokePartialToEnd) { +TEST_P(ParallelTest, TestInvokePartialToEnd) { EXPECT_TRUE(parallel_for(5, 10, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -120,7 +136,7 @@ TEST_F(ParallelTest, TestInvokePartialToEnd) { } } -TEST_F(ParallelTest, TestInvokePartialMiddle) { +TEST_P(ParallelTest, TestInvokePartialMiddle) { EXPECT_TRUE(parallel_for(2, 8, 1, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -136,7 +152,7 @@ TEST_F(ParallelTest, TestInvokePartialMiddle) { } } -TEST_F(ParallelTest, TestChunkSize2) { +TEST_P(ParallelTest, TestChunkSize2) { EXPECT_TRUE(parallel_for(0, 10, 2, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -146,7 +162,7 @@ TEST_F(ParallelTest, TestChunkSize2) { } } -TEST_F(ParallelTest, TestChunkSize2Middle) { +TEST_P(ParallelTest, TestChunkSize2Middle) { EXPECT_TRUE(parallel_for(3, 8, 2, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -162,7 +178,7 @@ TEST_F(ParallelTest, TestChunkSize2Middle) { } } -TEST_F(ParallelTest, TestChunkSize3) { +TEST_P(ParallelTest, TestChunkSize3) { EXPECT_TRUE(parallel_for(0, 10, 3, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -172,7 +188,7 @@ TEST_F(ParallelTest, TestChunkSize3) { } } -TEST_F(ParallelTest, TestChunkSize6) { +TEST_P(ParallelTest, TestChunkSize6) { EXPECT_TRUE(parallel_for(0, 10, 6, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -182,7 +198,7 @@ TEST_F(ParallelTest, TestChunkSize6) { } } -TEST_F(ParallelTest, TestChunkSizeTooLarge) { +TEST_P(ParallelTest, TestChunkSizeTooLarge) { EXPECT_TRUE(parallel_for(0, 10, 11, [this](int64_t begin, int64_t end) { this->RunTask(begin, end); })); @@ -191,3 +207,8 @@ TEST_F(ParallelTest, TestChunkSizeTooLarge) { EXPECT_EQ(data_[i], i); } } + +INSTANTIATE_TEST_SUITE_P( + ParallelTestWithOrWithoutThreadpool, + ParallelTest, + ::testing::Values(true, false)); diff --git a/extension/parallel/thread_parallel.cpp b/extension/threadpool/thread_parallel.cpp similarity index 85% rename from extension/parallel/thread_parallel.cpp rename to extension/threadpool/thread_parallel.cpp index dfbb911d3a9..fa26742368f 100644 --- a/extension/parallel/thread_parallel.cpp +++ b/extension/threadpool/thread_parallel.cpp @@ -6,11 +6,12 @@ * LICENSE file in the root directory of this source tree. */ +#include #include -#include #include -#include +#include +#include #include namespace executorch { @@ -53,9 +54,12 @@ bool parallel_for( const int64_t end, const int64_t grain_size, const std::function& f) { - ET_LOG_AND_RETURN_IF_FALSE(begin >= 0 && end >= 0); - ET_LOG_AND_RETURN_IF_FALSE(end >= begin); - ET_LOG_AND_RETURN_IF_FALSE(grain_size > 0); + ET_CHECK_OR_RETURN_FALSE( + begin >= 0 && end >= 0 && end >= begin, + "begin = %" PRId64 ", end = %" PRId64, + begin, + end); + ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size); int64_t num_tasks = 0, chunk_size = 0; std::tie(num_tasks, chunk_size) = calc_num_tasks_and_chunk_size(begin, end, grain_size); diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index c6d31c20263..23e26bfa72b 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -43,7 +43,7 @@ endif() list(TRANSFORM _optimized_cpublas__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(cpublas STATIC ${_optimized_cpublas__srcs}) target_link_libraries( - cpublas PUBLIC executorch_core eigen_blas extension_parallel extension_threadpool + cpublas PUBLIC executorch_core eigen_blas extension_threadpool ) target_compile_options(cpublas PUBLIC ${_common_compile_options}) diff --git a/kernels/optimized/blas/BlasKernel.h b/kernels/optimized/blas/BlasKernel.h index c2b03cfebdd..fc47b4482d6 100644 --- a/kernels/optimized/blas/BlasKernel.h +++ b/kernels/optimized/blas/BlasKernel.h @@ -11,8 +11,8 @@ #include #include -#include #include +#include #include diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl index 659c7afe090..dd246f38984 100644 --- a/kernels/optimized/lib_defs.bzl +++ b/kernels/optimized/lib_defs.bzl @@ -186,7 +186,10 @@ def define_libs(is_fbcode=False): ], ) - LIBBLAS_DEPS = [third_party_dep("cpuinfo")] + LIBBLAS_DEPS = [ + third_party_dep("cpuinfo"), + "//executorch/extension/threadpool:threadpool", + ] for libblas_name, mkl_dep in [("libblas", "fbsource//third-party/mkl:mkl_lp64_omp"), ("libblas_mkl_noomp", "fbsource//third-party/mkl:mkl")]: runtime.cxx_library( @@ -229,7 +232,6 @@ def define_libs(is_fbcode=False): "DEFAULT": [], }) + LIBBLAS_DEPS, exported_deps = [ - "//executorch/extension/parallel:thread_parallel", "//executorch/kernels/optimized:libutils", "//executorch/runtime/core/exec_aten:lib", ], diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h index 35cfdfbaa72..6d7b17443ee 100644 --- a/kernels/portable/cpu/util/reduce_util.h +++ b/kernels/portable/cpu/util/reduce_util.h @@ -8,8 +8,10 @@ #pragma once +#include #include #include +#include #include #include @@ -24,9 +26,12 @@ void apply_on_flat_ix_with_stride_and_base( const size_t base, const size_t start, const size_t end) { - for (size_t i = start; i <= end; i++) { - fn(base + i * stride); - } + executorch::extension::parallel_for( + start, end + 1, [&](auto start_, auto end_) { + for (const auto i : c10::irange(start_, end_)) { + fn(base + i * stride); + } + }); } template @@ -36,9 +41,12 @@ void apply_on_flat_and_dim_ix_with_stride_and_base( const size_t base, const size_t start, const size_t end) { - for (size_t i = start; i <= end; i++) { - fn(base + i * stride, i); - } + executorch::extension::parallel_for( + start, end + 1, [&](auto start_, auto end_) { + for (const auto i : c10::irange(start_, end_)) { + fn(base + i * stride, i); + } + }); } template diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index 739bc117fbf..eb48a6e59b1 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -302,8 +302,12 @@ def define_common_targets(): srcs = ["reduce_util.cpp"], exported_headers = ["reduce_util.h"], deps = [ - "//executorch/runtime/kernel:kernel_includes{}".format(suffix), "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix), + "//executorch/runtime/kernel:kernel_includes{}".format(suffix), + ], + exported_deps = [ + "//executorch/runtime/kernel:thread_parallel_interface", + "//executorch/runtime/core/portable_type/c10/c10:c10", ], exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [], visibility = [ diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl index d49435f2825..e67f76728b8 100644 --- a/runtime/kernel/targets.bzl +++ b/runtime/kernel/targets.bzl @@ -51,6 +51,19 @@ def define_common_targets(): preprocessor_flags = ["-DMAX_KERNEL_NUM=1"], ) + runtime.cxx_library( + name = "thread_parallel_interface", + exported_headers = ["thread_parallel_interface.h"], + exported_deps = [ + "//executorch/runtime/core:core", + "//executorch/runtime/platform:platform", + ], + visibility = [ + "//executorch/...", + "@EXECUTORCH_CLIENTS", + ], + ) + for aten_mode in get_aten_mode_options(): aten_suffix = "_aten" if aten_mode else "" diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h new file mode 100644 index 00000000000..cd6901e48a6 --- /dev/null +++ b/runtime/kernel/thread_parallel_interface.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace executorch { +namespace extension { +namespace internal { +template +inline bool parallel_for_no_threadpool( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const Func& f) { + ET_CHECK_OR_RETURN_FALSE( + begin >= 0 && end >= 0 && end >= begin, + "begin = %" PRId64 ", end = %" PRId64, + begin, + end); + ET_CHECK_OR_RETURN_FALSE(grain_size > 0, "grain_size = %" PRId64, grain_size); + f(begin, end); + return true; +} + +// Match GRAIN_SIZE from PyTorch core. +// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/TensorIterator.h#L78 +constexpr int64_t GRAIN_SIZE = 32768; + +} // namespace internal + +#ifdef ET_USE_THREADPOOL +/** + * A helper to run a function in parallel. + * + * begin, end: describe the extent of the workitems via first and last workitem + * to be processed + * grain_size: number of workitems processed by user callback which is + * described below + * f: user function applied in parallel to the chunks, signature: + * void f(int64_t begin, int64_t end) + * Returns true if all work items are processed successfully, false otherwise + * + * Warning: parallel_for does NOT copy thread local states from the current + * thread to the worker threads. Users need to protect the access to captured + * data if they mutate them in f. + */ +bool parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const std::function& f); + +int64_t get_thread_num(); + +void set_thread_num(int64_t thread_num); +#else // ET_USE_THREADPOOL +template +bool parallel_for( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const Func& func) { + return internal::parallel_for_no_threadpool(begin, end, grain_size, func); +} + +inline int64_t get_thread_num() { + return 0; +} + +inline void set_thread_num(int64_t thread_num) { + ET_DCHECK_MSG(false, "cannot set_thread_num without threading support!"); +} +#endif // ET_USE_THREADPOOL + +/** + * Convenience version of parallel_for that sets the grain size to + * internal::GRAIN_SIZE. + */ +template +bool parallel_for(const int64_t begin, const int64_t end, const Func& func) { + return parallel_for(begin, end, internal::GRAIN_SIZE, func); +} +} // namespace extension +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::extension::get_thread_num; +using ::executorch::extension::parallel_for; +using ::executorch::extension::set_thread_num; +} // namespace executor +} // namespace torch diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json index cc5e625f1e8..be594f9d5f4 100644 --- a/test/utils/OSSTestConfig.json +++ b/test/utils/OSSTestConfig.json @@ -59,6 +59,16 @@ "extension_tensor" ] }, + { + "directory": "extension/threadpool/test", + "sources": [ + "thread_parallel_test.cpp", + "threadpool_test.cpp" + ], + "additional_libs": [ + "extension_threadpool" + ] + }, { "directory": "kernels/portable/cpu/util/test", "sources": [