diff --git a/hwy/contrib/thread_pool/topology.cc b/hwy/contrib/thread_pool/topology.cc
index 3d24f4f4..17f35630 100644
--- a/hwy/contrib/thread_pool/topology.cc
+++ b/hwy/contrib/thread_pool/topology.cc
@@ -64,7 +64,7 @@
 
 namespace hwy {
 
-HWY_DLLEXPORT bool HaveThreadingSupport() {
+HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport() {
 #if HWY_ARCH_WASM
   return emscripten_has_threading_support() != 0;
 #else
@@ -72,7 +72,7 @@ HWY_DLLEXPORT bool HaveThreadingSupport() {
 #endif
 }
 
-HWY_DLLEXPORT size_t TotalLogicalProcessors() {
+HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() {
   size_t lp = 0;
 #if HWY_ARCH_WASM
   const int num_cores = emscripten_num_logical_cores();
@@ -111,7 +111,7 @@ HWY_DLLEXPORT size_t TotalLogicalProcessors() {
 #include <sys/syscall.h>
 #endif
 
-HWY_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps) {
+HWY_CONTRIB_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps) {
 #if HWY_OS_WIN
   // Only support the first 64 because WINE does not support processor groups.
   const HANDLE hThread = GetCurrentThread();
@@ -173,7 +173,7 @@ HWY_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps) {
 #endif
 }
 
-HWY_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps) {
+HWY_CONTRIB_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps) {
 #if HWY_OS_WIN
   const HANDLE hThread = GetCurrentThread();
   const DWORD_PTR prev = SetThreadAffinityMask(hThread, lps.Get64());
@@ -385,7 +385,7 @@ std::vector<PerPackage> DetectPackages(std::vector<Topology::LP>& lps) {
 }  // namespace
 #endif  // HWY_OS_LINUX
 
-HWY_DLLEXPORT Topology::Topology() {
+HWY_CONTRIB_DLLEXPORT Topology::Topology() {
 #if HWY_OS_LINUX
   lps.resize(TotalLogicalProcessors());
   const std::vector<PerPackage>& per_package = DetectPackages(lps);
diff --git a/hwy/contrib/thread_pool/topology.h b/hwy/contrib/thread_pool/topology.h
index 95b0835b..f80fc47c 100644
--- a/hwy/contrib/thread_pool/topology.h
+++ b/hwy/contrib/thread_pool/topology.h
@@ -28,7 +28,7 @@
 namespace hwy {
 
 // Returns false if std::thread should not be used.
-HWY_DLLEXPORT bool HaveThreadingSupport();
+HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport();
 
 // Upper bound on logical processors, including hyperthreads.
 static constexpr size_t kMaxLogicalProcessors = 1024;  // matches glibc
@@ -38,12 +38,12 @@ using LogicalProcessorSet = BitSet4096<kMaxLogicalProcessors>;
 
 // Returns false, or sets `lps` to all logical processors which are online and
 // available to the current thread.
-HWY_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps);
+HWY_CONTRIB_DLLEXPORT bool GetThreadAffinity(LogicalProcessorSet& lps);
 
 // Ensures the current thread can only run on the logical processors in `lps`.
 // Returns false if not supported (in particular on Apple), or if the
 // intersection between `lps` and `GetThreadAffinity` is the empty set.
-HWY_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps);
+HWY_CONTRIB_DLLEXPORT bool SetThreadAffinity(const LogicalProcessorSet& lps);
 
 // Returns false, or ensures the current thread will only run on `lp`, which
 // must not exceed `TotalLogicalProcessors`. Note that this merely calls
@@ -58,11 +58,11 @@ static inline bool PinThreadToLogicalProcessor(size_t lp) {
 // provided by the hardware clamped to `kMaxLogicalProcessors`.
 // These processors are not necessarily all usable; you can determine which are
 // via GetThreadAffinity().
-HWY_DLLEXPORT size_t TotalLogicalProcessors();
+HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors();
 
 struct Topology {
   // Caller must check packages.empty(); if so, do not use any fields.
-  HWY_DLLEXPORT Topology();
+  HWY_CONTRIB_DLLEXPORT Topology();
 
   // Clique of cores with lower latency to each other. On Apple M1 these are
   // four cores sharing an L2. On Zen4 these 'CCX' are up to eight cores sharing