From ecd46d5ea083aa5a46573b6cf973d9c82e4e5664 Mon Sep 17 00:00:00 2001
From: Yangqing Jia <jiayq84@gmail.com>
Date: Thu, 3 Sep 2015 20:31:48 -0700
Subject: [PATCH] A memory pool implementation based on cnmem. Added cnmem
 license to LICENSE.

---
 LICENSE                             |   33 +
 caffe2/core/BREW                    |   15 +
 caffe2/core/context_gpu.h           |   22 +-
 caffe2/core/cuda_memorypool.cc      |  118 +++
 caffe2/core/cuda_memorypool.h       |   74 ++
 caffe2/core/cuda_memorypool_test.cc |   64 ++
 third_party/cnmem/BREW              |    9 +
 third_party/cnmem/cnmem.cpp         | 1287 +++++++++++++++++++++++++++
 third_party/cnmem/cnmem.h           |  263 ++++++
 9 files changed, 1868 insertions(+), 17 deletions(-)
 create mode 100644 caffe2/core/cuda_memorypool.cc
 create mode 100644 caffe2/core/cuda_memorypool.h
 create mode 100644 caffe2/core/cuda_memorypool_test.cc
 create mode 100644 third_party/cnmem/BREW
 create mode 100644 third_party/cnmem/cnmem.cpp
 create mode 100644 third_party/cnmem/cnmem.h

diff --git a/LICENSE b/LICENSE
index 03141bd5da2e6..9e3fe9c342721 100644
--- a/LICENSE
+++ b/LICENSE
@@ -124,3 +124,36 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 IN THE SOFTWARE.
 *** end zmqhpp license ***
+
+Some part of the caffe2 code (specifically, third_party/cnmem) comes from the
+open-source cnmem code under the 2-clause BSD license. The cnmem license is
+as follows:
+*** begin cnmem license ***
+/* ********************************************************************** 
+ * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ********************************************************************** */
+*** end cnmem license ***
diff --git a/caffe2/core/BREW b/caffe2/core/BREW
index 8ca5e9363bcfb..074fe32d87200 100644
--- a/caffe2/core/BREW
+++ b/caffe2/core/BREW
@@ -38,13 +38,16 @@ cuda_library(
   srcs = [
     "blob_serialization_gpu.cc",
     "common_gpu.cc",
+    "cuda_memorypool.cc",
   ],
   hdrs = [
     "common_gpu.h",
     "context_gpu.h",
+    "cuda_memorypool.h",
   ],
   deps = [
     ":core",
+    "//third_party/cnmem:cnmem",
   ],
   whole_archive = True,
 )
@@ -87,6 +90,18 @@ cc_test(
   ],
 )
 
+cc_test(
+  name = "cuda_memorypool_test",
+  srcs = [
+      "cuda_memorypool_test.cc",
+  ],
+  deps = [
+      ":core_gpu",
+      "//gtest:gtest",
+      "//gtest:gtest_main",
+  ],
+)
+
 cc_test(
   name = "registry_test",
   srcs = ["registry_test.cc"],
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index ee0d84d071058..4d87b2c790435 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -3,6 +3,7 @@
 
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context.h"
+#include "caffe2/core/cuda_memorypool.h"
 #include "caffe2/core/types.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "glog/logging.h"
@@ -85,25 +86,12 @@ class CUDAContext {
     return curand_generator_;
   }
 
-  static void* New(size_t nbytes) {
-    void* dev_ptr;
-    CUDA_CHECK(cudaMalloc(&dev_ptr, nbytes));
-    // CUDA_CHECK(cudaMemset(dev_ptr, 0, nbytes));
-    return dev_ptr;
+  static inline void* New(size_t nbytes) {
+    return CudaMemoryPool::New(nbytes);
   }
 
-  static void Delete(void* data) {
-    cudaError_t error = cudaFree(data);
-    // For some reason, in Python runtime we sometimes delete a data pointer
-    // after the cuda runtime exits - this is odd but is probably caused by
-    // a static workspace that pycaffe2 uses, and the destruction got entangled
-    // in some race condition. Anyway, since cuda runtime is exiting anyway, we
-    // will not need to worry about memory leak, so we basically ignore it.
-    // This is definitely not ideal but works for now.
-    if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
-      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
-                 << cudaGetErrorString(error);
-    }
+  static inline void Delete(void* data) {
+    CudaMemoryPool::Delete(data);
   }
 
   template <class SrcContext, class DstContext>
diff --git a/caffe2/core/cuda_memorypool.cc b/caffe2/core/cuda_memorypool.cc
new file mode 100644
index 0000000000000..ad407307774b7
--- /dev/null
+++ b/caffe2/core/cuda_memorypool.cc
@@ -0,0 +1,118 @@
+#include "third_party/cnmem/cnmem.h"
+#include "caffe2/core/cuda_memorypool.h"
+
+namespace caffe2 {
+
+#define CNMEM_CHECK(condition) \
+  do { \
+    cnmemStatus_t error = condition; \
+    CHECK_EQ(error, CNMEM_STATUS_SUCCESS) << cnmemGetErrorString(error); \
+  } while (0)
+
+bool CudaMemoryPool::is_memory_pool_setup_ = false;
+bool CudaMemoryPool::memory_allocated_before_setup_ = false;
+vector<bool> CudaMemoryPool::memory_pool_available_for_device_(0);
+vector<cudaStream_t> CudaMemoryPool::per_device_streams_(0);
+
+bool CudaMemoryPool::InitializeMemoryPool(
+    const vector<int>& device_ids,
+    const float proportion_of_memory_to_reserve) {
+  if (memory_allocated_before_setup_) {
+    LOG(ERROR) << "There is cuda memory allocated before we initialize the "
+                  "memory pool. This should not happen: you should either "
+                  "use raw cudaMalloc and cudaFree and not initialize the "
+                  "pool at all, or initialize the pool before you allocate "
+                  "anything.";
+    return false;
+  }
+  if (is_memory_pool_setup_) {
+    LOG(ERROR) << "Memory pool is already set up. I cannot set up it twice.";
+    return false;
+  }
+
+  // The actual initialization.
+  int device_count;
+  CUDA_CHECK(cudaGetDeviceCount(&device_count));
+  // Initialize the flags for the memory pool.
+  memory_pool_available_for_device_.resize(device_count, false);
+  per_device_streams_.resize(device_count, nullptr);
+  // Push the current device so we can recover later.
+  int initial_device;
+  CUDA_CHECK(cudaGetDevice(&initial_device));
+
+  vector<cnmemDevice_t> cnmem_devs(device_ids.size());
+  for (int i = 0; i < device_ids.size(); ++i) {
+    const int device_id = device_ids[i];
+    CHECK_GE(device_id, 0);
+    CHECK_LT(device_id, device_count);
+    // This ensures we do not specify the same device twice.
+    CHECK(!memory_pool_available_for_device_[device_id]);
+    CUDA_CHECK(cudaSetDevice(device_id));
+    size_t free_memory, used_memory;
+    CUDA_CHECK(cudaMemGetInfo(&free_memory, &used_memory));
+    LOG(INFO) << "Reserving " << proportion_of_memory_to_reserve * 100
+              << "percent of the free memory (total " << free_memory
+              << ") on device " << device_id;
+    // Note: we create a dummy non-null stream for memory allocations, so that
+    // any malloc can be called from any cuda stream, since caffe2 uses a lot of
+    // non-default streams for computation. We will allocate all the reserved
+    // memory to that non-null stream.
+    cnmem_devs[i].device = device_id;
+    cnmem_devs[i].size = size_t(proportion_of_memory_to_reserve * free_memory);
+    CUDA_CHECK(cudaStreamCreate(&per_device_streams_[i]));
+    cnmem_devs[i].numStreams = 1;
+    cnmem_devs[i].streams = &per_device_streams_[i];
+    cnmem_devs[i].streamSizes = &cnmem_devs[i].size;
+    memory_pool_available_for_device_[device_id] = true;
+  }
+  CNMEM_CHECK(
+      cnmemInit(cnmem_devs.size(), cnmem_devs.data(), CNMEM_FLAGS_DEFAULT));
+  // After initialization, let's set back the device.
+  CUDA_CHECK(cudaSetDevice(initial_device));
+  LOG(INFO) << "Set up memory pool.";
+  is_memory_pool_setup_ = true;
+  return true;
+}
+
+bool CudaMemoryPool::FinalizeMemoryPool() {
+  // If it has not been set up yet, we have nothing to do.
+  if (!is_memory_pool_setup_) {
+    return true;
+  }
+  CNMEM_CHECK(cnmemFinalize());
+  for (int i = 0; i < per_device_streams_.size(); ++i) {
+    if (per_device_streams_[i]) {
+      CUDA_CHECK(cudaStreamDestroy(per_device_streams_[i]));
+    }
+  }
+  // Reset all the static variables
+  per_device_streams_.resize(0);
+  memory_pool_available_for_device_.resize(0);
+  memory_allocated_before_setup_ = false;
+  is_memory_pool_setup_ = false;
+  return true;
+}
+
+void* CudaMemoryPool::NewWithMemoryPool(size_t nbytes) {
+  int device_id;
+  CUDA_CHECK(cudaGetDevice(&device_id));
+  CHECK(memory_pool_available_for_device_[device_id])
+      << "Trying to allocate on device " << device_id
+      << ", but memory pool is not initialized on that device.";
+  void* ptr;
+  CNMEM_CHECK(cnmemMalloc(&ptr, nbytes, per_device_streams_[device_id]));
+  return ptr;
+}
+
+void CudaMemoryPool::DeleteWithMemoryPool(void* data) {
+  cudaPointerAttributes attr;
+  CUDA_CHECK(cudaPointerGetAttributes(&attr, data));
+  DCHECK_EQ(attr.memoryType, cudaMemoryTypeDevice);
+  CHECK(memory_pool_available_for_device_[attr.device])
+      << "Current pointer belongs to " << attr.device
+      << ", but memory pool is not initialized on that device. "
+      << "Was your pointer allocated using the memory pool?";
+  CNMEM_CHECK(cnmemFree(data, per_device_streams_[attr.device]));
+}
+
+}  // namespace caffe2
\ No newline at end of file
diff --git a/caffe2/core/cuda_memorypool.h b/caffe2/core/cuda_memorypool.h
new file mode 100644
index 0000000000000..b62d2e428c1b9
--- /dev/null
+++ b/caffe2/core/cuda_memorypool.h
@@ -0,0 +1,74 @@
+#ifndef CAFFE2_CORE_CUDA_MEMORYPOOL_H_
+#define CAFFE2_CORE_CUDA_MEMORYPOOL_H_
+
+#include <cstddef>
+
+#include "caffe2/core/common_gpu.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+class CudaMemoryPool {
+ public:
+  // Initializes the memory pool on the device ids, and pre-preserves the given
+  // proportion of the currently free memory on the device.
+  static bool InitializeMemoryPool(
+      const vector<int>& device_ids,
+      const float proportion_of_memory_to_reserve);
+
+  // Finalizes the memory pool. This has to be called after all memory allocated
+  // by the memory pool has been freed.
+  static bool FinalizeMemoryPool();
+
+  static inline bool MemoryPoolInitialized() { return is_memory_pool_setup_; }
+  static inline bool MemoryPoolAvailableForDevice(int device_id) {
+    return (device_id < memory_pool_available_for_device_.size() &&
+            memory_pool_available_for_device_[device_id]);
+  }
+
+  static inline void* New(size_t nbytes) {
+    if (is_memory_pool_setup_) {
+      return NewWithMemoryPool(nbytes);
+    } else {
+      // If memory pool is not set up, use simple cudaMalloc.
+      void* dev_ptr;
+      CUDA_CHECK(cudaMalloc(&dev_ptr, nbytes));
+      memory_allocated_before_setup_ = true;
+      return dev_ptr;
+    }
+  }
+
+  static inline void Delete(void* data) {
+    if (is_memory_pool_setup_) {
+      DeleteWithMemoryPool(data);
+    } else {
+      // If memory pool is not set up, use simple cudaFree.
+      cudaError_t error = cudaFree(data);
+      // For some reason, in Python runtime we sometimes delete a data pointer
+      // after the cuda runtime exits - this is odd but is probably caused by
+      // a static workspace that pycaffe2 uses, and the destruction got entangled
+      // in some race condition. Anyway, since cuda runtime is exiting anyway, we
+      // will not need to worry about memory leak, so we basically ignore it.
+      // This is definitely not ideal but works for now.
+      if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
+        LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
+                   << cudaGetErrorString(error);
+      }
+    }
+  }
+
+ private:
+  // CudaMemoryPool is a singleton, so it should not be instantiated.
+  CudaMemoryPool() {};
+  static void* NewWithMemoryPool(size_t nbytes);
+  static void DeleteWithMemoryPool(void* data);
+
+  static bool is_memory_pool_setup_;
+  static bool memory_allocated_before_setup_;
+  static vector<bool> memory_pool_available_for_device_;
+  static vector<cudaStream_t> per_device_streams_;
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_CUDA_MEMORYPOOL_H_
diff --git a/caffe2/core/cuda_memorypool_test.cc b/caffe2/core/cuda_memorypool_test.cc
new file mode 100644
index 0000000000000..d36c210f7d634
--- /dev/null
+++ b/caffe2/core/cuda_memorypool_test.cc
@@ -0,0 +1,64 @@
+#include "caffe2/core/cuda_memorypool.h"
+#include "caffe2/core/context_gpu.h"
+#include "gtest/gtest.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+struct UseMemoryPool { static const bool value = true; };
+struct NotUseMemoryPool { static const bool value = false; };
+
+template <class UsePoolOrNot>
+class MemoryPoolTest : public ::testing::Test {
+ protected:
+  MemoryPoolTest() : device_count_(0) {}
+  // virtual void SetUp() will be called before each test is run.  You
+  // should define it if you need to initialize the varaibles.
+  // Otherwise, this can be skipped.
+  void SetUp() override {
+    int device_count_;
+    CUDA_CHECK(cudaGetDeviceCount(&device_count_));
+    // If we test with the memory pool, initialize the memory pool.
+    if (UsePoolOrNot::value) {
+      vector<int> device_ids(device_count_);
+      for (int i = 0; i < device_count_; ++i) {
+        device_ids[i] = i;
+      }
+      CHECK(CudaMemoryPool::InitializeMemoryPool(device_ids, 0.8));
+    }
+  }
+
+  void TearDown() override {
+    if (UsePoolOrNot::value) {
+      CHECK(CudaMemoryPool::FinalizeMemoryPool());
+    }
+  }
+
+  // Declares the variables your tests want to use.
+  int device_count_;
+};
+
+typedef ::testing::Types<UseMemoryPool, NotUseMemoryPool> MemoryPoolTestTypes;
+TYPED_TEST_CASE(MemoryPoolTest, MemoryPoolTestTypes);
+
+// This just tests that setup and teardown works.
+TYPED_TEST(MemoryPoolTest, InitializeAndFinalizeWorks) {
+  EXPECT_TRUE(true);
+}
+
+TYPED_TEST(MemoryPoolTest, AllocateAndDeallocate) {
+  const int nbytes = 1048576;
+  for (int i = 0; i < this->device_count_; ++i) {
+    LOG(INFO) << "Device " << i << " of " << this->device_count_;
+    CUDA_CHECK(cudaSetDevice(i));
+    void* allocated = CUDAContext::New(nbytes);
+    EXPECT_NE(allocated, nullptr);
+    cudaPointerAttributes attr;
+    CUDA_CHECK(cudaPointerGetAttributes(&attr, allocated));
+    EXPECT_EQ(attr.memoryType, cudaMemoryTypeDevice);
+    EXPECT_EQ(attr.device, i);
+    CUDAContext::Delete(allocated);
+  }
+}
+
+}  // namespace caffe2
diff --git a/third_party/cnmem/BREW b/third_party/cnmem/BREW
new file mode 100644
index 0000000000000..0937ac7e960ee
--- /dev/null
+++ b/third_party/cnmem/BREW
@@ -0,0 +1,9 @@
+cuda_library(
+  name = "cnmem",
+  srcs = [
+    "cnmem.cpp",
+  ],
+  hdrs = [
+    "cnmem.h",
+  ],
+)
\ No newline at end of file
diff --git a/third_party/cnmem/cnmem.cpp b/third_party/cnmem/cnmem.cpp
new file mode 100644
index 0000000000000..f0bc54374ce76
--- /dev/null
+++ b/third_party/cnmem/cnmem.cpp
@@ -0,0 +1,1287 @@
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cnmem.h"
+#include <cstddef>
+#include <vector>
+#include <cuda_runtime_api.h>
+
+#if !defined(WIN32) && defined(_MSC_VER)
+#define WIN32
+#endif
+
+#ifdef WIN32
+#include <Windows.h>
+#else
+#include <pthread.h>
+#endif
+
+#define CNMEM_GRANULARITY 512
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+extern "C" const char* cnmemGetErrorString(cnmemStatus_t status) {
+    switch(status) {
+        case CNMEM_STATUS_SUCCESS: return "CNMEM_STATUS_SUCCESS";
+        case CNMEM_STATUS_CUDA_ERROR: return "CNMEM_STATUS_CUDA_ERROR";
+        case CNMEM_STATUS_INVALID_ARGUMENT: return "CNMEM_STATUS_INVALID_ARGUMENT";
+        case CNMEM_STATUS_NOT_INITIALIZED: return "CNMEM_STATUS_NOT_INITIALIZED";
+        case CNMEM_STATUS_OUT_OF_MEMORY: return "CNMEM_STATUS_OUT_OF_MEMORY";
+        default: return "CNMEM_STATUS_UNKNOWN_ERROR";
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if 0
+#ifdef WIN32
+#define CNMEM_DEBUG_ERROR(...) do { \
+    fprintf(stderr, "Error at line: %d\n", __LINE__); \
+    fprintf(stderr, __VA_ARGS__); \
+} while(0)
+#else
+#include <execinfo.h>
+static inline void printBacktrace() {
+    void *stackBuffer[64]; 
+    int numAddresses = backtrace((void**) &stackBuffer, 64); 
+    char **addresses = backtrace_symbols(stackBuffer, numAddresses); 
+    for( int i = 0 ; i < numAddresses ; ++i ) { 
+        fprintf(stderr, "[%2d]: %s\n", i, addresses[i]); 
+    } 
+    free(addresses); 
+}
+#define CNMEM_DEBUG_ERROR(...) do { \
+    fprintf(stderr, "Error at line: %d\n", __LINE__); \
+    fprintf(stderr, __VA_ARGS__); \
+    fprintf(stderr, "Backtrace:\n"); \
+    printBacktrace(); \
+} while(0)
+#endif
+#else
+#define CNMEM_DEBUG_ERROR(...)
+#endif
+
+#if 0
+#define CNMEM_DEBUG_INFO printf
+#else
+#define CNMEM_DEBUG_INFO(...)
+#endif
+
+#if 0 // Enable/disable assertions
+#include <cassert>
+#define CNMEM_ASSERT assert
+#else
+#define CNMEM_ASSERT(...)
+#endif
+
+#define CNMEM_CHECK_TRUE(cond, error) do { \
+    if( !(cond) ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_TRUE evaluates to false\n"); \
+        return error; \
+    } \
+} while(0) 
+
+#define CNMEM_CHECK(call) do { \
+    cnmemStatus_t status = (call); \
+    if( status != CNMEM_STATUS_SUCCESS ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK failed with status \"%s\"\n", \
+                cnmemGetErrorString(status)); \
+        return status; \
+    } \
+} while(0)
+
+#define CNMEM_CHECK_OR_UNLOCK(call, mutex) do { \
+    cnmemStatus_t status = (call); \
+    if( status != CNMEM_STATUS_SUCCESS ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_OR_UNLOCK failed with status \"%s\"\n", \
+                cnmemGetErrorString(status)); \
+        (mutex).unlock(); \
+        return status; \
+    } \
+} while(0)
+
+#define CNMEM_CHECK_CUDA(call) do { \
+    cudaError_t cudaError = (call); \
+    if( cudaError == cudaErrorMemoryAllocation ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \
+                cudaGetErrorString(cudaError)); \
+        return CNMEM_STATUS_OUT_OF_MEMORY; \
+    } \
+    else if( cudaError != cudaSuccess ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \
+                cudaGetErrorString(cudaError)); \
+        return CNMEM_STATUS_CUDA_ERROR; \
+    } \
+} while(0)
+
+#define CNMEM_CHECK_CUDA_OR_UNLOCK(call, mutex) do { \
+    cudaError_t cudaError = (call); \
+    if( cudaError == cudaErrorMemoryAllocation ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \
+                cudaGetErrorString(cudaError)); \
+        (mutex).unlock(); \
+        return CNMEM_STATUS_OUT_OF_MEMORY; \
+    } \
+    else if( cudaError != cudaSuccess ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \
+                cudaGetErrorString(cudaError)); \
+        (mutex).unlock(); \
+        return CNMEM_STATUS_CUDA_ERROR; \
+    } \
+} while(0)
+
+#ifdef WIN32
+#define CNMEM_CHECK_WIN32(call, error_code) do { \
+    SetLastError(0); /* Clean the flag. */ \
+    call; \
+    DWORD status = GetLastError(); \
+    if( status ) \
+        return error_code; \
+} while(0)
+#else
+#define CNMEM_CHECK_PTHREAD(call, error_code) do { \
+    int status = call; \
+    if( status ) { \
+        CNMEM_DEBUG_ERROR("CNMEM_CHECK_PTHREAD failed with status %d\n", status); \
+        return error_code; \
+    } \
+} while(0)
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cnmem {
+
+static inline std::size_t ceilInt(std::size_t m, std::size_t n) {
+    CNMEM_ASSERT(n > 0);
+    return (m + n-1) / n * n;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+class Mutex {
+#ifdef WIN32
+    mutable CRITICAL_SECTION mCriticalSection;
+#else
+    pthread_mutex_t  mMutex;
+#endif
+
+public:
+    /// Initialize the mutex.
+    cnmemStatus_t initialize();
+    /// Finalize the mutex.
+    cnmemStatus_t finalize();
+    /// Lock the mutex.
+    cnmemStatus_t lock() const;
+    /// Unlock the mutex.
+    cnmemStatus_t unlock() const;
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Mutex::initialize() {
+#ifdef WIN32
+    CNMEM_CHECK_WIN32(InitializeCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
+#else
+#if 0
+    pthread_mutexattr_t attr;
+    CNMEM_CHECK_PTHREAD(pthread_mutexattr_init(&attr), CNMEM_STATUS_UNKNOWN_ERROR);
+    CNMEM_CHECK_PTHREAD(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE), CNMEM_STATUS_UNKNOWN_ERROR);
+    CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, &attr), CNMEM_STATUS_UNKNOWN_ERROR);
+#else
+    CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, NULL), CNMEM_STATUS_UNKNOWN_ERROR);
+#endif
+#endif
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Mutex::finalize() {
+#ifdef WIN32
+    CNMEM_CHECK_WIN32(DeleteCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
+#else
+    CNMEM_CHECK_PTHREAD(pthread_mutex_destroy(&mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
+#endif
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Mutex::lock() const {
+#ifdef WIN32
+    CNMEM_CHECK_WIN32(EnterCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
+#else
+    CNMEM_CHECK_PTHREAD(pthread_mutex_lock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
+#endif
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Mutex::unlock() const {
+#ifdef WIN32
+    CNMEM_CHECK_WIN32(LeaveCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
+#else
+    CNMEM_CHECK_PTHREAD(pthread_mutex_unlock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
+#endif
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+class Block {
+    /// The pointer to the memory region on the device. 
+    char *mData;
+    /// The size of the memory buffer.
+    std::size_t mSize;
+    /// The prev/next blocks in the linked list of blocks.
+    Block *mNext;
+    /// Is it a head node (i.e. a node obtained from parent->allocate or cudaMalloc).
+    bool mIsHead;
+
+public:
+    /// Create a block.
+    Block(char *data, std::size_t size, Block *next, bool isHead)
+        : mData(data)
+        , mSize(size)
+        , mNext(next)
+        , mIsHead(isHead) {
+    }
+    
+    /// The data.
+    inline const char* getData() const { return mData; }
+    /// The data (mutable).
+    inline char* getData() { return mData; }
+    
+    /// The size of the block.
+    inline std::size_t getSize() const { return mSize; }
+
+    /// The next block in the linked list.
+    inline const Block* getNext() const { return mNext; }
+    /// The next block in the linked list (mutable).
+    inline Block* getNext() { return mNext; }
+    
+    /// Is it a head block.
+    inline bool isHead() const { return mIsHead; }
+
+    /// Change the next block.
+    inline void setNext(Block *next) { mNext = next; }
+    /// Change the size of the block.
+    inline void setSize(std::size_t size) { mSize = size; }
+    /// Set the head flag.
+    inline void setHeadFlag(bool isHead) { mIsHead = isHead; }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+class Manager {
+
+    /// The parent manager.
+    Manager *mParent;
+    /// The children managers.
+    std::vector<Manager*> mChildren;
+    /// The GPU device where the memory is allocated.
+    int mDevice;
+    /// The stream this manager is associated with. It could be NULL.
+    cudaStream_t mStream;
+    /// Is the stream blocking?
+    bool mIsStreamBlocking;
+    /// The list of used blocks.
+    Block *mUsedBlocks;
+    /// The list of free blocks.
+    Block *mFreeBlocks;
+    /// The managed memory size.
+    std::size_t mSize;
+    /// The flags.
+    unsigned mFlags;
+    /// To support multi-threading. Each manager has its own mutex.
+    Mutex mMutex;
+
+public:
+    /// Create an unitialized manager.
+    Manager();
+    /// Dtor.
+    ~Manager();
+
+    /// Allocate a block of memory.
+    cnmemStatus_t allocate(void *&ptr, std::size_t size, bool isBlocking = true);
+    /// Release a block of memory.
+    cnmemStatus_t release(void *ptr);
+    /// Release memory. It returns true if we have no memory leak.
+    cnmemStatus_t releaseAllUnsafe();
+    /// Reserve memory for a manager.
+    cnmemStatus_t reserve(std::size_t size);
+    /// Steal memory from another manager.
+    cnmemStatus_t stealUnsafe(void *&ptr, std::size_t size);
+
+    /// Print the full memory state.
+    cnmemStatus_t printMemoryState(FILE *file) const;
+
+    /// The amount of used memory.
+    inline cnmemStatus_t getUsedMemoryUnsafe(std::size_t &usedMemory) const { 
+        return getMemoryUnsafe(usedMemory, mUsedBlocks); 
+    }
+    /// The amount of used memory.
+    inline cnmemStatus_t getFreeMemoryUnsafe(std::size_t &freeMemory) const { 
+        return getMemoryUnsafe(freeMemory, mFreeBlocks); 
+    }
+    
+    /// Get a specific child based on the stream id. 
+    cnmemStatus_t getChildFromStream(Manager *&manager, cudaStream_t stream) const;
+    /// Get a specific child based on the stream id. 
+    cnmemStatus_t getChild(Manager *&manager, std::size_t i) const;
+    /// Add a new child.
+    cnmemStatus_t addChild(Manager *manager);
+    /// The number of children.
+    cnmemStatus_t getNumChildren(std::size_t &numChildren) const;
+
+    /// The associated device.
+    inline int getDevice() const { return mDevice; }
+    /// The flags.
+    inline unsigned getFlags() const { return mFlags; }
+    /// Get the mutex.
+    inline const Mutex* getMutex() const { return &mMutex; }
+    /// The size allocated to that manager.
+    inline std::size_t getSize() const { return mSize; }
+    /// The CUDA stream.
+    inline cudaStream_t getStream() const { return mStream; }
+    
+    /// Define the parent.
+    inline void setParent(Manager *parent) { mParent = parent; }
+    /// Define the device.
+    inline void setDevice(int device) { mDevice = device; }
+    /// Define the stream.
+    inline cnmemStatus_t setStream(cudaStream_t stream) { 
+        mStream = stream; 
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+        mIsStreamBlocking = false;
+#elif CUDART_VERSION < 5050
+        mIsStreamBlocking = true;
+#else
+        unsigned flags = 0;
+        CNMEM_CHECK_CUDA(cudaStreamGetFlags(mStream, &flags));
+        mIsStreamBlocking = !mStream || !(flags & cudaStreamNonBlocking);
+#endif
+        return CNMEM_STATUS_SUCCESS;
+    }
+    /// Define the flags.
+    inline void setFlags(unsigned flags) { mFlags = flags; }
+    
+private:
+    /// The member functions below which are marked "Unsafe" are not thread-safe when called on a
+    /// same Manager object. Make sure they are called by a single thread in that case.
+
+    /// Allocate a new block and add it to the free list.
+    cnmemStatus_t allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size);
+    /// Release a block from the active list.
+    cnmemStatus_t releaseBlockUnsafe(Block *curr, Block *prev);
+    /// Find the best free node based on the size.
+    cnmemStatus_t findBestBlockUnsafe(Block *&curr, Block *&prev, std::size_t size);
+    /// Extract a node from the list of free blocks.
+    cnmemStatus_t extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen);
+    
+    /// Give a free block from that manager.
+    cnmemStatus_t giveBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size);
+    /// Steal a block from another manager.
+    cnmemStatus_t stealBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size);
+    
+    /// The memory consumption of a list.
+    cnmemStatus_t getMemoryUnsafe(std::size_t &memSize, const Block *head) const;
+    /// Print an internal linked list.
+    cnmemStatus_t printListUnsafe(FILE *file, const char *name, const Block *head) const;
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+Manager::Manager()
+    : mParent(NULL)
+    , mChildren()
+    , mDevice(-1)
+    , mStream(NULL)
+    , mIsStreamBlocking(false)
+    , mUsedBlocks(NULL)
+    , mFreeBlocks(NULL)
+    , mSize(0)
+    , mFlags(CNMEM_FLAGS_DEFAULT)
+    , mMutex() {
+
+    mMutex.initialize();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+Manager::~Manager() {
+    if( mDevice == -1 || cudaSetDevice(mDevice) != cudaSuccess ) { // Invalid device, skip it.
+        return;
+    }
+    releaseAllUnsafe();
+    mMutex.finalize();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::addChild(Manager *manager) {
+    CNMEM_CHECK(mMutex.lock());
+    mChildren.push_back(manager);
+    CNMEM_CHECK(mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::allocate(void *&ptr, std::size_t size, bool isBlocking) {
+    CNMEM_CHECK(mMutex.lock());
+
+    // If the client is not blocking, we have to explicitly synchronize before giving one buffer.
+    if( !isBlocking ) {
+        CNMEM_CHECK_CUDA_OR_UNLOCK(cudaStreamSynchronize(mStream), mMutex);
+    }
+
+    // Find the best fit.
+    Block *best = NULL, *prev = NULL;
+    CNMEM_CHECK_OR_UNLOCK(findBestBlockUnsafe(best, prev, size), mMutex);
+
+    // If there's no block left in the list of free blocks (with a sufficient size). Request a new block. 
+    if( best == NULL && !(mFlags & CNMEM_FLAGS_CANNOT_GROW) ) {
+        CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(best, prev, size), mMutex);
+    }
+    
+    // Make sure we do have a block or quit.
+    if( !best ) {
+        ptr = NULL;
+        CNMEM_CHECK(mMutex.unlock());
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+
+    // Split the free block if needed.
+    CNMEM_CHECK_OR_UNLOCK(extractBlockUnsafe(best, prev, size, false), mMutex);
+
+    // Push the node to the list of used nodes.
+    best->setNext(mUsedBlocks);
+    mUsedBlocks = best;
+
+    // Return the new pointer into memory.
+    ptr = mUsedBlocks->getData();
+    CNMEM_CHECK(mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size) {
+    // Reset the outputs.
+    curr = prev = NULL;
+
+    // Try to allocate data from the parent or the device.
+    void *data = NULL;
+    if( mParent ) {
+        CNMEM_CHECK(mParent->allocate(data, size, mIsStreamBlocking));
+    }
+    else {
+        CNMEM_DEBUG_INFO("cudaMalloc(%lu)\n", size);
+        CNMEM_CHECK_CUDA(cudaMalloc(&data, size));
+        CNMEM_DEBUG_INFO(">> returned address=0x%016lx\n", (size_t) data);
+    }
+    
+    // If it failed, there's an unexpected issue.
+    CNMEM_ASSERT(data);
+
+    // We have data, we now need to add it to the list of free nodes. We keep the list sorted.
+    Block *next = mFreeBlocks;
+    for( ; next && next->getData() < data ; next = next->getNext() ) {
+        prev = next;
+    }
+    curr = new Block((char*) data, size, next, true);
+    if( !curr ) {
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+    if( prev ) {
+        prev->setNext(curr);
+    }
+    else {
+        mFreeBlocks = curr;
+    }
+
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen) {
+    // We have two cases: 1/ It is the right size so we keep it or 2/ it is too large and we split the node.
+    Block *next;
+    if( curr->getSize() == size ) {
+        next = curr->getNext();
+    }
+    else {
+        std::size_t remaining = curr->getSize()-size;
+        Block *newBlock = new Block(curr->getData() + size, remaining, curr->getNext(), stolen);
+        if( !newBlock ) {
+            return CNMEM_STATUS_OUT_OF_MEMORY;
+        }
+        next = newBlock;
+        curr->setSize(size);
+    }
+    
+    // Redo the "branching" in the nodes.
+    if( prev ) {
+        prev->setNext(next);
+    }
+    else {
+        mFreeBlocks = next;
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::findBestBlockUnsafe(Block *&best, Block *&prev, std::size_t size) {
+    best = NULL, prev = NULL;
+    for( Block *temp = mFreeBlocks, *tempPrev = NULL ; temp ; temp = temp->getNext() ) {
+        if( temp->getSize() >= size && (!best || temp->getSize() < best->getSize()) ) {
+            best = temp;
+            prev = tempPrev;
+        }
+        tempPrev = temp;
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::getChildFromStream(Manager *&manager, cudaStream_t stream) const {
+    CNMEM_CHECK(mMutex.lock());
+    std::size_t i = 0, numChildren = mChildren.size();
+    for( ; i < numChildren ; ++i ) {
+        if( mChildren[i]->mStream == stream ) {
+            manager = mChildren[i];
+            break;
+        }
+    }
+    CNMEM_CHECK(mMutex.unlock());
+    return i < numChildren ? CNMEM_STATUS_SUCCESS : CNMEM_STATUS_INVALID_ARGUMENT;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::getChild(Manager *&manager, std::size_t i) const {
+    CNMEM_CHECK(mMutex.lock());
+    if( i >= mChildren.size() ) {
+        CNMEM_CHECK(mMutex.unlock());
+        return CNMEM_STATUS_INVALID_ARGUMENT;
+    }
+    manager = mChildren[i];
+
+    CNMEM_CHECK(mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::getMemoryUnsafe(std::size_t &size, const Block *head) const {
+    size = 0;
+    for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) {
+        size += curr->getSize();
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if 0
+cnmemStatus_t Manager::getMemory(std::size_t &size, const Block *head) const {
+    CNMEM_CHECK(mMutex.lock());
+    CNMEM_CHECK_OR_UNLOCK(getMemoryUnsafe(size, head));
+    CNMEM_CHECK(mMutex.unlock());
+    return status;
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::getNumChildren(std::size_t &numChildren) const {
+    CNMEM_CHECK(mMutex.lock());
+    numChildren = mChildren.size();
+    CNMEM_CHECK(mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::giveBlockUnsafe(void *&blockData, std::size_t &blockSize, std::size_t size) {
+    // Make sure the block is not in use any more. It could be too coarse grain and we may change 
+    // it in the future.
+    CNMEM_CHECK_CUDA(cudaStreamSynchronize(mStream));
+    
+    // Init the returned values to 0.
+    blockData = NULL;
+    blockSize = 0;
+    
+    // Find the best node to steal and reserve it.
+    Block *best = NULL, *prev = NULL;
+    CNMEM_CHECK(findBestBlockUnsafe(best, prev, size));
+    if( !best ) {
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+    CNMEM_CHECK(extractBlockUnsafe(best, prev, size, true));
+    blockData = best->getData();
+    blockSize = best->getSize();
+    
+    // Release the memory used by that block.
+    delete best;
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::printListUnsafe(FILE *file, const char *name, const Block *head) const {
+    std::size_t size = 0;
+    for( Block *curr = (Block*) head; curr; curr = curr->getNext() ) {
+        size += curr->getSize();
+    }
+    fprintf(file, "| list=\"%s\", size=%lu\n", name, size);
+    for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) {
+        fprintf(file, "| | node=0x%016lx, data=0x%016lx, size=%lu, next=0x%016lx, head=%2lu\n", 
+            (std::size_t) curr, 
+            (std::size_t) curr->getData(),
+            (std::size_t) curr->getSize(),
+            (std::size_t) curr->getNext(),
+            (std::size_t) curr->isHead ());
+    }
+    fprintf(file, "|\n");
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::printMemoryState(FILE *file) const {
+    CNMEM_CHECK(mMutex.lock());
+    std::size_t streamCode = (std::size_t) mStream;
+    std::size_t usedMemory, freeMemory;
+    CNMEM_CHECK_OR_UNLOCK(getUsedMemoryUnsafe(usedMemory), mMutex);
+    CNMEM_CHECK_OR_UNLOCK(getFreeMemoryUnsafe(freeMemory), mMutex);
+
+    fprintf(file, ">> [%s] device=%d, stream=0x%016lx, used=%luB, free=%luB\n", 
+            mParent ? "child" : "root",
+            mDevice, 
+            streamCode,
+            usedMemory,
+            freeMemory);
+    CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "used", mUsedBlocks), mMutex);
+    CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "free", mFreeBlocks), mMutex);
+    fprintf(file, "\n");
+    CNMEM_CHECK(mMutex.unlock());
+
+    if( mParent ) {
+        CNMEM_CHECK(mParent->printMemoryState(file));
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::release(void *ptr) {
+    // Skip if ptr is NULL.
+    if( ptr == NULL ) {
+        return CNMEM_STATUS_SUCCESS;
+    }
+        
+    // Lock to make sure only one thread execute that fragment of code.
+    CNMEM_CHECK(mMutex.lock());
+    
+    // Find the node in the list of used blocks.
+    Block *curr = mUsedBlocks, *prev = NULL;
+    for( ; curr && curr->getData() != ptr ; curr = curr->getNext() ) {
+        prev = curr;
+    }
+    
+    // Make sure we have found a node.
+    if( curr == NULL ) {
+        CNMEM_CHECK(mMutex.unlock());
+        return CNMEM_STATUS_INVALID_ARGUMENT;
+    }
+
+    // We have the node so release it.
+    cnmemStatus_t result = releaseBlockUnsafe(curr, prev);
+    CNMEM_CHECK(mMutex.unlock());
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::releaseAllUnsafe() {
+    // Destroy the children if any.
+    for( std::size_t i = 0; i < mChildren.size(); ++i ) {
+        Manager *child = mChildren[i];
+        CNMEM_CHECK(child->releaseAllUnsafe());
+        delete child;
+    }
+    mChildren.clear();
+
+    // Destroy used blocks. It's a kind of panic mode to avoid leaks. NOTE: Do that only with roots!!!
+    if( !mParent ) {
+        while( mUsedBlocks ) {
+            CNMEM_CHECK(releaseBlockUnsafe(mUsedBlocks, NULL));
+        }
+    }
+
+    // We should be having only free blocks that are head blocks. Release those blocks.
+    while( mFreeBlocks ) {
+        if( mParent ) {
+            CNMEM_CHECK(mParent->release(mFreeBlocks->getData()));
+        }
+        else if( mFreeBlocks->isHead() ) {
+            void *data = mFreeBlocks->getData();
+            CNMEM_DEBUG_INFO("cudaFree(%lu, 0x%016lx)\n", mFreeBlocks->getSize(), (size_t) data);
+            CNMEM_CHECK_CUDA(cudaFree(data));
+            CNMEM_DEBUG_INFO(">> success\n");
+        }
+        Block *block = mFreeBlocks;
+        mFreeBlocks = mFreeBlocks->getNext();
+        delete block;
+    }
+
+    // We shouldn't have any used block left. Or, it means the user is causing memory leaks!
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::releaseBlockUnsafe(Block *curr, Block *prev) {
+    // The current node cannot be NULL!
+    CNMEM_ASSERT(curr != NULL);
+    
+    // Change the connection of the node.
+    if( prev ) {
+        prev->setNext(curr->getNext());
+    }
+    else {
+        mUsedBlocks = curr->getNext();
+    }
+        
+    // Find the location where this block should be added to the free list.
+    prev = NULL;
+    Block *iter = mFreeBlocks;
+    for( ; iter && iter->getData() < curr->getData() ; iter = iter->getNext() ) {
+        prev = iter;
+    }
+    
+    // Keep track of the successor of pred. We may lose track of it in the following "else".
+    Block *next = prev ? prev->getNext() : mFreeBlocks;
+    
+    // We first check if we can merge the block with its predecessor in the list and curr can be merged.
+    if( prev && prev->getData() + prev->getSize() == curr->getData() && !curr->isHead() ) {
+        prev->setSize(prev->getSize() + curr->getSize());
+        delete curr;
+        curr = prev;
+    }
+    else if( prev ) {
+        prev->setNext(curr);
+    }
+    else {
+        mFreeBlocks = curr;
+    }
+    
+    // Check if we can merge curr and next. We can't merge over "cudaMalloc" boundaries.
+    if( next && curr->getData() + curr->getSize() == next->getData() && !next->isHead() ) {
+        curr->setSize(curr->getSize() + next->getSize());
+        curr->setNext(next->getNext());
+        delete next;
+    }
+    else {
+        curr->setNext(next);
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::reserve(std::size_t size) {
+    CNMEM_CHECK(mMutex.lock());
+    Block *curr, *prev;
+    CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(curr, prev, size), mMutex);
+    mSize = size;
+    CNMEM_CHECK(mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::stealUnsafe(void *&stolen, std::size_t size) {
+    // If we cannot steal, don't even try.
+    if( mFlags & CNMEM_FLAGS_CANNOT_STEAL ) {
+        stolen = NULL;
+        return CNMEM_STATUS_INVALID_ARGUMENT;
+    }
+
+    // The stolen block.
+    void *data = NULL; std::size_t dataSize = 0;
+    if( !mChildren.empty() ) {
+        CNMEM_CHECK(stealBlockUnsafe(data, dataSize, size));
+    }
+    else if( mParent ) {
+        CNMEM_CHECK(mParent->stealBlockUnsafe(data, dataSize, size));
+    }
+    
+    // Make sure we do have a block of memory or quit.
+    if( !data ) {
+        stolen = NULL;
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+
+    // Push the block in the used list.
+    mUsedBlocks = new Block((char*) data, dataSize, mUsedBlocks, true);
+    if( !mUsedBlocks ) {
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+
+    // Return the new pointer into memory.
+    stolen = data;
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Manager::stealBlockUnsafe(void *&data, std::size_t &dataSize, ::size_t size) {
+    // No block found and no room to grow. Try to steal from a children (if we have any).
+    data = NULL;
+    for( std::size_t i = 0 ; !data && i < mChildren.size() ; ++i ) {
+        Manager *child = mChildren[i];
+        if( child->giveBlockUnsafe(data, dataSize, size) == CNMEM_STATUS_SUCCESS ) {
+            break;
+        }
+    }
+        
+    // If no memory space found, simply return NULL. We have failed to allocate. Quit miserably.
+    if( !data ) {
+        return CNMEM_STATUS_OUT_OF_MEMORY;
+    }
+
+    // We have got a node from a children. We need to update our "used" list before we can do 
+    // anything with it.
+    Block *curr = mUsedBlocks, *prev = NULL;
+    for( ; curr ; curr = curr->getNext() ) { 
+        if( curr->getData() <= data && data < curr->getData()+curr->getSize() ) {
+            break;
+        }
+        prev = curr;
+    }
+    
+    // Curr points to the node which contains that memory region.
+    CNMEM_ASSERT(curr);
+
+    // If it is exactly the same memory region, we are done!!!
+    if( curr->getData() == data && curr->getSize() == dataSize ) {
+        return CNMEM_STATUS_SUCCESS;
+    }
+    
+    // Track the blocks before and after curr.
+    Block *next = curr->getNext();
+    
+    // We may have up to 3 blocks.
+    std::size_t sizeBefore = (std::size_t) ((char*) data - curr->getData());
+    std::size_t sizeAfter = (curr->getSize() - sizeBefore - dataSize);
+
+    // The resulting block.
+    Block *result = curr;
+    
+    // If we have no space between curr->getData and block->getData.
+    if( sizeBefore == 0 ) {
+        curr->setSize(dataSize);
+    }
+    else {
+        curr->setSize(sizeBefore);
+        Block *block = new Block((char*) data, dataSize, next, false);
+        if( !block ) {
+            return CNMEM_STATUS_OUT_OF_MEMORY;
+        }
+        curr->setNext(block);
+        curr = block;
+        data = (char*) data + dataSize;
+        dataSize = sizeAfter; 
+        result = block;
+    }
+    
+    // We have space at the end so we may need to add a new node.
+    if( sizeAfter > 0 ) {
+        Block *block = new Block(curr->getData() + curr->getSize(), sizeAfter, next, false);
+        if( !block ) {
+            return CNMEM_STATUS_OUT_OF_MEMORY;
+        }
+        curr->setNext(block);
+        curr = block;
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+class Context {
+    /// Use a magic number to specify that the context is valid.
+    enum { CTX_VALID = 0x1f5632a3 };
+
+    /// The reference counting mechanism.
+    int mRefCount;
+    /// The mutex to increase/decrease the reference counter. TODO: Use atomics.
+    Mutex mMutex;
+    /// The memory managers.
+    std::vector<Manager> mManagers;
+    /// The global context.
+    static Context *sCtx;
+    /// Use a magic number to specify that the context was created.
+    static int sCtxCheck;
+
+public:
+    /// Ctor.
+    Context() : mRefCount(1) { mMutex.initialize(); }
+    /// Dtor.
+    ~Context();
+    /// Get the managers.
+    inline std::vector<Manager>& getManagers() { return mManagers; }
+    /// Get a single manager associated with a device.
+    inline Manager& getManager(int i) { return mManagers[i]; }
+
+    /// Create the global context.
+    static cnmemStatus_t create();
+    /// Check that the context was created.
+    static inline bool check() { return sCtxCheck == CTX_VALID && sCtx; }
+    /// Get the global context.
+    static Context* get();
+    /// Retain.
+    static cnmemStatus_t retain();
+    /// Release.
+    static cnmemStatus_t release();
+};
+
+Context *Context::sCtx;
+int Context::sCtxCheck;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+Context::~Context() { 
+    int oldDevice;
+    cudaGetDevice(&oldDevice);
+    for( std::size_t i = 0 ; i < mManagers.size() ; ++i ) {
+        if( mManagers[i].getDevice() != -1 ) { // Skip invalid managers.
+            cudaSetDevice(mManagers[i].getDevice());
+            mManagers[i].releaseAllUnsafe();
+        }
+    }
+    mManagers.clear();
+    mMutex.finalize();
+    cudaSetDevice(oldDevice);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Context::create() {
+    sCtx = new Context;
+    sCtxCheck = CTX_VALID;
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+Context* Context::get() {
+    CNMEM_ASSERT(Context::check());
+    return Context::sCtx;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Context::retain() { 
+    CNMEM_CHECK(sCtx->mMutex.lock());
+    sCtx->mRefCount++; 
+    CNMEM_CHECK(sCtx->mMutex.unlock());
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t Context::release() {
+    CNMEM_CHECK(sCtx->mMutex.lock());
+    int refCount = --sCtx->mRefCount;
+    CNMEM_CHECK(sCtx->mMutex.unlock());
+
+    if( refCount == 0 ) { // Kill the context.
+        delete sCtx;
+        Context::sCtx = NULL;
+        Context::sCtxCheck = 0;
+    }
+    return CNMEM_STATUS_SUCCESS;
+}
+
+} // namespace cnmem
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+extern "C" {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags) {
+    // Make sure we have at least one device declared.
+    CNMEM_CHECK_TRUE(numDevices > 0, CNMEM_STATUS_INVALID_ARGUMENT);
+    
+    // Find the largest ID of the device.
+    int maxDevice = 0;
+    for( int i = 0 ; i < numDevices ; ++i ) {
+        if( devices[i].device > maxDevice ) {
+            maxDevice = devices[i].device;
+        }
+    }
+
+    // Create the global context.
+    cnmem::Context::create();
+    cnmem::Context *ctx = cnmem::Context::get();
+        
+    // Allocate enough managers.
+    CNMEM_CHECK_TRUE(maxDevice >= 0, CNMEM_STATUS_INVALID_ARGUMENT);
+    std::vector<cnmem::Manager> &managers = ctx->getManagers();
+    managers.resize(maxDevice+1);
+
+    // Create a root manager for each device and create the children.
+    int oldDevice;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&oldDevice));
+    for( int i = 0 ; i < numDevices ; ++i ) {
+        CNMEM_CHECK_CUDA(cudaSetDevice(devices[i].device));
+        std::size_t size = devices[i].size;
+        if( size == 0 ) {
+            cudaDeviceProp props;
+            CNMEM_CHECK_CUDA(cudaGetDeviceProperties(&props, devices[i].device));
+            size = props.totalGlobalMem / 2;
+        }
+        CNMEM_CHECK_TRUE(size > 0, CNMEM_STATUS_INVALID_ARGUMENT);
+        
+        cnmem::Manager &manager = ctx->getManager(devices[i].device);
+        manager.setDevice(devices[i].device);
+        manager.setFlags(flags);
+        
+        size = cnmem::ceilInt(size, CNMEM_GRANULARITY);
+        CNMEM_CHECK(manager.reserve(size));
+        
+        for( int j = 0 ; j < devices[i].numStreams ; ++j ) {
+            cnmem::Manager *child = new cnmem::Manager;
+            child->setParent(&manager);
+            child->setDevice(devices[i].device);
+            child->setStream(devices[i].streams[j]);
+            child->setFlags(flags & ~CNMEM_FLAGS_CANNOT_GROW);
+            if( devices[i].streamSizes && devices[i].streamSizes[j] > 0 ) {
+                CNMEM_CHECK(child->reserve(devices[i].streamSizes[j]));
+            }
+            CNMEM_CHECK(manager.addChild(child));
+        }
+    }
+    CNMEM_CHECK_CUDA(cudaSetDevice(oldDevice));
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t cnmemFinalize() {
+    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
+    return cnmem::Context::release();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t cnmemRetain() {
+    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
+    return cnmem::Context::retain();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t cnmemRelease() {
+    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
+    return cnmem::Context::release();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t cnmemRegisterStream(cudaStream_t stream) {
+    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
+    CNMEM_CHECK_TRUE(stream, CNMEM_STATUS_INVALID_ARGUMENT);
+    
+    int device;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
+
+    cnmem::Manager &root = cnmem::Context::get()->getManager(device);
+    cnmem::Manager *child = new cnmem::Manager;
+    child->setParent(&root);
+    child->setDevice(device);
+    child->setStream(stream);
+    child->setFlags(root.getFlags() & ~CNMEM_FLAGS_CANNOT_GROW);
+    root.addChild(child);
+
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t cnmemMalloc(void **ptr, std::size_t size, cudaStream_t stream) {
+    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
+    if( !ptr && !size ) {
+        return CNMEM_STATUS_SUCCESS;
+    }
+    else if( !size ) {
+        ptr[0] = NULL;
+        return CNMEM_STATUS_SUCCESS;
+    }
+    CNMEM_CHECK_TRUE(ptr,  CNMEM_STATUS_INVALID_ARGUMENT);
+    
+    int device;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
+
+    cnmem::Manager &root = cnmem::Context::get()->getManager(device);
+    cnmem::Manager *manager = &root;
+    if( stream ) {
+        CNMEM_CHECK(root.getChildFromStream(manager, stream));
+    }
+    CNMEM_ASSERT(manager);
+    
+    size = cnmem::ceilInt(size, CNMEM_GRANULARITY);
+    cnmemStatus_t result = manager->allocate(ptr[0], size);
+
+    // We failed to allocate but there might still be a buffer available in another manager. Try to 
+    // steal it.
+    if( result == CNMEM_STATUS_OUT_OF_MEMORY ) {
+
+        // Try to acquire locks on all the children.
+        std::size_t numChildren;
+        CNMEM_CHECK(root.getNumChildren(numChildren));
+        std::vector<const cnmem::Mutex*> mutexes(numChildren);
+
+        std::size_t numLocked = 0;
+        for( size_t i = 0 ; i < numChildren ; ++i, ++numLocked ) {
+            cnmem::Manager *child;
+            CNMEM_CHECK(root.getChild(child, i));
+            mutexes[numLocked] = child->getMutex();
+            if( mutexes[numLocked]->lock() != CNMEM_STATUS_SUCCESS ) {
+                break;
+            }
+        }
+
+        // One lock failed, quit. Reduce the damage as much as possible, though.
+        if( numLocked != numChildren ) {
+            for( std::size_t i = 0 ; i < numLocked ; ++i ) {
+                cnmemStatus_t lockStatus = mutexes[i]->unlock();
+            }
+            return CNMEM_STATUS_UNKNOWN_ERROR;
+        }
+
+        // Grab the lock on the root, first.
+        const cnmem::Mutex *rootMutex = root.getMutex();
+        CNMEM_CHECK(rootMutex->lock());
+
+        // We acquired all the lock so we try to steal a node from another child.
+        if( numLocked == mutexes.size() ) {
+            result = manager->stealUnsafe(ptr[0], size);
+        }
+        for( std::size_t i = 0 ; i < numLocked ; ++i ) {
+            cnmemStatus_t lockStatus = mutexes[i]->unlock();
+            if( lockStatus != CNMEM_STATUS_SUCCESS ) { 
+                // Starting from now we are panicking!!! One lock failed to be released, we try
+                // we others. We could also give up because we are already screwed. I don't know
+                // what's best! Comment are welcome.
+                result = lockStatus;
+            }
+        }
+        CNMEM_CHECK(rootMutex->unlock());
+    }
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t cnmemFree(void *ptr, cudaStream_t stream) {
+    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
+    if( ptr == NULL ) {
+        return CNMEM_STATUS_SUCCESS;
+    }
+
+    int device;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
+
+    cnmem::Manager &root = cnmem::Context::get()->getManager(device);
+    cnmem::Manager *manager = &root;
+    if( stream ) {
+        CNMEM_CHECK(root.getChildFromStream(manager, stream));
+    }
+    CNMEM_ASSERT(manager);
+    return manager->release(ptr);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream) {
+    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
+    CNMEM_CHECK_TRUE(totalMem && freeMem, CNMEM_STATUS_INVALID_ARGUMENT);
+
+    int device;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
+    cnmem::Manager &root = cnmem::Context::get()->getManager(device);
+    cnmem::Manager *manager = &root;
+    if( stream ) {
+        CNMEM_CHECK(root.getChildFromStream(manager, stream));
+    }
+    CNMEM_ASSERT(manager);
+
+    const cnmem::Mutex *mutex = manager->getMutex();
+    CNMEM_CHECK(mutex->lock());
+    CNMEM_CHECK_OR_UNLOCK(manager->getFreeMemoryUnsafe(*freeMem), *mutex);
+    size_t usedMem;
+    CNMEM_CHECK_OR_UNLOCK(manager->getUsedMemoryUnsafe(usedMem), *mutex);
+    CNMEM_CHECK(mutex->unlock());
+    totalMem[0] = usedMem + freeMem[0];
+    return CNMEM_STATUS_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+cnmemStatus_t cnmemPrintMemoryState(FILE *file, cudaStream_t stream) {
+    CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
+
+    int device;
+    CNMEM_CHECK_CUDA(cudaGetDevice(&device));
+    cnmem::Manager &root = cnmem::Context::get()->getManager(device);
+    cnmem::Manager *manager = &root;
+    if( stream ) {
+        CNMEM_CHECK(root.getChildFromStream(manager, stream));
+    }
+    CNMEM_ASSERT(manager);
+    return manager->printMemoryState(file); 
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // extern "C"
+
diff --git a/third_party/cnmem/cnmem.h b/third_party/cnmem/cnmem.h
new file mode 100644
index 0000000000000..19fd2f84022c0
--- /dev/null
+++ b/third_party/cnmem/cnmem.h
@@ -0,0 +1,263 @@
+/* ********************************************************************** 
+ * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ********************************************************************** */
+#pragma once
+
+#ifdef __cplusplus
+#include "cstdio"
+#else
+#include "stdio.h"
+#endif
+#include "cuda_runtime_api.h"
+
+#if defined(_MSC_VER) || defined(WIN32)
+#ifdef CNMEM_DLLEXPORT
+#define CNMEM_API __declspec(dllexport)
+#else
+#define CNMEM_API __declspec(dllimport)
+#endif
+#else
+#ifdef CNMEM_DLLEXPORT
+#define CNMEM_API __attribute__((visibility ("default")))
+#else
+#define CNMEM_API
+#endif
+#endif
+
+#define CNMEM_VERSION 100 // It corresponds to 1.0.0
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ********************************************************************************************* */
+
+typedef enum
+{
+  CNMEM_STATUS_SUCCESS = 0,
+  CNMEM_STATUS_CUDA_ERROR,
+  CNMEM_STATUS_INVALID_ARGUMENT,
+  CNMEM_STATUS_NOT_INITIALIZED,
+  CNMEM_STATUS_OUT_OF_MEMORY,
+  CNMEM_STATUS_UNKNOWN_ERROR
+} cnmemStatus_t;
+
+/* ********************************************************************************************* */
+
+typedef enum
+{
+  CNMEM_FLAGS_DEFAULT = 0,       /// Default flags.
+  CNMEM_FLAGS_CANNOT_GROW = 1,   /// Prevent the manager from growing its memory consumption.
+  CNMEM_FLAGS_CANNOT_STEAL = 2,  /// Prevent the manager from stealing memory.
+} cnmemManagerFlags_t;
+
+/* ********************************************************************************************* */
+
+typedef struct cnmemDevice_t_
+{
+  /** The device number. */
+  int device;
+  /** The size to allocate for that device. If 0, the implementation chooses the size. */
+  size_t size;
+  /** The number of named streams associated with the device. The NULL stream is not counted. */
+  int numStreams;
+  /** The streams associated with the device. It can be NULL. The NULL stream is managed. */
+  cudaStream_t *streams;
+  /** The size reserved for each streams. It can be 0. */
+  size_t *streamSizes;
+
+} cnmemDevice_t;
+
+/**
+ * \brief Initialize the library and allocate memory on the listed devices.
+ *
+ * For each device, an internal memory manager is created and the specified amount of memory is 
+ * allocated (it is the size defined in device[i].size). For each, named stream an additional 
+ * memory manager is created. Currently, it is implemented as a tree of memory managers: A root 
+ * manager for the device and a list of children, one for each named stream.
+ * 
+ * This function must be called before any other function in the library. It has to be called 
+ * by a single thread since it is not thread-safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
+ * CNMEM_STATUS_OUT_OF_MEMORY,    if the requested size exceeds the available memory,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in a CUDA function.
+ */
+cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
+
+/**
+ * \brief Release all the allocated memory. 
+ * 
+ * This function must be called by a single thread and after all threads that called 
+ * cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ */
+cnmemStatus_t CNMEM_API cnmemFinalize();
+
+/**
+ * \brief Increase the internal reference counter of the context object.
+ * 
+ * This function increases the internal reference counter of the library. The purpose of that
+ * reference counting mechanism is to give more control to the user over the lifetime of the 
+ * library. It is useful with scoped memory allocation which may be destroyed in a final 
+ * memory collection after the end of main(). That function is thread-safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ */
+cnmemStatus_t CNMEM_API cnmemRetain();
+
+/**
+ * \brief Decrease the internal reference counter of the context object.
+ * 
+ * This function decreases the internal reference counter of the library. The purpose of that
+ * reference counting mechanism is to give more control to the user over the lifetime of the 
+ * library. It is useful with scoped memory allocation which may be destroyed in a final 
+ * memory collection after the end of main(). That function is thread-safe.
+ *
+ * You can use \c cnmemRelease to explicitly finalize the library.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ */
+cnmemStatus_t CNMEM_API cnmemRelease();
+
+/**
+ * \brief Add a new stream to the pool of managed streams on a device.
+ *
+ * This function registers a new stream into a device memory manager. It is thread-safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
+ */
+cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
+
+/**
+ * \brief Allocate memory. 
+ * 
+ * This function allocates memory and initializes a pointer to device memory. If no memory 
+ * is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
+ *
+ * The behavior of that function is the following: 
+ *
+ * - If the stream is NULL, the root memory manager is asked to allocate a buffer of device 
+ *   memory. If there's a buffer of size larger or equal to the requested size in the list of 
+ *   free blocks, it is returned. If there's no such buffer but the manager is allowed to grow 
+ *   its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls 
+ *   cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not 
+ *   allowed to grow, the manager attempts to steal memory from one of its children (unless 
+ *   CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns 
+ *   CNMEM_STATUS_OUT_OF_MEMORY.
+ * 
+ * - If the stream is a named stream, the initial request goes to the memory manager associated 
+ *   with that stream. If a free node is available in the lists of that manager, it is returned. 
+ *   Otherwise, the request is passed to the root node and works as if the request were made on 
+ *   the NULL stream.
+ *
+ * The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the 
+ * mechanism to steal memory from the children induces GPU synchronizations (the manager has to 
+ * make sure no kernel uses a given buffer before stealing it) and it the execution is 
+ * sequential (in a multi-threaded context, the code is executed in a critical section inside
+ * the cnmem library - no need for the user to wrap cnmemMalloc with locks).
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
+ * CNMEM_STATUS_OUT_OF_MEMORY,    if there is not enough memory available,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ */
+cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
+
+/**
+ * \brief Release memory. 
+ * 
+ * This function releases memory and recycles a memory block in the manager. This function is 
+ * thread safe. 
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ */
+cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
+
+/* ********************************************************************************************* */
+/* Utility functions.                                                                            */
+/* ********************************************************************************************* */
+
+/**
+ * \brief Returns the amount of memory managed by the memory manager associated with a stream.
+ * 
+ * The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
+ * xity linear in the number of allocated blocks so do not call it in performance critical 
+ * sections. 
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ */
+cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
+
+/**
+ * \brief Print a list of nodes to a file. 
+ * 
+ * This function is intended to be used in case of complex scenarios to help understand the 
+ * behaviour of the memory managers/application. It is thread safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 
+ *                                or free_mem == 0,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ */
+cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
+
+/**
+ * \brief Converts a cnmemStatus_t value to a string.
+ */
+const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status);
+
+/* ********************************************************************************************* */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+