diff --git a/cmake/FindCUDA.cmake b/cmake/FindCUDA.cmake
index e7ece0e212..f5fe3de1a5 100644
--- a/cmake/FindCUDA.cmake
+++ b/cmake/FindCUDA.cmake
@@ -31,10 +31,8 @@
 # The following variables affect the behavior of the macros in the
 # script (in alphebetical order).  Note that any of these flags can be
 # changed multiple times in the same directory before calling
-# CUDA_ADD_EXECUTABLE, CUDA_ADD_LIBRARY, CUDA_COMPILE, CUDA_COMPILE_PTX
-# or CUDA_WRAP_SRCS.
-#
-# ::
+# CUDA_ADD_EXECUTABLE, CUDA_ADD_LIBRARY, CUDA_COMPILE, CUDA_COMPILE_PTX,
+# CUDA_COMPILE_FATBIN, CUDA_COMPILE_CUBIN or CUDA_WRAP_SRCS::
 #
 #   CUDA_64_BIT_DEVICE_CODE (Default matches host bit size)
 #   -- Set to ON to compile for 64 bit device code, OFF for 32 bit device code.
@@ -43,19 +41,11 @@
 #      nvcc in the generated source.  If you compile to PTX and then load the
 #      file yourself, you can mix bit sizes between device and host.
 #
-#
-#
-# ::
-#
 #   CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE (Default ON)
 #   -- Set to ON if you want the custom build rule to be attached to the source
 #      file in Visual Studio.  Turn OFF if you add the same cuda file to multiple
 #      targets.
 #
-#
-#
-# ::
-#
 #      This allows the user to build the target from the CUDA file; however, bad
 #      things can happen if the CUDA source file is added to multiple targets.
 #      When performing parallel builds it is possible for the custom build
@@ -68,44 +58,24 @@
 #      this script could detect the reuse of source files across multiple targets
 #      and turn the option off for the user, but no good solution could be found.
 #
-#
-#
-# ::
-#
 #   CUDA_BUILD_CUBIN (Default OFF)
 #   -- Set to ON to enable and extra compilation pass with the -cubin option in
 #      Device mode. The output is parsed and register, shared memory usage is
 #      printed during build.
 #
-#
-#
-# ::
-#
 #   CUDA_BUILD_EMULATION (Default OFF for device mode)
 #   -- Set to ON for Emulation mode. -D_DEVICEEMU is defined for CUDA C files
 #      when CUDA_BUILD_EMULATION is TRUE.
 #
-#
-#
-# ::
-#
 #   CUDA_GENERATED_OUTPUT_DIR (Default CMAKE_CURRENT_BINARY_DIR)
 #   -- Set to the path you wish to have the generated files placed.  If it is
 #      blank output files will be placed in CMAKE_CURRENT_BINARY_DIR.
 #      Intermediate files will always be placed in
 #      CMAKE_CURRENT_BINARY_DIR/CMakeFiles.
 #
-#
-#
-# ::
-#
 #   CUDA_HOST_COMPILATION_CPP (Default ON)
 #   -- Set to OFF for C compilation of host code.
 #
-#
-#
-# ::
-#
 #   CUDA_HOST_COMPILER (Default CMAKE_C_COMPILER, $(VCInstallDir)/bin for VS)
 #   -- Set the host compiler to be used by nvcc.  Ignored if -ccbin or
 #      --compiler-bindir is already present in the CUDA_NVCC_FLAGS or
@@ -113,19 +83,11 @@
 #      $(VCInstallDir)/bin is a special value that expands out to the path when
 #      the command is run from withing VS.
 #
-#
-#
-# ::
-#
 #   CUDA_NVCC_FLAGS
 #   CUDA_NVCC_FLAGS_<CONFIG>
 #   -- Additional NVCC command line arguments.  NOTE: multiple arguments must be
 #      semi-colon delimited (e.g. --compiler-options;-Wall)
 #
-#
-#
-# ::
-#
 #   CUDA_PROPAGATE_HOST_FLAGS (Default ON)
 #   -- Set to ON to propagate CMAKE_{C,CXX}_FLAGS and their configuration
 #      dependent counterparts (e.g. CMAKE_C_FLAGS_DEBUG) automatically to the
@@ -137,10 +99,6 @@
 #      CUDA_ADD_LIBRARY, CUDA_ADD_EXECUTABLE, or CUDA_WRAP_SRCS.  Flags used for
 #      shared library compilation are not affected by this flag.
 #
-#
-#
-# ::
-#
 #   CUDA_SEPARABLE_COMPILATION (Default OFF)
 #   -- If set this will enable separable compilation for all CUDA runtime object
 #      files.  If used outside of CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY
@@ -148,38 +106,22 @@
 #      CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME and
 #      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS should be called.
 #
-#
-#
-# ::
-#
 #   CUDA_VERBOSE_BUILD (Default OFF)
 #   -- Set to ON to see all the commands used when building the CUDA file.  When
 #      using a Makefile generator the value defaults to VERBOSE (run make
 #      VERBOSE=1 to see output), although setting CUDA_VERBOSE_BUILD to ON will
 #      always print the output.
 #
-#
-#
-# The script creates the following macros (in alphebetical order):
-#
-# ::
+# The script creates the following macros (in alphebetical order)::
 #
 #   CUDA_ADD_CUFFT_TO_TARGET( cuda_target )
 #   -- Adds the cufft library to the target (can be any target).  Handles whether
 #      you are in emulation mode or not.
 #
-#
-#
-# ::
-#
 #   CUDA_ADD_CUBLAS_TO_TARGET( cuda_target )
 #   -- Adds the cublas library to the target (can be any target).  Handles
 #      whether you are in emulation mode or not.
 #
-#
-#
-# ::
-#
 #   CUDA_ADD_EXECUTABLE( cuda_target file0 file1 ...
 #                        [WIN32] [MACOSX_BUNDLE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
 #   -- Creates an executable "cuda_target" which is made up of the files
@@ -193,42 +135,28 @@
 #      nvcc.  Such flags should be modified before calling CUDA_ADD_EXECUTABLE,
 #      CUDA_ADD_LIBRARY or CUDA_WRAP_SRCS.
 #
-#
-#
-# ::
-#
 #   CUDA_ADD_LIBRARY( cuda_target file0 file1 ...
 #                     [STATIC | SHARED | MODULE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
 #   -- Same as CUDA_ADD_EXECUTABLE except that a library is created.
 #
-#
-#
-# ::
-#
 #   CUDA_BUILD_CLEAN_TARGET()
 #   -- Creates a convience target that deletes all the dependency files
 #      generated.  You should make clean after running this target to ensure the
 #      dependency files get regenerated.
 #
-#
-#
-# ::
-#
 #   CUDA_COMPILE( generated_files file0 file1 ... [STATIC | SHARED | MODULE]
 #                 [OPTIONS ...] )
 #   -- Returns a list of generated files from the input source files to be used
 #      with ADD_LIBRARY or ADD_EXECUTABLE.
 #
-#
-#
-# ::
-#
 #   CUDA_COMPILE_PTX( generated_files file0 file1 ... [OPTIONS ...] )
 #   -- Returns a list of PTX files generated from the input source files.
 #
+#   CUDA_COMPILE_FATBIN( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of FATBIN files generated from the input source files.
 #
-#
-# ::
+#   CUDA_COMPILE_CUBIN( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of CUBIN files generated from the input source files.
 #
 #   CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME( output_file_var
 #                                                        cuda_target
@@ -242,10 +170,6 @@
 #      automatically for CUDA_ADD_LIBRARY and CUDA_ADD_EXECUTABLE.  Note that
 #      this is a function and not a macro.
 #
-#
-#
-# ::
-#
 #   CUDA_INCLUDE_DIRECTORIES( path0 path1 ... )
 #   -- Sets the directories that should be passed to nvcc
 #      (e.g. nvcc -Ipath0 -Ipath1 ... ). These paths usually contain other .cu
@@ -253,17 +177,9 @@
 #
 #
 #
-#
-#
-# ::
-#
 #   CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS( output_file_var cuda_target
 #                                            nvcc_flags object_files)
 #
-#
-#
-# ::
-#
 #   -- Generates the link object required by separable compilation from the given
 #      object files.  This is called automatically for CUDA_ADD_EXECUTABLE and
 #      CUDA_ADD_LIBRARY, but can be called manually when using CUDA_WRAP_SRCS
@@ -273,91 +189,51 @@
 #      specified by CUDA_64_BIT_DEVICE_CODE.  Note that this is a function
 #      instead of a macro.
 #
-#
-#
-# ::
-#
 #   CUDA_WRAP_SRCS ( cuda_target format generated_files file0 file1 ...
 #                    [STATIC | SHARED | MODULE] [OPTIONS ...] )
 #   -- This is where all the magic happens.  CUDA_ADD_EXECUTABLE,
 #      CUDA_ADD_LIBRARY, CUDA_COMPILE, and CUDA_COMPILE_PTX all call this
 #      function under the hood.
 #
-#
-#
-# ::
-#
 #      Given the list of files (file0 file1 ... fileN) this macro generates
 #      custom commands that generate either PTX or linkable objects (use "PTX" or
 #      "OBJ" for the format argument to switch).  Files that don't end with .cu
 #      or have the HEADER_FILE_ONLY property are ignored.
 #
-#
-#
-# ::
-#
 #      The arguments passed in after OPTIONS are extra command line options to
 #      give to nvcc.  You can also specify per configuration options by
 #      specifying the name of the configuration followed by the options.  General
 #      options must preceed configuration specific options.  Not all
 #      configurations need to be specified, only the ones provided will be used.
 #
-#
-#
-# ::
-#
 #         OPTIONS -DFLAG=2 "-DFLAG_OTHER=space in flag"
 #         DEBUG -g
 #         RELEASE --use_fast_math
 #         RELWITHDEBINFO --use_fast_math;-g
 #         MINSIZEREL --use_fast_math
 #
-#
-#
-# ::
-#
 #      For certain configurations (namely VS generating object files with
 #      CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE set to ON), no generated file will
 #      be produced for the given cuda file.  This is because when you add the
 #      cuda file to Visual Studio it knows that this file produces an object file
 #      and will link in the resulting object file automatically.
 #
-#
-#
-# ::
-#
 #      This script will also generate a separate cmake script that is used at
 #      build time to invoke nvcc.  This is for several reasons.
 #
-#
-#
-# ::
-#
 #        1. nvcc can return negative numbers as return values which confuses
 #        Visual Studio into thinking that the command succeeded.  The script now
 #        checks the error codes and produces errors when there was a problem.
 #
-#
-#
-# ::
-#
 #        2. nvcc has been known to not delete incomplete results when it
 #        encounters problems.  This confuses build systems into thinking the
 #        target was generated when in fact an unusable file exists.  The script
 #        now deletes the output files if there was an error.
 #
-#
-#
-# ::
-#
 #        3. By putting all the options that affect the build into a file and then
 #        make the build rule dependent on the file, the output files will be
 #        regenerated when the options change.
 #
-#
-#
-# ::
-#
 #      This script also looks at optional arguments STATIC, SHARED, or MODULE to
 #      determine when to target the object compilation for a shared library.
 #      BUILD_SHARED_LIBS is ignored in CUDA_WRAP_SRCS, but it is respected in
@@ -366,27 +242,17 @@
 #      <target_name>_EXPORTS is defined when a shared library compilation is
 #      detected.
 #
-#
-#
-# ::
-#
 #      Flags passed into add_definitions with -D or /D are passed along to nvcc.
 #
 #
 #
-# The script defines the following variables:
-#
-# ::
+# The script defines the following variables::
 #
 #   CUDA_VERSION_MAJOR    -- The major version of cuda as reported by nvcc.
 #   CUDA_VERSION_MINOR    -- The minor version.
 #   CUDA_VERSION
 #   CUDA_VERSION_STRING   -- CUDA_VERSION_MAJOR.CUDA_VERSION_MINOR
 #
-#
-#
-# ::
-#
 #   CUDA_TOOLKIT_ROOT_DIR -- Path to the CUDA Toolkit (defined if not set).
 #   CUDA_SDK_ROOT_DIR     -- Path to the CUDA SDK.  Use this to find files in the
 #                            SDK.  This script will not directly support finding
@@ -412,13 +278,13 @@
 #                            Only available for CUDA version 3.2+.
 #   CUDA_cusparse_LIBRARY -- CUDA Sparse Matrix library.
 #                            Only available for CUDA version 3.2+.
-#   CUDA_npp_LIBRARY      -- NVIDIA Performance Primitives library.
+#   CUDA_npp_LIBRARY      -- NVIDIA Performance Primitives lib.
 #                            Only available for CUDA version 4.0+.
-#   CUDA_nppc_LIBRARY      -- NVIDIA Performance Primitives library (core).
+#   CUDA_nppc_LIBRARY     -- NVIDIA Performance Primitives lib (core).
 #                            Only available for CUDA version 5.5+.
-#   CUDA_nppi_LIBRARY      -- NVIDIA Performance Primitives library (image processing).
+#   CUDA_nppi_LIBRARY     -- NVIDIA Performance Primitives lib (image processing).
 #                            Only available for CUDA version 5.5+.
-#   CUDA_npps_LIBRARY      -- NVIDIA Performance Primitives library (signal processing).
+#   CUDA_npps_LIBRARY     -- NVIDIA Performance Primitives lib (signal processing).
 #                            Only available for CUDA version 5.5+.
 #   CUDA_nvcuvenc_LIBRARY -- CUDA Video Encoder library.
 #                            Only available for CUDA version 3.2+.
@@ -427,32 +293,15 @@
 #                            Only available for CUDA version 3.2+.
 #                            Windows only.
 #
-#
-#
-#
-#
-# ::
-#
+
 #   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
 #   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
 #
-#
-#
-# ::
-#
 #   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
 #
-#
-#
-# ::
-#
 #   Copyright (c) 2007-2009
 #   Scientific Computing and Imaging Institute, University of Utah
 #
-#
-#
-# ::
-#
 #   This code is licensed under the MIT License.  See the FindCUDA.cmake script
 #   for the text of the license.
 
@@ -481,11 +330,6 @@
 
 # FindCUDA.cmake
 
-# We need to have at least this version to support the VERSION_LESS argument to 'if' (2.6.2) and unset (2.6.3)
-cmake_policy(PUSH)
-cmake_minimum_required(VERSION 2.6.3)
-cmake_policy(POP)
-
 # This macro helps us find the location of helper files we will need the full path to
 macro(CUDA_FIND_HELPER_FILE _name _extension)
   set(_full_name "${_name}.${_extension}")
@@ -608,7 +452,17 @@ set(CUDA_NVCC_FLAGS "" CACHE STRING "Semi-colon delimit multiple arguments.")
 if(CMAKE_GENERATOR MATCHES "Visual Studio")
   set(CUDA_HOST_COMPILER "$(VCInstallDir)bin" CACHE FILEPATH "Host side compiler used by NVCC")
 else()
-  set(CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}" CACHE FILEPATH "Host side compiler used by NVCC")
+  # Using cc which is symlink to clang may let NVCC think it is GCC and issue
+  # unhandled -dumpspecs option to clang. Also in case neither
+  # CMAKE_C_COMPILER is defined (project does not use C language) nor
+  # CUDA_HOST_COMPILER is specified manually we should skip -ccbin and let
+  # nvcc use its own default C compiler.
+  if(DEFINED CMAKE_C_COMPILER AND NOT DEFINED CUDA_HOST_COMPILER)
+    get_filename_component(c_compiler_realpath "${CMAKE_C_COMPILER}" REALPATH)
+  else()
+    set(c_compiler_realpath "")
+  endif()
+  set(CUDA_HOST_COMPILER "${c_compiler_realpath}" CACHE FILEPATH "Host side compiler used by NVCC")
 endif()
 
 # Propagate the host flags to the host compiler via -Xcompiler
@@ -759,15 +613,11 @@ endif()
 set(CUDA_VERSION_STRING "${CUDA_VERSION}")
 
 # Support for arm cross compilation with CUDA 5.5
-set(__cuda_toolkit_target_dir_initial "${CUDA_TOOLKIT_ROOT_DIR}")
-if(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
-  if(ANDROID AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-androideabi")
-    set(__cuda_toolkit_target_dir_initial "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-androideabi")
-  elseif(EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
-    set(__cuda_toolkit_target_dir_initial "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
-  endif()
+if(CUDA_VERSION VERSION_GREATER "5.0" AND CMAKE_CROSSCOMPILING AND ${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND EXISTS "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf")
+  set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf" CACHE PATH "Toolkit target location.")
+else()
+  set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT_DIR}" CACHE PATH "Toolkit target location.")
 endif()
-set(CUDA_TOOLKIT_TARGET_DIR "${__cuda_toolkit_target_dir_initial}" CACHE PATH "Toolkit target location.")
 mark_as_advanced(CUDA_TOOLKIT_TARGET_DIR)
 
 # Target CPU architecture
@@ -853,18 +703,6 @@ if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
 else()
   set(CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY})
 endif()
-if(APPLE)
-  # We need to add the path to cudart to the linker using rpath, since the
-  # library name for the cuda libraries is prepended with @rpath.
-  if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
-    get_filename_component(_cuda_path_to_cudart "${CUDA_CUDARTEMU_LIBRARY}" PATH)
-  else()
-    get_filename_component(_cuda_path_to_cudart "${CUDA_CUDART_LIBRARY}" PATH)
-  endif()
-  if(_cuda_path_to_cudart)
-    list(APPEND CUDA_LIBRARIES -Wl,-rpath "-Wl,${_cuda_path_to_cudart}")
-  endif()
-endif()
 
 # 1.1 toolkit on linux doesn't appear to have a separate library on
 # some platforms.
@@ -1044,15 +882,15 @@ macro(CUDA_GET_SOURCES_AND_OPTIONS _sources _cmake_options _options)
   set( ${_options} )
   set( _found_options FALSE )
   foreach(arg ${ARGN})
-    if(arg STREQUAL "OPTIONS")
+    if("x${arg}" STREQUAL "xOPTIONS")
       set( _found_options TRUE )
     elseif(
-        arg STREQUAL "WIN32" OR
-        arg STREQUAL "MACOSX_BUNDLE" OR
-        arg STREQUAL "EXCLUDE_FROM_ALL" OR
-        arg STREQUAL "STATIC" OR
-        arg STREQUAL "SHARED" OR
-        arg STREQUAL "MODULE"
+        "x${arg}" STREQUAL "xWIN32" OR
+        "x${arg}" STREQUAL "xMACOSX_BUNDLE" OR
+        "x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR
+        "x${arg}" STREQUAL "xSTATIC" OR
+        "x${arg}" STREQUAL "xSHARED" OR
+        "x${arg}" STREQUAL "xMODULE"
         )
       list(APPEND ${_cmake_options} ${arg})
     else()
@@ -1148,7 +986,7 @@ function(CUDA_COMPUTE_BUILD_PATH path build_path)
     endif()
   endif()
 
-  # This recipie is from cmLocalGenerator::CreateSafeUniqueObjectFileName in the
+  # This recipe is from cmLocalGenerator::CreateSafeUniqueObjectFileName in the
   # CMake source.
 
   # Remove leading /
@@ -1177,7 +1015,7 @@ endfunction()
 # a .cpp or .ptx file.
 # INPUT:
 #   cuda_target         - Target name
-#   format              - PTX or OBJ
+#   format              - PTX, CUBIN, FATBIN or OBJ
 #   FILE1 .. FILEN      - The remaining arguments are the sources to be wrapped.
 #   OPTIONS             - Extra options to NVCC
 # OUTPUT:
@@ -1355,7 +1193,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
   foreach(file ${ARGN})
     # Ignore any file marked as a HEADER_FILE_ONLY
     get_source_file_property(_is_header ${file} HEADER_FILE_ONLY)
-    if(${file} MATCHES ".*\\.cu$" AND NOT _is_header)
+    if(${file} MATCHES "\\.cu$" AND NOT _is_header)
 
       # Allow per source file overrides of the format.
       get_source_file_property(_cuda_source_format ${file} CUDA_SOURCE_PROPERTY_FORMAT)
@@ -1363,16 +1201,22 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
         set(_cuda_source_format ${format})
       endif()
 
-      if( ${_cuda_source_format} MATCHES "PTX" )
-        set( compile_to_ptx ON )
-      elseif( ${_cuda_source_format} MATCHES "OBJ")
-        set( compile_to_ptx OFF )
+      if( ${_cuda_source_format} MATCHES "OBJ")
+        set( cuda_compile_to_external_module OFF )
       else()
-        message( FATAL_ERROR "Invalid format flag passed to CUDA_WRAP_SRCS for file '${file}': '${_cuda_source_format}'.  Use OBJ or PTX.")
+        set( cuda_compile_to_external_module ON )
+        if( ${_cuda_source_format} MATCHES "PTX" )
+          set( cuda_compile_to_external_module_type "ptx" )
+        elseif( ${_cuda_source_format} MATCHES "CUBIN")
+          set( cuda_compile_to_external_module_type "cubin" )
+        elseif( ${_cuda_source_format} MATCHES "FATBIN")
+          set( cuda_compile_to_external_module_type "fatbin" )
+        else()
+          message( FATAL_ERROR "Invalid format flag passed to CUDA_WRAP_SRCS for file '${file}': '${_cuda_source_format}'.  Use OBJ, PTX, CUBIN or FATBIN.")
+        endif()
       endif()
 
-
-      if(compile_to_ptx)
+      if(cuda_compile_to_external_module)
         # Don't use any of the host compilation flags for PTX targets.
         set(CUDA_HOST_FLAGS)
         set(CUDA_NVCC_FLAGS_CONFIG)
@@ -1387,7 +1231,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
       if(CUDA_GENERATED_OUTPUT_DIR)
         set(cuda_compile_output_dir "${CUDA_GENERATED_OUTPUT_DIR}")
       else()
-        if ( compile_to_ptx )
+        if ( cuda_compile_to_external_module )
           set(cuda_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}")
         else()
           set(cuda_compile_output_dir "${cuda_compile_intermediate_directory}")
@@ -1397,10 +1241,10 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
       # Add a custom target to generate a c or ptx file. ######################
 
       get_filename_component( basename ${file} NAME )
-      if( compile_to_ptx )
+      if( cuda_compile_to_external_module )
         set(generated_file_path "${cuda_compile_output_dir}")
-        set(generated_file_basename "${cuda_target}_generated_${basename}.ptx")
-        set(format_flag "-ptx")
+        set(generated_file_basename "${cuda_target}_generated_${basename}.${cuda_compile_to_external_module_type}")
+        set(format_flag "-${cuda_compile_to_external_module_type}")
         file(MAKE_DIRECTORY "${cuda_compile_output_dir}")
       else()
         set(generated_file_path "${cuda_compile_output_dir}/${CMAKE_CFG_INTDIR}")
@@ -1423,7 +1267,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
       set(custom_target_script "${cuda_compile_intermediate_directory}/${generated_file_basename}.cmake")
 
       # Setup properties for obj files:
-      if( NOT compile_to_ptx )
+      if( NOT cuda_compile_to_external_module )
         set_source_files_properties("${generated_file}"
           PROPERTIES
           EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked.
@@ -1438,7 +1282,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
         set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
       endif()
 
-      if( NOT compile_to_ptx AND CUDA_SEPARABLE_COMPILATION)
+      if( NOT cuda_compile_to_external_module AND CUDA_SEPARABLE_COMPILATION)
         list(APPEND ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS "${generated_file}")
       endif()
 
@@ -1455,7 +1299,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
       # Build the NVCC made dependency file ###################################
       set(build_cubin OFF)
       if ( NOT CUDA_BUILD_EMULATION AND CUDA_BUILD_CUBIN )
-         if ( NOT compile_to_ptx )
+         if ( NOT cuda_compile_to_external_module )
            set ( build_cubin ON )
          endif()
       endif()
@@ -1482,8 +1326,8 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
 
       # Create up the comment string
       file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}")
-      if(compile_to_ptx)
-        set(cuda_build_comment_string "Building NVCC ptx file ${generated_file_relative_path}")
+      if(cuda_compile_to_external_module)
+        set(cuda_build_comment_string "Building NVCC ${cuda_compile_to_external_module_type} file ${generated_file_relative_path}")
       else()
         set(cuda_build_comment_string "Building NVCC (${cuda_build_type}) object ${generated_file_relative_path}")
       endif()
@@ -1576,18 +1420,27 @@ function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options
     # If -ccbin, --compiler-bindir has been specified, don't do anything.  Otherwise add it here.
     list( FIND nvcc_flags "-ccbin" ccbin_found0 )
     list( FIND nvcc_flags "--compiler-bindir" ccbin_found1 )
-    if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+    if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER )
       list(APPEND nvcc_flags -ccbin "\"${CUDA_HOST_COMPILER}\"")
     endif()
+    # Create a list of flags specified by CUDA_NVCC_FLAGS_${CONFIG}
+    set(config_specific_flags)
     set(flags)
     foreach(config ${CUDA_configuration_types})
       string(TOUPPER ${config} config_upper)
+      # Add config specific flags
+      foreach(f ${CUDA_NVCC_FLAGS_${config_upper}})
+        list(APPEND config_specific_flags $<$<CONFIG:${config}>:${f}>)
+      endforeach()
       set(important_host_flags)
       _cuda_get_important_host_flags(important_host_flags ${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}})
       foreach(f ${important_host_flags})
         list(APPEND flags $<$<CONFIG:${config}>:-Xcompiler> $<$<CONFIG:${config}>:${f}>)
       endforeach()
     endforeach()
+    # Add our general CUDA_NVCC_FLAGS with the configuration specifig flags
+    set(nvcc_flags ${CUDA_NVCC_FLAGS} ${config_specific_flags} ${nvcc_flags})
+
     file(RELATIVE_PATH output_file_relative_path "${CMAKE_BINARY_DIR}" "${output_file}")
 
     # Some generators don't handle the multiple levels of custom command
@@ -1713,21 +1566,29 @@ endmacro()
 
 ###############################################################################
 ###############################################################################
-# CUDA COMPILE
+# (Internal) helper for manually added cuda source files with specific targets
 ###############################################################################
 ###############################################################################
-macro(CUDA_COMPILE generated_files)
+macro(cuda_compile_base cuda_target format generated_files)
 
   # Separate the sources from the options
   CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
   # Create custom commands and targets for each file.
-  CUDA_WRAP_SRCS( cuda_compile OBJ _generated_files ${_sources} ${_cmake_options}
+  CUDA_WRAP_SRCS( ${cuda_target} ${format} _generated_files ${_sources} ${_cmake_options}
     OPTIONS ${_options} )
 
   set( ${generated_files} ${_generated_files})
 
 endmacro()
 
+###############################################################################
+###############################################################################
+# CUDA COMPILE
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE generated_files)
+  cuda_compile_base(cuda_compile OBJ ${generated_files} ${ARGN})
+endmacro()
 
 ###############################################################################
 ###############################################################################
@@ -1735,17 +1596,28 @@ endmacro()
 ###############################################################################
 ###############################################################################
 macro(CUDA_COMPILE_PTX generated_files)
-
-  # Separate the sources from the options
-  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
-  # Create custom commands and targets for each file.
-  CUDA_WRAP_SRCS( cuda_compile_ptx PTX _generated_files ${_sources} ${_cmake_options}
-    OPTIONS ${_options} )
-
-  set( ${generated_files} ${_generated_files})
-
+  cuda_compile_base(cuda_compile_ptx PTX ${generated_files} ${ARGN})
 endmacro()
 
+###############################################################################
+###############################################################################
+# CUDA COMPILE FATBIN
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_FATBIN generated_files)
+  cuda_compile_base(cuda_compile_fatbin FATBIN ${generated_files} ${ARGN})
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE CUBIN
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_CUBIN generated_files)
+  cuda_compile_base(cuda_compile_cubin CUBIN ${generated_files} ${ARGN})
+endmacro()
+
+
 ###############################################################################
 ###############################################################################
 # CUDA ADD CUFFT TO TARGET
diff --git a/cmake/FindCUDA/make2cmake.cmake b/cmake/FindCUDA/make2cmake.cmake
index 1b53d177d0..c433fa8ed4 100644
--- a/cmake/FindCUDA/make2cmake.cmake
+++ b/cmake/FindCUDA/make2cmake.cmake
@@ -37,12 +37,11 @@
 
 file(READ ${input_file} depend_text)
 
-if (${depend_text} MATCHES ".+")
+if (NOT "${depend_text}" STREQUAL "")
 
   # message("FOUND DEPENDS")
 
-  # Remember, four backslashes is escaped to one backslash in the string.
-  string(REGEX REPLACE "\\\\ " " " depend_text ${depend_text})
+  string(REPLACE "\\ " " " depend_text ${depend_text})
 
   # This works for the nvcc -M generated dependency files.
   string(REGEX REPLACE "^.* : " "" depend_text ${depend_text})
diff --git a/cmake/FindCUDA/parse_cubin.cmake b/cmake/FindCUDA/parse_cubin.cmake
index e1905cfc66..25ceb49f3d 100644
--- a/cmake/FindCUDA/parse_cubin.cmake
+++ b/cmake/FindCUDA/parse_cubin.cmake
@@ -37,11 +37,10 @@
 
 file(READ ${input_file} file_text)
 
-if (${file_text} MATCHES ".+")
+if (NOT "${file_text}" STREQUAL "")
 
-  # Remember, four backslashes is escaped to one backslash in the string.
-  string(REGEX REPLACE ";" "\\\\;" file_text ${file_text})
-  string(REGEX REPLACE "\ncode" ";code" file_text ${file_text})
+  string(REPLACE ";" "\\;" file_text ${file_text})
+  string(REPLACE "\ncode" ";code" file_text ${file_text})
 
   list(LENGTH file_text len)
 
@@ -57,7 +56,7 @@ if (${file_text} MATCHES ".+")
 
         # Extract kernel names.
         if (${entry} MATCHES "[^g]name = ([^ ]+)")
-          string(REGEX REPLACE ".* = ([^ ]+)" "\\1" entry ${entry})
+          set(entry "${CMAKE_MATCH_1}")
 
           # Check to see if the kernel name starts with "_"
           set(skip FALSE)
@@ -76,19 +75,19 @@ if (${file_text} MATCHES ".+")
 
           # Registers
           if (${entry} MATCHES "reg([ ]+)=([ ]+)([^ ]+)")
-            string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
+            set(entry "${CMAKE_MATCH_3}")
             message("Registers: ${entry}")
           endif()
 
           # Local memory
           if (${entry} MATCHES "lmem([ ]+)=([ ]+)([^ ]+)")
-            string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
+            set(entry "${CMAKE_MATCH_3}")
             message("Local:     ${entry}")
           endif()
 
           # Shared memory
           if (${entry} MATCHES "smem([ ]+)=([ ]+)([^ ]+)")
-            string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry})
+            set(entry "${CMAKE_MATCH_3}")
             message("Shared:    ${entry}")
           endif()
 
diff --git a/cmake/FindCUDA/run_nvcc.cmake b/cmake/FindCUDA/run_nvcc.cmake
index f0aac8487a..abdd3079e1 100644
--- a/cmake/FindCUDA/run_nvcc.cmake
+++ b/cmake/FindCUDA/run_nvcc.cmake
@@ -62,7 +62,7 @@ set(cmake_dependency_file "@cmake_dependency_file@") # path
 set(CUDA_make2cmake "@CUDA_make2cmake@") # path
 set(CUDA_parse_cubin "@CUDA_parse_cubin@") # path
 set(build_cubin @build_cubin@) # bool
-set(CUDA_HOST_COMPILER "@CUDA_HOST_COMPILER@") # bool
+set(CUDA_HOST_COMPILER "@CUDA_HOST_COMPILER@") # path
 # We won't actually use these variables for now, but we need to set this, in
 # order to force this file to be run again if it changes.
 set(generated_file_path "@generated_file_path@") # path
@@ -106,7 +106,7 @@ list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
 # Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
 list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
 list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
-if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER )
   if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN)
     set(CCBIN -ccbin "${CCBIN}")
   else()
@@ -126,7 +126,7 @@ endif()
 # and other return variables are present after executing the process.
 macro(cuda_execute_process status command)
   set(_command ${command})
-  if(NOT _command STREQUAL "COMMAND")
+  if(NOT "x${_command}" STREQUAL "xCOMMAND")
     message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
   endif()
   if(verbose)