1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
| #include "stdio.h" #include "unistd.h" #include "sys/syscall.h"
#include "cuda.h" #include "cuda_runtime_api.h"
extern "C" { cudaError_t __wrap_cudaFree ( void* devPtr ); cudaError_t __wrap_cudaFreeHost ( void* ptr ); cudaError_t __wrap_cudaMalloc ( void** devPtr, size_t size ); cudaError_t __wrap_cudaMallocManaged ( void** devPtr, size_t size, unsigned int flags); cudaError_t __wrap_cudaMallocHost ( void** ptr, size_t size ); cudaError_t __wrap_cudaHostAlloc ( void** pHost, size_t size, unsigned int flags ); cudaError_t __wrap_cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ); cudaError_t __wrap_cudaMemset ( void* devPtr, int value, size_t count ); cudaError_t __wrap_cudaFreeAsync ( void* devPtr, cudaStream_t hStream ); cudaError_t __wrap_cudaMallocAsync ( void** devPtr, size_t size, cudaStream_t hStream ); cudaError_t __wrap_cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream ); cudaError_t __wrap_cudaMemsetAsync ( void* devPtr, int value, size_t count, cudaStream_t stream ); cudaError_t __wrap_cudaLaunchKernel ( const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream ); cudaError_t __wrap_cudaFreeAsync_ptsz ( void* devPtr, cudaStream_t hStream ); cudaError_t __wrap_cudaMallocAsync_ptsz ( void** devPtr, size_t size, cudaStream_t hStream ); cudaError_t __wrap_cudaMemcpyAsync_ptsz ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream ); cudaError_t __wrap_cudaMemsetAsync_ptsz ( void* devPtr, int value, size_t count, cudaStream_t stream ); cudaError_t __wrap_cudaLaunchKernel_ptsz ( const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream );
cudaError_t __real_cudaFree ( void* devPtr ); cudaError_t __real_cudaFreeHost ( void* ptr ); cudaError_t __real_cudaMalloc ( void** devPtr, size_t size ); cudaError_t __real_cudaMallocHost ( void** ptr, size_t size ); cudaError_t __real_cudaMallocManaged ( void** devPtr, size_t size, unsigned int flags); cudaError_t __real_cudaHostAlloc ( void** pHost, size_t size, unsigned int flags ); cudaError_t __real_cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ); cudaError_t __real_cudaMemset ( void* devPtr, int value, size_t count ); cudaError_t __real_cudaFreeAsync ( void* devPtr, cudaStream_t hStream ); cudaError_t __real_cudaMallocAsync ( void** devPtr, size_t size, cudaStream_t hStream ); cudaError_t __real_cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream ); cudaError_t __real_cudaMemsetAsync ( void* devPtr, int value, size_t count, cudaStream_t stream ); cudaError_t __real_cudaLaunchKernel ( const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream ); cudaError_t __real_cudaMallocAsync_ptsz ( void** devPtr, size_t size, cudaStream_t hStream ); cudaError_t __real_cudaFreeAsync_ptsz ( void* devPtr, cudaStream_t hStream ); cudaError_t __real_cudaMemcpyAsync_ptsz ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream ); cudaError_t __real_cudaMemsetAsync_ptsz ( void* devPtr, int value, size_t count, cudaStream_t stream ); cudaError_t __real_cudaLaunchKernel_ptsz ( const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream );
};
cudaError_t __wrap_cudaFree ( void* devPtr ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaFree(devPtr); printf("TID:%d __real_cudaFree with devPtr:%p, result:%s\n", tid, devPtr, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaFreeHost ( void* ptr ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaFreeHost(ptr); printf("TID:%d __real_cudaFreeHost with ptr:%p, result:%s\n", tid, ptr, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaMalloc ( void** devPtr, size_t size ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaMalloc(devPtr, size); printf("TID:%d __real_cudaMalloc with *devPtr:%p size:%d, result:%s\n", tid, *devPtr, size, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaMallocHost ( void** devPtr, size_t size ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaMallocHost(devPtr, size); printf("TID:%d __real_cudaMallocHost with *devPtr:%p size:%d, result:%s\n", tid, *devPtr, size, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaMallocManaged ( void** devPtr, size_t size, unsigned int flags) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaMallocManaged(devPtr, size, flags); printf("TID:%d __real_cudaMallocManaged with *devPtr:%p size:%d flags:%d, result:%s\n", tid, *devPtr, size, flags, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaHostAlloc ( void** pHost, size_t size, unsigned int flags ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaHostAlloc(pHost, size, flags); printf("TID:%d __real_cudaMallocHost with *pHost:%p size:%d flags%d, result:%s\n", tid, *pHost, size, flags, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaMemcpy(dst, src, count, kind); printf("TID:%d __real_cudaMemcpy with dst:%p src:%p count:%d kind:%d, result:%s\n", tid, dst, src, count, kind, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaMemset ( void* devPtr, int value, size_t count ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaMemset(devPtr, value, count); printf("TID:%d __real_cudaMemset with devPtr:%p value:%d count:%d, result:%s\n", tid, devPtr, value, count, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaFreeAsync ( void* devPtr, cudaStream_t hStream ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaFreeAsync(devPtr, hStream); printf("TID:%d __real_cudaFreeAsync with devPtr:%p stream:%p, result:%s\n", tid, devPtr, hStream, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaMemcpyAsync(dst, src, count, kind, stream); printf("TID:%d __real_cudaMemcpyAsync with dst:%p src:%p count:%d kind:%d stream:%p, result:%s\n", tid, dst, src, count, kind, stream, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaMallocAsync ( void** devPtr, size_t size, cudaStream_t hStream ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaMallocAsync(devPtr, size, hStream); printf("TID:%d __real_cudaMallocAsync with *devPtr:%p size:%d hStream:%p, result:%s\n", tid, *devPtr, size, hStream, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaMemsetAsync ( void* devPtr, int value, size_t count, cudaStream_t stream ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaMemsetAsync(devPtr, value, count, stream); printf("TID:%d __real_cudaMemsetAsync with devPtr:%p value:%d count:%d stream:%p, result:%s\n", tid, devPtr, value, count, stream, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaLaunchKernel ( const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream); printf("TID:%d __real_cudaLaunchKernel_ptsz with func:%p gridDim(%d,%d,%d) blockDim(%d,%d,%d) sharedMem:%d stream:%p, result:%s\n", tid, func, gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, sharedMem, stream, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaFreeAsync_ptsz ( void* devPtr, cudaStream_t hStream ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaFreeAsync_ptsz(devPtr, hStream); printf("TID:%d __real_cudaFreeAsync_ptsz with devPtr:%p stream:%p, result:%s\n", tid, devPtr, hStream, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaMemcpyAsync_ptsz ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaMemcpyAsync_ptsz(dst, src, count, kind, stream); printf("TID:%d __real_cudaMemcpyAsync_ptsz with dst:%p src:%p count:%d kind:%d stream:%p, result:%s\n", tid, dst, src, count, kind, stream, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaMallocAsync_ptsz ( void** devPtr, size_t size, cudaStream_t hStream ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaMallocAsync_ptsz(devPtr, size, hStream); printf("TID:%d __real_cudaMallocAsync_ptsz with *devPtr:%p size:%d hStream:%p, result:%s\n", tid, *devPtr, size, hStream, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaMemsetAsync_ptsz ( void* devPtr, int value, size_t count, cudaStream_t stream ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaMemsetAsync_ptsz(devPtr, value, count, stream); printf("TID:%d __real_cudaMemsetAsync_ptsz with devPtr:%p value:%d count:%d stream:%p, result:%s\n", tid, devPtr, value, count, stream, cudaGetErrorString(result ) ); return result; }
cudaError_t __wrap_cudaLaunchKernel_ptsz ( const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream ) { auto tid = (unsigned int)syscall(SYS_gettid); auto result = __real_cudaLaunchKernel_ptsz(func, gridDim, blockDim, args, sharedMem, stream); printf("TID:%d __real_cudaLaunchKernel_ptsz with func:%p gridDim(%d,%d,%d) blockDim(%d,%d,%d) sharedMem:%d stream:%p, result:%s\n", tid, func, gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, sharedMem, stream, cudaGetErrorString(result ) ); return result; }
|