cuda中使用--wrap

1 前言

在调查cuda内存问题时可能有以下需求:

  1. 跟踪对应内存块的分配位置
  2. 跟踪cuda runtime调用参数及结果

2 原理

链接器ld支持名为--wrap=symbol的选项,对symbol的任何未定义引用都将解析为__wrap_symbol。对__real_symbol的任何未定义引用都将解析为符号。这可用于提供系统函数的包装器。包装函数应称为__wrap_symbol。如果希望调用系统函数,则应调用__real_symbol

以下是一个简单的示例:

1
2
3
4
void* __wrap_malloc(size_t c) {  
printf("malloc called with %zu\n", c);
return __real_malloc(c);
}

如果将其他代码与使用--wrap=malloc此文件进行链接,那么对malloc的所有调用都将调用__wrap_malloc函数。在__wrap_malloc中调用__real_malloc将调用真正的malloc函数。

3 使用

3.1 添加编译选项

在CMakeLists.txt中添加以下选项

1
2
3
4
5
6
7
8
9
10
11
12
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --forward-unknown-to-host-linker")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wl,-wrap=cudaMalloc -Wl,-wrap=cudaMallocManaged -Wl,-wrap=cudaMallocHost -Wl,-wrap=cudaHostAlloc -Wl,-wrap=cudaMemcpy ")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wl,-wrap=cudaMemset -Wl,-wrap=cudaMallocAsync -Wl,-wrap=cudaMemcpyAsync -Wl,-wrap=cudaMemsetAsync -Wl,-wrap=cudaLaunchKernel")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wl,-wrap=cudaMallocAsync_ptsz -Wl,-wrap=cudaMemcpyAsync_ptsz -Wl,-wrap=cudaMemsetAsync_ptsz -Wl,-wrap=cudaLaunchKernel_ptsz")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wl,-wrap=cudaFree -Wl,-wrap=cudaFreeHost -Wl,-wrap=cudaFreeAsync -Wl,-wrap=cudaFreeAsync_ptsz ")


set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-wrap=cudaMalloc -Wl,-wrap=cudaMallocManaged -Wl,-wrap=cudaMallocHost -Wl,-wrap=cudaHostAlloc -Wl,-wrap=cudaMemcpy ")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-wrap=cudaMemset -Wl,-wrap=cudaMallocAsync -Wl,-wrap=cudaMemcpyAsync -Wl,-wrap=cudaMemsetAsync -Wl,-wrap=cudaLaunchKernel")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-wrap=cudaMallocAsync_ptsz -Wl,-wrap=cudaMemcpyAsync_ptsz -Wl,-wrap=cudaMemsetAsync_ptsz -Wl,-wrap=cudaLaunchKernel_ptsz")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-wrap=cudaFree -Wl,-wrap=cudaFreeHost -Wl,-wrap=cudaFreeAsync -Wl,-wrap=cudaFreeAsync_ptsz ")

3.2 添加代码

在任意cpp代码中添加如下内容:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#include "stdio.h" // 输出信息
#include "unistd.h" // 获取tid
#include "sys/syscall.h" // 获取tid

#include "cuda.h" // cuda runtime
#include "cuda_runtime_api.h" // cuda runtime


extern "C" {
cudaError_t __wrap_cudaFree ( void* devPtr );
cudaError_t __wrap_cudaFreeHost ( void* ptr );
cudaError_t __wrap_cudaMalloc ( void** devPtr, size_t size );
cudaError_t __wrap_cudaMallocManaged ( void** devPtr, size_t size, unsigned int flags);
cudaError_t __wrap_cudaMallocHost ( void** ptr, size_t size );
cudaError_t __wrap_cudaHostAlloc ( void** pHost, size_t size, unsigned int flags );
cudaError_t __wrap_cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind );
cudaError_t __wrap_cudaMemset ( void* devPtr, int value, size_t count );
cudaError_t __wrap_cudaFreeAsync ( void* devPtr, cudaStream_t hStream );
cudaError_t __wrap_cudaMallocAsync ( void** devPtr, size_t size, cudaStream_t hStream );
cudaError_t __wrap_cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream );
cudaError_t __wrap_cudaMemsetAsync ( void* devPtr, int value, size_t count, cudaStream_t stream );
cudaError_t __wrap_cudaLaunchKernel ( const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream );
cudaError_t __wrap_cudaFreeAsync_ptsz ( void* devPtr, cudaStream_t hStream );
cudaError_t __wrap_cudaMallocAsync_ptsz ( void** devPtr, size_t size, cudaStream_t hStream );
cudaError_t __wrap_cudaMemcpyAsync_ptsz ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream );
cudaError_t __wrap_cudaMemsetAsync_ptsz ( void* devPtr, int value, size_t count, cudaStream_t stream );
cudaError_t __wrap_cudaLaunchKernel_ptsz ( const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream );

cudaError_t __real_cudaFree ( void* devPtr );
cudaError_t __real_cudaFreeHost ( void* ptr );
cudaError_t __real_cudaMalloc ( void** devPtr, size_t size );
cudaError_t __real_cudaMallocHost ( void** ptr, size_t size );
cudaError_t __real_cudaMallocManaged ( void** devPtr, size_t size, unsigned int flags);
cudaError_t __real_cudaHostAlloc ( void** pHost, size_t size, unsigned int flags );
cudaError_t __real_cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind );
cudaError_t __real_cudaMemset ( void* devPtr, int value, size_t count );
cudaError_t __real_cudaFreeAsync ( void* devPtr, cudaStream_t hStream );
cudaError_t __real_cudaMallocAsync ( void** devPtr, size_t size, cudaStream_t hStream );
cudaError_t __real_cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream );
cudaError_t __real_cudaMemsetAsync ( void* devPtr, int value, size_t count, cudaStream_t stream );
cudaError_t __real_cudaLaunchKernel ( const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream );
cudaError_t __real_cudaMallocAsync_ptsz ( void** devPtr, size_t size, cudaStream_t hStream );
cudaError_t __real_cudaFreeAsync_ptsz ( void* devPtr, cudaStream_t hStream );
cudaError_t __real_cudaMemcpyAsync_ptsz ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream );
cudaError_t __real_cudaMemsetAsync_ptsz ( void* devPtr, int value, size_t count, cudaStream_t stream );
cudaError_t __real_cudaLaunchKernel_ptsz ( const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream );

};

cudaError_t __wrap_cudaFree ( void* devPtr ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaFree(devPtr);
printf("TID:%d __real_cudaFree with devPtr:%p, result:%s\n", tid, devPtr, cudaGetErrorString(result ) );
return result;
}

cudaError_t __wrap_cudaFreeHost ( void* ptr ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaFreeHost(ptr);
printf("TID:%d __real_cudaFreeHost with ptr:%p, result:%s\n", tid, ptr, cudaGetErrorString(result ) );
return result;
}

cudaError_t __wrap_cudaMalloc ( void** devPtr, size_t size ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaMalloc(devPtr, size);
printf("TID:%d __real_cudaMalloc with *devPtr:%p size:%d, result:%s\n", tid, *devPtr, size, cudaGetErrorString(result ) );
return result;
}

cudaError_t __wrap_cudaMallocHost ( void** devPtr, size_t size ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaMallocHost(devPtr, size);
printf("TID:%d __real_cudaMallocHost with *devPtr:%p size:%d, result:%s\n", tid, *devPtr, size, cudaGetErrorString(result ) );
return result;
}

cudaError_t __wrap_cudaMallocManaged ( void** devPtr, size_t size, unsigned int flags) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaMallocManaged(devPtr, size, flags);
printf("TID:%d __real_cudaMallocManaged with *devPtr:%p size:%d flags:%d, result:%s\n", tid, *devPtr, size, flags, cudaGetErrorString(result ) );
return result;
}


cudaError_t __wrap_cudaHostAlloc ( void** pHost, size_t size, unsigned int flags ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaHostAlloc(pHost, size, flags);
printf("TID:%d __real_cudaMallocHost with *pHost:%p size:%d flags%d, result:%s\n", tid, *pHost, size, flags, cudaGetErrorString(result ) );
return result;
}

cudaError_t __wrap_cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaMemcpy(dst, src, count, kind);
printf("TID:%d __real_cudaMemcpy with dst:%p src:%p count:%d kind:%d, result:%s\n", tid, dst, src, count, kind, cudaGetErrorString(result ) );
return result;
}

cudaError_t __wrap_cudaMemset ( void* devPtr, int value, size_t count ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaMemset(devPtr, value, count);
printf("TID:%d __real_cudaMemset with devPtr:%p value:%d count:%d, result:%s\n", tid, devPtr, value, count, cudaGetErrorString(result ) );
return result;
}

cudaError_t __wrap_cudaFreeAsync ( void* devPtr, cudaStream_t hStream ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaFreeAsync(devPtr, hStream);
printf("TID:%d __real_cudaFreeAsync with devPtr:%p stream:%p, result:%s\n", tid, devPtr, hStream, cudaGetErrorString(result ) );
return result;
}

cudaError_t __wrap_cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaMemcpyAsync(dst, src, count, kind, stream);
printf("TID:%d __real_cudaMemcpyAsync with dst:%p src:%p count:%d kind:%d stream:%p, result:%s\n", tid, dst, src, count, kind, stream, cudaGetErrorString(result ) );
return result;
}

cudaError_t __wrap_cudaMallocAsync ( void** devPtr, size_t size, cudaStream_t hStream ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaMallocAsync(devPtr, size, hStream);
printf("TID:%d __real_cudaMallocAsync with *devPtr:%p size:%d hStream:%p, result:%s\n", tid, *devPtr, size, hStream, cudaGetErrorString(result ) );
return result;
}

cudaError_t __wrap_cudaMemsetAsync ( void* devPtr, int value, size_t count, cudaStream_t stream ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaMemsetAsync(devPtr, value, count, stream);
printf("TID:%d __real_cudaMemsetAsync with devPtr:%p value:%d count:%d stream:%p, result:%s\n", tid, devPtr, value, count, stream, cudaGetErrorString(result ) );
return result;
}

cudaError_t __wrap_cudaLaunchKernel ( const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
printf("TID:%d __real_cudaLaunchKernel_ptsz with func:%p gridDim(%d,%d,%d) blockDim(%d,%d,%d) sharedMem:%d stream:%p, result:%s\n",
tid, func, gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, sharedMem, stream, cudaGetErrorString(result ) );
return result;
}

cudaError_t __wrap_cudaFreeAsync_ptsz ( void* devPtr, cudaStream_t hStream ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaFreeAsync_ptsz(devPtr, hStream);
printf("TID:%d __real_cudaFreeAsync_ptsz with devPtr:%p stream:%p, result:%s\n", tid, devPtr, hStream, cudaGetErrorString(result ) );
return result;
}

cudaError_t __wrap_cudaMemcpyAsync_ptsz ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaMemcpyAsync_ptsz(dst, src, count, kind, stream);
printf("TID:%d __real_cudaMemcpyAsync_ptsz with dst:%p src:%p count:%d kind:%d stream:%p, result:%s\n", tid, dst, src, count, kind, stream, cudaGetErrorString(result ) );
return result;
}

cudaError_t __wrap_cudaMallocAsync_ptsz ( void** devPtr, size_t size, cudaStream_t hStream ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaMallocAsync_ptsz(devPtr, size, hStream);
printf("TID:%d __real_cudaMallocAsync_ptsz with *devPtr:%p size:%d hStream:%p, result:%s\n", tid, *devPtr, size, hStream, cudaGetErrorString(result ) );
return result;
}

cudaError_t __wrap_cudaMemsetAsync_ptsz ( void* devPtr, int value, size_t count, cudaStream_t stream ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaMemsetAsync_ptsz(devPtr, value, count, stream);
printf("TID:%d __real_cudaMemsetAsync_ptsz with devPtr:%p value:%d count:%d stream:%p, result:%s\n", tid, devPtr, value, count, stream, cudaGetErrorString(result ) );
return result;
}

cudaError_t __wrap_cudaLaunchKernel_ptsz ( const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream ) {
auto tid = (unsigned int)syscall(SYS_gettid);
auto result = __real_cudaLaunchKernel_ptsz(func, gridDim, blockDim, args, sharedMem, stream);
printf("TID:%d __real_cudaLaunchKernel_ptsz with func:%p gridDim(%d,%d,%d) blockDim(%d,%d,%d) sharedMem:%d stream:%p, result:%s\n",
tid, func, gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, sharedMem, stream, cudaGetErrorString(result ) );
return result;
}
  1. 关于’_ptsz‘函数后缀,该行为与”--default-stream per-thread“选项有关,在编译Async类函数时会被替换成带”_ptsz“后缀的函数,请参考cuda_runtime_api.h
  2. 根据需要可以添加新的内容,但注意编译选项也需要做相应修改。