Skip to content
Snippets Groups Projects
Commit 58e6e38e authored by Recolic Keghart's avatar Recolic Keghart
Browse files

adjust cpu/gpu switch scripts

parent 0d2e7121
No related branches found
No related tags found
No related merge requests found
......@@ -5,20 +5,18 @@ NVCC ?= /usr/local/cuda-10.2/bin/nvcc
# requires: cuda 10.2
SM ?= 50
TEST_DEV ?= GPU
OMP_FLAGS = -Xcompiler -fopenmp
NVFLAGS = -I. -O3 -std=c++14 -arch sm_$(SM) --relocatable-device-code=true --extended-lambda $(OMP_FLAGS) -g
# --cudart static
# --gpu-architecture=sm_50' is equivalent to 'nvcc --gpu-architecture=compute_50
# --gpu-code=sm_50,compute_50'.
NVFLAGS = -I. -O3 -std=c++14 -DCUDA_SM=$(SM) -arch sm_$(SM) --relocatable-device-code=true --extended-lambda $(OMP_FLAGS) -g -DTEST_DEV=$(TEST_DEV)
default:
$(NVCC) $(NVFLAGS) gpma_bfs_demo.cu -o gpma_bfs_demo -lgomp
MINI_TEST_DEV ?= GPU
mini:
$(NVCC) $(NVFLAGS) mini.cu -o mini -lgomp -DDEBUG -DTEST_DEV=$(MINI_TEST_DEV)
$(NVCC) $(NVFLAGS) mini.cu -o mini -lgomp -DDEBUG
format:
clang-format --style=file -i *.cuh *.cu *.hpp
......
......@@ -762,9 +762,8 @@ __global__ void rebalancing_kernel(SIZE_TYPE unique_update_size, SIZE_TYPE seg_l
BLOCKS_NUM = CALC_BLOCKS_NUM(THREADS_NUM, interval_size);
memcpy_kernel<KEY_TYPE><<<BLOCKS_NUM, THREADS_NUM>>>(tmp_keys + (*compacted_size), update_keys + interval_a, interval_size);
memcpy_kernel<VALUE_TYPE><<<BLOCKS_NUM, THREADS_NUM>>>(tmp_values + (*compacted_size), update_values + interval_a, interval_size);
//printf("DETAIL DEBUG 4: values[0:6] = %llu %llu %llu %llu %llu %llu\n", values[0], values[1], values[2], values[3], values[4], values[5]);
//printf("DETAIL DEBUG 4: update_values[0:6] = %llu %llu %llu %llu %llu %llu\n", update_values[0], update_values[1], update_values[2], update_values[3], update_values[4], update_values[5]);
//printf("DETAIL DEBUG 4: values[0:6] = %llu %llu %llu %llu %llu %llu\n", values[0], values[1], values[2], values[3], values[4], values[5]);
//printf("DETAIL DEBUG 4: update_values[0:6] = %llu %llu %llu %llu %llu %llu\n", update_values[0], update_values[1], update_values[2], update_values[3], update_values[4], update_values[5]);
// set SIZE_NONE for executed updates
memset_kernel<SIZE_TYPE><<<BLOCKS_NUM, THREADS_NUM>>>(update_nodes + interval_a, SIZE_NONE, interval_size);
......@@ -772,9 +771,8 @@ __global__ void rebalancing_kernel(SIZE_TYPE unique_update_size, SIZE_TYPE seg_l
// warning: the silly original GPU coder: sub-func::keys is tmp_keys, and sub-func::values is tmp_values. which means, sort tmp_keys+tmp_values and copy to key+value.
cub_sort_key_value(tmp_keys, tmp_values, merge_size, key, value);
//printf("DETAIL DEBUG 3: values[0:6] = %llu %llu %llu %llu %llu %llu\n", values[0], values[1], values[2], values[3], values[4], values[5]);
//printf("DETAIL DEBUG 3: tmp_values[0:6] = %llu %llu %llu %llu %llu %llu\n", tmp_values[0], tmp_values[1], tmp_values[2], tmp_values[3], tmp_values[4], tmp_values[5]);
//printf("DETAIL DEBUG 3: values[0:6] = %llu %llu %llu %llu %llu %llu\n", values[0], values[1], values[2], values[3], values[4], values[5]);
//printf("DETAIL DEBUG 3: tmp_values[0:6] = %llu %llu %llu %llu %llu %llu\n", tmp_values[0], tmp_values[1], tmp_values[2], tmp_values[3], tmp_values[4], tmp_values[5]);
// re-dispatch
BLOCKS_NUM = CALC_BLOCKS_NUM(THREADS_NUM, update_width);
......@@ -798,9 +796,9 @@ template <dev_type_t DEV>
void rebalance_batch(SIZE_TYPE level, SIZE_TYPE seg_length, KEY_TYPE *keys, VALUE_TYPE *values, SIZE_TYPE *update_nodes, KEY_TYPE *update_keys, VALUE_TYPE *update_values, SIZE_TYPE update_size, SIZE_TYPE *unique_update_nodes, SIZE_TYPE *update_offset, SIZE_TYPE unique_update_size, SIZE_TYPE lower_bound, SIZE_TYPE upper_bound, SIZE_TYPE *row_offset) {
// TryInsert+ is this function.
SIZE_TYPE update_width = seg_length << level; // real seg_length of this level
if (false && update_width <= 1024) {
assert(IsPowerOfTwo(update_width));
if (DEV == GPU) {
if (DEV == GPU) {
if (update_width <= 1024) {
assert(IsPowerOfTwo(update_width));
// func pointer for each template
decltype(&block_rebalancing_kernel<1, 1>) func_arr[10];
func_arr[0] = block_rebalancing_kernel<2, 1>;
......@@ -821,33 +819,14 @@ void rebalance_batch(SIZE_TYPE level, SIZE_TYPE seg_length, KEY_TYPE *keys, VALU
DEBUG_PRINTFLN("ARG-=DEBUG: block_rebalancing_kernel calling func_arr[{}]<<<{}, {}>>>, update_width={}", fls(update_width) - 2, BLOCKS_NUM, THREADS_NUM, update_width);
func_arr[fls(update_width) - 2]<<<BLOCKS_NUM, THREADS_NUM>>>(seg_length, level, keys, values, update_nodes, update_keys, update_values, unique_update_nodes, update_offset, lower_bound, upper_bound, row_offset);
} else {
// func pointer for each template
decltype(&block_rebalancing_cpu<1>) func_arr[10];
func_arr[0] = block_rebalancing_cpu<2>;
func_arr[1] = block_rebalancing_cpu<4>;
func_arr[2] = block_rebalancing_cpu<8>;
func_arr[3] = block_rebalancing_cpu<16>;
func_arr[4] = block_rebalancing_cpu<32>;
func_arr[5] = block_rebalancing_cpu<64>;
func_arr[6] = block_rebalancing_cpu<128>;
func_arr[7] = block_rebalancing_cpu<256>;
func_arr[8] = block_rebalancing_cpu<512>;
func_arr[9] = block_rebalancing_cpu<1024>;
DEBUG_PRINTFLN("ARG-=DEBUG: CPU block_rebalancing_cpu calling func_arr[{}], update_width={}", fls(update_width) - 2, update_width);
func_arr[fls(update_width) - 2](unique_update_size, seg_length, level, keys, values, update_nodes, update_keys, update_values, unique_update_nodes, update_offset, lower_bound, upper_bound, row_offset);
}
} else {
if (DEV == GPU) {
// operate each tree node by cub-kernel (dynamic parallelsim)
SIZE_TYPE BLOCKS_NUM = min(2048, unique_update_size);
DEBUG_PRINTFLN("ARG-=DEBUG: rebalance_batch calling rebalancing_kernel<<<{}, {}>>>, update_width={}", BLOCKS_NUM, 1, update_width);
rebalancing_kernel<<<BLOCKS_NUM, 1>>>(unique_update_size, seg_length, level, keys, values, update_nodes, update_keys, update_values, unique_update_nodes, update_offset, lower_bound, upper_bound, row_offset);
} else {
DEBUG_PRINTFLN("ARG-=DEBUG: CPU rebalance_batch calling rebalancing_impl_cpu, blocks={}, update_width={}", unique_update_size, update_width);
rebalancing_impl_cpu(unique_update_size, seg_length, level, keys, values, update_nodes, update_keys, update_values, unique_update_nodes, update_offset, lower_bound, upper_bound, row_offset);
}
} else { // DEV == CPU
DEBUG_PRINTFLN("ARG-=DEBUG: CPU rebalance_batch calling rebalancing_impl_cpu, blocks={}, update_width={}", unique_update_size, update_width);
rebalancing_impl_cpu(unique_update_size, seg_length, level, keys, values, update_nodes, update_keys, update_values, unique_update_nodes, update_offset, lower_bound, upper_bound, row_offset);
}
anySync<DEV>(); // after previous kernel launch
......
......@@ -253,16 +253,16 @@ __host__ void gpma_bfs(KEY_TYPE *keys, VALUE_TYPE *values, SIZE_TYPE *row_offset
anyMemset<DEV>(results, 0, sizeof(SIZE_TYPE) * node_size);
SIZE_TYPE *bitmap;
anyMalloc<DEV>((void**)&bitmap, sizeof(SIZE_TYPE) * ((node_size - 1) / 32 + 1));
anyMalloc<DEV>((void **)&bitmap, sizeof(SIZE_TYPE) * ((node_size - 1) / 32 + 1));
anyMemset<DEV>(bitmap, 0, sizeof(SIZE_TYPE) * ((node_size - 1) / 32 + 1));
SIZE_TYPE *node_queue;
anyMalloc<DEV>((void**)&node_queue, sizeof(SIZE_TYPE) * node_size);
anyMalloc<DEV>((void **)&node_queue, sizeof(SIZE_TYPE) * node_size);
SIZE_TYPE *node_queue_offset;
anyMalloc<DEV>((void**)&node_queue_offset, sizeof(SIZE_TYPE));
anyMalloc<DEV>((void **)&node_queue_offset, sizeof(SIZE_TYPE));
SIZE_TYPE *edge_queue;
anyMalloc<DEV>((void**)&edge_queue, sizeof(SIZE_TYPE) * edge_size);
anyMalloc<DEV>((void **)&edge_queue, sizeof(SIZE_TYPE) * edge_size);
SIZE_TYPE *edge_queue_offset;
anyMalloc<DEV>((void**)&edge_queue_offset, sizeof(SIZE_TYPE));
anyMalloc<DEV>((void **)&edge_queue_offset, sizeof(SIZE_TYPE));
// init
SIZE_TYPE host_num[1];
......@@ -281,11 +281,9 @@ __host__ void gpma_bfs(KEY_TYPE *keys, VALUE_TYPE *values, SIZE_TYPE *row_offset
SIZE_TYPE BLOCKS_NUM = CALC_BLOCKS_NUM(THREADS_NUM, host_num[0]);
host_num[0] = 0;
anyMemcpy<CPU, DEV>(edge_queue_offset, host_num, sizeof(SIZE_TYPE));
if(DEV == GPU) {
if (DEV == GPU) {
gpma_bfs_gather_kernel<THREADS_NUM><<<BLOCKS_NUM, THREADS_NUM>>>(node_queue, node_queue_offset, edge_queue, edge_queue_offset, keys, values, row_offsets);
}
else {
} else {
}
// contract
......@@ -294,11 +292,9 @@ __host__ void gpma_bfs(KEY_TYPE *keys, VALUE_TYPE *values, SIZE_TYPE *row_offset
anyMemcpy<DEV, CPU>(host_num, edge_queue_offset, sizeof(SIZE_TYPE));
BLOCKS_NUM = CALC_BLOCKS_NUM(THREADS_NUM, host_num[0]);
if(DEV == GPU) {
if (DEV == GPU) {
gpma_bfs_contract_kernel<THREADS_NUM><<<BLOCKS_NUM, THREADS_NUM>>>(edge_queue, edge_queue_offset, node_queue, node_queue_offset, level, results, bitmap);
}
else {
} else {
}
anyMemcpy<DEV, CPU>(host_num, node_queue_offset, sizeof(SIZE_TYPE));
......
......@@ -38,8 +38,11 @@ int main(int argc, char **argv) {
char *data_path = argv[1];
int bfs_start_node = std::atoi(argv[2]);
// cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1024ll * 1024 * 1024);
// cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 5);
#if CUDA_SM >= 60
// heap size limit is KNOWN to be required at SM_75(Tesla T4),SM_61(Tesla P4), and KNOWN to be forbidden at SM_50(GEForce 750).
cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1024ll * 1024 * 1024);
cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 5);
#endif
thrust::host_vector<int> host_x;
thrust::host_vector<int> host_y;
......@@ -53,15 +56,15 @@ int main(int argc, char **argv) {
h_base_keys[i] = ((KEY_TYPE)host_x[i] << 32) + host_y[i];
}
NATIVE_VEC_KEY<CPU> base_keys = h_base_keys;
NATIVE_VEC_VALUE<CPU> base_values(half, 1);
NATIVE_VEC_KEY<TEST_DEV> base_keys = h_base_keys;
NATIVE_VEC_VALUE<TEST_DEV> base_values(half, 1);
cudaDeviceSynchronize();
int num_slide = 100;
int step = half / num_slide;
LOG_TIME("before init_csr_gpma")
GPMA<CPU> gpma(node_size);
GPMA<TEST_DEV> gpma(node_size);
cudaDeviceSynchronize();
LOG_TIME("before update_gpma 1")
......@@ -69,9 +72,12 @@ int main(int argc, char **argv) {
thrust::device_vector<SIZE_TYPE> bfs_result(node_size);
cudaDeviceSynchronize();
LOG_TIME("before first bfs")
{
auto gpma_mirror = gpma.mirror();
LOG_TIME("before first bfs") {
auto gpma_mirror = gpma
#if TEST_DEV == CPU
.mirror()
#endif
;
gpma_bfs<GPU>(RAW_PTR(gpma_mirror.keys), RAW_PTR(gpma_mirror.values), RAW_PTR(gpma_mirror.row_offset), node_size, edge_size, bfs_start_node, RAW_PTR(bfs_result));
}
int reach_nodes = node_size - thrust::count(bfs_result.begin(), bfs_result.end(), 0);
......@@ -89,10 +95,10 @@ int main(int argc, char **argv) {
hk[j + step] = ((KEY_TYPE)host_x[idx] << 32) + host_y[idx];
}
NATIVE_VEC_VALUE<CPU> update_values(step * 2);
NATIVE_VEC_VALUE<TEST_DEV> update_values(step * 2);
thrust::fill(update_values.begin(), update_values.begin() + step, 1);
thrust::fill(update_values.begin() + step, update_values.end(), VALUE_NONE);
NATIVE_VEC_KEY<CPU> update_keys = hk;
NATIVE_VEC_KEY<TEST_DEV> update_keys = hk;
cudaDeviceSynchronize();
update_gpma(gpma, update_keys, update_values);
......@@ -102,7 +108,11 @@ int main(int argc, char **argv) {
LOG_TIME("before second bfs")
{
auto gpma_mirror = gpma.mirror();
auto gpma_mirror = gpma
#if TEST_DEV == CPU
.mirror()
#endif
;
gpma_bfs<GPU>(RAW_PTR(gpma_mirror.keys), RAW_PTR(gpma_mirror.values), RAW_PTR(gpma_mirror.row_offset), node_size, edge_size, bfs_start_node, RAW_PTR(bfs_result));
}
reach_nodes = node_size - thrust::count(bfs_result.begin(), bfs_result.end(), 0);
......
#!/usr/bin/fish
function t
make TEST_DEV=$argv[1]
and ./gpma_bfs_demo /dataset/999999999.pokec.txt 0 | tee /dev/fd/2 2>| grep 1334630
set ret $status
test $ret = 0 ; and echo $argv[1] OK ; or echo $argv[1] FAILED
return $ret
end
t CPU
and t GPU
exit $status
......@@ -87,7 +87,7 @@ __host__ __device__ void anyFree<GPU>(void *ptr) {
template <dev_type_t DEV>
void anyMemset(void *dst, int value, size_t count) {
if(DEV == GPU)
if (DEV == GPU)
cErr(cudaMemset(dst, value, count));
else
memset(dst, value, count);
......@@ -117,8 +117,6 @@ void anyRunLengthEncoding(const SIZE_TYPE *inputVec, SIZE_TYPE inputLen, SIZE_TY
} else {
*outputLen = rlib::cpu_rle_simple(inputVec, inputLen, outputVec, outputLenVec);
}
}
// Sometimes we need to call exsum from gpu code...
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment