Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
G
gpma_bfs
Manage
Activity
Members
Labels
Plan
Issues
2
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
This is an archived project. Repository and other project resources are read-only.
Show more breadcrumbs
Recolic
gpma_bfs
Commits
58e6e38e
There was an error fetching the commit references. Please try again later.
Commit
58e6e38e
authored
5 years ago
by
Recolic Keghart
Browse files
Options
Downloads
Patches
Plain Diff
adjust cpu/gpu switch scripts
parent
0d2e7121
No related branches found
No related tags found
No related merge requests found
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
Makefile
+4
-6
4 additions, 6 deletions
Makefile
gpma.cuh
+10
-31
10 additions, 31 deletions
gpma.cuh
gpma_bfs.cuh
+9
-13
9 additions, 13 deletions
gpma_bfs.cuh
gpma_bfs_demo.cu
+21
-11
21 additions, 11 deletions
gpma_bfs_demo.cu
test.fish
+15
-0
15 additions, 0 deletions
test.fish
utils.cuh
+1
-3
1 addition, 3 deletions
utils.cuh
with
60 additions
and
64 deletions
Makefile
+
4
−
6
View file @
58e6e38e
...
...
@@ -5,20 +5,18 @@ NVCC ?= /usr/local/cuda-10.2/bin/nvcc
# requires: cuda 10.2
SM
?=
50
TEST_DEV
?=
GPU
OMP_FLAGS
=
-Xcompiler
-fopenmp
NVFLAGS
=
-I
.
-O3
-std
=
c++14
-arch
sm_
$(
SM
)
--relocatable-device-code
=
true
--extended-lambda
$(
OMP_FLAGS
)
-g
# --cudart static
# --gpu-architecture=sm_50' is equivalent to 'nvcc --gpu-architecture=compute_50
# --gpu-code=sm_50,compute_50'.
NVFLAGS
=
-I
.
-O3
-std
=
c++14
-DCUDA_SM
=
$(
SM
)
-arch
sm_
$(
SM
)
--relocatable-device-code
=
true
--extended-lambda
$(
OMP_FLAGS
)
-g
-DTEST_DEV
=
$(
TEST_DEV
)
default
:
$(
NVCC
)
$(
NVFLAGS
)
gpma_bfs_demo.cu
-o
gpma_bfs_demo
-lgomp
MINI_TEST_DEV
?=
GPU
mini
:
$(
NVCC
)
$(
NVFLAGS
)
mini.cu
-o
mini
-lgomp
-DDEBUG
-DTEST_DEV
=
$(
MINI_TEST_DEV
)
$(
NVCC
)
$(
NVFLAGS
)
mini.cu
-o
mini
-lgomp
-DDEBUG
format
:
clang-format
--style
=
file
-i
*
.cuh
*
.cu
*
.hpp
...
...
This diff is collapsed.
Click to expand it.
gpma.cuh
+
10
−
31
View file @
58e6e38e
...
...
@@ -762,9 +762,8 @@ __global__ void rebalancing_kernel(SIZE_TYPE unique_update_size, SIZE_TYPE seg_l
BLOCKS_NUM
=
CALC_BLOCKS_NUM
(
THREADS_NUM
,
interval_size
);
memcpy_kernel
<
KEY_TYPE
><<<
BLOCKS_NUM
,
THREADS_NUM
>>>
(
tmp_keys
+
(
*
compacted_size
),
update_keys
+
interval_a
,
interval_size
);
memcpy_kernel
<
VALUE_TYPE
><<<
BLOCKS_NUM
,
THREADS_NUM
>>>
(
tmp_values
+
(
*
compacted_size
),
update_values
+
interval_a
,
interval_size
);
//printf("DETAIL DEBUG 4: values[0:6] = %llu %llu %llu %llu %llu %llu\n", values[0], values[1], values[2], values[3], values[4], values[5]);
//printf("DETAIL DEBUG 4: update_values[0:6] = %llu %llu %llu %llu %llu %llu\n", update_values[0], update_values[1], update_values[2], update_values[3], update_values[4], update_values[5]);
//printf("DETAIL DEBUG 4: values[0:6] = %llu %llu %llu %llu %llu %llu\n", values[0], values[1], values[2], values[3], values[4], values[5]);
//printf("DETAIL DEBUG 4: update_values[0:6] = %llu %llu %llu %llu %llu %llu\n", update_values[0], update_values[1], update_values[2], update_values[3], update_values[4], update_values[5]);
// set SIZE_NONE for executed updates
memset_kernel
<
SIZE_TYPE
><<<
BLOCKS_NUM
,
THREADS_NUM
>>>
(
update_nodes
+
interval_a
,
SIZE_NONE
,
interval_size
);
...
...
@@ -772,9 +771,8 @@ __global__ void rebalancing_kernel(SIZE_TYPE unique_update_size, SIZE_TYPE seg_l
// warning: the silly original GPU coder: sub-func::keys is tmp_keys, and sub-func::values is tmp_values. which means, sort tmp_keys+tmp_values and copy to key+value.
cub_sort_key_value
(
tmp_keys
,
tmp_values
,
merge_size
,
key
,
value
);
//printf("DETAIL DEBUG 3: values[0:6] = %llu %llu %llu %llu %llu %llu\n", values[0], values[1], values[2], values[3], values[4], values[5]);
//printf("DETAIL DEBUG 3: tmp_values[0:6] = %llu %llu %llu %llu %llu %llu\n", tmp_values[0], tmp_values[1], tmp_values[2], tmp_values[3], tmp_values[4], tmp_values[5]);
//printf("DETAIL DEBUG 3: values[0:6] = %llu %llu %llu %llu %llu %llu\n", values[0], values[1], values[2], values[3], values[4], values[5]);
//printf("DETAIL DEBUG 3: tmp_values[0:6] = %llu %llu %llu %llu %llu %llu\n", tmp_values[0], tmp_values[1], tmp_values[2], tmp_values[3], tmp_values[4], tmp_values[5]);
// re-dispatch
BLOCKS_NUM
=
CALC_BLOCKS_NUM
(
THREADS_NUM
,
update_width
);
...
...
@@ -798,9 +796,9 @@ template <dev_type_t DEV>
void
rebalance_batch
(
SIZE_TYPE
level
,
SIZE_TYPE
seg_length
,
KEY_TYPE
*
keys
,
VALUE_TYPE
*
values
,
SIZE_TYPE
*
update_nodes
,
KEY_TYPE
*
update_keys
,
VALUE_TYPE
*
update_values
,
SIZE_TYPE
update_size
,
SIZE_TYPE
*
unique_update_nodes
,
SIZE_TYPE
*
update_offset
,
SIZE_TYPE
unique_update_size
,
SIZE_TYPE
lower_bound
,
SIZE_TYPE
upper_bound
,
SIZE_TYPE
*
row_offset
)
{
// TryInsert+ is this function.
SIZE_TYPE
update_width
=
seg_length
<<
level
;
// real seg_length of this level
if
(
false
&&
update_width
<=
1024
)
{
assert
(
IsPowerOfTwo
(
update_width
));
if
(
DEV
==
GPU
)
{
if
(
DEV
==
GPU
)
{
if
(
update_width
<=
1024
)
{
assert
(
IsPowerOfTwo
(
update_width
));
// func pointer for each template
decltype
(
&
block_rebalancing_kernel
<
1
,
1
>
)
func_arr
[
10
];
func_arr
[
0
]
=
block_rebalancing_kernel
<
2
,
1
>
;
...
...
@@ -821,33 +819,14 @@ void rebalance_batch(SIZE_TYPE level, SIZE_TYPE seg_length, KEY_TYPE *keys, VALU
DEBUG_PRINTFLN
(
"ARG-=DEBUG: block_rebalancing_kernel calling func_arr[{}]<<<{}, {}>>>, update_width={}"
,
fls
(
update_width
)
-
2
,
BLOCKS_NUM
,
THREADS_NUM
,
update_width
);
func_arr
[
fls
(
update_width
)
-
2
]
<<<
BLOCKS_NUM
,
THREADS_NUM
>>>
(
seg_length
,
level
,
keys
,
values
,
update_nodes
,
update_keys
,
update_values
,
unique_update_nodes
,
update_offset
,
lower_bound
,
upper_bound
,
row_offset
);
}
else
{
// func pointer for each template
decltype
(
&
block_rebalancing_cpu
<
1
>
)
func_arr
[
10
];
func_arr
[
0
]
=
block_rebalancing_cpu
<
2
>
;
func_arr
[
1
]
=
block_rebalancing_cpu
<
4
>
;
func_arr
[
2
]
=
block_rebalancing_cpu
<
8
>
;
func_arr
[
3
]
=
block_rebalancing_cpu
<
16
>
;
func_arr
[
4
]
=
block_rebalancing_cpu
<
32
>
;
func_arr
[
5
]
=
block_rebalancing_cpu
<
64
>
;
func_arr
[
6
]
=
block_rebalancing_cpu
<
128
>
;
func_arr
[
7
]
=
block_rebalancing_cpu
<
256
>
;
func_arr
[
8
]
=
block_rebalancing_cpu
<
512
>
;
func_arr
[
9
]
=
block_rebalancing_cpu
<
1024
>
;
DEBUG_PRINTFLN
(
"ARG-=DEBUG: CPU block_rebalancing_cpu calling func_arr[{}], update_width={}"
,
fls
(
update_width
)
-
2
,
update_width
);
func_arr
[
fls
(
update_width
)
-
2
](
unique_update_size
,
seg_length
,
level
,
keys
,
values
,
update_nodes
,
update_keys
,
update_values
,
unique_update_nodes
,
update_offset
,
lower_bound
,
upper_bound
,
row_offset
);
}
}
else
{
if
(
DEV
==
GPU
)
{
// operate each tree node by cub-kernel (dynamic parallelsim)
SIZE_TYPE
BLOCKS_NUM
=
min
(
2048
,
unique_update_size
);
DEBUG_PRINTFLN
(
"ARG-=DEBUG: rebalance_batch calling rebalancing_kernel<<<{}, {}>>>, update_width={}"
,
BLOCKS_NUM
,
1
,
update_width
);
rebalancing_kernel
<<<
BLOCKS_NUM
,
1
>>>
(
unique_update_size
,
seg_length
,
level
,
keys
,
values
,
update_nodes
,
update_keys
,
update_values
,
unique_update_nodes
,
update_offset
,
lower_bound
,
upper_bound
,
row_offset
);
}
else
{
DEBUG_PRINTFLN
(
"ARG-=DEBUG: CPU rebalance_batch calling rebalancing_impl_cpu, blocks={}, update_width={}"
,
unique_update_size
,
update_width
);
rebalancing_impl_cpu
(
unique_update_size
,
seg_length
,
level
,
keys
,
values
,
update_nodes
,
update_keys
,
update_values
,
unique_update_nodes
,
update_offset
,
lower_bound
,
upper_bound
,
row_offset
);
}
}
else
{
// DEV == CPU
DEBUG_PRINTFLN
(
"ARG-=DEBUG: CPU rebalance_batch calling rebalancing_impl_cpu, blocks={}, update_width={}"
,
unique_update_size
,
update_width
);
rebalancing_impl_cpu
(
unique_update_size
,
seg_length
,
level
,
keys
,
values
,
update_nodes
,
update_keys
,
update_values
,
unique_update_nodes
,
update_offset
,
lower_bound
,
upper_bound
,
row_offset
);
}
anySync
<
DEV
>
();
// after previous kernel launch
...
...
This diff is collapsed.
Click to expand it.
gpma_bfs.cuh
+
9
−
13
View file @
58e6e38e
...
...
@@ -253,16 +253,16 @@ __host__ void gpma_bfs(KEY_TYPE *keys, VALUE_TYPE *values, SIZE_TYPE *row_offset
anyMemset
<
DEV
>
(
results
,
0
,
sizeof
(
SIZE_TYPE
)
*
node_size
);
SIZE_TYPE
*
bitmap
;
anyMalloc
<
DEV
>
((
void
**
)
&
bitmap
,
sizeof
(
SIZE_TYPE
)
*
((
node_size
-
1
)
/
32
+
1
));
anyMalloc
<
DEV
>
((
void
**
)
&
bitmap
,
sizeof
(
SIZE_TYPE
)
*
((
node_size
-
1
)
/
32
+
1
));
anyMemset
<
DEV
>
(
bitmap
,
0
,
sizeof
(
SIZE_TYPE
)
*
((
node_size
-
1
)
/
32
+
1
));
SIZE_TYPE
*
node_queue
;
anyMalloc
<
DEV
>
((
void
**
)
&
node_queue
,
sizeof
(
SIZE_TYPE
)
*
node_size
);
anyMalloc
<
DEV
>
((
void
**
)
&
node_queue
,
sizeof
(
SIZE_TYPE
)
*
node_size
);
SIZE_TYPE
*
node_queue_offset
;
anyMalloc
<
DEV
>
((
void
**
)
&
node_queue_offset
,
sizeof
(
SIZE_TYPE
));
anyMalloc
<
DEV
>
((
void
**
)
&
node_queue_offset
,
sizeof
(
SIZE_TYPE
));
SIZE_TYPE
*
edge_queue
;
anyMalloc
<
DEV
>
((
void
**
)
&
edge_queue
,
sizeof
(
SIZE_TYPE
)
*
edge_size
);
anyMalloc
<
DEV
>
((
void
**
)
&
edge_queue
,
sizeof
(
SIZE_TYPE
)
*
edge_size
);
SIZE_TYPE
*
edge_queue_offset
;
anyMalloc
<
DEV
>
((
void
**
)
&
edge_queue_offset
,
sizeof
(
SIZE_TYPE
));
anyMalloc
<
DEV
>
((
void
**
)
&
edge_queue_offset
,
sizeof
(
SIZE_TYPE
));
// init
SIZE_TYPE
host_num
[
1
];
...
...
@@ -281,11 +281,9 @@ __host__ void gpma_bfs(KEY_TYPE *keys, VALUE_TYPE *values, SIZE_TYPE *row_offset
SIZE_TYPE
BLOCKS_NUM
=
CALC_BLOCKS_NUM
(
THREADS_NUM
,
host_num
[
0
]);
host_num
[
0
]
=
0
;
anyMemcpy
<
CPU
,
DEV
>
(
edge_queue_offset
,
host_num
,
sizeof
(
SIZE_TYPE
));
if
(
DEV
==
GPU
)
{
if
(
DEV
==
GPU
)
{
gpma_bfs_gather_kernel
<
THREADS_NUM
><<<
BLOCKS_NUM
,
THREADS_NUM
>>>
(
node_queue
,
node_queue_offset
,
edge_queue
,
edge_queue_offset
,
keys
,
values
,
row_offsets
);
}
else
{
}
else
{
}
// contract
...
...
@@ -294,11 +292,9 @@ __host__ void gpma_bfs(KEY_TYPE *keys, VALUE_TYPE *values, SIZE_TYPE *row_offset
anyMemcpy
<
DEV
,
CPU
>
(
host_num
,
edge_queue_offset
,
sizeof
(
SIZE_TYPE
));
BLOCKS_NUM
=
CALC_BLOCKS_NUM
(
THREADS_NUM
,
host_num
[
0
]);
if
(
DEV
==
GPU
)
{
if
(
DEV
==
GPU
)
{
gpma_bfs_contract_kernel
<
THREADS_NUM
><<<
BLOCKS_NUM
,
THREADS_NUM
>>>
(
edge_queue
,
edge_queue_offset
,
node_queue
,
node_queue_offset
,
level
,
results
,
bitmap
);
}
else
{
}
else
{
}
anyMemcpy
<
DEV
,
CPU
>
(
host_num
,
node_queue_offset
,
sizeof
(
SIZE_TYPE
));
...
...
This diff is collapsed.
Click to expand it.
gpma_bfs_demo.cu
+
21
−
11
View file @
58e6e38e
...
...
@@ -38,8 +38,11 @@ int main(int argc, char **argv) {
char
*
data_path
=
argv
[
1
];
int
bfs_start_node
=
std
::
atoi
(
argv
[
2
]);
// cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1024ll * 1024 * 1024);
// cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 5);
#if CUDA_SM >= 60
// heap size limit is KNOWN to be required at SM_75(Tesla T4),SM_61(Tesla P4), and KNOWN to be forbidden at SM_50(GEForce 750).
cudaDeviceSetLimit
(
cudaLimitMallocHeapSize
,
1024ll
*
1024
*
1024
);
cudaDeviceSetLimit
(
cudaLimitDevRuntimeSyncDepth
,
5
);
#endif
thrust
::
host_vector
<
int
>
host_x
;
thrust
::
host_vector
<
int
>
host_y
;
...
...
@@ -53,15 +56,15 @@ int main(int argc, char **argv) {
h_base_keys
[
i
]
=
((
KEY_TYPE
)
host_x
[
i
]
<<
32
)
+
host_y
[
i
];
}
NATIVE_VEC_KEY
<
CPU
>
base_keys
=
h_base_keys
;
NATIVE_VEC_VALUE
<
CPU
>
base_values
(
half
,
1
);
NATIVE_VEC_KEY
<
TEST_DEV
>
base_keys
=
h_base_keys
;
NATIVE_VEC_VALUE
<
TEST_DEV
>
base_values
(
half
,
1
);
cudaDeviceSynchronize
();
int
num_slide
=
100
;
int
step
=
half
/
num_slide
;
LOG_TIME
(
"before init_csr_gpma"
)
GPMA
<
CPU
>
gpma
(
node_size
);
GPMA
<
TEST_DEV
>
gpma
(
node_size
);
cudaDeviceSynchronize
();
LOG_TIME
(
"before update_gpma 1"
)
...
...
@@ -69,9 +72,12 @@ int main(int argc, char **argv) {
thrust
::
device_vector
<
SIZE_TYPE
>
bfs_result
(
node_size
);
cudaDeviceSynchronize
();
LOG_TIME
(
"before first bfs"
)
{
auto
gpma_mirror
=
gpma
.
mirror
();
LOG_TIME
(
"before first bfs"
)
{
auto
gpma_mirror
=
gpma
#if TEST_DEV == CPU
.
mirror
()
#endif
;
gpma_bfs
<
GPU
>
(
RAW_PTR
(
gpma_mirror
.
keys
),
RAW_PTR
(
gpma_mirror
.
values
),
RAW_PTR
(
gpma_mirror
.
row_offset
),
node_size
,
edge_size
,
bfs_start_node
,
RAW_PTR
(
bfs_result
));
}
int
reach_nodes
=
node_size
-
thrust
::
count
(
bfs_result
.
begin
(),
bfs_result
.
end
(),
0
);
...
...
@@ -89,10 +95,10 @@ int main(int argc, char **argv) {
hk
[
j
+
step
]
=
((
KEY_TYPE
)
host_x
[
idx
]
<<
32
)
+
host_y
[
idx
];
}
NATIVE_VEC_VALUE
<
CPU
>
update_values
(
step
*
2
);
NATIVE_VEC_VALUE
<
TEST_DEV
>
update_values
(
step
*
2
);
thrust
::
fill
(
update_values
.
begin
(),
update_values
.
begin
()
+
step
,
1
);
thrust
::
fill
(
update_values
.
begin
()
+
step
,
update_values
.
end
(),
VALUE_NONE
);
NATIVE_VEC_KEY
<
CPU
>
update_keys
=
hk
;
NATIVE_VEC_KEY
<
TEST_DEV
>
update_keys
=
hk
;
cudaDeviceSynchronize
();
update_gpma
(
gpma
,
update_keys
,
update_values
);
...
...
@@ -102,7 +108,11 @@ int main(int argc, char **argv) {
LOG_TIME
(
"before second bfs"
)
{
auto
gpma_mirror
=
gpma
.
mirror
();
auto
gpma_mirror
=
gpma
#if TEST_DEV == CPU
.
mirror
()
#endif
;
gpma_bfs
<
GPU
>
(
RAW_PTR
(
gpma_mirror
.
keys
),
RAW_PTR
(
gpma_mirror
.
values
),
RAW_PTR
(
gpma_mirror
.
row_offset
),
node_size
,
edge_size
,
bfs_start_node
,
RAW_PTR
(
bfs_result
));
}
reach_nodes
=
node_size
-
thrust
::
count
(
bfs_result
.
begin
(),
bfs_result
.
end
(),
0
);
...
...
This diff is collapsed.
Click to expand it.
test.fish
0 → 100755
+
15
−
0
View file @
58e6e38e
#!/usr/bin/fish
function t
make TEST_DEV=$argv[1]
and ./gpma_bfs_demo /dataset/999999999.pokec.txt 0 | tee /dev/fd/2 2>| grep 1334630
set ret $status
test $ret = 0 ; and echo $argv[1] OK ; or echo $argv[1] FAILED
return $ret
end
t CPU
and t GPU
exit $status
This diff is collapsed.
Click to expand it.
utils.cuh
+
1
−
3
View file @
58e6e38e
...
...
@@ -87,7 +87,7 @@ __host__ __device__ void anyFree<GPU>(void *ptr) {
template
<
dev_type_t
DEV
>
void
anyMemset
(
void
*
dst
,
int
value
,
size_t
count
)
{
if
(
DEV
==
GPU
)
if
(
DEV
==
GPU
)
cErr
(
cudaMemset
(
dst
,
value
,
count
));
else
memset
(
dst
,
value
,
count
);
...
...
@@ -117,8 +117,6 @@ void anyRunLengthEncoding(const SIZE_TYPE *inputVec, SIZE_TYPE inputLen, SIZE_TY
}
else
{
*
outputLen
=
rlib
::
cpu_rle_simple
(
inputVec
,
inputLen
,
outputVec
,
outputLenVec
);
}
}
// Sometimes we need to call exsum from gpu code...
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment