Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
G
gpma_bfs
Manage
Activity
Members
Labels
Plan
Issues
2
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
This is an archived project. Repository and other project resources are read-only.
Show more breadcrumbs
Recolic
gpma_bfs
Commits
876686a3
There was an error fetching the commit references. Please try again later.
Commit
876686a3
authored
5 years ago
by
Recolic Keghart
Browse files
Options
Downloads
Patches
Plain Diff
fix cpu bug, but cpu result seems still incorrect, maybe value error
parent
4a91599e
No related branches found
No related tags found
1 merge request
!2
Bfs cpu
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
gpma.cuh
+23
-10
23 additions, 10 deletions
gpma.cuh
with
23 additions
and
10 deletions
gpma.cuh
+
23
−
10
View file @
876686a3
...
...
@@ -633,6 +633,7 @@ __forceinline__ __host__ __device__ void redispatch_loop_body(size_t i, KEY_TYPE
VALUE_TYPE
cur_value
=
tmp_values
[
i
];
keys
[
proj_location
]
=
cur_key
;
values
[
proj_location
]
=
cur_value
;
//printf("DETAIL DEBUG: i=%lu, KEYS[%d]<=%llu, values[%d]<=%lu\n", i, proj_location, cur_key, proj_location, cur_value);
// addition for csr
if
((
cur_key
&
COL_IDX_NONE
)
==
COL_IDX_NONE
)
{
...
...
@@ -654,7 +655,7 @@ void rebalancing_impl_cpu(SIZE_TYPE unique_update_size, SIZE_TYPE seg_length, SI
// COPY_PASTED from rebalancing_kernel BEGIN
SIZE_TYPE
update_width
=
seg_length
<<
level
;
#pragma omp parallel
//
#pragma omp parallel
{
// private variables.
SIZE_TYPE
*
compacted_size
;
...
...
@@ -669,7 +670,7 @@ void rebalancing_impl_cpu(SIZE_TYPE unique_update_size, SIZE_TYPE seg_length, SI
anyMalloc
<
CPU
>
((
void
**
)
&
tmp_exscan
,
update_width
*
sizeof
(
SIZE_TYPE
));
anyMalloc
<
CPU
>
((
void
**
)
&
tmp_label
,
update_width
*
sizeof
(
SIZE_TYPE
));
#pragma omp for schedule(dynamic, 8) // this loop is heavy...
//
#pragma omp for schedule(dynamic, 8) // this loop is heavy...
for
(
SIZE_TYPE
i
=
0
;
i
<
unique_update_size
;
++
i
)
{
SIZE_TYPE
update_node
=
unique_update_nodes
[
i
];
KEY_TYPE
*
key
=
keys
+
update_node
;
...
...
@@ -686,19 +687,24 @@ void rebalancing_impl_cpu(SIZE_TYPE unique_update_size, SIZE_TYPE seg_length, SI
if
(
lower_bound
<=
merge_size
&&
merge_size
<=
upper_bound
)
{
// move
memcpy
(
tmp_keys
+
(
*
compacted_size
),
update_keys
+
interval_a
,
interval_size
);
memcpy
(
tmp_values
+
(
*
compacted_size
),
update_values
+
interval_a
,
interval_size
);
memcpy
(
tmp_keys
+
(
*
compacted_size
),
update_keys
+
interval_a
,
interval_size
*
sizeof
(
*
update_keys
));
memcpy
(
tmp_values
+
(
*
compacted_size
),
update_values
+
interval_a
,
interval_size
*
sizeof
(
*
update_values
));
//printf("DETAIL DEBUG 4: values[0:6] = %llu %llu %llu %llu %llu %llu\n", values[0], values[1], values[2], values[3], values[4], values[5]);
//printf("DETAIL DEBUG 4: update_values[0:6] = %llu %llu %llu %llu %llu %llu\n", update_values[0], update_values[1], update_values[2], update_values[3], update_values[4], update_values[5]);
// set SIZE_NONE for executed updates
// std::fill(...); mXeXmXsXeXt(update_nodes + interval_a, SIZE_NONE, interval_size);
std
::
fill
(
update_nodes
+
interval_a
,
update_nodes
+
interval_a
+
interval_size
,
SIZE_NONE
);
thrust
::
sort_by_key
(
thrust
::
host
,
key
,
key
+
merge_size
,
value
);
thrust
::
sort_by_key
(
thrust
::
host
,
tmp_
key
s
,
tmp_
key
s
+
merge_size
,
tmp_
value
s
);
// In original cub_sort_by_key, tmp_key should be equal to key(sorted), tmp_value should equal to value(sorted).
// TODO: conflict|| std::copy(key, key + merge_size, tmp_key);
std
::
copy
(
tmp_keys
,
tmp_keys
+
merge_size
,
key
);
// TODO: thread conflict!
// TODO: conflict|| std::copy(value, value + merge_size, tmp_value);
std
::
copy
(
tmp_values
,
tmp_values
+
merge_size
,
value
);
//printf("DETAIL DEBUG 3: values[0:6] = %llu %llu %llu %llu %llu %llu\n", values[0], values[1], values[2], values[3], values[4], values[5]);
//printf("DETAIL DEBUG 3: tmp_values[0:6] = %llu %llu %llu %llu %llu %llu\n", tmp_values[0], tmp_values[1], tmp_values[2], tmp_values[3], tmp_values[4], tmp_values[5]);
// re-dispatch
// std::fill(); mXeXmXsXeXt(key, KEY_NONE, update_width);
std
::
fill
(
key
,
key
+
update_width
,
KEY_NONE
);
// redispatch_kernel<<<BLOCKS_NUM, THREADS_NUM>>>(tmp_keys, tmp_values, key, value, update_width, seg_length, merge_size, row_offset, update_node);
...
...
@@ -752,17 +758,23 @@ __global__ void rebalancing_kernel(SIZE_TYPE unique_update_size, SIZE_TYPE seg_l
if
(
lower_bound
<=
merge_size
&&
merge_size
<=
upper_bound
)
{
SIZE_TYPE
THREADS_NUM
=
32
;
SIZE_TYPE
BLOCKS_NUM
;
// move
BLOCKS_NUM
=
CALC_BLOCKS_NUM
(
THREADS_NUM
,
interval_size
);
memcpy_kernel
<
KEY_TYPE
><<<
BLOCKS_NUM
,
THREADS_NUM
>>>
(
tmp_keys
+
(
*
compacted_size
),
update_keys
+
interval_a
,
interval_size
);
memcpy_kernel
<
VALUE_TYPE
><<<
BLOCKS_NUM
,
THREADS_NUM
>>>
(
tmp_values
+
(
*
compacted_size
),
update_values
+
interval_a
,
interval_size
);
//printf("DETAIL DEBUG 4: values[0:6] = %llu %llu %llu %llu %llu %llu\n", values[0], values[1], values[2], values[3], values[4], values[5]);
//printf("DETAIL DEBUG 4: update_values[0:6] = %llu %llu %llu %llu %llu %llu\n", update_values[0], update_values[1], update_values[2], update_values[3], update_values[4], update_values[5]);
// set SIZE_NONE for executed updates
memset_kernel
<
SIZE_TYPE
><<<
BLOCKS_NUM
,
THREADS_NUM
>>>
(
update_nodes
+
interval_a
,
SIZE_NONE
,
interval_size
);
anySync
<
GPU
>
();
// Necessary here, since there's multiple CUDA stream.
// warning: the silly original GPU coder: sub-func::keys is tmp_keys, and sub-func::values is tmp_values. which means, sort tmp_keys+tmp_values and copy to key+value.
cub_sort_key_value
(
tmp_keys
,
tmp_values
,
merge_size
,
key
,
value
);
//printf("DETAIL DEBUG 3: values[0:6] = %llu %llu %llu %llu %llu %llu\n", values[0], values[1], values[2], values[3], values[4], values[5]);
//printf("DETAIL DEBUG 3: tmp_values[0:6] = %llu %llu %llu %llu %llu %llu\n", tmp_values[0], tmp_values[1], tmp_values[2], tmp_values[3], tmp_values[4], tmp_values[5]);
// re-dispatch
BLOCKS_NUM
=
CALC_BLOCKS_NUM
(
THREADS_NUM
,
update_width
);
...
...
@@ -786,7 +798,7 @@ template <dev_type_t DEV>
void
rebalance_batch
(
SIZE_TYPE
level
,
SIZE_TYPE
seg_length
,
KEY_TYPE
*
keys
,
VALUE_TYPE
*
values
,
SIZE_TYPE
*
update_nodes
,
KEY_TYPE
*
update_keys
,
VALUE_TYPE
*
update_values
,
SIZE_TYPE
update_size
,
SIZE_TYPE
*
unique_update_nodes
,
SIZE_TYPE
*
update_offset
,
SIZE_TYPE
unique_update_size
,
SIZE_TYPE
lower_bound
,
SIZE_TYPE
upper_bound
,
SIZE_TYPE
*
row_offset
)
{
// TryInsert+ is this function.
SIZE_TYPE
update_width
=
seg_length
<<
level
;
// real seg_length of this level
if
(
update_width
<=
1024
)
{
if
(
false
&&
update_width
<=
1024
)
{
assert
(
IsPowerOfTwo
(
update_width
));
if
(
DEV
==
GPU
)
{
// func pointer for each template
...
...
@@ -826,6 +838,7 @@ void rebalance_batch(SIZE_TYPE level, SIZE_TYPE seg_length, KEY_TYPE *keys, VALU
func_arr
[
fls
(
update_width
)
-
2
](
unique_update_size
,
seg_length
,
level
,
keys
,
values
,
update_nodes
,
update_keys
,
update_values
,
unique_update_nodes
,
update_offset
,
lower_bound
,
upper_bound
,
row_offset
);
}
}
else
{
if
(
DEV
==
GPU
)
{
// operate each tree node by cub-kernel (dynamic parallelsim)
SIZE_TYPE
BLOCKS_NUM
=
min
(
2048
,
unique_update_size
);
...
...
@@ -1021,7 +1034,7 @@ __host__ void update_gpma(GPMA<DEV> &gpma, NATIVE_VEC_KEY<DEV> &update_keys, NAT
SIZE_TYPE
upper_bound
=
gpma
.
upper_element
[
level
];
// re-balance
//rlib::println("REBAL ARGS: ",
level, gpma.segment_length,
rlib::printable_iter(gpma.keys), rlib::printable_iter(gpma.values),
rlib::printable_iter(update_nodes)
, rlib::printable_iter(update_keys), rlib::printable_iter(update_values), update_size, "|||", rlib::printable_iter(unique_update_nodes), "|", rlib::printable_iter(update_offset), "|", unique_node_size, lower_bound, upper_bound, rlib::printable_iter(gpma.row_offset));
//
rlib::println("REBAL ARGS: ", rlib::printable_iter(gpma.keys),
"|",
rlib::printable_iter(gpma.values),
"|"
, rlib::printable_iter(update_keys),
"|",
rlib::printable_iter(update_values), update_size, "|||", rlib::printable_iter(unique_update_nodes), "|", rlib::printable_iter(update_offset), "|", unique_node_size, lower_bound, upper_bound, rlib::printable_iter(gpma.row_offset));
rebalance_batch
<
DEV
>
(
level
,
gpma
.
segment_length
,
RAW_PTR
(
gpma
.
keys
),
RAW_PTR
(
gpma
.
values
),
RAW_PTR
(
update_nodes
),
RAW_PTR
(
update_keys
),
RAW_PTR
(
update_values
),
update_size
,
RAW_PTR
(
unique_update_nodes
),
RAW_PTR
(
update_offset
),
unique_node_size
,
lower_bound
,
upper_bound
,
RAW_PTR
(
gpma
.
row_offset
));
gpma
.
print_status
(
"IN 5, after REBAL"
);
...
...
This diff is collapsed.
Click to expand it.
Recolic
@root
mentioned in issue
#1 (closed)
·
5 years ago
mentioned in issue
#1 (closed)
mentioned in issue #1
Toggle commit list
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment