Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
hysop
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
particle_methods
hysop
Commits
a023fbd9
Commit
a023fbd9
authored
8 years ago
by
Jean-Baptiste Keck
Browse files
Options
Downloads
Patches
Plain Diff
directional stretching autotuning ok
parent
83819765
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
examples/advection_gpu.py
+4
-4
4 additions, 4 deletions
examples/advection_gpu.py
hysop/gpu/kernel_autotuner.py
+99
-49
99 additions, 49 deletions
hysop/gpu/kernel_autotuner.py
with
103 additions
and
53 deletions
examples/advection_gpu.py
+
4
−
4
View file @
a023fbd9
...
@@ -27,10 +27,10 @@ if __name__=='__main__':
...
@@ -27,10 +27,10 @@ if __name__=='__main__':
NSCALARS
=
0
NSCALARS
=
0
DIM
=
3
DIM
=
3
f_resolution
=
(
33
,
33
,
33
)[:
DIM
]
#
f_resolution = (33,33,33)[:DIM]
#f_resolution = (65,65,65)[:DIM]
#
f_resolution = (65,65,65)[:DIM]
#
f_resolution = (129,129,129)[:DIM]
f_resolution
=
(
129
,
129
,
129
)[:
DIM
]
#f_resolution = (257,257,257)[:DIM]
#
f_resolution = (257,257,257)[:DIM]
#f_resolution = (513,513,257)[:DIM]
#f_resolution = (513,513,257)[:DIM]
#DIM = 2
#DIM = 2
...
...
This diff is collapsed.
Click to expand it.
hysop/gpu/kernel_autotuner.py
+
99
−
49
View file @
a023fbd9
...
@@ -201,11 +201,12 @@ class KernelAutotuner(object):
...
@@ -201,11 +201,12 @@ class KernelAutotuner(object):
self
.
enable_variable_workload
=
True
self
.
enable_variable_workload
=
True
if
np
.
isscalar
(
max_workitem_workload
):
if
np
.
isscalar
(
max_workitem_workload
):
workloads
=
_compute_pows
(
max_workitem_workload
)
workloads
=
_compute_pows
(
max_workitem_workload
)
self
.
workloads
=
itertools
.
product
(
workloads
,
repeat
=
3
)
workloads
=
itertools
.
product
(
workloads
,
repeat
=
3
)
else
:
else
:
workloads
=
[
_compute_pows
(
mww
)
for
mww
in
max_workitem_workload
]
workloads
=
[
_compute_pows
(
mww
)
for
mww
in
max_workitem_workload
]
self
.
workloads
=
itertools
.
product
(
*
workloads
)
workloads
=
itertools
.
product
(
*
workloads
)
self
.
workloads
=
[
w
for
w
in
workloads
]
def
register_extra_parameter
(
self
,
name
,
values
):
def
register_extra_parameter
(
self
,
name
,
values
):
if
self
.
extra_parameters
is
None
:
if
self
.
extra_parameters
is
None
:
self
.
extra_parameters
=
{}
self
.
extra_parameters
=
{}
...
@@ -257,7 +258,9 @@ class KernelAutotuner(object):
...
@@ -257,7 +258,9 @@ class KernelAutotuner(object):
platform
=
typegen
.
platform
platform
=
typegen
.
platform
device
=
typegen
.
device
device
=
typegen
.
device
ctx
=
typegen
.
context
ctx
=
typegen
.
context
if
self
.
autotuner_config
.
verbose
:
verbose
=
self
.
autotuner_config
.
verbose
if
verbose
:
print
'
== Kernel {} Autotuning ==
'
.
format
(
self
.
name
)
print
'
== Kernel {} Autotuning ==
'
.
format
(
self
.
name
)
print
'
platform: {}
'
.
format
(
platform
.
name
)
print
'
platform: {}
'
.
format
(
platform
.
name
)
print
'
device: {}
'
.
format
(
device
.
name
)
print
'
device: {}
'
.
format
(
device
.
name
)
...
@@ -280,20 +283,33 @@ class KernelAutotuner(object):
...
@@ -280,20 +283,33 @@ class KernelAutotuner(object):
best_global_size
=
None
best_global_size
=
None
best_local_size
=
None
best_local_size
=
None
best_stats
=
None
best_stats
=
None
best_extra_params
=
{
'
is_cached
'
:
True
}
best_extra_params
=
None
dump_cache
=
False
separator
=
'
_
'
*
100
indent
=
lambda
i
:
'
'
*
i
for
extra_parameters
in
self
.
get_extra_parameters
():
for
extra_parameters
in
self
.
get_extra_parameters
():
if
self
.
extra_parameters
is
None
:
if
self
.
extra_parameters
is
None
:
extra_parameters
=
{}
extra_parameters
=
{}
else
:
else
:
extra_parameters
=
dict
(
zip
(
self
.
extra_parameters
.
keys
(),
extra_parameters
))
extra_parameters
=
dict
(
zip
(
self
.
extra_parameters
.
keys
(),
extra_parameters
))
params_hash
=
hashlib
.
sha256
(
str
(
hash
(
frozenset
(
sorted
(
extra_parameters
.
items
()))))).
hexdigest
()
params_hash
=
params_hash
[:
8
]
best_params_workload
=
None
best_params_workload
=
None
best_params_global_size
=
None
best_params_global_size
=
None
best_params_local_size
=
None
best_params_local_size
=
None
best_params_stats
=
None
best_params_stats
=
None
for
workload
in
self
.
get_workloads
(
work_size
):
if
verbose
:
print
separator
print
'
::Current tuning parameters:: {}
'
.
format
(
extra_parameters
)
workloads
=
self
.
get_workloads
(
work_size
)
for
workload
in
workloads
:
workload
=
np
.
asarray
(
workload
)
workload
=
np
.
asarray
(
workload
)
max_global_size
=
get_max_global_size
(
work_size
,
workload
)
max_global_size
=
get_max_global_size
(
work_size
,
workload
)
...
@@ -303,11 +319,10 @@ class KernelAutotuner(object):
...
@@ -303,11 +319,10 @@ class KernelAutotuner(object):
best_workload_candidate
=
None
best_workload_candidate
=
None
best_workload_ids
=
None
best_workload_ids
=
None
dump_cache
=
False
if
verbose
:
print
separator
print
'
_
'
*
100
print
indent
(
1
)
+
'
::Current workload {}::
'
.
format
(
workload
)
print
'
::Current workload {}::
'
.
format
(
workload
)
print
indent
(
2
)
+
'
-> global_size is set to {}
'
.
format
(
max_global_size
)
print
'
-> global_size is set to {}
'
.
format
(
max_global_size
)
candidates
=
self
.
get_candidates
(
ctx
=
ctx
,
device
=
device
,
candidates
=
self
.
get_candidates
(
ctx
=
ctx
,
device
=
device
,
max_global_size
=
max_global_size
,
**
kargs
)
max_global_size
=
max_global_size
,
**
kargs
)
...
@@ -321,8 +336,8 @@ class KernelAutotuner(object):
...
@@ -321,8 +336,8 @@ class KernelAutotuner(object):
pruned_count
=
0
pruned_count
=
0
unpruned_count
=
0
unpruned_count
=
0
if
self
.
autotuner_config
.
verbose
:
if
verbose
:
msg
=
'
\n
Step {} :: running {} candidates over {} runs:
'
msg
=
'
\n
'
+
indent
(
2
)
+
'
Step {} :: running {} candidates over {} runs:
'
print
msg
.
format
(
step
,
candidates
.
shape
[
0
],
nruns
)
print
msg
.
format
(
step
,
candidates
.
shape
[
0
],
nruns
)
for
local_work_size
in
candidates
:
for
local_work_size
in
candidates
:
...
@@ -338,31 +353,45 @@ class KernelAutotuner(object):
...
@@ -338,31 +353,45 @@ class KernelAutotuner(object):
extra_parameters
=
extra_parameters
,
extra_parameters
=
extra_parameters
,
**
kargs
)
**
kargs
)
except
KernelGenerationError
:
except
KernelGenerationError
:
pruned_count
+=
1
continue
continue
global_size
=
np
.
asarray
(
global_size
)
global_size
=
np
.
asarray
(
global_size
)
gwi
=
tuple
(
global_size
)
gwi
=
tuple
(
global_size
)
update
=
False
update
=
False
if
(
not
self
.
autotuner_config
.
override_cache
)
\
pms
=
params_hash
and
src_hash
in
results
.
keys
()
\
and
ws
in
results
[
src_hash
].
keys
()
\
if
(
not
self
.
autotuner_config
.
override_cache
)
\
and
gwi
in
results
[
src_hash
][
ws
].
keys
()
\
and
src_hash
in
results
.
keys
()
\
and
lwi
in
results
[
src_hash
][
ws
][
gwi
].
keys
():
and
pms
in
results
[
src_hash
].
keys
()
\
and
ws
in
results
[
src_hash
][
pms
].
keys
()
\
and
gwi
in
results
[
src_hash
][
pms
][
ws
].
keys
()
\
and
lwi
in
results
[
src_hash
][
pms
][
ws
][
gwi
].
keys
():
stat
=
self
.
results
[
config
][
src_hash
][
ws
][
gwi
][
lwi
]
stat
=
self
.
results
[
config
][
src_hash
][
pms
][
ws
][
gwi
][
lwi
]
if
stat
.
nruns
>=
nruns
:
if
stat
.
nruns
>=
nruns
:
if
self
.
autotuner_config
.
verbose
:
if
self
.
autotuner_config
.
verbose
:
print
'
{} {} => {} (cached)
'
.
format
(
gwi
,
lwi
,
stat
)
print
indent
(
3
)
+
'
{} {} => {} (cached)
'
.
format
(
gwi
,
lwi
,
stat
)
unpruned_candidates
[
unpruned_count
,:]
=
local_work_size
unpruned_candidates
[
unpruned_count
,:]
=
local_work_size
unpruned_count
+=
1
unpruned_count
+=
1
stats
.
append
(
stat
)
stats
.
append
(
stat
)
continue
continue
else
:
else
:
update
=
True
update
=
True
if
(
best_stats
is
not
None
):
current_best_stats
=
best_stats
elif
(
best_params_stats
is
not
None
):
current_best_stats
=
best_params_stats
elif
(
best_workload_stats
is
not
None
):
current_best_stats
=
best_workload_stats
else
:
current_best_stats
=
None
(
stat
,
pruned
)
=
self
.
_bench_one
(
ctx
,
device
,
gwi
,
lwi
,
_kernel
,
_kernel_args
,
nruns
,
(
stat
,
pruned
)
=
self
.
_bench_one
(
ctx
,
device
,
gwi
,
lwi
,
_kernel
,
_kernel_args
,
nruns
,
best_stats
if
(
best_stats
is
not
None
)
else
best_workload_stats
)
current_best_stats
)
if
not
pruned
:
if
not
pruned
:
unpruned_candidates
[
unpruned_count
,:]
=
local_work_size
unpruned_candidates
[
unpruned_count
,:]
=
local_work_size
unpruned_count
+=
1
unpruned_count
+=
1
...
@@ -372,17 +401,19 @@ class KernelAutotuner(object):
...
@@ -372,17 +401,19 @@ class KernelAutotuner(object):
pruned_count
+=
1
pruned_count
+=
1
status
=
'
pruned
'
status
=
'
pruned
'
if
self
.
autotuner_config
.
verbose
:
if
verbose
:
print
'
{} {} => {} ({})
'
.
format
(
gwi
,
lwi
,
stat
,
status
)
print
indent
(
3
)
+
'
{} {} => {} ({})
'
.
format
(
gwi
,
lwi
,
stat
,
status
)
if
not
pruned
:
if
not
pruned
:
if
src_hash
not
in
results
.
keys
():
if
src_hash
not
in
results
.
keys
():
results
[
src_hash
]
=
{}
results
[
src_hash
]
=
{}
if
ws
not
in
results
[
src_hash
].
keys
():
if
pms
not
in
results
[
src_hash
].
keys
():
results
[
src_hash
][
ws
]
=
{}
results
[
src_hash
][
pms
]
=
{}
if
gwi
not
in
results
[
src_hash
][
ws
].
keys
():
if
ws
not
in
results
[
src_hash
][
pms
].
keys
():
results
[
src_hash
][
ws
][
gwi
]
=
{}
results
[
src_hash
][
pms
][
ws
]
=
{}
results
[
src_hash
][
ws
][
gwi
][
lwi
]
=
stat
if
gwi
not
in
results
[
src_hash
][
pms
][
ws
].
keys
():
results
[
src_hash
][
pms
][
ws
][
gwi
]
=
{}
results
[
src_hash
][
pms
][
ws
][
gwi
][
lwi
]
=
stat
dump_cache
=
True
dump_cache
=
True
all_pruned
=
(
pruned_count
==
candidates
.
shape
[
0
])
all_pruned
=
(
pruned_count
==
candidates
.
shape
[
0
])
...
@@ -399,10 +430,10 @@ class KernelAutotuner(object):
...
@@ -399,10 +430,10 @@ class KernelAutotuner(object):
step
+=
1
step
+=
1
if
all_pruned
:
if
all_pruned
:
if
self
.
autotuner_config
.
verbose
:
if
verbose
:
print
'
_
'
*
100
print
separator
print
'
Workload {} winner for kernel {}:
'
.
format
(
workload
,
self
.
name
)
print
indent
(
1
)
+
'
Workload {} winner for kernel {}:
'
.
format
(
workload
,
self
.
name
)
print
'
no winner (all candidates were prune
s
)
'
print
indent
(
2
)
+
'
no winner (all candidates were prune
d
)
'
continue
continue
if
(
candidates
.
shape
[
0
]
!=
1
or
len
(
best_workload_ids
)
!=
1
):
if
(
candidates
.
shape
[
0
]
!=
1
or
len
(
best_workload_ids
)
!=
1
):
...
@@ -427,22 +458,41 @@ class KernelAutotuner(object):
...
@@ -427,22 +458,41 @@ class KernelAutotuner(object):
extra_parameters
=
extra_parameters
,
extra_parameters
=
extra_parameters
,
**
kargs
)
**
kargs
)
if
self
.
autotuner_config
.
verbose
:
if
verbose
:
print
'
_
'
*
100
print
separator
print
'
Workload {} winner for kernel {}:
'
.
format
(
workload
,
self
.
name
)
print
indent
(
1
)
+
'
Workload {} winner for kernel {}:
'
.
format
(
workload
,
self
.
name
)
print
'
{} {} => {}
'
.
format
(
best_workload_global_size
,
best_workload_local_size
,
best_workload_stats
)
print
indent
(
2
)
+
'
{} {} => {}
'
.
format
(
best_workload_global_size
,
best_workload_local_size
,
best_workload_stats
)
if
(
best_stats
is
None
)
or
(
best_workload_stats
<
best_stats
):
best_workload
=
workload
if
(
best_params_stats
is
None
)
or
(
best_workload_stats
<
best_workload_stats
):
best_stats
=
best_workload_stats
best_params_workload
=
workload
best_global_size
=
best_workload_global_size
best_params_stats
=
best_workload_stats
best_local_size
=
best_workload_local_size
best_params_global_size
=
best_workload_global_size
best_params_local_size
=
best_workload_local_size
if
verbose
:
print
separator
print
'
Current parameters winner for kernel {}:
'
.
format
(
self
.
name
)
if
(
best_params_stats
is
None
):
print
indent
(
1
)
+
'
no winner (all candidates were pruned)
'
else
:
print
indent
(
1
)
+
'
{} {} => {}
'
.
format
(
best_params_global_size
,
best_params_local_size
,
best_params_stats
)
if
(
best_params_stats
is
not
None
)
and
\
((
best_stats
is
None
)
or
(
best_params_stats
<
best_stats
)):
best_workload
=
best_params_workload
best_stats
=
best_params_stats
best_global_size
=
best_params_global_size
best_local_size
=
best_params_local_size
best_extra_params
=
extra_parameters
if
self
.
autotuner_config
.
verbose
:
if
verbose
:
print
'
_
'
*
100
print
separator
print
'
BEST OVERALL RESULT for kernel {}:
'
.
format
(
self
.
name
)
print
'
BEST OVERALL RESULT for kernel {}:
'
.
format
(
self
.
name
)
print
'
=> WL={} G={} L={} => {}
'
.
format
(
best_workload
,
best_global_size
,
best_local_size
,
best_stats
)
print
'
=> WL={} G={} L={} => {}
'
.
format
(
best_workload
,
best_global_size
,
best_local_size
,
print
'
_
'
*
100
best_stats
)
print
separator
print
print
return
(
best_global_size
,
best_local_size
,
best_stats
,
best_workload
,
best_extra_params
)
return
(
best_global_size
,
best_local_size
,
best_stats
,
best_workload
,
best_extra_params
)
...
@@ -523,7 +573,8 @@ class KernelAutotuner(object):
...
@@ -523,7 +573,8 @@ class KernelAutotuner(object):
#user available filters
#user available filters
def
ordering_filter
(
self
,
**
kargs
):
def
ordering_filter
(
self
,
**
kargs
):
return
lambda
local_work_size
:
(
local_work_size
[
2
]
<=
local_work_size
[
1
])
and
(
local_work_size
[
1
]
<=
local_work_size
[
0
])
return
lambda
local_work_size
:
(
local_work_size
[
2
]
<=
local_work_size
[
1
])
\
and
(
local_work_size
[
1
]
<=
local_work_size
[
0
])
def
min_workitems_per_direction
(
self
,
min_local_size
,
**
kargs
):
def
min_workitems_per_direction
(
self
,
min_local_size
,
**
kargs
):
if
np
.
isscalar
(
min_local_size
):
if
np
.
isscalar
(
min_local_size
):
min_local_size
=
[
min_local_size
]
min_local_size
=
[
min_local_size
]
...
@@ -539,4 +590,3 @@ class KernelAutotuner(object):
...
@@ -539,4 +590,3 @@ class KernelAutotuner(object):
max_local_size
=
np
.
asarray
(
max_local_size
)
max_local_size
=
np
.
asarray
(
max_local_size
)
return
lambda
local_work_size
,
**
kargs
:
(
local_work_size
<=
max_local_size
).
all
()
return
lambda
local_work_size
,
**
kargs
:
(
local_work_size
<=
max_local_size
).
all
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment