Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
hysop
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
particle_methods
hysop
Commits
c4d249b1
Commit
c4d249b1
authored
12 years ago
by
Christophe Picard
Browse files
Options
Downloads
Patches
Plain Diff
Test Img2d
parent
799eba8d
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
parmepy/kernel_benchmark/test_advec2d.py
+211
-0
211 additions, 0 deletions
parmepy/kernel_benchmark/test_advec2d.py
parmepy/kernel_benchmark/test_advec_img2d_kernel.cl
+82
-0
82 additions, 0 deletions
parmepy/kernel_benchmark/test_advec_img2d_kernel.cl
with
293 additions
and
0 deletions
parmepy/kernel_benchmark/test_advec2d.py
0 → 100755
+
211
−
0
View file @
c4d249b1
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import
pyopencl
as
cl
import
numpy
as
np
import
pylab
as
pl
import
signal
nb
=
8
grid_dim
=
2
dtype_integer
=
np
.
uint32
dtype_real
=
np
.
float32
grid_shape
=
(
nb
,
nb
,
2
)
grid_size
=
1.
/
np
.
array
(
grid_shape
[
0
:
grid_dim
],
dtype
=
dtype_real
)
times
=
None
flops
=
None
bw
=
None
#Get platform.
platform
=
cl
.
get_platforms
()[
0
]
#Get device.
device
=
platform
.
get_devices
(
cl
.
device_type
.
GPU
)[
0
]
print
"
Running on
"
,
device
.
name
,
"
of
"
,
platform
.
name
,
"
platform.
"
#Creates GPU Context
ctx
=
cl
.
Context
([
device
])
#Create CommandQueue on the GPU Context
queue
=
cl
.
CommandQueue
(
ctx
,
properties
=
cl
.
command_queue_properties
.
PROFILING_ENABLE
)
def
main
(
sdir
,
nb_run
):
print
"
sDir
"
,
sdir
,
"
nbPart
"
,
nb
,
"
**3
"
# Creation des tableaux numpy
ppos
=
np
.
empty
((
nb
,
nb
),
dtype
=
dtype_real
)
pscal
=
np
.
empty
((
nb
,
nb
),
dtype
=
dtype_real
)
gvelo
=
np
.
empty
((
nb
,
nb
,
grid_dim
),
dtype
=
dtype_real
)
## Initialisation aléatoire des vitesses et scalaires
gvelo
=
np
.
random
.
random_sample
((
nb
,
nb
,
grid_dim
)).
astype
(
dtype_real
)
/
(
nb
*
10.
)
gscal
=
np
.
random
.
random_sample
((
nb
,
nb
)).
astype
(
dtype_real
)
# Création des buffer pour le kernel de base
ppos_buf
=
cl
.
Buffer
(
ctx
,
cl
.
mem_flags
.
WRITE_ONLY
,
size
=
ppos
.
nbytes
)
pscal_buf
=
cl
.
Buffer
(
ctx
,
cl
.
mem_flags
.
WRITE_ONLY
,
size
=
pscal
.
nbytes
)
gvelo_buf
=
cl
.
Buffer
(
ctx
,
cl
.
mem_flags
.
READ_ONLY
,
size
=
gvelo
.
nbytes
)
gscal_buf
=
cl
.
Buffer
(
ctx
,
cl
.
mem_flags
.
READ_ONLY
,
size
=
gscal
.
nbytes
)
# Création des images3d pour le nouveau kernel
## Image grid_img : R : vitesse.x
## G : vitesse.y
## B : vitesse.z
## A : scalaire
## Image part_img : R : position (dans la direction de splitting)
## G : -
## B : distance au point de grille de gauche
## A : scalaire
grid_rgba
=
np
.
zeros
((
nb
,
nb
,
4
),
dtype
=
dtype_real
)
grid_rgba
[...,
0
:
2
]
=
gvelo
grid_rgba
[...,
3
]
=
gscal
part_rgba
=
np
.
zeros
((
nb
,
nb
,
4
),
dtype
=
dtype_real
)
print
gvelo
.
shape
print
grid_rgba
.
shape
img_fmt
=
cl
.
ImageFormat
(
cl
.
channel_order
.
RGBA
,
cl
.
channel_type
.
FLOAT
)
grid_img
=
cl
.
Image
(
ctx
,
cl
.
mem_flags
.
READ_ONLY
,
img_fmt
,
(
nb
,
nb
))
part_img
=
cl
.
Image
(
ctx
,
cl
.
mem_flags
.
WRITE_ONLY
,
img_fmt
,
(
nb
,
nb
))
# Envoi des données pour les buffer (mémoire organisée pour un acces contigu)
offset
=
0
cl
.
enqueue_copy
(
queue
,
gvelo_buf
,
gvelo
[...,
0
].
ravel
(),
device_offset
=
np
.
long
(
offset
))
offset
+=
gvelo
[...,
0
].
nbytes
cl
.
enqueue_copy
(
queue
,
gvelo_buf
,
gvelo
[...,
1
].
swapaxes
(
1
,
0
).
ravel
(),
device_offset
=
np
.
long
(
offset
))
offset
+=
gvelo
[...,
1
].
nbytes
#cl.enqueue_copy(queue, gvelo_buf, gvelo[..., 2].swapaxes(1, 0).swapaxes(2, 0).ravel(), device_offset=np.long(offset))
cl
.
enqueue_copy
(
queue
,
gscal_buf
,
gscal
)
# Envoi des données sur l'image3d
cl
.
enqueue_copy
(
queue
,
grid_img
,
grid_rgba
,
origin
=
(
0
,
0
),
region
=
(
nb
,
nb
))
gpu_src
=
""
f
=
open
(
"
./utils.h
"
,
'
r
'
)
gpu_src
+=
""
.
join
(
f
.
readlines
())
f
.
close
()
f
=
open
(
"
./test_advec_base_kernel.cl
"
,
'
r
'
)
gpu_src
+=
""
.
join
(
f
.
readlines
())
f
.
close
()
f
=
open
(
"
./test_advec_img2d_kernel.cl
"
,
'
r
'
)
gpu_src
+=
""
.
join
(
f
.
readlines
())
f
.
close
()
print
cl
.
device_info
prg
=
cl
.
Program
(
ctx
,
gpu_src
).
build
(
"
-D DIM=
"
+
str
(
grid_dim
)
+
"
-D MAX_LINE_POINTS=
"
+
str
(
nb
))
## Run du kernel de base (buffers) (résultat de référence)
queue
.
finish
()
t
=
0.
for
i
in
xrange
(
nb_run
):
evt
=
prg
.
advection
(
queue
,
tuple
((
nb
,)),
None
,
gvelo_buf
,
ppos_buf
,
dtype_real
(
0.
),
dtype_real
(
1.
),
dtype_integer
(
sdir
),
dtype_integer
(
nb
),
np
.
array
([
1.
,
2.
,
3.
,
4.
],
dtype
=
dtype_real
),
np
.
array
([
1.
,
1.
,
1.
,
1.
],
dtype
=
dtype_real
),
np
.
array
([
nb
,
nb
,
nb
,
0
],
dtype
=
dtype_integer
),
np
.
array
([
1.
/
(
1.
*
nb
),
1.
/
(
1.
*
nb
),
1.
/
(
1.
*
nb
),
1.
/
(
1.
*
nb
)],
dtype
=
dtype_real
)
)
queue
.
finish
()
t
+=
(
evt
.
profile
.
end
-
evt
.
profile
.
start
)
*
1e-9
evt
=
cl
.
enqueue_copy
(
queue
,
pscal_buf
,
gscal_buf
)
queue
.
finish
()
t
+=
(
evt
.
profile
.
end
-
evt
.
profile
.
start
)
*
1e-9
time_base
=
t
/
(
nb_run
*
1.
)
times
[
sdir
][
0
].
append
(
time_base
)
flops
[
sdir
][
0
].
append
(
1e-9
*
(
20
*
nb
**
grid_dim
)
/
time_base
)
bw
[
sdir
][
0
].
append
(
1e-9
*
(
2
*
4
*
nb
**
grid_dim
)
/
time_base
)
print
"
base
\t
"
,
time_base
cl
.
enqueue_copy
(
queue
,
ppos
,
ppos_buf
)
cl
.
enqueue_copy
(
queue
,
pscal
,
pscal_buf
)
queue
.
finish
()
## Run kernel avec image3d
queue
.
finish
()
t
=
0.
print
"
================= Direction is
"
,
sdir
for
i
in
xrange
(
nb_run
):
evt
=
prg
.
advection_img_v2
(
queue
,
tuple
((
nb
,)),
None
,
grid_img
,
part_img
,
dtype_real
(
0.
),
dtype_real
(
1.
),
dtype_integer
(
sdir
),
dtype_integer
(
nb
),
np
.
array
([
1.
,
2.
,
3.
,
4.
],
dtype
=
dtype_real
),
np
.
array
([
1.
,
1.
,
1.
,
1.
],
dtype
=
dtype_real
),
np
.
array
([
nb
,
nb
,
nb
,
0
],
dtype
=
dtype_integer
),
np
.
array
([
1.
/
(
1.
*
nb
),
1.
/
(
1.
*
nb
),
1.
/
(
1.
*
nb
),
1.
/
(
1.
*
nb
)],
dtype
=
dtype_real
)
)
queue
.
finish
()
t
+=
(
evt
.
profile
.
end
-
evt
.
profile
.
start
)
*
1e-9
time_img
=
t
/
(
nb_run
*
1.
)
times
[
sdir
][
1
].
append
(
time_img
)
flops
[
sdir
][
1
].
append
(
1e-9
*
(
21
*
nb
**
grid_dim
)
/
time_img
)
bw
[
sdir
][
1
].
append
(
1e-9
*
(
8
*
4
*
nb
**
grid_dim
)
/
time_img
)
print
"
img
\t
"
,
time_img
,
'
\t
'
,
cl
.
enqueue_copy
(
queue
,
part_rgba
,
part_img
,
origin
=
(
0
,
0
),
region
=
(
nb
,
nb
))
queue
.
finish
()
## Comparaison des résultats pour la position des particules (part.R)
if
sdir
==
0
:
print
np
.
max
(
np
.
abs
(
ppos
-
part_rgba
[...,
0
])),
#print part_rgba[..., 0]
elif
sdir
==
1
:
print
np
.
max
(
np
.
abs
(
ppos
-
part_rgba
[...,
0
].
swapaxes
(
0
,
1
))),
#print part_rgba[..., 0].swapaxes(0, 1)
elif
sdir
==
2
:
print
np
.
max
(
np
.
abs
(
ppos
-
part_rgba
[...,
0
].
swapaxes
(
0
,
1
).
swapaxes
(
0
,
2
))),
#print part_rgba[..., 0].swapaxes(0, 1).swapaxes(0, 2)
## Comparaison des résultats pour le scalaire des particules (part.A)
print
np
.
max
(
np
.
abs
(
pscal
-
part_rgba
[...,
3
])),
print
'
gain :
'
,
time_base
/
time_img
queue
.
finish
()
if
__name__
==
"
__main__
"
:
nb_run
=
1
nb_kernel
=
2
sizes
=
[
128
,
196
]
t_sizes
=
np
.
array
(
sizes
)
**
2
times
=
[[[]
for
k
in
xrange
(
nb_kernel
)]
for
d
in
xrange
(
grid_dim
)]
flops
=
[[[]
for
k
in
xrange
(
nb_kernel
)]
for
d
in
xrange
(
grid_dim
)]
bw
=
[[[]
for
k
in
xrange
(
nb_kernel
)]
for
d
in
xrange
(
grid_dim
)]
# main(0, nb_run)
# main(1, nb_run)
# main(2, nb_run)
for
nb
in
sizes
:
print
"
==== {0}**3 ====
"
.
format
(
nb
)
for
d
in
xrange
(
grid_dim
):
print
"
-- sDIR =
"
,
d
main
(
d
,
nb_run
)
pl
.
figure
(
1
)
for
d
in
xrange
(
grid_dim
):
pl
.
plot
(
t_sizes
,
times
[
d
][
0
],
'
-o
'
,
label
=
"
base d=
"
+
str
(
d
))
for
d
in
xrange
(
grid_dim
):
pl
.
plot
(
t_sizes
,
times
[
d
][
1
],
'
-o
'
,
label
=
"
img d=
"
+
str
(
d
))
pl
.
legend
((
"
base d=0
"
,
"
base d=1
"
,
"
base d=2
"
,
"
img d=0
"
,
"
img d=1
"
,
"
img d=2
"
),
loc
=
'
center left
'
)
#pl.axis([0, 13000000, 0, 0.08])
pl
.
xlabel
(
"
Nb particles
"
)
pl
.
ylabel
(
"
time (s)
"
)
pl
.
title
(
"
Advection times
"
)
pl
.
xscale
(
'
log
'
)
pl
.
figure
(
2
)
for
d
in
xrange
(
grid_dim
):
pl
.
plot
(
t_sizes
,
flops
[
d
][
0
],
'
-o
'
,
label
=
"
base d=
"
+
str
(
d
))
for
d
in
xrange
(
grid_dim
):
pl
.
plot
(
t_sizes
,
flops
[
d
][
1
],
'
-o
'
,
label
=
"
img d=
"
+
str
(
d
))
pl
.
legend
((
"
base d=0
"
,
"
base d=1
"
,
"
base d=2
"
,
"
img d=0
"
,
"
img d=1
"
,
"
img d=2
"
),
loc
=
'
center left
'
)
#pl.axis([0, 13000000, 0, 5.0])
pl
.
xlabel
(
"
Nb particles
"
)
pl
.
ylabel
(
"
GFlops
"
)
pl
.
title
(
"
Advection flops
"
)
pl
.
xscale
(
'
log
'
)
pl
.
figure
(
3
)
for
d
in
xrange
(
grid_dim
):
pl
.
plot
(
t_sizes
,
bw
[
d
][
0
],
'
-o
'
,
label
=
"
base d=
"
+
str
(
d
))
for
d
in
xrange
(
grid_dim
):
pl
.
plot
(
t_sizes
,
bw
[
d
][
1
],
'
-o
'
,
label
=
"
img d=
"
+
str
(
d
))
pl
.
legend
((
"
base d=0
"
,
"
base d=1
"
,
"
base d=2
"
,
"
img d=0
"
,
"
img d=1
"
,
"
img d=2
"
),
loc
=
'
upper left
'
)
pl
.
xlabel
(
"
Nb particles
"
)
pl
.
ylabel
(
"
GB/s
"
)
pl
.
title
(
"
Advection bw
"
)
pl
.
xscale
(
'
log
'
)
signal
.
signal
(
signal
.
SIGINT
,
signal
.
SIG_DFL
)
pl
.
show
()
This diff is collapsed.
Click to expand it.
parmepy/kernel_benchmark/test_advec_img2d_kernel.cl
0 → 100644
+
82
−
0
View file @
c4d249b1
//
Integration
Kernel
(
RK2
method
)
__kernel
void
advection_img_v2
(
__read_only
image2d_t
grid,
__write_only
image2d_t
part,
__private
const
float
t
,
__private
const
float
dt,
__private
const
uint
dir,
__private
const
uint
line_nb_pts,
__private
const
float4
min_v,
__private
const
float4
length_v,
__private
const
uint4
nb,
__private
const
float4
size
)
{
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP_TO_EDGE |
CLK_FILTER_NEAREST
;
__private
int2
coord
;
__private
int
coord_t[2]
;
__private
float4
part_px
;
__private
float4
grid_px
;
__private
float
dx_dir,
min_dir
;
__private
int
nb_dir,
indice
;
__private
float
velo_dir
;
__private
size_t
p
;
__private
float2
buffer_loc[MAX_LINE_POINTS]__attribute__
((
aligned
))
,
buff
;
if
(
dir
==
0
)
{
dx_dir
=
size.x
;
nb_dir
=
nb.x
;
min_dir
=
min_v.x
;
coord_t[1]
=
get_global_id
(
0
)
;
for
(
p=0
;p<line_nb_pts;p++)
{
//lecture
vitesse
coord
=
(
int2
)(
coord_t[1],
p
)
;
grid_px
=
read_imagef
(
grid,
sampler,
coord
)
; //Read 1 pixel (4floats) //TODO lire directement grid_px[dir]
buffer_loc[p].x
=
grid_px.x
;
buffer_loc[p].y
=
grid_px.w
;
}
}
else
{
dx_dir
=
size.y
;
nb_dir
=
nb.y
;
min_dir
=
min_v.y
;
coord_t[1]
=
get_global_id
(
0
)
;
for
(
p=0
;p<line_nb_pts;p++)
{
//lecture
vitesse
coord
=
(
int2
)(
p,
coord_t[0]
)
;
grid_px
=
read_imagef
(
grid,
sampler,
coord
)
; //Read 1 pixel (4floats) //TODO lire directement grid_px[dir]
buffer_loc[p].x
=
grid_px.y
;
buffer_loc[p].y
=
grid_px.w
;
}
}
for
(
p=0
;p<line_nb_pts;p++)
{
buff
=
buffer_loc[p]
;
//Lecture
Vitesse
velo_dir
=
buff.x
;
//calcul
position
part_px.x
=
fma
(
velo_dir,
0.5f*dt,
p*dx_dir
)
; //4flop OpenCL : fma(a,b,c) = c+a*b
//calcul
indice
left
grid
point
indice
=
(
convert_int_rtn
(
part_px.x/dx_dir
)
+
nb_dir
)
%
nb_dir
; //1flop
//calcul
distance
(
particle
to
left
grid
point
)
part_px.z
=
fma
(
-indice,
dx_dir,
part_px.x
)
/dx_dir
; //4flop OpenCL : fma(a,b,c) = c+a*b
//interpolation
velo_dir
=
mix
(
buffer_loc[indice].x,
buffer_loc[
(
indice
+
1
)
%
nb_dir].x,
part_px.z
)
;// 3flop OpenCL mix(x,y,a)=x+(y-x)*a
//calcul
position
part_px.x
=
fma
(
velo_dir,
dt,
p*dx_dir
)
; //3flop OpenCL : fma(a,b,c) = c+a*b
//calcul
indice
left
grid
point
indice
=
(
convert_int_rtn
(
part_px.x/dx_dir
)
+
nb_dir
)
%
nb_dir
; //1flop
//calcul
distance
(
particle
to
left
grid
point
)
part_px.z
=
fma
(
-indice,
dx_dir,
part_px.x
)
/dx_dir
; //4flop OpenCL : fma(a,b,c) = c+a*b
//scalaire
part_px.w
=
buff.y
;
part_px.x
+=
min_dir
; // 1flop
coord_t[dir]
=
p
;
coord
=
(
int2
)(
coord_t[1],
coord_t[0]
)
;
write_imagef
(
part,
coord,
part_px
)
; //Write 1 pixel (4floats)
}
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment