performance of vasp on BlueGene/P
Posted: Mon Nov 21, 2011 2:40 pm
Hi,
I compiled VASP 5.2.12 on a BlueGene/P (see the makefile below). I did not use scalapack because I read about problems with their use.
Now, i was doing some test calculations to test the performance of the machine and I found out that the same job running in Virtual Node mode (using all 4 cores of the processors) takes 4 times wall clock time than the same job running in the Symmetrical Multi-Processing mode (only one task per node). Is it something expected? Is it possible to improve the performace of the VN mode somehow (using different libraries..)? Does anyone has experience in setting the NPAR parameter for those kind of machines?
Maybe the answers to those questions are trivial, but I'm a newbie..
Thanks a lot for any help!!
Here is the makefile:
.SUFFIXES: .inc .f .F
#-----------------------------------------------------------------------
#
#
#-----------------------------------------------------------------------
SUFFIX=.f
#-----------------------------------------------------------------------
# fortran compiler and linker
#-----------------------------------------------------------------------
FC=mpixlf90_r -g -qfree=f90
FCL=$(FC)
#-----------------------------------------------------------------------
# C-preprocessor define any of the flags given below
# MPI generate parallel version
# NGZhalf charge density reduced in Z direction
# wNGZhalf gamma point only reduced in Z direction
# CACHE_SIZE 5001 for SP3 and Power 3
# 32768 for 550,590,3CT
# 8001 595/397 quad word systems
# scaLAPACK use scaLAPACK
#
#-----------------------------------------------------------------------
# Add -DwNGZhalf for gamma point only
CPP = /usr/bin/cpp -P -C -DHOST=\"BlueGene\" -DMPI -DNGZhalf \
-Duse_collective -Davoidalloc \
-DCACHE_SIZE=4000 -DMPI_BLOCK=50000 $*.F >$*.f
#-----------------------------------------------------------------------
# general fortran flag
#-----------------------------------------------------------------------
FFLAGS = -qarch=450d -qtune=450 -qmaxmem=-1 -qsource
#-----------------------------------------------------------------------
# optimization
#-----------------------------------------------------------------------
OFLAG = -O3 -qstrict
OFLAG_HIGH = $(OFLAG)
OBJ_HIGH = none
OBJ_NOOPT = none
DEBUG = -g
INCS =
INLINE = $(OFLAG)
#-----------------------------------------------------------------------
# options for linking
#-----------------------------------------------------------------------
LIBLOC = /usr/local/lib
ESSL = -L/bgsys/ibm_essl/sles10/prod/opt/ibmmath/essl/4.4/lib -lesslbg
LIB = -L../../VASP5.2/vasp.5.lib -ldmy -L$(LIBLOC) -llapack_bgp $(ESSL)
#-----------------------------------------------------------------------
# specify 3d-fft to be used with VASP
#-----------------------------------------------------------------------
FFT3D = fftmpi.o fftmpi_map.o fft3dfurth.o fft3dlib.o
#-----------------------------------------------------------------------
# general rules and compile lines
#-----------------------------------------------------------------------
BASIC= symmetry.o symlib.o lattlib.o random.o
SOURCE= base.o mpi.o smart_allocate.o xml.o \
constant.o jacobi.o main_mpi.o scala.o \
asa.o lattice.o poscar.o ini.o mgrid.o xclib.o vdw_nl.o xclib_grad.o \
radial.o pseudo.o gridq.o ebs.o \
mkpoints.o wave.o wave_mpi.o wave_high.o \
$(BASIC) nonl.o nonlr.o nonl_high.o dfast.o choleski2.o \
mix.o hamil.o xcgrad.o xcspin.o potex1.o potex2.o \
constrmag.o cl_shift.o relativistic.o LDApU.o \
paw_base.o metagga.o egrad.o pawsym.o pawfock.o pawlhf.o rhfatm.o paw.o \
mkpoints_full.o charge.o Lebedev-Laikov.o stockholder.o dipol.o pot.o \
dos.o elf.o tet.o tetweight.o hamil_rot.o \
steep.o chain.o dyna.o sphpro.o us.o core_rel.o \
aedens.o wavpre.o wavpre_noio.o broyden.o \
dynbr.o rmm-diis.o reader.o writer.o tutor.o xml_writer.o \
brent.o stufak.o fileio.o opergrid.o stepver.o \
chgloc.o fast_aug.o fock.o mkpoints_change.o sym_grad.o \
mymath.o internals.o dynconstr.o dimer_heyden.o dvvtrajectory.o vdwforcefield.o \
hamil_high.o nmr.o pead.o mlwf.o subrot.o subrot_scf.o \
force.o pwlhf.o gw_model.o optreal.o davidson.o david_inner.o \
electron.o rot.o electron_all.o shm.o pardens.o paircorrection.o \
optics.o constr_cell_relax.o stm.o finite_diff.o elpol.o \
hamil_lr.o rmm-diis_lr.o subrot_cluster.o subrot_lr.o \
lr_helper.o hamil_lrf.o elinear_response.o ilinear_response.o \
linear_optics.o linear_response.o \
setlocalpp.o wannier.o electron_OEP.o electron_lhf.o twoelectron4o.o \
ratpol.o screened_2e.o wave_cacher.o chi_base.o wpot.o local_field.o \
ump2.o bse_te.o bse.o acfdt.o chi.o sydmat.o dmft.o \
rmm-diis_mlr.o linear_response_NMR.o
INC=
vasp.bgp: $(SOURCE) $(FFT3D) $(INC) main.o
rm -f vasp.bgp
$(FCL) -o vasp.bgp main.o $(SOURCE) $(FFT3D) $(LIB) $(LINK)
makeparam: $(SOURCE) $(FFT3D) makeparam.o main.F $(INC)
$(FCL) -o makeparam $(LINK) makeparam.o $(SOURCE) $(FFT3D) $(LIB)
zgemmtest: zgemmtest.o base.o random.o $(INC)
$(FCL) -o zgemmtest $(LINK) zgemmtest.o random.o base.o $(LIB)
dgemmtest: dgemmtest.o base.o random.o $(INC)
$(FCL) -o dgemmtest $(LINK) dgemmtest.o random.o base.o $(LIB)
ffttest: base.o smart_allocate.o mpi.o mgrid.o random.o ffttest.o $(FFT3D) $(INC)
$(FCL) -o ffttest $(LINK) ffttest.o mpi.o mgrid.o random.o smart_allocate.o base.o $(FFT3D) $(LIB)
kpoints: $(SOURCE) $(FFT3D) makekpoints.o main.F $(INC)
$(FCL) -o kpoints $(LINK) makekpoints.o $(SOURCE) $(FFT3D) $(LIB)
clean:
-rm -f *.g *.f *.o *.L *.mod ; touch *.F
main.o: main$(SUFFIX)
$(FC) $(FFLAGS)$(DEBUG) $(INCS) -c main$(SUFFIX)
xcgrad.o: xcgrad$(SUFFIX)
$(FC) $(FFLAGS) $(INLINE) $(INCS) -c xcgrad$(SUFFIX)
xcspin.o: xcspin$(SUFFIX)
$(FC) $(FFLAGS) $(INLINE) $(INCS) -c xcspin$(SUFFIX)
makeparam.o: makeparam$(SUFFIX)
$(FC) $(FFLAGS)$(DEBUG) $(INCS) -c makeparam$(SUFFIX)
makeparam$(SUFFIX): makeparam.F main.F
#
base.o: base.inc base.F
mgrid.o: mgrid.inc mgrid.F
constant.o: constant.inc constant.F
lattice.o: lattice.inc lattice.F
setex.o: setexm.inc setex.F
pseudo.o: pseudo.inc pseudo.F
poscar.o: poscar.inc poscar.F
mkpoints.o: mkpoints.inc mkpoints.F
wave.o: wave.inc wave.F
nonl.o: nonl.inc nonl.F
nonlr.o: nonlr.inc nonlr.F
$(OBJ_HIGH):
$(CPP)
$(FC) $(FFLAGS) $(OFLAG_HIGH) $(INCS) -c $*$(SUFFIX)
$(OBJ_NOOPT):
$(CPP)
$(FC) $(FFLAGS) $(INCS) -c $*$(SUFFIX)
fft3dlib_f77.o: fft3dlib_f77.F
$(CPP)
$(F77) $(FFLAGS_F77) -c $*$(SUFFIX)
.F.o:
$(CPP)
$(FC) $(FFLAGS) $(OFLAG) $(INCS) -c $*$(SUFFIX)
.F$(SUFFIX):
$(CPP)
$(SUFFIX).o:
$(FC) $(FFLAGS) $(OFLAG) $(INCS) -c $*$(SUFFIX)
# special rules
#-----------------------------------------------------------------------
radial.o: radial.F
$(CPP)
$(FC) $(FFLAGS) $(INCS) -O2 -c $*$(SUFFIX)
wave.o: wave.F
$(CPP)
$(FC) $(FFLAGS) $(INCS) -O2 -c $*$(SUFFIX)
metagga.o: metagga.F
$(CPP)
$(FC) $(FFLAGS) $(INCS) -O2 -c $*$(SUFFIX)
nonl.o: nonl.F
$(CPP)
$(FC) $(FFLAGS) $(INCS) -O -c $*$(SUFFIX)
paw.o: paw.F
$(CPP)
$(FC) $(FFLAGS) $(INCS) -O1 -c $*$(SUFFIX)
pseudo.o: pseudo.F
$(CPP)
$(FC) $(FFLAGS) $(INCS) -O1 -c $*$(SUFFIX)
I compiled VASP 5.2.12 on a BlueGene/P (see the makefile below). I did not use scalapack because I read about problems with their use.
Now, i was doing some test calculations to test the performance of the machine and I found out that the same job running in Virtual Node mode (using all 4 cores of the processors) takes 4 times wall clock time than the same job running in the Symmetrical Multi-Processing mode (only one task per node). Is it something expected? Is it possible to improve the performace of the VN mode somehow (using different libraries..)? Does anyone has experience in setting the NPAR parameter for those kind of machines?
Maybe the answers to those questions are trivial, but I'm a newbie..
Thanks a lot for any help!!
Here is the makefile:
.SUFFIXES: .inc .f .F
#-----------------------------------------------------------------------
#
#
#-----------------------------------------------------------------------
SUFFIX=.f
#-----------------------------------------------------------------------
# fortran compiler and linker
#-----------------------------------------------------------------------
FC=mpixlf90_r -g -qfree=f90
FCL=$(FC)
#-----------------------------------------------------------------------
# C-preprocessor define any of the flags given below
# MPI generate parallel version
# NGZhalf charge density reduced in Z direction
# wNGZhalf gamma point only reduced in Z direction
# CACHE_SIZE 5001 for SP3 and Power 3
# 32768 for 550,590,3CT
# 8001 595/397 quad word systems
# scaLAPACK use scaLAPACK
#
#-----------------------------------------------------------------------
# Add -DwNGZhalf for gamma point only
CPP = /usr/bin/cpp -P -C -DHOST=\"BlueGene\" -DMPI -DNGZhalf \
-Duse_collective -Davoidalloc \
-DCACHE_SIZE=4000 -DMPI_BLOCK=50000 $*.F >$*.f
#-----------------------------------------------------------------------
# general fortran flag
#-----------------------------------------------------------------------
FFLAGS = -qarch=450d -qtune=450 -qmaxmem=-1 -qsource
#-----------------------------------------------------------------------
# optimization
#-----------------------------------------------------------------------
OFLAG = -O3 -qstrict
OFLAG_HIGH = $(OFLAG)
OBJ_HIGH = none
OBJ_NOOPT = none
DEBUG = -g
INCS =
INLINE = $(OFLAG)
#-----------------------------------------------------------------------
# options for linking
#-----------------------------------------------------------------------
LIBLOC = /usr/local/lib
ESSL = -L/bgsys/ibm_essl/sles10/prod/opt/ibmmath/essl/4.4/lib -lesslbg
LIB = -L../../VASP5.2/vasp.5.lib -ldmy -L$(LIBLOC) -llapack_bgp $(ESSL)
#-----------------------------------------------------------------------
# specify 3d-fft to be used with VASP
#-----------------------------------------------------------------------
FFT3D = fftmpi.o fftmpi_map.o fft3dfurth.o fft3dlib.o
#-----------------------------------------------------------------------
# general rules and compile lines
#-----------------------------------------------------------------------
BASIC= symmetry.o symlib.o lattlib.o random.o
SOURCE= base.o mpi.o smart_allocate.o xml.o \
constant.o jacobi.o main_mpi.o scala.o \
asa.o lattice.o poscar.o ini.o mgrid.o xclib.o vdw_nl.o xclib_grad.o \
radial.o pseudo.o gridq.o ebs.o \
mkpoints.o wave.o wave_mpi.o wave_high.o \
$(BASIC) nonl.o nonlr.o nonl_high.o dfast.o choleski2.o \
mix.o hamil.o xcgrad.o xcspin.o potex1.o potex2.o \
constrmag.o cl_shift.o relativistic.o LDApU.o \
paw_base.o metagga.o egrad.o pawsym.o pawfock.o pawlhf.o rhfatm.o paw.o \
mkpoints_full.o charge.o Lebedev-Laikov.o stockholder.o dipol.o pot.o \
dos.o elf.o tet.o tetweight.o hamil_rot.o \
steep.o chain.o dyna.o sphpro.o us.o core_rel.o \
aedens.o wavpre.o wavpre_noio.o broyden.o \
dynbr.o rmm-diis.o reader.o writer.o tutor.o xml_writer.o \
brent.o stufak.o fileio.o opergrid.o stepver.o \
chgloc.o fast_aug.o fock.o mkpoints_change.o sym_grad.o \
mymath.o internals.o dynconstr.o dimer_heyden.o dvvtrajectory.o vdwforcefield.o \
hamil_high.o nmr.o pead.o mlwf.o subrot.o subrot_scf.o \
force.o pwlhf.o gw_model.o optreal.o davidson.o david_inner.o \
electron.o rot.o electron_all.o shm.o pardens.o paircorrection.o \
optics.o constr_cell_relax.o stm.o finite_diff.o elpol.o \
hamil_lr.o rmm-diis_lr.o subrot_cluster.o subrot_lr.o \
lr_helper.o hamil_lrf.o elinear_response.o ilinear_response.o \
linear_optics.o linear_response.o \
setlocalpp.o wannier.o electron_OEP.o electron_lhf.o twoelectron4o.o \
ratpol.o screened_2e.o wave_cacher.o chi_base.o wpot.o local_field.o \
ump2.o bse_te.o bse.o acfdt.o chi.o sydmat.o dmft.o \
rmm-diis_mlr.o linear_response_NMR.o
INC=
vasp.bgp: $(SOURCE) $(FFT3D) $(INC) main.o
rm -f vasp.bgp
$(FCL) -o vasp.bgp main.o $(SOURCE) $(FFT3D) $(LIB) $(LINK)
makeparam: $(SOURCE) $(FFT3D) makeparam.o main.F $(INC)
$(FCL) -o makeparam $(LINK) makeparam.o $(SOURCE) $(FFT3D) $(LIB)
zgemmtest: zgemmtest.o base.o random.o $(INC)
$(FCL) -o zgemmtest $(LINK) zgemmtest.o random.o base.o $(LIB)
dgemmtest: dgemmtest.o base.o random.o $(INC)
$(FCL) -o dgemmtest $(LINK) dgemmtest.o random.o base.o $(LIB)
ffttest: base.o smart_allocate.o mpi.o mgrid.o random.o ffttest.o $(FFT3D) $(INC)
$(FCL) -o ffttest $(LINK) ffttest.o mpi.o mgrid.o random.o smart_allocate.o base.o $(FFT3D) $(LIB)
kpoints: $(SOURCE) $(FFT3D) makekpoints.o main.F $(INC)
$(FCL) -o kpoints $(LINK) makekpoints.o $(SOURCE) $(FFT3D) $(LIB)
clean:
-rm -f *.g *.f *.o *.L *.mod ; touch *.F
main.o: main$(SUFFIX)
$(FC) $(FFLAGS)$(DEBUG) $(INCS) -c main$(SUFFIX)
xcgrad.o: xcgrad$(SUFFIX)
$(FC) $(FFLAGS) $(INLINE) $(INCS) -c xcgrad$(SUFFIX)
xcspin.o: xcspin$(SUFFIX)
$(FC) $(FFLAGS) $(INLINE) $(INCS) -c xcspin$(SUFFIX)
makeparam.o: makeparam$(SUFFIX)
$(FC) $(FFLAGS)$(DEBUG) $(INCS) -c makeparam$(SUFFIX)
makeparam$(SUFFIX): makeparam.F main.F
#
base.o: base.inc base.F
mgrid.o: mgrid.inc mgrid.F
constant.o: constant.inc constant.F
lattice.o: lattice.inc lattice.F
setex.o: setexm.inc setex.F
pseudo.o: pseudo.inc pseudo.F
poscar.o: poscar.inc poscar.F
mkpoints.o: mkpoints.inc mkpoints.F
wave.o: wave.inc wave.F
nonl.o: nonl.inc nonl.F
nonlr.o: nonlr.inc nonlr.F
$(OBJ_HIGH):
$(CPP)
$(FC) $(FFLAGS) $(OFLAG_HIGH) $(INCS) -c $*$(SUFFIX)
$(OBJ_NOOPT):
$(CPP)
$(FC) $(FFLAGS) $(INCS) -c $*$(SUFFIX)
fft3dlib_f77.o: fft3dlib_f77.F
$(CPP)
$(F77) $(FFLAGS_F77) -c $*$(SUFFIX)
.F.o:
$(CPP)
$(FC) $(FFLAGS) $(OFLAG) $(INCS) -c $*$(SUFFIX)
.F$(SUFFIX):
$(CPP)
$(SUFFIX).o:
$(FC) $(FFLAGS) $(OFLAG) $(INCS) -c $*$(SUFFIX)
# special rules
#-----------------------------------------------------------------------
radial.o: radial.F
$(CPP)
$(FC) $(FFLAGS) $(INCS) -O2 -c $*$(SUFFIX)
wave.o: wave.F
$(CPP)
$(FC) $(FFLAGS) $(INCS) -O2 -c $*$(SUFFIX)
metagga.o: metagga.F
$(CPP)
$(FC) $(FFLAGS) $(INCS) -O2 -c $*$(SUFFIX)
nonl.o: nonl.F
$(CPP)
$(FC) $(FFLAGS) $(INCS) -O -c $*$(SUFFIX)
paw.o: paw.F
$(CPP)
$(FC) $(FFLAGS) $(INCS) -O1 -c $*$(SUFFIX)
pseudo.o: pseudo.F
$(CPP)
$(FC) $(FFLAGS) $(INCS) -O1 -c $*$(SUFFIX)