cndaqiang Web Linux DFT

单GPU节点配置计算环境

2024-05-01
cndaqiang
RSS

系统安装

  • 刻录ubuntu-22.04-desktop到U盘
  • 插入U盘到服务器
  • idrac启动到u盘,远程安装系统

网络配置

  • 网口1: 互联网访问ip
  • 网口2: 内网ip

InfiniBand 配置

安装驱动

cd /home/source/MLNX_OFED_LINUX-23.10-2.1.3.1-ubuntu22.04-x86_64
./mlnxofedinstall --force
/etc/init.d/openibd restart

配置ip

注:装完docker等更改网络的软件,可能需要重新配置

#查看ib网卡信息,名称为ibp75s0
root@BBBBgpu:# ip link show
4: ibp75s0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 2044 qdisc mq state UP mode DEFAULT group default qlen 256
    link/infiniband 00:00:0e:e8:fe:80:00:00:00:00:00:00:94:6d:ae:03:00:3c:83:4a brd 00:ff:ff:ff:ff:12:40:1b:ff:ff:00:00:00:00:00:00:ff:ff:ff:ff
#修改ib网卡ibp75s0配置
root@BBBBgpu:#  nmcli connection edit type infiniband con-name  ibp75s0
nmcli> set ipv4.addresses  10.10.10.120/24
nmcli> set ipv4.gateway  10.10.10.1
nmcli> save
Saving the connection with 'autoconnect=yes'. That might result in an immediate activation of the connection.
Do you still want to save? (yes/no) [yes] yes
Connection 'ibp75s0' (34a3e7f4-2cde-4e2e-9e04-a0390711f825) successfully saved.
nmcli> quit

用户管理

adduser --home /home/cndaqiang cndaqiang
#使用之前的home目录,以及不同服务器之间pid相同的要求
usermod -u 1043 cndaqiang
groupmod -g 1043 cndaqiang
#使用之前的home目录
chown -R cndaqiang:cndaqiang /home/cndaqiang
#添加sudo权限
usermod -aG sudo cndaqiang

常用软件安装

基本软件

apt install openssh-server
systemctl start ssh
systemctl enable ssh
apt install vim
apt install htop

module

sudo apt-get install environment-modules

添加下面内容到/etc/profile

# cndaqiang 20240430
MODULEPATH=/home/apps/module_files

docker

apt install apt-transport-https ca-certificates curl software-properties-common gnupg lsb-release
#
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
#
add-apt-repository \
   "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
   $(lsb_release -cs) \
   stable"
#
apt-get update
apt-get install docker-ce

RDP远程桌面

# Install new packages
sudo apt-get install xrdp xorg

# Add xrdp user to ssl-cert group and reboot
sudo adduser xrdp ssl-cert
sudo reboot

nfs ib共享存储

apt install nfs-common

添加挂载信息到/etc/fstab

10.10.10.101:/home           /home/suanpan                   nfs            defaults     0 0

重启之后可以看到

cndaqiang@BBBBgpu:~$ df -h
Filesystem          Size  Used Avail Use% Mounted on
10.10.10.101:/home  145T  106T   33T  77% /home/suanpan

oneapi

  • 注: 新版本oneapi的编译器变了
  • icc -> icx, ifort -> ifx, icpc -> icpx,
  • 相应的mpi编译器名字也跟着变了

安装

安装方法以官方链接为准,先安装Base Toolkit再安装HPC Toolkit

module配置

cd /opt/intel/oneapi
./modulefiles-setup.sh --output-dir=/home/apps/module_files/oneapi.2024

精简配置

oneapi.2024/tbb/latest oneapi.2024/compiler-rt/2024.1.0 oneapi.2024/oclfpga/2024.1.0 oneapi.2024/compiler/2024.1.0 oneapi.2024/mkl/2024.1 oneapi.2024/mpi/2021.12

其他的module文件可以删除

管理员推荐配置

创建一个我们需要使用的module文件到/home/apps/module_files/oneapi.2024/recommended-oneapi

#%Module1.0###########################################
# build by cndaqiang 20240501

## 提供模块的帮助信息
proc ModulesHelp { } {
    puts stderr "This module sets up the recommended oneAPI 2024 environment."
}

## 设置模块的描述信息
module-whatis "Sets up the recommended oneAPI 2024 environment"

set module_prefixname "oneapi.2024"

foreach depmodulename { "compiler/2024.1.0" "mkl/2024.1" "mpi/2021.12" } {
    set fullmodulename "${module_prefixname}/${depmodulename}"
    prereq $fullmodulename
}

GPU配置

禁用开源驱动

创建一个文件 /etc/modprobe.d/blacklist-nouveau.conf 并添加以下内容:

blacklist nouveau
options nouveau modeset=0

更新

sudo update-initramfs -u

重启

cuda

注,安装完cuda,重启一下

sh cuda_12.3.2_545.23.08_linux.run

hpc toolkit

注toolkit要和cuda版本匹配,注意一致

#curl https://developer.download.nvidia.com/hpc-sdk/ubuntu/DEB-GPG-KEY-NVIDIA-HPC-SDK | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg
wget https://developer.download.nvidia.com/hpc-sdk/ubuntu/DEB-GPG-KEY-NVIDIA-HPC-SDK
cat DEB-GPG-KEY-NVIDIA-HPC-SDK | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg
echo 'deb [signed-by=/usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg] https://developer.download.nvidia.com/hpc-sdk/ubuntu/amd64 /' | sudo tee /etc/apt/sources.list.d/nvhpc.list
sudo apt-get update -y
sudo apt-get install -y nvhpc-24-3

module配置

cp -r /opt/nvidia/hpc_sdk/modulefiles/* /home/apps/module_files/

计算软件配置

HDF5

oneapi版本

并行版

./configure  CC=mpiicx FC=mpiifx CXX=mpiicpx --prefix=/home/apps/HDF5/oneapi/1.14.4-2.parallel --enable-fortran --enable-cxx --enable-parallel --enable-unsupported

串行版

./configure  CC=icx FC=ifx CXX=icpx --prefix=/home/apps/HDF5/oneapi/1.14.4-2 --enable-fortran --enable-cxx

安装,由于root没有编译环境,所以需要sudo env PATH=$PATH

make
sudo env PATH=$PATH make install

创建module file,示例

cndaqiang@BBBBgpu:~$ cat /home/apps/module_files/HDF5/oneapi.2024/1.14.4-2
#%Module1.0
#
#  HDF5 module file
#
proc ModulesHelp { } {
    puts stderr "Provides HDF5 1.14.4-2"
}

module-whatis "Sets up the environment for HDF5 1.14.4-2"

# Set paths
set prefix /home/apps/HDF5/oneapi/1.14.4-2

prepend-path PATH            $prefix/bin
prepend-path LD_LIBRARY_PATH $prefix/lib
prepend-path CPATH           $prefix/include
prepend-path LIBRARY_PATH    $prefix/lib
prepend-path MANPATH         $prefix/share/man

cuda版本

module load nvhpc
./configure FC=nvfortran CC=nvc FCFLAGS=-fPIC --enable-fortran --prefix=/home/apps/HDF5/nvhpc/1.14.4-2
make
sudo env PATH=$PATH make install

module file与oneapi版本相同,不过

set prefix /home/apps/HDF5/nvhpc/1.14.4-2

QE

external包无法下载编译的解决办法

方法1:ssh科学上网

wannier等体积较大的包,科学上网的方法,也容易中断,使用方法2:把仓库转存到gitee

ssh  -f -N -D 127.0.0.1:42090 -p 2022 cndaqiang@科学的服务器ip
#编辑 ~/.ssh/config
Host github.com
    ProxyCommand nc -x 127.0.0.1:42090 %h %p
Host gitlab.com
    ProxyCommand nc -x 127.0.0.1:42090 %h %p

方法2:把仓库转存到gitee

编辑vi .gitmodules,替换git地址,例如:

https://github.com/wannier-developers/wannier90.git
变为
https://gitee.com/cndaqiang/wannier90.git

QE-oneapi

#支持HDF5的qe-7.2
./configure --with-hdf5=/home/apps/HDF5/oneapi/1.14.4-2/ FC=ifx CC=icx MPIF90=mpiifx
make pw
#普通的qe-7.3
./configure  FC=ifx CC=icx MPIF90=mpiifx

复制到公共目录

sudo cp bin/* /home/apps/QE/7.3.1.oneapi.2024/bin/

module file

cndaqiang@BBBBgpu:~$ cat /home/apps/module_files/QE/oneapi.2024/7.3.1
#%Module1.0
##
## QE 7.3.1 for oneAPI 2024 module file
##

module-whatis "Sets up the Quantum ESPRESSO 7.3.1 environment for oneAPI 2024"

set MODULEPATH /home/apps/QE/7.3.1.oneapi.2024
prepend-path PATH            $MODULEPATH/bin
#这里是依赖的编译环境,这样只用module load一次
prereq oneapi.2024/recommended-oneapi
mbd代码兼容性问题

2024.05.01 mbd和oneapi不兼容

修改代码

vi external/mbd/src/mbd_c_api.F90
!use iso_c_binding
use iso_c_binding, only: c_int, c_double, c_bool, c_char, c_ptr, c_double_complex, &
                         c_null_char, c_loc, c_f_pointer, c_associated
rm -rf MBD
make libmbd

重新编译

rm -rf MBD
make libmbd
make pw

QE-cuda

module load nvhpc
./configure --with-hdf5=/home/apps/HDF5/nvhpc/1.14.4-2 --with-cuda=/opt/nvidia/hpc_sdk/Linux_x86_64/24.3/cuda/ --with-cuda-cc=86 --with-cuda-runtime=12.3   --with-scalapack=no  --with-cuda-mpi=yes

复制

sudo cp bin/* /home/apps/QE.nvhpc.24.3/qe-AAAA-7.2-hdf5-1.14.4-2/bin/

module file

cndaqiang@BBBBgpu:~$ cat /home/apps/module_files/QE/nvhpc.24.3/qe-AAAA-7.2-hdf5-1.14.4-2
#%Module1.0
##
## QE 7.3.1 for oneAPI 2024 module file
##

module-whatis "Sets up the Quantum ESPRESSO 7.2 environment for oneAPI 2024"

set MODULEPATH /home/apps/QE.nvhpc.24.3/qe-AAAA-7.2-hdf5-1.14.4-2
prepend-path PATH            $MODULEPATH/bin

prereq nvhpc/24.3
prereq HDF5/nvhpc.24.3/1.14.4-2

运行测试

oneapi版本:module load QE/oneapi.2024/qe-AAAA-7.2-hdf5-1.14.4-2

cd ~/work/tdpw/test/in
cndaqiang@BBBBgpu:~/work/tdpw/test/in$ module load QE/oneapi.2024/qe-AAAA-7.2-hdf5-1.14.4-2
Loading QE/oneapi.2024/qe-AAAA-7.2-hdf5-1.14.4-2
  Loading requirement: oneapi.2024/tbb/latest oneapi.2024/compiler-rt/2024.1.0 oneapi.2024/oclfpga/2024.1.0 oneapi.2024/compiler/2024.1.0
    oneapi.2024/mkl/2024.1 oneapi.2024/mpi/2021.12 oneapi.2024/recommended-oneapi HDF5/oneapi.2024/1.14.4-2

#运行
mpirun -np 4 pw.x -i input.in  | tee result
cndaqiang@BBBBgpu:~/work/tdpw/test/in$ grep ! result
!    total energy              =     -21.68049299 Ry
!    total energy              =     -21.68049301 Ry

cuda版本:module load QE/nvhpc.24.3/qe-AAAA-7.2-hdf5-1.14.4-2

cd ~/work/tdpw/test/in-nv
cndaqiang@BBBBgpu:~$ module load QE/nvhpc.24.3/qe-AAAA-7.2-hdf5-1.14.4-2
Loading QE/nvhpc.24.3/qe-AAAA-7.2-hdf5-1.14.4-2
  Loading requirement: nvhpc/24.3 HDF5/nvhpc.24.3/1.14.4-2
mpirun -np 2 pw.x -i input.in -npool 2 | tee result
cndaqiang@BBBBgpu:~/work/tdpw/test/in-nv$ grep ! result
!    total energy              =     -21.68049299 Ry
!    total energy              =     -21.68049301 Ry

docker配置

douku wiki

docker run -dit -v /home/apps/doukuwiki:/var/www/html -p 810:80 --name dokuwiki --restart unless-stopped php:7.0-apache
cp -r /home/cndaqiang/wiki/wiki_backup/* /home/apps/doukuwiki/
#给权限
docker exec  -it dokuwiki bash
chown -R www-data:www-data *
exit

刷新网页即可


本文首发于我的博客@cndaqiang.
本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!


类似文章


评论


广告


广告
访客数据