系统安装
- 刻录ubuntu-22.04-desktop到U盘
- 插入U盘到服务器
- idrac启动到u盘,远程安装系统
网络配置
- 网口1: 互联网访问ip
- 网口2: 内网ip
InfiniBand 配置
安装驱动
cd /home/source/MLNX_OFED_LINUX-23.10-2.1.3.1-ubuntu22.04-x86_64
./mlnxofedinstall --force
/etc/init.d/openibd restart
配置ip
注:装完docker等更改网络的软件,可能需要重新配置
#查看ib网卡信息,名称为ibp75s0
root@BBBBgpu:# ip link show
4: ibp75s0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 2044 qdisc mq state UP mode DEFAULT group default qlen 256
link/infiniband 00:00:0e:e8:fe:80:00:00:00:00:00:00:94:6d:ae:03:00:3c:83:4a brd 00:ff:ff:ff:ff:12:40:1b:ff:ff:00:00:00:00:00:00:ff:ff:ff:ff
#修改ib网卡ibp75s0配置
root@BBBBgpu:# nmcli connection edit type infiniband con-name ibp75s0
nmcli> set ipv4.addresses 10.10.10.120/24
nmcli> set ipv4.gateway 10.10.10.1
nmcli> save
Saving the connection with 'autoconnect=yes'. That might result in an immediate activation of the connection.
Do you still want to save? (yes/no) [yes] yes
Connection 'ibp75s0' (34a3e7f4-2cde-4e2e-9e04-a0390711f825) successfully saved.
nmcli> quit
用户管理
adduser --home /home/cndaqiang cndaqiang
#使用之前的home目录,以及不同服务器之间pid相同的要求
usermod -u 1043 cndaqiang
groupmod -g 1043 cndaqiang
#使用之前的home目录
chown -R cndaqiang:cndaqiang /home/cndaqiang
#添加sudo权限
usermod -aG sudo cndaqiang
常用软件安装
基本软件
apt install openssh-server
systemctl start ssh
systemctl enable ssh
apt install vim
apt install htop
module
sudo apt-get install environment-modules
添加下面内容到/etc/profile
# cndaqiang 20240430
MODULEPATH=/home/apps/module_files
docker
apt install apt-transport-https ca-certificates curl software-properties-common gnupg lsb-release
#
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
#
add-apt-repository \
"deb [arch=amd64] https://download.docker.com/linux/ubuntu \
$(lsb_release -cs) \
stable"
#
apt-get update
apt-get install docker-ce
RDP远程桌面
# Install new packages
sudo apt-get install xrdp xorg
# Add xrdp user to ssl-cert group and reboot
sudo adduser xrdp ssl-cert
sudo reboot
nfs ib共享存储
apt install nfs-common
添加挂载信息到/etc/fstab
10.10.10.101:/home /home/suanpan nfs defaults 0 0
重启之后可以看到
cndaqiang@BBBBgpu:~$ df -h
Filesystem Size Used Avail Use% Mounted on
10.10.10.101:/home 145T 106T 33T 77% /home/suanpan
oneapi
- 注: 新版本oneapi的编译器变了
icc -> icx, ifort -> ifx, icpc -> icpx
,- 相应的mpi编译器名字也跟着变了
安装
安装方法以官方链接为准,先安装Base Toolkit再安装HPC Toolkit
module配置
cd /opt/intel/oneapi
./modulefiles-setup.sh --output-dir=/home/apps/module_files/oneapi.2024
精简配置
oneapi.2024/tbb/latest oneapi.2024/compiler-rt/2024.1.0 oneapi.2024/oclfpga/2024.1.0 oneapi.2024/compiler/2024.1.0 oneapi.2024/mkl/2024.1 oneapi.2024/mpi/2021.12
其他的module文件可以删除
管理员推荐配置
创建一个我们需要使用的module文件到/home/apps/module_files/oneapi.2024/recommended-oneapi
#%Module1.0###########################################
# build by cndaqiang 20240501
## 提供模块的帮助信息
proc ModulesHelp { } {
puts stderr "This module sets up the recommended oneAPI 2024 environment."
}
## 设置模块的描述信息
module-whatis "Sets up the recommended oneAPI 2024 environment"
set module_prefixname "oneapi.2024"
foreach depmodulename { "compiler/2024.1.0" "mkl/2024.1" "mpi/2021.12" } {
set fullmodulename "${module_prefixname}/${depmodulename}"
prereq $fullmodulename
}
GPU配置
禁用开源驱动
创建一个文件 /etc/modprobe.d/blacklist-nouveau.conf
并添加以下内容:
blacklist nouveau
options nouveau modeset=0
更新
sudo update-initramfs -u
重启
cuda
注,安装完cuda,重启一下
sh cuda_12.3.2_545.23.08_linux.run
hpc toolkit
注toolkit要和cuda版本匹配,注意一致
#curl https://developer.download.nvidia.com/hpc-sdk/ubuntu/DEB-GPG-KEY-NVIDIA-HPC-SDK | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg
wget https://developer.download.nvidia.com/hpc-sdk/ubuntu/DEB-GPG-KEY-NVIDIA-HPC-SDK
cat DEB-GPG-KEY-NVIDIA-HPC-SDK | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg
echo 'deb [signed-by=/usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg] https://developer.download.nvidia.com/hpc-sdk/ubuntu/amd64 /' | sudo tee /etc/apt/sources.list.d/nvhpc.list
sudo apt-get update -y
sudo apt-get install -y nvhpc-24-3
module配置
cp -r /opt/nvidia/hpc_sdk/modulefiles/* /home/apps/module_files/
计算软件配置
HDF5
oneapi版本
并行版
./configure CC=mpiicx FC=mpiifx CXX=mpiicpx --prefix=/home/apps/HDF5/oneapi/1.14.4-2.parallel --enable-fortran --enable-cxx --enable-parallel --enable-unsupported
串行版
./configure CC=icx FC=ifx CXX=icpx --prefix=/home/apps/HDF5/oneapi/1.14.4-2 --enable-fortran --enable-cxx
安装,由于root没有编译环境,所以需要sudo env PATH=$PATH
make
sudo env PATH=$PATH make install
创建module file,示例
cndaqiang@BBBBgpu:~$ cat /home/apps/module_files/HDF5/oneapi.2024/1.14.4-2
#%Module1.0
#
# HDF5 module file
#
proc ModulesHelp { } {
puts stderr "Provides HDF5 1.14.4-2"
}
module-whatis "Sets up the environment for HDF5 1.14.4-2"
# Set paths
set prefix /home/apps/HDF5/oneapi/1.14.4-2
prepend-path PATH $prefix/bin
prepend-path LD_LIBRARY_PATH $prefix/lib
prepend-path CPATH $prefix/include
prepend-path LIBRARY_PATH $prefix/lib
prepend-path MANPATH $prefix/share/man
cuda版本
module load nvhpc
./configure FC=nvfortran CC=nvc FCFLAGS=-fPIC --enable-fortran --prefix=/home/apps/HDF5/nvhpc/1.14.4-2
make
sudo env PATH=$PATH make install
module file与oneapi版本相同,不过
set prefix /home/apps/HDF5/nvhpc/1.14.4-2
QE
external包无法下载编译的解决办法
方法1:ssh科学上网
wannier等体积较大的包,科学上网的方法,也容易中断,使用方法2:把仓库转存到gitee
ssh -f -N -D 127.0.0.1:42090 -p 2022 cndaqiang@科学的服务器ip
#编辑 ~/.ssh/config
Host github.com
ProxyCommand nc -x 127.0.0.1:42090 %h %p
Host gitlab.com
ProxyCommand nc -x 127.0.0.1:42090 %h %p
方法2:把仓库转存到gitee
编辑vi .gitmodules
,替换git地址,例如:
https://github.com/wannier-developers/wannier90.git
变为
https://gitee.com/cndaqiang/wannier90.git
QE-oneapi
#支持HDF5的qe-7.2
./configure --with-hdf5=/home/apps/HDF5/oneapi/1.14.4-2/ FC=ifx CC=icx MPIF90=mpiifx
make pw
#普通的qe-7.3
./configure FC=ifx CC=icx MPIF90=mpiifx
复制到公共目录
sudo cp bin/* /home/apps/QE/7.3.1.oneapi.2024/bin/
module file
cndaqiang@BBBBgpu:~$ cat /home/apps/module_files/QE/oneapi.2024/7.3.1
#%Module1.0
##
## QE 7.3.1 for oneAPI 2024 module file
##
module-whatis "Sets up the Quantum ESPRESSO 7.3.1 environment for oneAPI 2024"
set MODULEPATH /home/apps/QE/7.3.1.oneapi.2024
prepend-path PATH $MODULEPATH/bin
#这里是依赖的编译环境,这样只用module load一次
prereq oneapi.2024/recommended-oneapi
mbd代码兼容性问题
2024.05.01 mbd和oneapi不兼容
修改代码
vi external/mbd/src/mbd_c_api.F90
!use iso_c_binding
use iso_c_binding, only: c_int, c_double, c_bool, c_char, c_ptr, c_double_complex, &
c_null_char, c_loc, c_f_pointer, c_associated
rm -rf MBD
make libmbd
重新编译
rm -rf MBD
make libmbd
make pw
QE-cuda
module load nvhpc
./configure --with-hdf5=/home/apps/HDF5/nvhpc/1.14.4-2 --with-cuda=/opt/nvidia/hpc_sdk/Linux_x86_64/24.3/cuda/ --with-cuda-cc=86 --with-cuda-runtime=12.3 --with-scalapack=no --with-cuda-mpi=yes
复制
sudo cp bin/* /home/apps/QE.nvhpc.24.3/qe-AAAA-7.2-hdf5-1.14.4-2/bin/
module file
cndaqiang@BBBBgpu:~$ cat /home/apps/module_files/QE/nvhpc.24.3/qe-AAAA-7.2-hdf5-1.14.4-2
#%Module1.0
##
## QE 7.2 for oneAPI 2024 module file
##
module-whatis "Sets up the Quantum ESPRESSO 7.2 environment for oneAPI 2024"
set MODULEPATH /home/apps/QE.nvhpc.24.3/qe-AAAA-7.2-hdf5-1.14.4-2
prepend-path PATH $MODULEPATH/bin
prereq nvhpc/24.3
prereq HDF5/nvhpc.24.3/1.14.4-2
运行测试
oneapi版本:module load QE/oneapi.2024/qe-AAAA-7.2-hdf5-1.14.4-2
cd ~/work/tdpw/test/in
cndaqiang@BBBBgpu:~/work/tdpw/test/in$ module load QE/oneapi.2024/qe-AAAA-7.2-hdf5-1.14.4-2
Loading QE/oneapi.2024/qe-AAAA-7.2-hdf5-1.14.4-2
Loading requirement: oneapi.2024/tbb/latest oneapi.2024/compiler-rt/2024.1.0 oneapi.2024/oclfpga/2024.1.0 oneapi.2024/compiler/2024.1.0
oneapi.2024/mkl/2024.1 oneapi.2024/mpi/2021.12 oneapi.2024/recommended-oneapi HDF5/oneapi.2024/1.14.4-2
#运行
mpirun -np 4 pw.x -i input.in | tee result
cndaqiang@BBBBgpu:~/work/tdpw/test/in$ grep ! result
! total energy = -21.68049299 Ry
! total energy = -21.68049301 Ry
cuda版本:module load QE/nvhpc.24.3/qe-AAAA-7.2-hdf5-1.14.4-2
cd ~/work/tdpw/test/in-nv
cndaqiang@BBBBgpu:~$ module load QE/nvhpc.24.3/qe-AAAA-7.2-hdf5-1.14.4-2
Loading QE/nvhpc.24.3/qe-AAAA-7.2-hdf5-1.14.4-2
Loading requirement: nvhpc/24.3 HDF5/nvhpc.24.3/1.14.4-2
mpirun -np 2 pw.x -i input.in -npool 2 | tee result
cndaqiang@BBBBgpu:~/work/tdpw/test/in-nv$ grep ! result
! total energy = -21.68049299 Ry
! total energy = -21.68049301 Ry
docker配置
douku wiki
docker run -dit -v /home/apps/doukuwiki:/var/www/html -p 810:80 --name dokuwiki --restart unless-stopped php:7.0-apache
cp -r /home/cndaqiang/wiki/wiki_backup/* /home/apps/doukuwiki/
#给权限
docker exec -it dokuwiki bash
chown -R www-data:www-data *
exit
刷新网页即可
本文首发于我的博客@cndaqiang.
本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!