Commit 4d53260b by Marek Blazewicz

initial commit

parents
cache
*.retry
.*.swp
Copyright 2018 PSNC
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software within the M2DC project. Further use in the purposes other than
within the M2DC project should be discussed and granted by the copyright
holder.
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Documentation
=============
Basic usage
======================
```bash
cd ansible
ansible-playbook -u your-user -i ../inventories/hpc_appliance@ubi hpc_appliance_ubi.yml
your-user needs to have sudo on the servers
```
Setup
=====
Directory roles is added to roles path search dir.
To create new role run:
```bash
cd roles
ansible-galaxy init <role>
```
[defaults]
gathering = smart
fact_caching = jsonfile
fact_caching_connection = ./cache
host_key_checking = False
nocows = True
roles_path = ../roles/
---
# set rights to / and /etc
- hosts: globals
become: True
tasks:
- name: change global rights
file:
path: "{{ item }}"
owner: root
group: root
mode: 0755
with_items:
- /
- /etc/
# common: create users, ssh config, install basic packages, hostname
- hosts: globals
roles:
- role: common
# set up LDAP server
- hosts: ui
roles:
- role: ldap-server
- hosts: nfs-server
tags:
- nfs
- nfs-server
tasks:
- name: ensure exports
lineinfile:
path: /etc/exports
line: "{{ item.1 | regex_replace('N_E_T_W_O_R_K', item.0) }}"
become: True
with_nested:
- "{{ nfs_export_network.split(' ') }}"
- - "{{ nfs_prefix | default('/') }}home N_E_T_W_O_R_K(rw,no_root_squash,no_subtree_check)"
- "{{ nfs_prefix | default('/') }}opt N_E_T_W_O_R_K(ro,no_root_squash,no_subtree_check)"
#- "{{ nfs_prefix | default('/') }}root {{ nfs_export_network }}(rw,no_root_squash,no_subtree_check)"
notify: restart nfs
- name: ensure services
become: True
service:
name: "{{ item }}"
state: started
enabled: True
with_items:
- rpcbind
- nfs-server
- nfs-lock
- nfs-idmap
handlers:
- name: restart nfs
become: True
service:
name: nfs-server
state: restarted
# podpina hosty z grupy "globals" pod LDAPa
- hosts: globals
roles:
- role: ldap-client
- hosts: nfs-client
tags:
- nfs
- nfs-client
tasks:
- name: ensure packages
package:
name: nfs-utils
state: present
become: True
when: ansible_os_family == "RedHat"
- name: ensure packages
package:
name: nfs-common
state: present
become: True
when: ansible_os_family == "Debian"
- name: ensure dirs are mounted
become: True
mount:
path: "{{ item.path }}"
src: "{{ item.src }}"
fstype: nfs
opts: rw,nosuid,nodev
state: mounted
with_items:
- { path: /home, src: "{{ nfs_server }}:{{ nfs_prefix | default('/') }}home" }
#- { path: /root, src: "{{ nfs_server }}:{{ nfs_prefix | default('/') }}root" }
- { path: /opt, src: "{{ nfs_server }}:{{ nfs_prefix | default('/') }}opt" }
- hosts: globals
tags:
- packages
tasks:
- name: install some more packages
package:
name: "{{ item }}"
state: present
become: True
tags:
- packages
with_items:
- python-virtualenv
- python34-virtualenv
- java-1.8.0-openjdk
- tcsh
- log4cxx-devel
- openssl-devel
- kernel-devel
- fontconfig-devel
- SDL2
- SDL2_image-devel
- SDL2_ttf-devel
when: ansible_os_family == "RedHat"
- name: install some more packages debian
package:
name: "{{ item }}"
state: present
become: True
tags:
- packages
with_items:
- python-virtualenv
- python3-virtualenv
- openjdk-8-jdk
- tcsh
- openssl
- libssl-dev
- liblog4cxx-dev
- libsdl2-image-dev
- libsdl2-ttf-dev
- fontconfig
when: ansible_os_family == "Debian"
- hosts: globals
roles:
- role: slurm
slurm_conf_additional_files:
- hpc_appliance/slurm.epilog.clean
- hosts: globals
tags:
- modules
become: True
tasks:
- name:
package:
name: environment-modules
state: present
- name:
set_fact:
modulespath: /etc/environment-modules/modulespath
when: ansible_os_family == "Debian"
- name:
set_fact:
modulespath: /usr/share/Modules/init/.modulespath
when: ansible_os_family == "RedHat"
- name: comment all old modulefile locations
replace:
path: "{{ modulespath }}"
regexp: '^/(?!opt)'
replace: '#/'
- name: insert proper line
lineinfile:
path: "{{ modulespath }}"
line: /opt/modules
- hosts: ui[0]
tags:
- modules
become: True
tasks:
- name: create dirs
file:
state: directory
path: "{{ item }}"
owner: root
mode: 0775
with_items:
- /opt/soft
- /opt/modules
- hosts: globals
become: True
handlers:
- name: restart systemd
become: True
systemd:
daemon-reload: yes
tags:
- recsdaemon
tasks:
- name: create recs file
file:
state: directory
group: root
owner: root
mode: 0755
path: /etc/RECSDaemon/
- name: check if file exists
stat:
path: /etc/RECSDaemon/recsdaemon.ini
register: resdaemon_file
- name: create recs file
copy:
content: ""
group: root
owner: m2dc
mode: 0755
dest: /etc/RECSDaemon/recsdaemon.ini
when: not resdaemon_file.stat.exists
- name: set RECSDir
set_fact:
RECSDir: /opt/RECSDaemon
when: ansible_os_family == "RedHat"
- name: set RECSDir
set_fact:
RECSDir: /opt/RECSDaemon.ubuntu
when: ansible_os_family == "Debian"
- name: create recs startup file
copy:
dest: /etc/systemd/system/RECSDaemon.service
content: |
[Unit]
Description=Allows sending system monitoring data to the RECS|Box management system
After=syslog.target network.target
[Service]
Type=simple
WorkingDirectory={{ RECSDir }}
ExecStart={{ RECSDir }}/RECSDaemon
ExecReload=/bin/kill -HUP $MAINPID
[Install]
WantedBy=multi-user.target
owner: root
group: root
mode: 644
notify: restart systemd
# - name: m2dc user to restart recsdaemon
# copy:
# dest: /etc/sudoers.d/10_m2dc
# content: |
# m2dc ALL=NOPASSWD:/usr/bin/systemctl restart RECSDaemon
# m2dc ALL=NOPASSWD:/usr/bin/systemctl start RECSDaemon
# m2dc ALL=NOPASSWD:/usr/bin/systemctl stop RECSDaemon
# m2dc ALL=NOPASSWD:/usr/bin/systemctl status RECSDaemon
# m2dc ALL=NOPASSWD:/bin/systemctl restart RECSDaemon
# m2dc ALL=NOPASSWD:/bin/systemctl start RECSDaemon
# m2dc ALL=NOPASSWD:/bin/systemctl stop RECSDaemon
# m2dc ALL=NOPASSWD:/bin/systemctl status RECSDaemon
- hosts: globals
become: True
tags:
- asplus
- starpu
tasks:
- name: install some packages
package:
name: "{{ item }}"
state: present
when: ansible_os_family == "RedHat"
with_items:
- hwloc-libs
- pkgconfig
- hwloc
- hwloc-devel
# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
ClusterName=hpc-appliance
ControlMachine={{ hostvars[groups["ui"][0]].inventory_hostname }}
ControlAddr={{ hostvars[groups["ui"][0]].ansible_host }}
SlurmUser=slurm
#SlurmdUser=root
AuthType=auth/munge
CacheGroups=0
CryptoType=crypto/munge
DisableRootJobs=YES
MpiDefault=none
ProctrackType=proctrack/pgid
ReturnToService=2
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPort=6818
SlurmdPidFile=/var/run/slurmd.pid
SlurmdSpoolDir=/tmp/slurmd
StateSaveLocation=/var/slurm
SwitchType=switch/none
TaskPlugin=task/none
# TIMERS
InactiveLimit=0
KillWait=30
MinJobAge=300
SlurmctldTimeout=120
SlurmdTimeout=300
Waittime=0
# SCHEDULING ===============================
FastSchedule=1
SchedulerType=sched/backfill
# dla Moaba odkomentowac ponizsze i port
#SchedulerType=sched/wiki2
#SchedulerPort=7321
SelectType=select/cons_res
SelectTypeParameters=CR_CPU
#SelectType=select/linear
# LOGGING AND ACCOUNTING
AccountingStorageType=accounting_storage/none
AccountingStoreJobComment=YES
JobCompLoc=/var/log/slurm/slurm_comp.log
JobCompType=jobcomp/filetxt
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/linux
SlurmctldDebug=5
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=5
SlurmdLogFile=/var/log/slurmd.log
SallocDefaultCommand = "srun -n1 -N1 --pty --preserve-env --mpi=none $SHELL"
##POWER SAVE
#SuspendTime=900
#SuspendProgram=/etc/slurm/suspend/suspend
#ResumeProgram=/etc/slurm/suspend/resume
#SuspendTimeout=30
#ResumeTimeout=90
#SuspendExcNodes=xeon_[01-17],amd_f_[01-18],i7_[01-18],atom64_[01-18],taishan-2180-01
# GPUs
GresTypes=gpu,see
# RealMemory=15000
# RealMemory=3000
#!/bin/sh
#
# This script will kill any user processes on a node when the last
# SLURM job there ends. For example, if a user directly logs into
# an allocated node SLURM will not kill that process without this
# script being executed as an epilog.
#
# SLURM_BIN can be used for testing with private version of SLURM
#SLURM_BIN="/usr/bin/"
#
if [ x$SLURM_UID = "x" ] ; then
exit 0
fi
if [ x$SLURM_JOB_ID = "x" ] ; then
exit 0
fi
#
# Don't try to kill user root or system daemon jobs
#
if [ $SLURM_UID -lt 100 ] ; then
exit 0
fi
job_list=`${SLURM_BIN}squeue --noheader --format=%A --user=$SLURM_UID --node=localhost`
for job_id in $job_list
do
if [ $job_id -ne $SLURM_JOB_ID ] ; then
exit 0
fi
done
#
# No other SLURM jobs, purge all remaining processes of this user
#
pkill -KILL -U $SLURM_UID
exit 0
---
# set rights to / and /etc
- hosts: globals
become: True
tasks:
- name: change global rights
file:
path: "{{ item }}"
owner: root
group: root
mode: 0755
with_items:
- /
- /etc/
# common: create users, ssh config, install basic packages, hostname
- hosts: globals
roles:
- role: common
# set up LDAP server
- hosts: ui
roles:
- role: ldap-server
- hosts: nfs-server
tags:
- nfs
- nfs-server
tasks:
- name: ensure exports
lineinfile:
path: /etc/exports
line: "{{ item.1 | regex_replace('N_E_T_W_O_R_K', item.0) }}"
become: True
with_nested:
- "{{ nfs_export_network.split(' ') }}"
- - "{{ nfs_prefix | default('/') }}home N_E_T_W_O_R_K(rw,no_root_squash,no_subtree_check)"
- "{{ nfs_prefix | default('/') }}opt N_E_T_W_O_R_K(rw,no_root_squash,no_subtree_check)"
#- "{{ nfs_prefix | default('/') }}root {{ nfs_export_network }}(rw,no_root_squash,no_subtree_check)"
notify: restart nfs
- name: ensure services
become: True
service:
name: "{{ item }}"
state: started
enabled: True
with_items:
- rpcbind
- nfs-server
- nfs-lock
- nfs-idmap
handlers:
- name: restart nfs
become: True
service:
name: nfs-server
state: restarted
# podpina hosty z grupy "globals" pod LDAPa
- hosts: globals
roles:
- role: ldap-client
- hosts: nfs-client
tags:
- nfs
- nfs-client
tasks:
- name: ensure packages
package:
name: nfs-utils
state: present
become: True
when: ansible_os_family == "RedHat"
- name: ensure packages
package:
name: nfs-common
state: present
become: True
when: ansible_os_family == "Debian"
- name: ensure dirs are mounted
become: True
mount:
path: "{{ item.path }}"
src: "{{ item.src }}"
fstype: nfs
opts: rw,nosuid,nodev
state: mounted
with_items:
- { path: /home, src: "{{ nfs_server }}:{{ nfs_prefix | default('/') }}home" }
#- { path: /root, src: "{{ nfs_server }}:{{ nfs_prefix | default('/') }}root" }
- { path: /opt, src: "{{ nfs_server }}:{{ nfs_prefix | default('/') }}opt" }
- hosts: globals
tags:
- packages
tasks:
- name: install some more packages
package:
name: "{{ item }}"
state: present
become: True
tags:
- packages
with_items:
- python-virtualenv
- python34-virtualenv
- java-1.8.0-openjdk
- tcsh
- log4cxx-devel
- openssl-devel
- kernel-devel
- fontconfig-devel
- SDL2
- SDL2_image-devel
- SDL2_ttf-devel
when: ansible_os_family == "RedHat"
- name: install some more packages debian
package:
name: "{{ item }}"
state: present
become: True
tags:
- packages
with_items:
- python-virtualenv
- python3-virtualenv
- openjdk-8-jdk
- tcsh
- openssl
- libssl-dev
- liblog4cxx-dev
- libsdl2-image-dev
- libsdl2-ttf-dev
- fontconfig
when: ansible_os_family == "Debian"
- hosts: globals
roles:
- role: slurm
slurm_conf_additional_files:
- hpc_appliance/slurm.epilog.clean
- hosts: globals
tags:
- modules
become: True
tasks:
- name:
package:
name: environment-modules
state: present
- name:
set_fact:
modulespath: /etc/environment-modules/modulespath
when: ansible_os_family == "Debian"
- name:
set_fact:
modulespath: /usr/share/Modules/init/.modulespath
when: ansible_os_family == "RedHat"
- name: comment all old modulefile locations
replace:
path: "{{ modulespath }}"
regexp: '^/(?!opt)'
replace: '#/'
- name: insert proper line
lineinfile:
path: "{{ modulespath }}"
line: /opt/modules
- hosts: ui[0]
tags:
- modules
become: True
tasks:
- name: create dirs
file:
state: directory
path: "{{ item }}"
owner: root
mode: 0775
with_items:
- /opt/soft
- /opt/modules
- hosts: globals
become: True
handlers:
- name: restart systemd
become: True
systemd:
daemon-reload: yes
tags:
- recsdaemon
tasks:
- name: create recs file
file:
state: directory
group: root
owner: root
mode: 0755
path: /etc/RECSDaemon/
- name: check if file exists
stat:
path: /etc/RECSDaemon/recsdaemon.ini
register: resdaemon_file
- name: create recs file
copy:
content: ""
group: root
owner: m2dc
mode: 0755
dest: /etc/RECSDaemon/recsdaemon.ini
when: not resdaemon_file.stat.exists
- name: set RECSDir
set_fact:
RECSDir: /opt/RECSDaemon
when: ansible_os_family == "RedHat"
- name: set RECSDir
set_fact:
RECSDir: /opt/RECSDaemon.ubuntu
when: ansible_os_family == "Debian"
- name: create recs startup file
copy:
dest: /etc/systemd/system/RECSDaemon.service
content: |
[Unit]
Description=Allows sending system monitoring data to the RECS|Box management system
After=syslog.target network.target
[Service]
Type=simple
WorkingDirectory={{ RECSDir }}
ExecStart={{ RECSDir }}/RECSDaemon
ExecReload=/bin/kill -HUP $MAINPID
[Install]
WantedBy=multi-user.target
owner: root
group: root
mode: 644
notify: restart systemd
- name: m2dc user to restart recsdaemon
copy:
dest: /etc/sudoers.d/10_m2dc
content: |
m2dc ALL=(ALL) NOPASSWD: ALL