Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
b2ff7d1
init restic repo
dgoebel97 May 26, 2026
377abcd
add systemd trigger for backup
dgoebel97 May 27, 2026
cb1b410
fix errors
dgoebel97 May 27, 2026
66c6140
fix errors
dgoebel97 May 27, 2026
7f4399d
create regular backups
dgoebel97 Jun 15, 2026
32b1a6b
rename ansible vars
dgoebel97 Jun 15, 2026
c336340
split playbook for different hosts
dgoebel97 Jun 15, 2026
d2e2068
enable systemd units
dgoebel97 Jun 15, 2026
b45a8b9
start systemd timers
dgoebel97 Jun 15, 2026
4cc6ce3
fix systemd unit activation
dgoebel97 Jun 15, 2026
db219ed
run facts before full backup
dgoebel97 Jun 15, 2026
e9a2a80
refactor backup scripts and playbook
dgoebel97 Jun 25, 2026
e0a4d7e
Merge branch 'main' into mariadb-backup
dgoebel97 Jun 25, 2026
7c09997
add missing roles directory
dgoebel97 Jun 25, 2026
1ffe0b3
fix typos
dgoebel97 Jun 25, 2026
421e3ea
fix typo
dgoebel97 Jun 25, 2026
31631d0
fix gpg public key format
dgoebel97 Jun 25, 2026
110c7ae
fix gpg public key format
dgoebel97 Jun 25, 2026
7bf65a8
fix typo
dgoebel97 Jun 25, 2026
b8fbadb
fix typoes
dgoebel97 Jun 25, 2026
19b0bac
fix bugs
dgoebel97 Jun 25, 2026
c62dbb7
fix bug
dgoebel97 Jun 26, 2026
169e54b
fix typo
dgoebel97 Jun 26, 2026
1cb1b73
split s3 and sftp rclone operations
dgoebel97 Jun 26, 2026
fac7e54
fix typo
dgoebel97 Jun 26, 2026
eafed27
fix typo
dgoebel97 Jun 26, 2026
c1bd0af
fix typo
dgoebel97 Jun 26, 2026
e3af90e
fix GC
dgoebel97 Jun 29, 2026
13fb845
fix switch statement
dgoebel97 Jun 29, 2026
1a0840d
automatically deploy backup mechanism when deploying testbed
dgoebel97 Jun 29, 2026
eb61444
add documentation for vars and change user of systemd service
dgoebel97 Jun 30, 2026
9131eae
fix bug with s3 prefix
dgoebel97 Jun 30, 2026
867fa82
Fix typo
dgoebel97 Jul 1, 2026
c3adb2a
Fix unescaped variables
dgoebel97 Jul 1, 2026
ea342c6
Fix permission error
dgoebel97 Jul 1, 2026
8c5704b
Fix permission issue
dgoebel97 Jul 1, 2026
523266a
Fix gpg import error
dgoebel97 Jul 1, 2026
553c58a
Fix gpg import error
dgoebel97 Jul 1, 2026
6006ad2
Update role variable
dgoebel97 Jul 3, 2026
88d8b2a
Update config and s3 keys
dgoebel97 Jul 3, 2026
ae24b74
Fix broken config
dgoebel97 Jul 3, 2026
64cefef
Rename backup types
dgoebel97 Jul 3, 2026
ecdbb4d
Fix typo
dgoebel97 Jul 3, 2026
191d99e
update s3 credentials
dgoebel97 Jul 3, 2026
24d6e23
Deploy systemd exporter to monitor backup services
dgoebel97 Jul 3, 2026
f4436c4
Scrape systemd exporter
dgoebel97 Jul 3, 2026
95210f6
Fix systemd exporter container
dgoebel97 Jul 3, 2026
fd63d0b
Add scrape target for systemd exporter
dgoebel97 Jul 3, 2026
f414b71
Add inital alertrules for backups
dgoebel97 Jul 3, 2026
ebdd4c7
add alert for overdue weekly/monthly backup
dgoebel97 Jul 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion environments/custom/configuration.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
---
# Dummy variable to avoid error because ansible does not recognize the
# file as a good configuration file when no variable in it.
dummy:
#
# ========== MariaDB Backup ============
restic_backup_s3_bucket: backup-testbed
restic_backup_docker_tag: "0.18.1@sha256:c1958a2a1c8614f5c317347c2aaddd9f426076f0521430b55509eba43d7516ee"
18 changes: 5 additions & 13 deletions environments/custom/secrets.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
$ANSIBLE_VAULT;1.1;AES256
34653334613432613331383063306531376339613139313231363330396664343063303036666330
3361336266313038663133396135373430306662396261340a623634343533383930663963636466
65313662656461393031323230626135376533656130353765373035333230323037373630636233
3139356434663061350a333865333232623238316262366562383361653065666266323035666162
61353032636637373961653265356132303264633136373433323832366337366135343566626435
30643962343965656435323736373538623663646363343739613735626634653336626336313937
33656539623730646264376339333237366630376335333764376538363331363635623664333561
39386364326332633135343037353663636135346334323234313366303532303738613039363437
37363963353230643763643433633036643565393437323139346134396164643561373230643138
64303632363461393463613365633434646433633562303336666363386666316264376431316331
31616431633166363134373333396536343933383436626566326264663633623462336438313836
38623535353561386531386464313631303362366534626361663438373561653836333933383237
37646134613163383863373430366137646332666639336639643062373831316263
65636135663137643465613334633666306139386363306562626133383330343566646434346164
6237353632626231323464336435623263376237393638640a386461613230363637303864386335
62313765623437393535383239643632366363633038386661623734613031633439346662623861
6238343739343337620a326162383563363332393062666238366336633764336361616566663839
3339
27 changes: 27 additions & 0 deletions environments/custom/templates/restic-backup.service.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
[Unit]
Description=Trigger backup of MariaDB to S3
Documentation=https://restic.readthedocs.io/
After=docker.service network-online.target
Requires=docker.service

[Service]
Type=oneshot

User=dragon
Group=dragon

# Determine full vs incremental based on the current hour
ExecStart=/usr/local/bin/restic-backup.sh
TimeoutStartSec=14400

StandardOutput=journal
StandardError=journal
SyslogIdentifier=restic-db-backup

# Security hardening
PrivateTmp=true
NoNewPrivileges=true
ProtectSystem=strict

[Install]
WantedBy=multi-user.target
26 changes: 26 additions & 0 deletions environments/custom/templates/restic-backup.sh.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env bash
# restic-backup.sh – managed by Ansible. Do NOT edit manually.
#
# Usage:
# restic-backup.sh
set -euo pipefail

# ── Determine backup type ─────────────────────────────────────────────────────
# When called by the systemd timer, BACKUP_TYPE is evaluated automatically:
# - "full" every {{ restic_backup_full_interval_hours }} hours
# - "incremental" all other hourly runs
HOUR=$(date +%-H)
if (( HOUR % {{ restic_backup_full_interval_hours }} == 0 )); then
BACKUP_TYPE=full
else
BACKUP_TYPE=incremental
fi

echo "Backup type: ${BACKUP_TYPE}"


# --- Backup Maria DB -----------------------------------------------------------
/usr/local/bin/osism apply mariadb_backup -e mariadb_backup_type=$BACKUP_TYPE

# --- Upload --------------------------------------------------------------------
# /usr/local/bin/osism apply upload-backup
16 changes: 16 additions & 0 deletions environments/custom/templates/restic-backup.timer.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# restic-backup.timer – managed by OSISM. Do NOT edit manually.
[Unit]
Description=Hourly restic backup timer
Documentation=https://restic.readthedocs.io/
Requires=restic-backup.service

[Timer]
# Fire at the top of every hour, randomized within the first 5 minutes
# to avoid thundering-herd problems on multiple hosts.
OnCalendar={{ restic_backup_oncalendar }}
RandomizedDelaySec=300
# catch up on a missed run after reboot/downtime
Persistent=true

[Install]
WantedBy=timers.target
15 changes: 15 additions & 0 deletions environments/kolla/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,18 @@ prometheus_instance_label: "{{ ansible_facts.hostname }}"

om_enable_rabbitmq_high_availability: false
om_enable_rabbitmq_quorum_queues: false

##########################################################
# mariadb backup

mariadb_backup_gpg_recipient: backup@testbed.osism.xyz
mariadb_backup_node: "testbed-node-0"
mariadb_backup_s3_endpoint: "https://api.bfe2.teutostack.de:6780"
mariadb_backup_s3_bucket: "backup-testbed"
mariadb_backup_s3_prefix: "bfe2/"
mariadb_backup_retention_hourly: 3
mariadb_backup_retention_daily: 7
mariadb_backup_retention_weekly: 28
mariadb_backup_retention_monthly: 365
mariadb_backup_s3_region: ""
mariadb_backup_s3_virtual_hosted_style_bucket: false
59 changes: 59 additions & 0 deletions environments/kolla/files/overlays/prometheus/backup.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
groups:
- name: BackupSystemdExporter
rules:
- alert: BackupProbeFailed
expr: 'up{job="backup-systemd-exporter"} == 0'
for: 1m
labels:
severity: critical
annotations:
summary: Backup probe failed (service {{ $labels.name }})
description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BackupTimerInactive
expr: 'systemd_unit_state{job="backup-systemd-exporter", state="active", type="timer"} == 0'
for: 1m
labels:
severity: critical
annotations:
summary: Backup timer is inative (service {{ $labels.name }})
description: "Timer inactive\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BackupFailed
expr: 'systemd_unit_state{job="backup-systemd-exporter", state="failed"} == 1'
for: 1m
labels:
severity: critical
annotations:
summary: MariaDB Backup job failed (service {{ $labels.name }})
description: "Backup failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HourlyBackupMissed
expr: '(time() - systemd_timer_last_trigger_seconds{job="backup-systemd-exporter", name="mariadb-backup-hourly.timer"}) / 60 > 60'
for: 1m
labels:
severity: critical
annotations:
summary: Hourly backup job is {{ $value }}m overdue
description: "Hourly backup overdue\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: DailyBackupMissed
expr: '(time() - systemd_timer_last_trigger_seconds{job="backup-systemd-exporter", name="mariadb-backup-daily.timer"}) / 60 > 60*60*24'
for: 1m
labels:
severity: critical
annotations:
summary: Daily backup job is {{ $value }}m overdue
description: "Daily backup overdue\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: WeeklyBackupMissed
expr: '(time() - systemd_timer_last_trigger_seconds{job="backup-systemd-exporter", name="mariadb-backup-weekly.timer"}) / 60 > 60*60*24*7'
for: 1m
labels:
severity: critical
annotations:
summary: Weekly backup job is {{ $value }}m overdue
description: "Weekly backup overdue\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MonthlyBackupMissed
expr: '(time() - systemd_timer_last_trigger_seconds{job="backup-systemd-exporter", name="mariadb-backup-monthly.timer"}) / 60 > 60*60*24*31'
for: 1m
labels:
severity: critical
annotations:
summary: Monthly backup job is {{ $value }}m overdue
description: "Monthly backup overdue\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
scrape_configs:
- job_name: backup-systemd-exporter
scrape_interval: 60s
static_configs:
{% for host in groups['manager'] %}
- targets:
- "{{ hostvars[host]['ansible_facts'][hostvars[host]['network_interface']]['ipv4']['address'] }}:{{ systemd_exporter_port | default('9558') }}"
{% if hostvars[host].prometheus_instance_label | default(false, true) %}
labels:
instance: "{{ hostvars[host].prometheus_instance_label }}"
{% endif %}
{% endfor %}
16 changes: 16 additions & 0 deletions environments/kolla/playbook-deploy-mariadb-backup.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
- name: MariaDB backup — control node
hosts: "{{ mariadb_backup_node | default('mariadb[0]') }}"
become: true
roles:
- role: mariadb_backup
vars:
mariadb_backup_component: mariadb

- name: MariaDB backup — manager
hosts: "manager"
become: true
roles:
- role: mariadb_backup
vars:
mariadb_backup_component: manager
111 changes: 111 additions & 0 deletions environments/kolla/roles/mariadb_backup/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
---
# =============================================================================
# roles/mariadb_backup/defaults/main.yml
# Lowest precedence — override anything via group_vars / host_vars / -e.
# =============================================================================

# --- Manager -> mariadb remote execution ------------------------------------
# On which mariadb node should the backup be triggered
mariadb_backup_node: ""
# Location of the SSH key to access the mariadb node
mariadb_backup_ssh_key: "/opt/ansible/secrets/id_rsa.operator"
# IP address of the mariadb node
mariadb_backup_node_address: "{{ hostvars[mariadb_backup_node]['internal_address'] }}"
# ssh command to access mariadb node
mariadb_backup_remote_exec: >-
ssh {% if mariadb_backup_ssh_key %}-i {{ mariadb_backup_ssh_key }} {% endif %}-o BatchMode=yes
-o StrictHostKeyChecking=no dragon@{{ mariadb_backup_node_address }}

# --- Paths ------------------------------------------------------------------
# Path where to save the config for the backup script
mariadb_backup_config_dir: "/etc/kolla/mariadb-backup"
# Path where to save the backups on local disk
mariadb_backup_destination_dir: "/srv/mariadb-backup"
# Path to overwriute the location of backups on the manager
mariadb_backup_destination_dir_manager: "{{ mariadb_backup_destination_dir }}"
# Path to overwriute the location of backups on the mariadb nmode
mariadb_backup_destination_dir_mariadb: "{{ mariadb_backup_destination_dir }}"
# Location of the extract script on the mariadb node
mariadb_backup_extract_script: "/usr/local/bin/mariadb-extract.sh"
# Location of the backup script on the manager ndoe
mariadb_backup_orchestrate_script: "/usr/local/bin/mariadb-backup.sh"
# Name of the docker volume that holds the mariadb backups created by kolla
mariadb_backup_volume: "mariadb_backup"
# Path to the docker volumes
mariadb_backup_docker_volume_dir: "/var/lib/docker/volumes"
# Full path to the backups in the docker volume
mariadb_backup_backup_volume_dir: "/var/lib/docker/volumes/{{ mariadb_backup_volume }}/_data"

# --- GPG (asymmetric; control node holds the PUBLIC key only) ---------------
# Email address of the gpg public key
mariadb_backup_gpg_recipient: ""
# Full public gpg key which is used for backup encryption
mariadb_backup_gpg_public_key: ""

# --- rclone (runs as a container; config + disk are bind mounts) ------------
# Docker registry of rclone image
mariadb_backup_rclone_docker_registry: "ghcr.io"
# Name of rclone image
mariadb_backup_rclone_docker_image: "rclone/rclone"
# Tag of rclone image
mariadb_backup_rclone_docker_tag: "1.74.3"
# Full name of rclone docker image
mariadb_backup_rclone_image_full: "{{ mariadb_backup_rclone_docker_registry }}/{{ mariadb_backup_rclone_docker_image }}:{{ mariadb_backup_rclone_docker_tag }}"
# Path to save the rclone config
mariadb_backup_rclone_config_file: "{{ mariadb_backup_config_dir }}/rclone.conf"

# --- rclone remotes ---------------------------------------------------------
# Name of the mariadb remote in rclone config
mariadb_backup_sftp_remote: "sftp-mariadb"
# Name of S3 remote in rclone config
mariadb_backup_s3_remote: "s3offsite"

# Off-site S3 remote
# Hostname of S3 enpoint
mariadb_backup_s3_hostname: ""
# Port of S3 endpoint
mariadb_backup_s3_port: 443
# Full HTTPS s3 endpoint
mariadb_backup_s3_endpoint: "https://{{ mariadb_backup_s3_hostname }}:{{ mariadb_backup_s3_port }}"
# S3 compatible rclone provider https://rclone.org/s3/
mariadb_backup_s3_rclone_provider: "Ceph"
# S3 backups for backups
mariadb_backup_s3_bucket: "mariadb-backups"
# Flag whether to use the virtual hosted style to address buckets
mariadb_backup_s3_virtual_hosted_style_bucket: false
# Prefix to add before all s3 path
mariadb_backup_s3_prefix: ""
# S3 region for bucket, not necessary for ceph
mariadb_backup_s3_region: ""
# S3 access key id
mariadb_backup_s3_access_key_id: ""
# S3 secret access key
mariadb_backup_s3_secret_access_key: ""

# --- Object Lock / WORM (off-site S3) ---------------------------------------
# S3 Object lock mode: GOVERNANCE | COMPLIANCE | "" (disable)
mariadb_backup_object_lock_mode: "GOVERNANCE"
# S3 lock rentention time in days for hourly backups
mariadb_backup_retention_hourly: 3
# S3 lock rentention time in days for daily backups
mariadb_backup_retention_daily: 7
# S3 lock rentention time in days for weekly backups
mariadb_backup_retention_weekly: 28
# S3 lock rentention time in days for monthly backups
mariadb_backup_retention_monthly: 365

# --- Schedules (systemd OnCalendar) -----------------------------------------
# systemd timer schedule for daily backups
mariadb_backup_schedule_daily: "*-*-* 00:00:00"
# systemd timer schedule for hourly backups
mariadb_backup_schedule_hourly: "*-*-* 01..23:00:00"
# systemd timer schedule for weekly backups
mariadb_backup_schedule_weekly: "Mon *-*-* 01:00:00"
# systemd timer schedule for monthly backups
mariadb_backup_schedule_monthly: "*-*-01 02:00:00"

mariadb_backup_jobs:
- { type: "daily", schedule: "{{ mariadb_backup_schedule_daily }}" }
- { type: "hourly", schedule: "{{ mariadb_backup_schedule_hourly }}" }
- { type: "weekly", schedule: "{{ mariadb_backup_schedule_weekly }}" }
- { type: "monthly", schedule: "{{ mariadb_backup_schedule_monthly }}" }
4 changes: 4 additions & 0 deletions environments/kolla/roles/mariadb_backup/handlers/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
- name: Reload systemd
ansible.builtin.systemd:
daemon_reload: true
8 changes: 8 additions & 0 deletions environments/kolla/roles/mariadb_backup/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
- name: Configure mariadb-node backup tasks
ansible.builtin.include_tasks: mariadb.yml
when: mariadb_backup_component == 'mariadb'

- name: Configure manager orchestration tasks
ansible.builtin.include_tasks: manager.yml
when: mariadb_backup_component == 'manager'
Loading