From be7baf65dae7378f6adb4cd93e6c4f80f2dd8b5c Mon Sep 17 00:00:00 2001 From: Sanya Varghese Date: Wed, 20 May 2026 12:30:57 -0400 Subject: [PATCH] feat: adding netapp-iscsi-helper script --- netapp-iscsi-audit/README.md | 461 +++++++++ netapp-iscsi-audit/pf9-storage-audit.py | 1149 +++++++++++++++++++++++ 2 files changed, 1610 insertions(+) create mode 100644 netapp-iscsi-audit/README.md create mode 100644 netapp-iscsi-audit/pf9-storage-audit.py diff --git a/netapp-iscsi-audit/README.md b/netapp-iscsi-audit/README.md new file mode 100644 index 0000000..26d4cf3 --- /dev/null +++ b/netapp-iscsi-audit/README.md @@ -0,0 +1,461 @@ +# pf9-storage-audit.py — iSCSI Live-Migration Remediation Tool + +## What This Script Does + +After a failed live migration, OpenStack can leave behind corrupted iSCSI +attachment state on the NetApp array. The VM may still be running, but its +disk is mapped to the wrong igroup — which can cause I/O errors, failed +rescans, or a complete loss of disk access on the next reboot. + +This script: +1. Queries Nova and Cinder to find every VM and its attached volumes +2. Queries NetApp ONTAP to check which igroup each LUN is mapped to +3. Compares the two and reports any mismatch +4. Optionally fixes the NetApp igroup mappings automatically + +--- + +## Failure Modes Detected + +### DUAL IGROUP (most common in production) + +`pre_live_migration` on the destination host creates a new LUN map entry +**before** the VM actually moves. If the migration fails, the source igroup +entry is never cleaned up — the same LUN ends up mapped to two igroups at once. + +``` +LUN /vol/vol1/cinder-volume-abc... + ✓ igroup-source → iqn....source-host ← correct, VM is here + ✗ igroup-dest → iqn....dest-host ← stale, must be removed +``` + +Nova's BDM `target_lun` may also point to the destination's LUN ID, requiring +a DB fix after the igroup is corrected. + +--- + +### SOURCE MISSING + +A failed `terminate_connection` call removes the source igroup entry entirely. +Only the destination's igroup entry remains — the source host loses access. + +``` +LUN /vol/vol1/cinder-volume-abc... + ✗ igroup-dest → iqn....dest-host ← wrong host, source entry is gone +``` + +--- + +## What `--remediate` Actually Does + +This is the most important thing to understand before running the script. + +**`--remediate` makes live changes to NetApp ONTAP immediately:** + +| Action | Automated? | +|--------|-----------| +| Remove stale LUN→igroup mapping on NetApp (DUAL IGROUP) | **Yes — executes immediately** | +| Re-add correct LUN→igroup mapping on NetApp (SOURCE MISSING) | **Yes — executes immediately** | +| iSCSI rescan commands (`iscsiadm`, `multipath -r`) | No — printed for you to run manually on the host | +| Nova BDM `target_lun` SQL fix | No — printed for you to review and run on the DB host | + +**Always run `--dry-run` first.** It shows you exactly what would happen +without touching anything. + +--- + +## Prerequisites + +| Requirement | How to verify | +|-------------|--------------| +| Python 3.8+ | `python3 --version` | +| OpenStack CLI | `pip3 install python-openstackclient` | +| RC file sourced | `echo $OS_AUTH_URL` — must return a URL | +| Network access to NetApp (port 443) | `curl -sk https:///api/cluster` | + +--- + +## Supplying IQNs (required for Ubuntu compute hosts) + +Ubuntu iSCSI IQNs encode a hardware ID (MAC suffix), not the hostname. +The script cannot infer them from the igroup name — you must supply them. + +Without IQNs the script can still **detect** dual mappings but cannot +classify which igroup entry is correct vs stale. + +**Option A — SSH (automatic, recommended):** + +```bash +python3 pf9-storage-audit.py \ + --netapp-host \ + --netapp-user admin \ + --svm \ + --ssh-user root \ + --ssh-key /path/to/key +``` + +The script SSHes to each compute host and reads +`/etc/iscsi/initiatorname.iscsi` automatically. + +**Option B — Manual:** + +```bash +# On each compute host, run: +cat /etc/iscsi/initiatorname.iscsi +# → InitiatorName=iqn.2004-10.com.ubuntu:01:04cd37af9c9 + +# Then pass to the script (one --host-iqn per host): +python3 pf9-storage-audit.py \ + --netapp-host \ + --netapp-user admin \ + --svm \ + --host-iqn "compute-970-1=iqn.2004-10.com.ubuntu:01:04cd37af9c9" \ + --host-iqn "compute-970-2=iqn.2004-10.com.ubuntu:01:ef99ea7be46" +``` + +--- + +## Usage Examples + +### 1. Detect issues across all VMs (safe — no changes) + +```bash +python3 pf9-storage-audit.py \ + --netapp-host \ + --netapp-user admin \ + --svm \ + --host-iqn "compute-1=iqn.2004-10.com.ubuntu:01:04cd37af9c9" \ + --host-iqn "compute-2=iqn.2004-10.com.ubuntu:01:ef99ea7be46" +``` + +### 2. Detect issues for a single VM (safe — no changes) + +```bash +python3 pf9-storage-audit.py \ + --netapp-host \ + --netapp-user admin \ + --svm \ + --host-iqn "compute-1=iqn.2004-10.com.ubuntu:01:04cd37af9c9" \ + --host-iqn "compute-2=iqn.2004-10.com.ubuntu:01:ef99ea7be46" \ + --server +``` + +### 3. Preview what remediation would do (safe — no changes) + +```bash +python3 pf9-storage-audit.py \ + --netapp-host \ + --netapp-user admin \ + --svm \ + --host-iqn "compute-1=iqn.2004-10.com.ubuntu:01:04cd37af9c9" \ + --host-iqn "compute-2=iqn.2004-10.com.ubuntu:01:ef99ea7be46" \ + --server \ + --dry-run +``` + +### 4. Apply fixes — changes NetApp immediately + +```bash +python3 pf9-storage-audit.py \ + --netapp-host \ + --netapp-user admin \ + --svm \ + --host-iqn "compute-1=iqn.2004-10.com.ubuntu:01:04cd37af9c9" \ + --host-iqn "compute-2=iqn.2004-10.com.ubuntu:01:ef99ea7be46" \ + --server \ + --remediate +``` + +> **Recommended order:** run without flags first → then `--dry-run` → then `--remediate`. + +--- + +## Sample Output + +### Clean environment (no issues) + +``` +✓ No igroup mapping issues detected. +``` + +--- + +### DUAL IGROUP detected + +``` +================================================================================ +ISSUES FOUND: 1 +================================================================================ + + [DUAL IGROUP] + VM : prod-vm-07 (a1b2c3d4-...) status=ACTIVE + Volume : vol-uuid-... + Nova host : compute-970-1 ← VM is HERE + LUN maps : 2 igroup(s) ← DUAL MAPPING (most common production failure) + ✓ cinder-iqn-abc... → IQN hosts: compute-970-1 [correct — nova host] + ✗ cinder-iqn-def... → IQN hosts: compute-970-2 [stale — destination igroup] + +Summary: 1 VM(s), 1 volume(s) with issues [dual_mapping=1, source_missing=0] + +Run with --dry-run to preview remediation steps. +Run with --remediate to apply igroup fixes and print Cinder/iSCSI steps. +``` + +--- + +### SOURCE MISSING detected + +``` +================================================================================ +ISSUES FOUND: 1 +================================================================================ + + [SOURCE MISSING] + VM : prod-vm-07 (a1b2c3d4-...) status=ACTIVE + Volume : vol-uuid-... + Nova host : compute-970-1 ← VM is HERE + LUN maps : source igroup mapping is MISSING (removed by failed terminate_connection) + ✗ cinder-iqn-def... → IQN hosts: compute-970-2 [wrong host — destination only] + +Summary: 1 VM(s), 1 volume(s) with issues [dual_mapping=0, source_missing=1] +``` + +--- + +### Dry run — DUAL IGROUP case + +Command used: +```bash +python3 pf9-storage-audit.py ... --server --dry-run +``` + +Output: +``` +================================================================================ +[DRY-RUN] REMEDIATION +================================================================================ +Steps per finding: + 1. Fix NetApp LUN maps / igroup initiators (automated here) + 2. iSCSI rescan (commands to run on the correct host) + 3. Nova BDM target_lun fix (SQL to run — review before applying) + +── prod-vm-07 (a1b2c3d4-...) ── + Nova host : compute-970-1 + + STEP 1: Fix NetApp LUN maps + + Volume : vol-uuid-... + LUN path: /vol/vol1/cinder-volume-vol-uuid-... + [DRY-RUN] NetApp: remove LUN map → igroup 'cinder-iqn-def...' (igroup-uuid-...) + Correct LUN ID (nova host's mapping): 3 ← use this for BDM fix in Step 3 + + STEP 2: iSCSI rescan — run on compute-970-1: + iscsiadm -m session -R + iscsiadm -m node --login + multipath -r + multipath -ll | grep -E 'failed|faulty|0 paths' + + STEP 3: Nova BDM target_lun fix + # target_lun / target_luns in Nova BDM may point to the destination LUN ID. + # First, check current values and path count: + mysql> SELECT instance_uuid, + JSON_EXTRACT(connection_info, '$.data.target_lun') AS lun, + JSON_EXTRACT(connection_info, '$.data.target_luns') AS luns + FROM block_device_mapping + WHERE volume_id = 'vol-uuid-...' + AND instance_uuid = 'a1b2c3d4-...' + AND deleted = 0; + + # Then update — adjust JSON_ARRAY() length to match the path count above: + # Volume vol-uuid-... → correct LUN ID = 3 + mysql> UPDATE block_device_mapping + SET connection_info = JSON_SET( + JSON_SET(connection_info, '$.data.target_lun', 3), + '$.data.target_luns', JSON_ARRAY(3, 3, 3, 3) + ) + WHERE volume_id = 'vol-uuid-...' + AND instance_uuid = 'a1b2c3d4-...' + AND deleted = 0; + +================================================================================ +After all steps, verify: + virsh list --all (on affected host — should not hang) + multipath -ll (no failed/faulty maps) + openstack volume list (volumes should be 'in-use') + openstack server list (VMs should be 'ACTIVE') +``` + +--- + +### Remediate — DUAL IGROUP case + +Command used: +```bash +python3 pf9-storage-audit.py ... --server --remediate +``` + +Same output as dry run above, except Step 1 lines read: +``` + NetApp: remove LUN map → igroup 'cinder-iqn-def...' (igroup-uuid-...) + Done. +``` +The `[DRY-RUN]` prefix is gone and `Done.` confirms the change was applied. + +--- + +### Remediate — SOURCE MISSING case + +Command used: +```bash +python3 pf9-storage-audit.py ... --server --remediate +``` + +Output: +``` +── prod-vm-07 (a1b2c3d4-...) ── + Nova host : compute-970-1 + + STEP 1: Fix NetApp LUN maps + + Volume : vol-uuid-... + LUN path: /vol/vol1/cinder-volume-vol-uuid-... + NetApp: remove LUN map → igroup 'cinder-iqn-def...' (igroup-uuid-dest) + Done. + NetApp: map LUN '/vol/vol1/cinder-volume-vol-uuid-...' → igroup 'cinder-iqn-abc...' + Done. + + STEP 2: iSCSI rescan — run on compute-970-1: + iscsiadm -m session -R + iscsiadm -m node --login + multipath -r + multipath -ll | grep -E 'failed|faulty|0 paths' + + STEP 3: Nova BDM target_lun fix + # Volume vol-uuid-...: LUN ID unknown — check NetApp and set manually +``` + +> If the source igroup cannot be found (IQN unknown and SSH unreachable), +> Step 1 prints a warning and skips the `add_lun_map` call. +> Supply the IQN via `--host-iqn` or `--ssh-user` and re-run. + +--- + +### With host health checks (`--ssh-user root`) + +When `--ssh-user` is provided, the script also checks each compute host's +health over SSH and prints a summary: + +``` +================================================================================ +HOST HEALTH +================================================================================ + + compute-970-1 + libvirtd : active + multipath : 2 failed path(s) ← ATTENTION + D-state procs: none + virsh : 5 domain(s) visible + + compute-970-2 + libvirtd : active + multipath : OK + D-state procs: qemu-system-x86 ← ATTENTION + virsh : 4 domain(s) visible +``` + +--- + +### Ubuntu IQN warning (no `--host-iqn` supplied) + +If IQNs are not provided and SSH is not available, the script warns per host: + +``` + [WARN] compute-970-1: igroup check skipped — pass --host-iqn compute-970-1= + (get via: cat /etc/iscsi/initiatorname.iscsi) +✓ No igroup mapping issues detected. +``` + +The script can still report the LUN map state but cannot determine which +igroup entry is correct vs stale without knowing the host's IQN. + +--- + +## All Options + +| Option | Default | Description | +|--------|---------|-------------| +| `--netapp-host` | *(required)* | NetApp management IP or hostname | +| `--netapp-user` | `admin` | NetApp username | +| `--netapp-password` | *(prompted)* | NetApp password — prompted if omitted | +| `--svm` | *(all SVMs)* | Limit to a specific SVM (e.g. `cinder_svm`) — recommended | +| `--server` | *(all VMs)* | Check a single VM by UUID or name | +| `--ssh-user` | *(none)* | SSH user for compute hosts — enables IQN auto-fetch and health checks | +| `--ssh-key` | *(default key)* | Path to SSH private key | +| `--host-iqn` | *(none)* | Known IQN for a compute host: `--host-iqn hostname=iqn.xxx` — repeat per host | +| `--dry-run` | *(off)* | Show all remediation steps without making any changes | +| `--remediate` | *(off)* | Execute NetApp igroup fixes; print iSCSI rescan + BDM SQL steps | +| `--clean-mpath` | *(off)* | Scan compute hosts for mpath devices with all paths failed/ghost and flush them. Requires `--ssh-user`. Use with `--dry-run` to preview. | + +--- + +## What Gets Fixed Automatically + +| Step | What | How | +|------|------|-----| +| 1a | Remove stale destination LUN map (DUAL IGROUP) | **Automated** by `--remediate` | +| 1b | Re-add source LUN map (SOURCE MISSING, if IQN known) | **Automated** by `--remediate` | +| 2 | iSCSI rescan on correct compute host | **Printed** — run manually on the host | +| 3 | Fix Nova BDM `target_lun` + `target_luns` | **Printed** — SQL to review and run manually | +| 4 | Flush orphaned mpath devices (all paths failed) | **Automated** by `--clean-mpath --remediate` | + +--- + +## Common Errors + +**`Missing value auth-url required for auth plugin password`** +```bash +source openstack-rc.rc +``` + +**`'openstack' CLI not found`** +```bash +pip3 install python-openstackclient +``` + +**`Cannot reach NetApp at : Network is unreachable`** +You are not on the correct network. Check VPN or run from inside the customer environment. + +**`Found 0 VM(s)`** +RC file is sourced for the wrong project, or there are no VMs in the project. +Verify with `openstack server list --all`. + +**`[WARN] : igroup check skipped`** +No IQN is known for this host. The script detected a LUN map but cannot classify +it as correct or stale. Supply the IQN: +```bash +# On the compute host: +cat /etc/iscsi/initiatorname.iscsi + +# Then re-run with: +--host-iqn "hostname=iqn.2004-10.com.ubuntu:01:..." +``` + +**`[WARN] Cannot find igroup for `** +SOURCE MISSING case — the source igroup must be re-added but the host IQN is +unknown, so the script cannot locate the correct igroup. Supply the IQN as +above and re-run `--remediate`. + +**`[MANUAL ACTION REQUIRED] NetApp rejected automatic removal`** +Older ONTAP versions require LUN maps to be removed before igroup initiators +can be deleted. Follow the printed NetApp System Manager steps, then re-run +`--remediate`. + +--- + +## Exit Codes + +| Code | Meaning | +|------|---------| +| `0` | Clean — no issues found | +| `1` | Issues detected, or fatal error | +| `130` | Interrupted (Ctrl+C) | diff --git a/netapp-iscsi-audit/pf9-storage-audit.py b/netapp-iscsi-audit/pf9-storage-audit.py new file mode 100644 index 0000000..2be35af --- /dev/null +++ b/netapp-iscsi-audit/pf9-storage-audit.py @@ -0,0 +1,1149 @@ +#!/usr/bin/env python3 +""" +pf9-storage-audit.py — Cluster-wide iSCSI live-migration BDM/igroup remediation. + +Cross-references Nova, Cinder, and NetApp ONTAP to find and fix stale +attachment state left behind by failed live migrations. + +Detects two failure modes: + DUAL IGROUP — LUN is mapped to both source and destination igroup simultaneously. + Root cause: pre_live_migration ran on destination but migration + failed and BDM rollback was skipped (libvirt monitor timeout). + This is the most common production failure. + SOURCE MISSING — LUN is mapped only to the destination igroup; source igroup + mapping was removed (e.g. failed terminate_connection call). + +Usage: + # Detect only + python3 pf9-storage-audit.py --netapp-host --netapp-user admin + + # With SSH for IQN resolution and host health + python3 pf9-storage-audit.py --netapp-host --netapp-user admin \\ + --ssh-user root --ssh-key /tmp/key + + # Supply known IQNs manually (alternative to SSH) + python3 pf9-storage-audit.py ... --host-iqn 970-1=iqn.2004-10.com.ubuntu:01:04cd37af9c9 + + # Preview remediation + python3 pf9-storage-audit.py ... --dry-run + + # Apply igroup fixes + python3 pf9-storage-audit.py ... --remediate +""" + +import argparse +import base64 +import getpass +import json +import re +import ssl +import subprocess +import sys +import urllib.error +import urllib.parse +import urllib.request +from concurrent.futures import ThreadPoolExecutor, as_completed + + +# ── OpenStack helpers ────────────────────────────────────────────────────── + +def os_cmd(*args, allow_fail=False): + cmd = ["openstack", *args, "-f", "json"] + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) + except FileNotFoundError: + print("[ERROR] 'openstack' CLI not found — install python-openstackclient and source your RC file.", + file=sys.stderr) + sys.exit(1) + except subprocess.TimeoutExpired: + print(f"[ERROR] openstack {' '.join(args)} timed out after 120s.", file=sys.stderr) + if allow_fail: + return None + sys.exit(1) + if result.returncode != 0: + if allow_fail: + return None + print(f"[ERROR] openstack {' '.join(args)}\n{result.stderr.strip()}", file=sys.stderr) + sys.exit(1) + try: + return json.loads(result.stdout) + except json.JSONDecodeError: + return [] + + +def get_all_servers(): + servers = os_cmd("server", "list", "--all", "--long") + return [ + { + "id": s.get("ID", ""), + "name": s.get("Name", ""), + "host": s.get("Host", ""), + "status": s.get("Status", ""), + } + for s in servers + ] + + +def get_server(server_id): + s = os_cmd("server", "show", server_id) + host = (s.get("OS-EXT-SRV-ATTR:hypervisor_hostname") + or s.get("OS-EXT-SRV-ATTR:host", "")) + return { + "id": s.get("id", ""), + "name": s.get("name", ""), + "host": host, + "status": s.get("status", ""), + } + + +def get_server_volumes(server_id): + vols = os_cmd("server", "volume", "list", server_id, allow_fail=True) or [] + seen = set() + result = [] + for v in vols: + vid = v.get("Volume ID", v.get("id", "")) if v else "" + if vid and vid not in seen: + seen.add(vid) + result.append(vid) + return result + + +def get_volume_info(volume_id): + return os_cmd("volume", "show", volume_id, allow_fail=True) + + +def get_hypervisor_name_map(): + """Resolve Cinder host UUIDs → hostnames. + + In PF9, Cinder stores the nova-compute service UUID in attachment.host_name. + That UUID is regenerated on every service restart, so old attachments can't be + resolved after a reboot. We try the hypervisor list UUIDs as a best-effort; + callers must treat unresolved UUIDs as 'unknown' rather than 'stale'. + """ + result = {} + hypervisors = os_cmd("hypervisor", "list", "--long", allow_fail=True) or [] + for h in hypervisors: + uuid = str(h.get("ID", h.get("id", ""))) + name = h.get("Hypervisor Hostname", h.get("hypervisor_hostname", "")) + if uuid and name: + result[uuid] = name + return result + + +def get_hypervisor_ip_map(): + result = {} + hypervisors = os_cmd("hypervisor", "list", "--long", allow_fail=True) or [] + for h in hypervisors: + name = h.get("Hypervisor Hostname", h.get("hypervisor_hostname", "")) + ip = h.get("Host IP", h.get("host_ip", "")) + if name and ip: + result[name] = ip + return result + + +# ── NetApp helpers ───────────────────────────────────────────────────────── + +def _netapp_request(host, user, password, path, params=None): + url = f"https://{host}/api/{path}" + if params: + url += "?" + "&".join(f"{k}={v}" for k, v in params.items()) + creds = base64.b64encode(f"{user}:{password}".encode()).decode() + req = urllib.request.Request(url, headers={ + "Authorization": f"Basic {creds}", + "Accept": "application/json", + }) + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + try: + with urllib.request.urlopen(req, context=ctx, timeout=30) as resp: + return json.loads(resp.read()) + except urllib.error.HTTPError as e: + print(f"[ERROR] NetApp {e.code} on {url}: {e.read().decode()}", file=sys.stderr) + sys.exit(1) + except urllib.error.URLError as e: + print(f"[ERROR] Cannot reach NetApp at {host}: {e.reason}", file=sys.stderr) + print(f" Ensure you are on the correct network/VPN and {host} is reachable.", file=sys.stderr) + sys.exit(1) + + +def _netapp_get_all(host, user, password, path, params=None): + params = dict(params or {}) + params.setdefault("max_records", "1000") + records = [] + while True: + data = _netapp_request(host, user, password, path, params) + records.extend(data.get("records", [])) + next_href = data.get("_links", {}).get("next", {}).get("href") + if not next_href: + break + path = next_href.lstrip("/").removeprefix("api/") + params = {} + return records + + +def _netapp_post(host, user, password, path, body): + url = f"https://{host}/api/{path}" + data = json.dumps(body).encode() + creds = base64.b64encode(f"{user}:{password}".encode()).decode() + req = urllib.request.Request(url, data=data, method="POST", headers={ + "Authorization": f"Basic {creds}", + "Accept": "application/json", + "Content-Type": "application/json", + }) + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + with urllib.request.urlopen(req, context=ctx, timeout=30) as resp: + raw = resp.read() + return json.loads(raw) if raw else {} + + +def _do_delete(host, user, password, url): + creds = base64.b64encode(f"{user}:{password}".encode()).decode() + req = urllib.request.Request(url, method="DELETE", headers={ + "Authorization": f"Basic {creds}", + "Accept": "application/json", + }) + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + with urllib.request.urlopen(req, context=ctx): + pass + + +def get_igroups(host, user, password, svm=None): + params = {"fields": "name,uuid,initiators,svm.name"} + if svm: + params["svm.name"] = svm + raw = _netapp_get_all(host, user, password, "protocols/san/igroups", params) + return { + ig["name"]: { + "uuid": ig.get("uuid", ""), + "initiators": [i.get("name", "") for i in ig.get("initiators", [])], + "svm": ig.get("svm", {}).get("name", ""), + } + for ig in raw + } + + +def get_lun_map(host, user, password, svm=None): + """Return {lun_path: [list of mapping dicts]} — preserves dual mappings.""" + params = {"fields": "lun.name,lun.uuid,igroup.name,igroup.uuid,logical_unit_number,svm.name"} + if svm: + params["svm.name"] = svm + raw = _netapp_get_all(host, user, password, "protocols/san/lun-maps", params) + result = {} + for m in raw: + lun_path = m.get("lun", {}).get("name", "") + result.setdefault(lun_path, []).append({ + "igroup": m.get("igroup", {}).get("name", ""), + "igroup_uuid": m.get("igroup", {}).get("uuid", ""), + "lun_uuid": m.get("lun", {}).get("uuid", ""), + "lun_id": m.get("logical_unit_number"), + "svm": m.get("svm", {}).get("name", ""), + }) + return result + + +def remove_igroup_initiator(host, user, password, ig_uuid, iqn, dry_run=False): + label = "[DRY-RUN] " if dry_run else "" + print(f" {label}NetApp: remove initiator {iqn} from igroup {ig_uuid}") + if dry_run: + return True + + base_url = f"https://{host}/api/protocols/san/igroups/{ig_uuid}/initiators/{urllib.parse.quote(iqn, safe='')}" + + # Try with allow_delete_while_lun_mapped first (ONTAP 9.9+) + try: + _do_delete(host, user, password, base_url + "?allow_delete_while_lun_mapped=true") + print(" Done.") + return True + except urllib.error.HTTPError as e: + body = e.read().decode() + if e.code == 400 and "allow_delete_while_lun_mapped" in body: + pass # Older ONTAP — retry without the parameter + else: + print(f" [ERROR] {e.code}: {body}", file=sys.stderr) + return False + except urllib.error.URLError as e: + print(f" [ERROR] Network error during igroup DELETE: {e.reason}", file=sys.stderr) + return False + + try: + _do_delete(host, user, password, base_url) + print(" Done.") + return True + except urllib.error.HTTPError as e: + body = e.read().decode() + if e.code == 409: + print(f" [MANUAL ACTION REQUIRED] NetApp rejected automatic removal.") + print(f" This ONTAP version requires LUN maps to be removed first.") + print(f" Remove manually via NetApp System Manager:") + print(f" Storage → Igroups → search '{ig_uuid}' → Initiators → remove {iqn}") + print(f" Then re-run --remediate to continue.") + else: + print(f" [ERROR] {e.code}: {body}", file=sys.stderr) + return False + except urllib.error.URLError as e: + print(f" [ERROR] Network error during igroup DELETE: {e.reason}", file=sys.stderr) + return False + + +def remove_lun_map(host, user, password, lun_uuid, igroup_uuid, igroup_name, dry_run=False): + """Remove one LUN→igroup mapping (DELETE /api/protocols/san/lun-maps/{lun_uuid}/{igroup_uuid}).""" + label = "[DRY-RUN] " if dry_run else "" + print(f" {label}NetApp: remove LUN map → igroup '{igroup_name}' ({igroup_uuid})") + if dry_run: + return True + url = f"https://{host}/api/protocols/san/lun-maps/{lun_uuid}/{igroup_uuid}" + try: + _do_delete(host, user, password, url) + print(" Done.") + return True + except urllib.error.HTTPError as e: + body = e.read().decode() + if e.code == 404: + print(" Done (mapping was already absent).") + return True + print(f" [ERROR] {e.code}: {body}", file=sys.stderr) + return False + except urllib.error.URLError as e: + print(f" [ERROR] Network error during LUN map DELETE: {e.reason}", file=sys.stderr) + return False + + +def add_lun_map(host, user, password, lun_path, igroup_name, svm_name, dry_run=False): + """Add a LUN→igroup mapping (POST /api/protocols/san/lun-maps).""" + label = "[DRY-RUN] " if dry_run else "" + print(f" {label}NetApp: map LUN '{lun_path}' → igroup '{igroup_name}'") + if dry_run: + return True + body = { + "lun": {"name": lun_path}, + "igroup": {"name": igroup_name}, + "svm": {"name": svm_name}, + } + try: + _netapp_post(host, user, password, "protocols/san/lun-maps", body) + print(" Done.") + return True + except urllib.error.HTTPError as e: + body_text = e.read().decode() + if e.code == 409: + print(" Done (mapping already present).") + return True + print(f" [ERROR] {e.code}: {body_text}", file=sys.stderr) + return False + except urllib.error.URLError as e: + print(f" [ERROR] Network error during LUN map POST: {e.reason}", file=sys.stderr) + return False + + +def create_igroup(host, user, password, ig_name, svm_name, iqn, dry_run=False): + """Create a new igroup with one initiator (POST /api/protocols/san/igroups).""" + label = "[DRY-RUN] " if dry_run else "" + print(f" {label}NetApp: create igroup '{ig_name}' (svm={svm_name}, iqn={iqn})") + if dry_run: + return ig_name + body = { + "name": ig_name, + "protocol": "iscsi", + "os_type": "linux", + "svm": {"name": svm_name}, + "initiators": [{"name": iqn}], + } + try: + resp = _netapp_post(host, user, password, "protocols/san/igroups", body) + print(" Done.") + return ig_name + except urllib.error.HTTPError as e: + body_text = e.read().decode() + if e.code == 409: + print(" Done (igroup already exists).") + return ig_name + print(f" [ERROR] {e.code}: {body_text}", file=sys.stderr) + return None + except urllib.error.URLError as e: + print(f" [ERROR] Network error during igroup POST: {e.reason}", file=sys.stderr) + return None + + +# ── SSH helpers ─────────────────────────────────────────────────────────── + +def _ssh_cmd(ssh_user, ssh_key, target, command): + cmd = ["ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no", "-o", "BatchMode=yes"] + if ssh_key: + cmd += ["-i", ssh_key] + cmd += [f"{ssh_user}@{target}", command] + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=15) + return result.stdout, result.returncode + except Exception: + return None, -1 + + +def get_host_iqn_via_ssh(hostname, ssh_user, ssh_key=None, host_ip=None): + for target in filter(None, [host_ip, hostname.split(".")[0]]): + out, rc = _ssh_cmd(ssh_user, ssh_key, target, "cat /etc/iscsi/initiatorname.iscsi") + if rc == 0 and out: + for line in out.splitlines(): + if line.startswith("InitiatorName="): + return line.split("=", 1)[1].strip() + return None + + +# ── Orphaned mpath helpers ──────────────────────────────────────────────── + +def _parse_orphaned_mpath(output): + """Parse `multipath -ll` output; return WWIDs where every path is failed/ghost.""" + orphaned = [] + current = None + total_paths = 0 + bad_paths = 0 + + for line in output.splitlines(): + # New device block: line starts with a 33-char hex WWID + m = re.match(r'^([0-9a-f]{33})\s', line) + if m: + if current is not None and total_paths > 0 and total_paths == bad_paths: + orphaned.append(current) + current = m.group(1) + total_paths = 0 + bad_paths = 0 + elif current is not None: + # Path lines: H:C:T:L sdX M:N ... + path_m = re.search(r'\d+:\d+:\d+:\d+\s+\S+\s+\d+:\d+\s+(\w+)', line) + if path_m: + total_paths += 1 + if path_m.group(1) in ('failed', 'ghost'): + bad_paths += 1 + + if current is not None and total_paths > 0 and total_paths == bad_paths: + orphaned.append(current) + + return orphaned + + +def get_orphaned_mpath_on_host(hostname, ssh_user, ssh_key=None, host_ip=None): + """SSH to host and return list of orphaned mpath WWIDs (all paths failed/ghost). + + Returns None if SSH failed, [] if SSH succeeded but no orphans found. + """ + target = host_ip or hostname.split(".")[0] + out, rc = _ssh_cmd(ssh_user, ssh_key, target, "multipath -ll 2>/dev/null") + if rc != 0 or out is None: + return None + return _parse_orphaned_mpath(out) + + +def flush_mpath_via_ssh(hostname, ssh_user, ssh_key, host_ip, wwid, dry_run=False): + label = "[DRY-RUN] " if dry_run else "" + print(f" {label}multipath -f {wwid}") + if dry_run: + return True + target = host_ip or hostname.split(".")[0] + out, rc = _ssh_cmd(ssh_user, ssh_key, target, f"multipath -f {wwid} 2>&1") + if rc == 0: + print(" Done.") + return True + msg = (out or "").strip() + if "map in use" in msg: + print(f" [WARN] Cannot flush {wwid}: device is held open by a running process " + f"(stop or live-migrate the VM first, then re-run --remediate).", file=sys.stderr) + else: + print(f" [WARN] rc={rc}: {msg}", file=sys.stderr) + return False + + +def run_mpath_cleanup(nova_hosts, ssh_user, ssh_key, hyp_ip_map, dry_run, remediate): + """Scan compute hosts for orphaned mpath devices and optionally flush them. + + detect-only (remediate=False, dry_run=False): report WWIDs, no flush. + dry-run (dry_run=True): show flush commands, no changes. + remediate (remediate=True): flush each orphaned device via SSH. + """ + hyp_ip_map = hyp_ip_map or {} + prefix = "[DRY-RUN] " if dry_run else "" + + print(f"\n{'='*80}") + print(f"{prefix}ORPHANED MPATH CLEANUP") + print(f"{'='*80}") + print("Scanning compute hosts for mpath devices with all paths failed...\n") + + any_found = False + for host in sorted(nova_hosts): + short = host.split(".")[0] + wwids = get_orphaned_mpath_on_host(host, ssh_user, ssh_key, hyp_ip_map.get(host)) + if wwids is None: + print(f" {short}: SSH failed — skipping") + continue + if not wwids: + print(f" {short}: no orphaned mpath devices") + continue + any_found = True + print(f"\n {short}: {len(wwids)} orphaned mpath device(s)") + for wwid in wwids: + if remediate or dry_run: + flush_mpath_via_ssh(host, ssh_user, ssh_key, hyp_ip_map.get(host), + wwid, dry_run=dry_run) + else: + print(f" {wwid}") + + if not any_found: + print("\n✓ No orphaned mpath devices found across all hosts.") + elif not (remediate or dry_run): + print("\nRun with --dry-run to preview or --remediate to flush.") + + +# ── Host health checks ──────────────────────────────────────────────────── + +def check_host_health_via_ssh(hostname, ssh_user, ssh_key=None, host_ip=None): + cmd = ( + "printf 'MPFAIL:%s\\n' \"$(multipath -ll 2>/dev/null | grep -cE 'failed|faulty' || echo 0)\"; " + "printf 'DSTATE:%s\\n' \"$(ps -eo stat,comm 2>/dev/null | awk '$1~/^D/{print $2}' | sort -u | tr '\\n' ' ')\"; " + "printf 'LIBVIRTD:%s\\n' \"$(systemctl is-active libvirtd 2>/dev/null || echo unknown)\"; " + "printf 'VIRSH:%s\\n' \"$(timeout 5 virsh list --all --name 2>/dev/null | grep -vc '^$' || echo timeout)\"" + ) + target = host_ip or hostname.split(".")[0] + out, rc = _ssh_cmd(ssh_user, ssh_key, target, cmd) + try: + if rc != 0 or not out: + return None + health = {} + for line in out.splitlines(): + key, _, val = line.partition(":") + health[key.strip()] = val.strip() + return { + "libvirtd": health.get("LIBVIRTD", "unknown"), + "mp_failed": int(health.get("MPFAIL", "0") or "0"), + "dstate_procs": [p for p in health.get("DSTATE", "").split() if p], + "virsh_domains": health.get("VIRSH", "unknown"), + } + except Exception: + return None + + +def run_host_health_checks(nova_hosts, ssh_user, ssh_key=None, hyp_ip_map=None): + hyp_ip_map = hyp_ip_map or {} + results = {} + with ThreadPoolExecutor(max_workers=min(len(nova_hosts), 8)) as pool: + futures = { + pool.submit(check_host_health_via_ssh, h, ssh_user, ssh_key, hyp_ip_map.get(h)): h + for h in nova_hosts + } + for future in as_completed(futures): + results[futures[future]] = future.result() + return results + + +def print_health_report(health_results): + if not health_results: + return + print(f"\n{'='*80}") + print("HOST HEALTH") + print(f"{'='*80}") + for host, h in sorted(health_results.items()): + short = host.split(".")[0] + if h is None: + print(f"\n {short}: SSH failed — health check skipped") + continue + mp_str = f"{h['mp_failed']} failed path(s)" if h["mp_failed"] else "OK" + dstate_str = ", ".join(h["dstate_procs"]) if h["dstate_procs"] else "none" + print(f"\n {short}") + print(f" libvirtd : {h['libvirtd']}{' ← ATTENTION' if h['libvirtd'] != 'active' else ''}") + print(f" multipath : {mp_str}{' ← ATTENTION' if h['mp_failed'] else ''}") + print(f" D-state procs: {dstate_str}{' ← ATTENTION' if h['dstate_procs'] else ''}") + print(f" virsh : {h['virsh_domains']} domain(s) visible") + + +# ── Cross-reference helpers ──────────────────────────────────────────────── + +def iqn_to_hostname(iqn): + parts = iqn.rsplit(":", 1) + return parts[-1].lower() if len(parts) > 1 else iqn.lower() + + +def is_ubuntu_iqn(iqn): + return "com.ubuntu" in iqn.lower() + + +def hostname_matches_iqn(hostname, iqn): + # Only valid for RHEL/Rocky — Ubuntu IQNs encode a hardware ID, not a hostname. + if is_ubuntu_iqn(iqn): + return False + return hostname.split(".")[0].lower() in iqn_to_hostname(iqn) + + +def find_lun_for_volume(volume_id, lun_map): + for lun_path in lun_map: + if volume_id in lun_path: + return lun_path + return None + + +def find_igroup_for_host(nova_host, host_iqn_map, igroups): + """Return (igroup_name, igroup_data) whose initiators match nova_host, or (None, None). + + Warns if multiple igroups match the same host IQN — this indicates duplicate igroups + (e.g. left behind by VMHA or a failed migration that created a second igroup). + """ + nova_s = nova_host.split(".")[0].lower() + known_iqns = host_iqn_map.get(nova_s, set()) + if not known_iqns: + return None, None + matches = [ + (name, data) for name, data in igroups.items() + if known_iqns & set(data.get("initiators", [])) + ] + if len(matches) > 1: + names = ", ".join(n for n, _ in matches) + print(f" [WARN] {nova_s}: multiple igroups share the same IQN — {names}", + file=sys.stderr) + print(f" Duplicate igroups indicate a prior VMHA or failed-migration side effect.", + file=sys.stderr) + print(f" Using '{matches[0][0]}' (first match). Verify this is correct.", + file=sys.stderr) + if matches: + return matches[0] + return None, None + + +# ── Detection ────────────────────────────────────────────────────────────── + +def _fetch_server_items(server, hyp_map, igroups, lun_map): + """Fetch volume/attachment data for one server. Runs in a worker thread.""" + nova_host = hyp_map.get(server["host"], server["host"]) + if not nova_host: + return [] + items = [] + for volume_id in get_server_volumes(server["id"]): + vol = get_volume_info(volume_id) + if not vol: + continue + attachments = vol.get("attachments", []) + if isinstance(attachments, str): + try: + attachments = json.loads(attachments) + except json.JSONDecodeError: + attachments = [] + cinder_host = attachment_id = "" + for att in attachments: + if att.get("server_id") == server["id"]: + raw = att.get("host_name", "") + cinder_host = hyp_map.get(raw, raw) + attachment_id = att.get("attachment_id", att.get("id", "")) + break + lun_path = find_lun_for_volume(volume_id, lun_map) + mappings = lun_map.get(lun_path, []) if lun_path else [] + items.append({ + "server": server, + "nova_host": nova_host, + "volume_id": volume_id, + "cinder_host": cinder_host, + "attachment_id": attachment_id, + "lun_path": lun_path, + "lun_maps": mappings, # list of {igroup, igroup_uuid, lun_uuid, lun_id, svm} + }) + return items + + +def _classify_lun_maps(lun_maps, igroups, nova_host, host_iqn_map): + """Split LUN map entries into nova_maps (correct) / stale_maps / unknown_maps.""" + nova_s = nova_host.split(".")[0].lower() + known_nova_iqns = host_iqn_map.get(nova_s, set()) + nova_maps = [] + stale_maps = [] + unknown_maps = [] + + for m in lun_maps: + ig_data = igroups.get(m["igroup"], {}) + ig_iqns = set(ig_data.get("initiators", [])) + enriched = {**m, "igroup_data": ig_data, "igroup_iqns": list(ig_iqns)} + + if known_nova_iqns: + if known_nova_iqns & ig_iqns: + nova_maps.append(enriched) + else: + stale_maps.append(enriched) + else: + matchable = [q for q in ig_iqns if not is_ubuntu_iqn(q)] + if matchable: + if any(hostname_matches_iqn(nova_host, q) for q in matchable): + nova_maps.append(enriched) + else: + stale_maps.append(enriched) + else: + unknown_maps.append(enriched) + + return nova_maps, stale_maps, unknown_maps + + +def detect(servers, netapp_host, netapp_user, netapp_password, svm, + ssh_user=None, ssh_key=None, hyp_ip_map=None, manual_iqns=None): + print("\nQuerying NetApp ONTAP...") + with ThreadPoolExecutor(max_workers=2) as pool: + ig_future = pool.submit(get_igroups, netapp_host, netapp_user, netapp_password, svm) + lun_future = pool.submit(get_lun_map, netapp_host, netapp_user, netapp_password, svm) + igroups = ig_future.result() + lun_map = lun_future.result() + hyp_map = get_hypervisor_name_map() + print(f"Checking {len(servers)} VM(s)...\n") + + # Pass 1: fetch per-VM data in parallel; print progress so screen isn't blank + collected = [] + with ThreadPoolExecutor(max_workers=8) as pool: + futures = {pool.submit(_fetch_server_items, s, hyp_map, igroups, lun_map): s for s in servers} + done = 0 + for future in as_completed(futures): + done += 1 + server = futures[future] + print(f" [{done}/{len(servers)}] {server['name']}", flush=True) + try: + collected.extend(future.result()) + except Exception as exc: + print(f" [WARN] {server['name']}: {exc}", file=sys.stderr) + + # Pass 2a: infer ground-truth IQNs from clean (non-stale) attachments. + host_iqn_map = {} + for item in collected: + nova_s = item["nova_host"].split(".")[0].lower() + cinder_s = item["cinder_host"].split(".")[0].lower() if item["cinder_host"] else "" + if cinder_s and nova_s == cinder_s: + for m in item["lun_maps"]: + ig_data = igroups.get(m["igroup"], {}) + for iqn in ig_data.get("initiators", []): + host_iqn_map.setdefault(nova_s, set()).add(iqn) + + # Pass 2b: seed from --host-iqn entries (substring match on short hostname) + if manual_iqns: + nova_shorts = {item["nova_host"].split(".")[0].lower() for item in collected} + for key, iqn in manual_iqns.items(): + matched = [s for s in nova_shorts if key.lower() in s] + if matched: + for s in matched: + host_iqn_map[s] = {iqn} + print(f" --host-iqn: {s} → {iqn}", flush=True) + else: + print(f" [WARN] --host-iqn: no host matched '{key}' (known: {', '.join(nova_shorts)})", + file=sys.stderr) + + # Pass 2c: SSH to fetch IQNs for hosts not yet covered + if ssh_user: + ssh_hosts = {h for h in {item["nova_host"] for item in collected if item["nova_host"]} + if h.split(".")[0].lower() not in host_iqn_map} + if ssh_hosts: + hyp_ip_map = hyp_ip_map or {} + print(f"\nFetching IQNs via SSH ({ssh_user}@host)...") + with ThreadPoolExecutor(max_workers=min(len(ssh_hosts), 8)) as pool: + ssh_futures = { + pool.submit(get_host_iqn_via_ssh, h, ssh_user, ssh_key, hyp_ip_map.get(h)): h + for h in ssh_hosts + } + for future in as_completed(ssh_futures): + host = ssh_futures[future] + short = host.split(".")[0].lower() + iqn = future.result() + if iqn: + host_iqn_map[short] = {iqn} + print(f" {short}: {iqn}", flush=True) + else: + print(f" [WARN] {short}: SSH failed — igroup check will be skipped for Ubuntu hosts", + flush=True) + + # Pass 3: classify LUN maps per item and build findings + findings = [] + warned_hosts = set() + for item in collected: + server = item["server"] + nova_host = item["nova_host"] + cinder_host = item["cinder_host"] + nova_s = nova_host.split(".")[0].lower() + lun_maps = item["lun_maps"] + + nova_maps, stale_maps, unknown_maps = _classify_lun_maps( + lun_maps, igroups, nova_host, host_iqn_map + ) + + dual_mapping = bool(nova_maps and stale_maps) + source_missing = bool(not nova_maps and stale_maps) + igroup_stale = bool(stale_maps) + + # Warn about Ubuntu-IQN hosts we can't classify + if lun_maps and not nova_maps and not stale_maps and unknown_maps: + warned_hosts.add(nova_host) + + if not (dual_mapping or source_missing): + continue + + stale_iqns = [] + for m in stale_maps: + stale_iqns.extend(m["igroup_iqns"]) + + # Primary map for legacy display fields (prefer stale for backward compat) + primary = (stale_maps or nova_maps or unknown_maps or [{}])[0] + prim_ig_data = primary.get("igroup_data", {}) + + findings.append({ + "vm_id": server["id"], + "vm_name": server["name"], + "vm_status": server["status"], + "nova_host": nova_host, + "cinder_host": cinder_host, + "volume_id": item["volume_id"], + "attachment_id": item["attachment_id"], + "lun_path": item["lun_path"], + "lun_maps": lun_maps, + "nova_maps": nova_maps, + "stale_maps": stale_maps, + "unknown_maps": unknown_maps, + # Legacy display fields + "igroup": primary.get("igroup", ""), + "igroup_uuid": prim_ig_data.get("uuid", primary.get("igroup_uuid", "")), + "igroup_svm": prim_ig_data.get("svm", primary.get("svm", "")), + "igroup_iqns": list(prim_ig_data.get("initiators", [])), + "stale_iqns": stale_iqns, + "dual_mapping": dual_mapping, + "source_missing": source_missing, + "igroup_stale": igroup_stale, + }) + + for host in sorted(warned_hosts): + short = host.split(".")[0] + print(f" [WARN] {short}: igroup check skipped — " + f"pass --host-iqn {short}= (get via: cat /etc/iscsi/initiatorname.iscsi)", + flush=True) + + all_nova_hosts = {item["nova_host"] for item in collected if item["nova_host"]} + return findings, igroups, host_iqn_map, all_nova_hosts + + +# ── Reporting ────────────────────────────────────────────────────────────── + +def print_report(findings): + if not findings: + print("✓ No igroup mapping issues detected.") + return + + print(f"\n{'='*80}") + print(f"ISSUES FOUND: {len(findings)}") + print(f"{'='*80}") + + for f in findings: + tags = [] + if f.get("dual_mapping"): tags.append("DUAL IGROUP") + elif f.get("source_missing"): tags.append("SOURCE MISSING") + elif f.get("igroup_stale"): tags.append("STALE IGROUP") + + print(f"\n [{' + '.join(tags) or 'UNKNOWN'}]") + print(f" VM : {f['vm_name']} ({f['vm_id']}) status={f['vm_status']}") + print(f" Volume : {f['volume_id']}") + print(f" Nova host : {f['nova_host']}") + + if f.get("dual_mapping"): + print(f" LUN maps : {len(f['lun_maps'])} igroup(s) ← DUAL MAPPING (most common production failure)") + for m in f.get("nova_maps", []): + ig_iqns = m.get("igroup_iqns", []) + hosts = ", ".join(iqn_to_hostname(q) for q in ig_iqns) or "(none)" + print(f" ✓ {m['igroup']} → IQN hosts: {hosts} [correct — nova host]") + for m in f.get("stale_maps", []): + ig_iqns = m.get("igroup_iqns", []) + hosts = ", ".join(iqn_to_hostname(q) for q in ig_iqns) or "(none)" + print(f" ✗ {m['igroup']} → IQN hosts: {hosts} [stale — destination igroup]") + elif f.get("source_missing"): + print(f" LUN maps : source igroup mapping is MISSING (removed by failed terminate_connection)") + for m in f.get("stale_maps", []): + ig_iqns = m.get("igroup_iqns", []) + hosts = ", ".join(iqn_to_hostname(q) for q in ig_iqns) or "(none)" + print(f" ✗ {m['igroup']} → IQN hosts: {hosts} [wrong host — destination only]") + elif f["igroup"]: + hosts = ", ".join(iqn_to_hostname(q) for q in f["igroup_iqns"]) or "(none)" + print(f" igroup : {f['igroup']} → IQN hosts: {hosts}") + for iqn in f["stale_iqns"]: + print(f" Stale IQN : {iqn}") + + +# ── Remediation ──────────────────────────────────────────────────────────── + +def remediate(findings, igroups, host_iqn_map, netapp_host, netapp_user, netapp_password, dry_run): + if not findings: + return + + prefix = "[DRY-RUN] " if dry_run else "" + print(f"\n{'='*80}") + print(f"{prefix}REMEDIATION") + print(f"{'='*80}") + print("Steps per finding:") + print(" 1. Fix NetApp LUN maps / igroup initiators (automated here)") + print(" 2. iSCSI rescan (commands to run on the correct host)") + print(" 3. Nova BDM target_lun fix (SQL to run — review before applying)") + print() + + by_vm = {} + for f in findings: + by_vm.setdefault(f["vm_id"], []).append(f) + + for vm_id, vm_findings in by_vm.items(): + first = vm_findings[0] + nova_host = first["nova_host"] + nova_s = nova_host.split(".")[0].lower() + known_iqns = host_iqn_map.get(nova_s, set()) + correct_iqn = next(iter(known_iqns), None) + + print(f"\n── {first['vm_name']} ({vm_id}) ──") + print(f" Nova host : {nova_host}") + + # ── Step 1: Fix NetApp ──────────────────────────────────────────── + print(f"\n STEP 1: Fix NetApp LUN maps") + step1_ok = True + + for f in vm_findings: + print(f"\n Volume : {f['volume_id']}") + print(f" LUN path: {f['lun_path'] or '(not found on NetApp)'}") + if not f["lun_path"]: + print(f" (skip — LUN not found)") + continue + + lun_uuid = (f["nova_maps"] or f["stale_maps"] or [{}])[0].get("lun_uuid", "") + + if f.get("dual_mapping"): + # Remove the stale (destination) LUN map entries + for m in f["stale_maps"]: + ok = remove_lun_map( + netapp_host, netapp_user, netapp_password, + m.get("lun_uuid", lun_uuid), m["igroup_uuid"], m["igroup"], + dry_run=dry_run, + ) + if not ok: + step1_ok = False + + # Print the correct LUN ID from nova_maps for BDM fix reference + if f["nova_maps"]: + nm = f["nova_maps"][0] + print(f" Correct LUN ID (nova host's mapping): {nm.get('lun_id')} " + f"← use this for BDM fix in Step 3") + + elif f.get("source_missing"): + # Need to re-add the source (nova_host) LUN map. + # If the igroup was deleted entirely (e.g. by VMHA), create it first. + nova_ig_name, nova_ig_data = find_igroup_for_host(nova_host, host_iqn_map, igroups) + svm_name = (f["stale_maps"] or [{}])[0].get("svm", "") + + if not nova_ig_name and correct_iqn: + # Igroup is gone — create a recovery igroup for this host. + recovery_ig_name = f"openstack-recovery-{nova_s}" + print(f" igroup for {nova_host} not found on NetApp — will create one.") + created = create_igroup( + netapp_host, netapp_user, netapp_password, + recovery_ig_name, svm_name, correct_iqn, + dry_run=dry_run, + ) + if created: + nova_ig_name = created + # Refresh local igroups cache so subsequent volumes find it + igroups[nova_ig_name] = { + "uuid": "", + "initiators": [correct_iqn], + "svm": svm_name, + } + else: + step1_ok = False + + if nova_ig_name: + # Add the correct (source) mapping first — if this fails the stale + # map is still in place and the VM retains disk access. + ok = add_lun_map( + netapp_host, netapp_user, netapp_password, + f["lun_path"], nova_ig_name, svm_name, + dry_run=dry_run, + ) + if not ok: + step1_ok = False + else: + # Only remove the stale destination mapping after the correct + # one is confirmed present — worst case is DUAL IGROUP, not data loss. + for m in f["stale_maps"]: + ok = remove_lun_map( + netapp_host, netapp_user, netapp_password, + m.get("lun_uuid", lun_uuid), m["igroup_uuid"], m["igroup"], + dry_run=dry_run, + ) + if not ok: + step1_ok = False + elif not step1_ok: + pass # create_igroup already printed the error + else: + print(f" [WARN] Cannot find or create igroup for {nova_host} — " + f"IQN unknown. Pass --host-iqn {nova_s}= or --ssh-user.") + print(f" Get IQN: ssh {nova_host} 'cat /etc/iscsi/initiatorname.iscsi'") + step1_ok = False + + elif f.get("igroup_stale") and f["stale_iqns"]: + # Legacy: igroup exists but has wrong initiator + print(f" igroup : {f['igroup']}") + for iqn in f["stale_iqns"]: + ok = remove_igroup_initiator( + netapp_host, netapp_user, netapp_password, + f["igroup_uuid"], iqn, dry_run=dry_run, + ) + if not ok: + step1_ok = False + + if correct_iqn: + print(f" Add correct IQN for {nova_host}: {correct_iqn}") + print(f" curl -sk -u {netapp_user}: -X POST \\") + print(f" https://{netapp_host}/api/protocols/san/igroups/{f['igroup_uuid']}/initiators \\") + print(f" -H 'Content-Type: application/json' -d '{{\"name\": \"{correct_iqn}\"}}'") + else: + print(f" IQN not found — retrieve manually:") + print(f" ssh {nova_host} 'cat /etc/iscsi/initiatorname.iscsi'") + print(f" Then: python3 pf9-storage-audit.py ... --host-iqn {nova_s}= --remediate") + else: + print(f" NetApp igroup OK — no changes needed") + + if not step1_ok and not dry_run: + print(f"\n ✗ STEP 1 FAILED — manual NetApp action required (see above).") + print(f" Re-run with --remediate after fixing NetApp manually.") + continue + + # ── Step 2: iSCSI rescan ────────────────────────────────────────── + print(f"\n STEP 2: iSCSI rescan — run on {nova_host}:") + print(f" iscsiadm -m session -R") + print(f" iscsiadm -m node --login") + print(f" multipath -r") + print(f" multipath -ll | grep -E 'failed|faulty|0 paths'") + + # ── Step 3: Nova BDM target_lun fix ────────────────────────────── + print(f"\n STEP 3: Nova BDM target_lun fix") + + any_dual = any(f.get("dual_mapping") for f in vm_findings) + any_missing = any(f.get("source_missing") for f in vm_findings) + + if any_dual or any_missing: + print(f" # target_lun / target_luns in Nova BDM may point to the destination LUN ID.") + print(f" # First, check current values and path count:") + for f in vm_findings: + print(f" mysql> SELECT instance_uuid,") + print(f" JSON_EXTRACT(connection_info, '$.data.target_lun') AS lun,") + print(f" JSON_EXTRACT(connection_info, '$.data.target_luns') AS luns") + print(f" FROM block_device_mapping") + print(f" WHERE volume_id = '{f['volume_id']}'") + print(f" AND instance_uuid = '{vm_id}'") + print(f" AND deleted = 0;") + print(f"") + print(f" # Then update — adjust JSON_ARRAY() length to match the path count above:") + for f in vm_findings: + correct_lun_id = (f["nova_maps"] or [{}])[0].get("lun_id") + if correct_lun_id is not None: + lun_array = ", ".join([str(correct_lun_id)] * 4) + print(f" # Volume {f['volume_id']} → correct LUN ID = {correct_lun_id}") + print(f" mysql> UPDATE block_device_mapping") + print(f" SET connection_info = JSON_SET(") + print(f" JSON_SET(connection_info, '$.data.target_lun', {correct_lun_id}),") + print(f" '$.data.target_luns', JSON_ARRAY({lun_array})") + print(f" )") + print(f" WHERE volume_id = '{f['volume_id']}'") + print(f" AND instance_uuid = '{vm_id}'") + print(f" AND deleted = 0;") + else: + print(f" # Volume {f['volume_id']}: LUN ID unknown — check NetApp and set manually") + else: + print(f" # Igroup was the only issue — no BDM change needed.") + + print(f"\n{'='*80}") + print("After all steps, verify:") + print(" virsh list --all (on affected host — should not hang)") + print(" multipath -ll (no failed/faulty maps)") + print(" openstack volume list (volumes should be 'in-use')") + print(" openstack server list (VMs should be 'ACTIVE')") + + +# ── Entry point ──────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser( + description="Detect and remediate stale Cinder BDM/igroup state after failed migrations" + ) + parser.add_argument("--netapp-host", required=True) + parser.add_argument("--netapp-user", default="admin") + parser.add_argument("--netapp-password", help="Prompted if omitted") + parser.add_argument("--svm", help="Filter by SVM name (e.g. vs.5)") + parser.add_argument("--server", help="Check a single VM by ID or name") + parser.add_argument("--ssh-user", default=None, + help="SSH user for host health checks and IQN fetching (e.g. root)") + parser.add_argument("--ssh-key", default=None, metavar="PATH", + help="SSH private key file (optional if default key works)") + parser.add_argument("--host-iqn", action="append", default=[], metavar="HOST=IQN", + help="Known IQN for a compute host, e.g. 970-1=iqn.2004-10.com.ubuntu:01:04cd37af9c9. " + "Repeat for each host. Run: cat /etc/iscsi/initiatorname.iscsi on each host.") + parser.add_argument("--dry-run", action="store_true", + help="Preview all remediation steps without making changes") + parser.add_argument("--remediate", action="store_true", + help="Apply igroup fixes and print Cinder/iSCSI steps") + parser.add_argument("--clean-mpath", action="store_true", + help="Scan compute hosts for orphaned mpath devices (all paths failed) " + "and flush them. Requires --ssh-user. Use with --dry-run to preview.") + args = parser.parse_args() + + manual_iqns = {} + for entry in args.host_iqn: + if "=" not in entry: + print(f"[ERROR] --host-iqn must be HOST=IQN format, got: {entry}", file=sys.stderr) + sys.exit(1) + host, iqn = entry.split("=", 1) + manual_iqns[host.strip()] = iqn.strip() + + if not args.netapp_password: + args.netapp_password = getpass.getpass( + f"NetApp password for {args.netapp_user}@{args.netapp_host}: " + ) + + print("Querying OpenStack...") + servers = [get_server(args.server)] if args.server else get_all_servers() + print(f"Found {len(servers)} VM(s).") + + hyp_ip_map = get_hypervisor_ip_map() + + findings, igroups, host_iqn_map, all_nova_hosts = detect( + servers, args.netapp_host, args.netapp_user, args.netapp_password, args.svm, + ssh_user=args.ssh_user, ssh_key=args.ssh_key, hyp_ip_map=hyp_ip_map, + manual_iqns=manual_iqns or None, + ) + + if args.ssh_user and all_nova_hosts: + health_results = run_host_health_checks( + all_nova_hosts, args.ssh_user, args.ssh_key, hyp_ip_map) + print_health_report(health_results) + + print_report(findings) + + stale_vms = len({f["vm_id"] for f in findings}) + if findings: + dual = sum(1 for f in findings if f.get("dual_mapping")) + missing = sum(1 for f in findings if f.get("source_missing")) + print(f"\nSummary: {stale_vms} VM(s), {len(findings)} volume(s) with issues " + f"[dual_mapping={dual}, source_missing={missing}]") + + if args.remediate or args.dry_run: + remediate(findings, igroups, host_iqn_map, args.netapp_host, args.netapp_user, + args.netapp_password, dry_run=args.dry_run) + elif findings: + print("\nRun with --dry-run to preview remediation steps.") + print("Run with --remediate to apply igroup fixes and print Cinder/iSCSI steps.") + + if args.clean_mpath: + if not args.ssh_user: + print("[ERROR] --clean-mpath requires --ssh-user", file=sys.stderr) + sys.exit(1) + run_mpath_cleanup(all_nova_hosts, args.ssh_user, args.ssh_key, hyp_ip_map, + dry_run=args.dry_run, remediate=args.remediate) + + sys.exit(1 if findings else 0) + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + print("\n[Interrupted]", file=sys.stderr) + sys.exit(130)