path-cc · biozit · Mar 30, 2026
diff --git a/campus-contributions/osdf.py b/campus-contributions/osdf.py
@@ -1,64 +1,112 @@
-#!/usr/bin/env python
+"""
+osdf_facilities.py
 
-from __future__ import print_function
+Lists unique institutions that have a registered cache or origin in OSDF.
 
-import collections
-import operator
+Sources:
+  Registry    : https://osdf-registry.osg-htc.org/api/v1.0/registry_ui/servers
+  Institutions: https://topology-institutions.osg-htc.org/api/institution_ids
+
+Usage:
+  python osdf_facilities.py               # all (cache + origin)
+  python osdf_facilities.py --type cache  # cache only
+  python osdf_facilities.py --type origin # origin only
+"""
+
+import argparse
+import json
 import sys
-import os
+import urllib.request
+import urllib.error
 
-try:
-    from urllib.request import urlopen
-except ImportError:
-    from urllib2 import urlopen
 
-import xml.etree.ElementTree as et
+REGISTRY_URL     = "https://osdf-registry.osg-htc.org/api/v1.0/registry_ui/servers"
+INSTITUTIONS_URL = "https://topology-institutions.osg-htc.org/api/institution_ids"
 
 
-_topology_host = "topology.opensciencegrid.org"
-_rgsummary_url = 'https://{host}/rgsummary/xml'.format(host=_topology_host)
-_active_params = [
-    ('active',        'on'),  # filter resources by "Active" field
-    ('active_value',  '1' ),  # require Active: true
-    ('disable',       'on'),  # filter resources by "Disable" field
-    ('disable_value', '0' ),  # require Disable: false
-]
-_xml_url = "%s?%s" % (_rgsummary_url, '&'.join(map('='.join, _active_params)))
+def fetch_json(url: str) -> list | dict:
+    try:
+        req = urllib.request.Request(url, headers={"Accept": "application/json"})
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            return json.loads(resp.read().decode())
+    except urllib.error.URLError as e:
+        print(f"ERROR: could not reach {url}\n  {e}", file=sys.stderr)
+        sys.exit(1)
 
 
-_osdf_service_types = [
-    "XRootD cache server",
-    "XRootD origin server",
-    "Pelican cache",
-    "Pelican origin",
-]
+def build_institution_map(institutions: list) -> dict:
+    """Map institution ID URL -> human-readable name."""
+    return {
+        inst["id"]: inst["name"]
+        for inst in institutions
+        if inst.get("id") and inst.get("name")
+    }
 
 
-def getxml():
-    return urlopen(_xml_url).read()
+def get_facilities(server_type: str | None) -> list[str]:
+    """
+    Return a sorted, deduplicated list of institution names
+    that have at least one approved cache or origin registration.
 
+    server_type: None = both, "cache" = caches only, "origin" = origins only
+    """
+    registry     = fetch_json(REGISTRY_URL)
+    institutions = fetch_json(INSTITUTIONS_URL)
 
-def get_osdf_facilities(xmltxt=None):
-    if xmltxt is None:
-        xmltxt = getxml()
-    xmltree = et.fromstring(xmltxt)
-    return set(
-        rg.find("Facility").find("Name").text
-        for rg in xmltree.findall('ResourceGroup')
-        for r in rg.find('Resources').findall('Resource')
-        for s in r.find("Services").findall("Service")
-        if s.find("Name").text in _osdf_service_types
-    )
+    institution_map = build_institution_map(institutions)
+
+    seen = set()
+    for server in registry:
+        is_cache  = server.get("is_cache", False)
+        is_origin = server.get("is_origin", False)
+
+        # Apply type filter
+        if server_type == "cache" and not is_cache:
+            continue
+        if server_type == "origin" and not is_origin:
+            continue
+        if not is_cache and not is_origin:
+            continue
+
+        # Walk registrations to find an approved one with an institution
+        for reg in server.get("registration", []):
+            meta   = reg.get("admin_metadata") or {}
+            status = meta.get("status", "")
+
+            if status.lower() != "approved":
+                continue
+
+            institution_id = meta.get("institution", "")
+            institution    = institution_map.get(institution_id, "").strip()
+
+            if institution:
+                seen.add(institution)
+                break  # one match per server is enough
+
+    return sorted(seen, key=str.casefold)
 
 
 def main():
-    facilities = sorted(get_osdf_facilities())
-    n = len(facilities)
-    print("%d OSDF Facilit%s:" % (n, "y" if n == 1 else "ies"))
-    for f in sorted(facilities):
-        print(" - %s" % f)
+    parser = argparse.ArgumentParser(
+        description="List OSDF institutions with registered cache or origin servers."
+    )
+    parser.add_argument(
+        "--type",
+        choices=["cache", "origin"],
+        default=None,
+        metavar="TYPE",
+        help="Filter by server type: 'cache' or 'origin' (default: all)",
+    )
+    args = parser.parse_args()
 
+    type_label = args.type.capitalize() if args.type else "Cache & Origin"
 
-if __name__ == '__main__':
-    main()
+    facilities = get_facilities(args.type)
+
+    print(f"{len(facilities)} OSDF Facilities:")
+    for name in facilities:
+        print(f"- {name}")
 
+
+if __name__ == "__main__":
+    main()