chicagopcdc · sonali-shaw · Jun 11, 2026 · grugna · Jun 11, 2026 · sgchoe
diff --git a/pfb_to_zip/pfb_to_zip.py b/pfb_to_zip/pfb_to_zip.py
@@ -15,6 +15,7 @@
 from pfb.reader import PFBReader
 from pfb.exporters import tsv
 from dictionaryutils.utils import node_values_to_codes
+from pfb.writer import PFBWriter
 
 
 def to_folder_name(value: str) -> str:
@@ -28,8 +29,6 @@ class Config():
     def __init__(self):
         self.reader = None
 
-
-
 class PFBExporter:
     def __init__(self, pfb_file_path:str, tmp_folder:str, output_path:str, config_file_path:str, ontology:str=None, extra_analysis:str=None) -> None:
         self.pfb_file_path = pfb_file_path
@@ -77,7 +76,6 @@ def __new__(cls, *args, **kwargs):
         if cls._validate(*args, **kwargs):
             return super().__new__(cls)
 
-
 
     def initialize(self):
         print("Initializing...")
@@ -316,7 +314,6 @@ def zip(self):
         self.zip_file_output_path = make_archive(output_filename, 'zip', self.tmp_folder, self.zip_subfolder)
         print(f"ZIP file created at {self.zip_file_output_path}")
 
-
     def clean_up(self):
         print(f"Removing all the resources used and the temporary data directory at {self.tmp_folder}")
 
@@ -335,18 +332,91 @@ def clean_up(self):
             print(e)
 
 
+class Difference:
+    def __init__(self, old_file_path: str, new_file_path: str):
+        self.old_file = old_file_path
+        self.new_file = new_file_path
+
+    def read_avro(self, file_path):
+        records_by_submitter = {}
+        submitter_ids_list = []
+
+        with open(file_path, 'rb') as f:
+            with PFBReader(f) as reader:
+                for record in reader:
+                    submitter_id = record['object'].get('submitter_id')
+                    if submitter_id: # program etc. don't have submitter_id
+                        submitter_ids_list.append(submitter_id)
+                        records_by_submitter[submitter_id] = record
+
+        # duplicates check
+        submitter_ids_set = set(submitter_ids_list)
+        if len(submitter_ids_list) != len(submitter_ids_set):
+            print("⚠️ Duplicates found!")
+
+        return records_by_submitter
+
+    def generate_diff(self, output_path: str):
+        old = self.read_avro(self.old_file)
+        new = self.read_avro(self.new_file)
+
+        deleted_submitter_ids = set(old.keys()) - set(new.keys())
+        diff_records = []
+        log_lines = []
+
+        if deleted_submitter_ids:
+            msg = f" {len(deleted_submitter_ids)} record(s) removed (consent withdrawn):"
+            print(msg)
+            log_lines.append(msg)
+            for sid in sorted(deleted_submitter_ids):
+                line = f"   - {sid}"
+                print(line)
+                log_lines.append(line)
+        else:
+            msg = "No records removed."
+            print(msg)
+            log_lines.append(msg)
+
+        for submitter_id, new_record in new.items():
+            if submitter_id not in old:
+                diff_records.append(new_record)
+            else:
+                if new_record['object'] != old[submitter_id]['object']:
+                    diff_records.append(new_record)
+
+        summary = f"\n Total changed/new records in diff: {len(diff_records)}"
+        print(summary)
+        log_lines.append(summary)
+
+        # Save log to markdown
+        log_path = output_path.replace('.avro', '_log.md')
+        with open(log_path, 'w') as f:
+            f.write("# Diff Log\n\n")
+            f.write("\n".join(log_lines))
+        print(f"Log saved to {log_path}")
+
+        with open(output_path, 'wb') as out_f:
+            with PFBWriter(out_f) as writer:
+                with open(self.new_file, 'rb') as schema_f:
+                    with PFBReader(schema_f) as reader:
+                        writer.copy_schema(reader)
+                writer.write(diff_records)
+
+        print(f"Written to {output_path} with {len(diff_records)} records")
 
 
 
 def main():
     # EXAMPLE: python pfb_to_zip.py -i ./export_2023-03-27T02_42_17.avro -o ./outputs/ -c ./config.py -d https://portal.pedscommons.org/api/v0/submission/_dictionary/_all -t ncit
+    # Example for diff: python3 pfb_to_zip.py -i ../new_data.avro -o ../outputs/ -c ./config.py -d ../old_data.avro
 
     parser = argparse.ArgumentParser(description="Build ZIP bundle for data delivery after project request has been approved")
     parser.add_argument('-c', '--config', help='The config file')
     parser.add_argument('-i', '--input', help='Input PFB file path')
     parser.add_argument('-o', '--output', help='Output ZIP directory')
     parser.add_argument('-t', '--terminology', help='The ontology you want to transform GEN3 values to.')
     parser.add_argument('-a', '--analysis', help='The consorti script you want to execute. Ex:INRG')
+    parser.add_argument('-d', '--diff', help='Path to the old avro file to diff against')
 
     try:
         args = parser.parse_args()
@@ -359,6 +429,11 @@ def main():
         print(err)
         parser.print_help()
 
+    if args.diff:
+        diff = Difference(args.diff, input_path)
+        diff_output = output_path + 'diff.avro'
+        diff.generate_diff(diff_output)
+        input_path = diff_output
 
     tmp_folder = "./tmp"
     pfb_export = PFBExporter(input_path, tmp_folder, output_path, config_file, ontology, True if analysis_script_consortia and analysis_script_consortia != "" else False)
@@ -371,14 +446,11 @@ def main():
     pfb_export.filter_attributes(is_black_list=False)
     # pfb_export.filter_attributes(is_black_list=True)
     pfb_export.setup_and_run_analysis(analysis_script_consortia) #TODO how to find consortium
-    pfb_export.to_ontology_code() 
+    pfb_export.to_ontology_code()
     pfb_export.add_external_references_material()
     pfb_export.zip()
     pfb_export.clean_up()
 
-
-
-
 
 if __name__ == "__main__":
     main()