diff --git a/pfb_to_zip/pfb_to_zip.py b/pfb_to_zip/pfb_to_zip.py index 97a45ec..753f907 100644 --- a/pfb_to_zip/pfb_to_zip.py +++ b/pfb_to_zip/pfb_to_zip.py @@ -15,6 +15,7 @@ from pfb.reader import PFBReader from pfb.exporters import tsv from dictionaryutils.utils import node_values_to_codes +from pfb.writer import PFBWriter def to_folder_name(value: str) -> str: @@ -28,8 +29,6 @@ class Config(): def __init__(self): self.reader = None - - class PFBExporter: def __init__(self, pfb_file_path:str, tmp_folder:str, output_path:str, config_file_path:str, ontology:str=None, extra_analysis:str=None) -> None: self.pfb_file_path = pfb_file_path @@ -77,7 +76,6 @@ def __new__(cls, *args, **kwargs): if cls._validate(*args, **kwargs): return super().__new__(cls) - def initialize(self): print("Initializing...") @@ -316,7 +314,6 @@ def zip(self): self.zip_file_output_path = make_archive(output_filename, 'zip', self.tmp_folder, self.zip_subfolder) print(f"ZIP file created at {self.zip_file_output_path}") - def clean_up(self): print(f"Removing all the resources used and the temporary data directory at {self.tmp_folder}") @@ -335,11 +332,83 @@ def clean_up(self): print(e) +class Difference: + def __init__(self, old_file_path: str, new_file_path: str): + self.old_file = old_file_path + self.new_file = new_file_path + + def read_avro(self, file_path): + records_by_submitter = {} + submitter_ids_list = [] + + with open(file_path, 'rb') as f: + with PFBReader(f) as reader: + for record in reader: + submitter_id = record['object'].get('submitter_id') + if submitter_id: # program etc. don't have submitter_id + submitter_ids_list.append(submitter_id) + records_by_submitter[submitter_id] = record + + # duplicates check + submitter_ids_set = set(submitter_ids_list) + if len(submitter_ids_list) != len(submitter_ids_set): + print("⚠️ Duplicates found!") + + return records_by_submitter + + def generate_diff(self, output_path: str): + old = self.read_avro(self.old_file) + new = self.read_avro(self.new_file) + + deleted_submitter_ids = set(old.keys()) - set(new.keys()) + diff_records = [] + log_lines = [] + + if deleted_submitter_ids: + msg = f" {len(deleted_submitter_ids)} record(s) removed (consent withdrawn):" + print(msg) + log_lines.append(msg) + for sid in sorted(deleted_submitter_ids): + line = f" - {sid}" + print(line) + log_lines.append(line) + else: + msg = "No records removed." + print(msg) + log_lines.append(msg) + + for submitter_id, new_record in new.items(): + if submitter_id not in old: + diff_records.append(new_record) + else: + if new_record['object'] != old[submitter_id]['object']: + diff_records.append(new_record) + + summary = f"\n Total changed/new records in diff: {len(diff_records)}" + print(summary) + log_lines.append(summary) + + # Save log to markdown + log_path = output_path.replace('.avro', '_log.md') + with open(log_path, 'w') as f: + f.write("# Diff Log\n\n") + f.write("\n".join(log_lines)) + print(f"Log saved to {log_path}") + + with open(output_path, 'wb') as out_f: + with PFBWriter(out_f) as writer: + with open(self.new_file, 'rb') as schema_f: + with PFBReader(schema_f) as reader: + writer.copy_schema(reader) + writer.write(diff_records) + + print(f"Written to {output_path} with {len(diff_records)} records") def main(): # EXAMPLE: python pfb_to_zip.py -i ./export_2023-03-27T02_42_17.avro -o ./outputs/ -c ./config.py -d https://portal.pedscommons.org/api/v0/submission/_dictionary/_all -t ncit + # Example for diff: python3 pfb_to_zip.py -i ../new_data.avro -o ../outputs/ -c ./config.py -d ../old_data.avro parser = argparse.ArgumentParser(description="Build ZIP bundle for data delivery after project request has been approved") parser.add_argument('-c', '--config', help='The config file') @@ -347,6 +416,7 @@ def main(): parser.add_argument('-o', '--output', help='Output ZIP directory') parser.add_argument('-t', '--terminology', help='The ontology you want to transform GEN3 values to.') parser.add_argument('-a', '--analysis', help='The consorti script you want to execute. Ex:INRG') + parser.add_argument('-d', '--diff', help='Path to the old avro file to diff against') try: args = parser.parse_args() @@ -359,6 +429,11 @@ def main(): print(err) parser.print_help() + if args.diff: + diff = Difference(args.diff, input_path) + diff_output = output_path + 'diff.avro' + diff.generate_diff(diff_output) + input_path = diff_output tmp_folder = "./tmp" pfb_export = PFBExporter(input_path, tmp_folder, output_path, config_file, ontology, True if analysis_script_consortia and analysis_script_consortia != "" else False) @@ -371,14 +446,11 @@ def main(): pfb_export.filter_attributes(is_black_list=False) # pfb_export.filter_attributes(is_black_list=True) pfb_export.setup_and_run_analysis(analysis_script_consortia) #TODO how to find consortium - pfb_export.to_ontology_code() + pfb_export.to_ontology_code() pfb_export.add_external_references_material() pfb_export.zip() pfb_export.clean_up() - - - if __name__ == "__main__": main()