-
Notifications
You must be signed in to change notification settings - Fork 0
PEDS-1257 Add Difference class with diff #9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,6 +15,7 @@ | |
| from pfb.reader import PFBReader | ||
| from pfb.exporters import tsv | ||
| from dictionaryutils.utils import node_values_to_codes | ||
| from pfb.writer import PFBWriter | ||
|
|
||
|
|
||
| def to_folder_name(value: str) -> str: | ||
|
|
@@ -28,8 +29,6 @@ class Config(): | |
| def __init__(self): | ||
| self.reader = None | ||
|
|
||
|
|
||
|
|
||
| class PFBExporter: | ||
| def __init__(self, pfb_file_path:str, tmp_folder:str, output_path:str, config_file_path:str, ontology:str=None, extra_analysis:str=None) -> None: | ||
| self.pfb_file_path = pfb_file_path | ||
|
|
@@ -77,7 +76,6 @@ def __new__(cls, *args, **kwargs): | |
| if cls._validate(*args, **kwargs): | ||
| return super().__new__(cls) | ||
|
|
||
|
|
||
|
|
||
| def initialize(self): | ||
| print("Initializing...") | ||
|
|
@@ -316,7 +314,6 @@ def zip(self): | |
| self.zip_file_output_path = make_archive(output_filename, 'zip', self.tmp_folder, self.zip_subfolder) | ||
| print(f"ZIP file created at {self.zip_file_output_path}") | ||
|
|
||
|
|
||
| def clean_up(self): | ||
| print(f"Removing all the resources used and the temporary data directory at {self.tmp_folder}") | ||
|
|
||
|
|
@@ -335,18 +332,91 @@ def clean_up(self): | |
| print(e) | ||
|
|
||
|
|
||
| class Difference: | ||
| def __init__(self, old_file_path: str, new_file_path: str): | ||
| self.old_file = old_file_path | ||
| self.new_file = new_file_path | ||
|
|
||
| def read_avro(self, file_path): | ||
| records_by_submitter = {} | ||
| submitter_ids_list = [] | ||
|
|
||
| with open(file_path, 'rb') as f: | ||
| with PFBReader(f) as reader: | ||
| for record in reader: | ||
| submitter_id = record['object'].get('submitter_id') | ||
| if submitter_id: # program etc. don't have submitter_id | ||
| submitter_ids_list.append(submitter_id) | ||
| records_by_submitter[submitter_id] = record | ||
|
|
||
| # duplicates check | ||
| submitter_ids_set = set(submitter_ids_list) | ||
| if len(submitter_ids_list) != len(submitter_ids_set): | ||
| print("⚠️ Duplicates found!") | ||
|
|
||
| return records_by_submitter | ||
|
|
||
| def generate_diff(self, output_path: str): | ||
| old = self.read_avro(self.old_file) | ||
| new = self.read_avro(self.new_file) | ||
|
|
||
| deleted_submitter_ids = set(old.keys()) - set(new.keys()) | ||
| diff_records = [] | ||
| log_lines = [] | ||
|
|
||
| if deleted_submitter_ids: | ||
| msg = f" {len(deleted_submitter_ids)} record(s) removed (consent withdrawn):" | ||
| print(msg) | ||
| log_lines.append(msg) | ||
| for sid in sorted(deleted_submitter_ids): | ||
| line = f" - {sid}" | ||
| print(line) | ||
| log_lines.append(line) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would be useful to capture and output the removed records similar to the way 'diff_records' contains added/changed records. |
||
| else: | ||
| msg = "No records removed." | ||
| print(msg) | ||
| log_lines.append(msg) | ||
|
|
||
| for submitter_id, new_record in new.items(): | ||
| if submitter_id not in old: | ||
| diff_records.append(new_record) | ||
| else: | ||
| if new_record['object'] != old[submitter_id]['object']: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would suggest not including the 'created_datetime' and 'updated_datetime' attributes within 'object' when checking to see if the record has changed. |
||
| diff_records.append(new_record) | ||
|
|
||
| summary = f"\n Total changed/new records in diff: {len(diff_records)}" | ||
| print(summary) | ||
| log_lines.append(summary) | ||
|
|
||
| # Save log to markdown | ||
| log_path = output_path.replace('.avro', '_log.md') | ||
| with open(log_path, 'w') as f: | ||
| f.write("# Diff Log\n\n") | ||
| f.write("\n".join(log_lines)) | ||
| print(f"Log saved to {log_path}") | ||
|
|
||
| with open(output_path, 'wb') as out_f: | ||
| with PFBWriter(out_f) as writer: | ||
| with open(self.new_file, 'rb') as schema_f: | ||
| with PFBReader(schema_f) as reader: | ||
| writer.copy_schema(reader) | ||
| writer.write(diff_records) | ||
|
|
||
| print(f"Written to {output_path} with {len(diff_records)} records") | ||
|
|
||
|
|
||
|
|
||
| def main(): | ||
| # EXAMPLE: python pfb_to_zip.py -i ./export_2023-03-27T02_42_17.avro -o ./outputs/ -c ./config.py -d https://portal.pedscommons.org/api/v0/submission/_dictionary/_all -t ncit | ||
| # Example for diff: python3 pfb_to_zip.py -i ../new_data.avro -o ../outputs/ -c ./config.py -d ../old_data.avro | ||
|
|
||
| parser = argparse.ArgumentParser(description="Build ZIP bundle for data delivery after project request has been approved") | ||
| parser.add_argument('-c', '--config', help='The config file') | ||
| parser.add_argument('-i', '--input', help='Input PFB file path') | ||
| parser.add_argument('-o', '--output', help='Output ZIP directory') | ||
| parser.add_argument('-t', '--terminology', help='The ontology you want to transform GEN3 values to.') | ||
| parser.add_argument('-a', '--analysis', help='The consorti script you want to execute. Ex:INRG') | ||
| parser.add_argument('-d', '--diff', help='Path to the old avro file to diff against') | ||
|
|
||
| try: | ||
| args = parser.parse_args() | ||
|
|
@@ -359,6 +429,11 @@ def main(): | |
| print(err) | ||
| parser.print_help() | ||
|
|
||
| if args.diff: | ||
| diff = Difference(args.diff, input_path) | ||
| diff_output = output_path + 'diff.avro' | ||
| diff.generate_diff(diff_output) | ||
| input_path = diff_output | ||
|
|
||
| tmp_folder = "./tmp" | ||
| pfb_export = PFBExporter(input_path, tmp_folder, output_path, config_file, ontology, True if analysis_script_consortia and analysis_script_consortia != "" else False) | ||
|
|
@@ -371,14 +446,11 @@ def main(): | |
| pfb_export.filter_attributes(is_black_list=False) | ||
| # pfb_export.filter_attributes(is_black_list=True) | ||
| pfb_export.setup_and_run_analysis(analysis_script_consortia) #TODO how to find consortium | ||
| pfb_export.to_ontology_code() | ||
| pfb_export.to_ontology_code() | ||
| pfb_export.add_external_references_material() | ||
| pfb_export.zip() | ||
| pfb_export.clean_up() | ||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the submitter_id is preserved across data version only for the subject node. Other node may change. so the diff should be made on all the values and the attribute
subjects.submitter_idwill be stable as a reference to the subject node.