diff --git a/stat_var_renaming/stat_var_renaming.py b/stat_var_renaming/stat_var_renaming.py index 3cf72068..b48be56c 100644 --- a/stat_var_renaming/stat_var_renaming.py +++ b/stat_var_renaming/stat_var_renaming.py @@ -83,7 +83,7 @@ _MAX_CONSTRAINTS = 3 _MAX_CONSTRAINTS_WITH_DPV = 6 # If true, no new statistical variables will be introduced. -ONLY_REGENERATE_OUTPUT = False +_ONLY_REGENERATE_EXISTING = False def authenticate_bq_client(): @@ -124,7 +124,7 @@ def download_stat_vars(client): # Dynamically create query for constraints in SQL query. constraint_string = "" pop_string = "" - for num in range(1, _MAX_CONSTRAINTS + 1): + for num in range(1, _MAX_CONSTRAINTS_WITH_DPV + 1): constraint_string += f"SP.v{num} as v{num},\n" pop_string += f"SP.p{num} as p{num},\n" @@ -148,6 +148,7 @@ def download_stat_vars(client): for c in range(1, _MAX_CONSTRAINTS_WITH_DPV + 1): stat_vars[f"orig_p{c}"] = stat_vars[f"p{c}"] stat_vars[f"orig_v{c}"] = stat_vars[f"v{c}"] + stat_vars["orig_populationType"] = stat_vars['populationType'] return stat_vars @@ -173,7 +174,7 @@ def remap_constraint_from_prop(row, prop_remap): row: Pandas row to apply function to. prop_remap: Dictionary of renaming functions for each property. """ - for constraint in range(1, 1 + row['numConstraints']): + for constraint in range(1, min(_MAX_CONSTRAINTS_WITH_DPV, 1 + row['numConstraints'])): prop = row[f"p{constraint}"] if prop in prop_remap: # May need to apply multiple functions for a single property. @@ -256,7 +257,7 @@ def remove_dependent_constraints(stat_vars): # Merge across common columns shared with dependent variable list. common_cols = (['measuredProp', 'populationType', 'statType'] + - ["orig_p" + x for x in range(_MAX_CONSTRAINTS_WITH_DPV)]) + [f"orig_p{x}" for x in range(1, 1 + _MAX_CONSTRAINTS_WITH_DPV)]) stat_vars = pd.merge(stat_vars, dpvs, on=common_cols, how='left') # Replace any dependent variables and their value with nan. @@ -271,8 +272,8 @@ def remove_dependent_constraints(stat_vars): stat_vars.loc[dpv_match.index, "numConstraints"] = ( stat_vars.loc[dpv_match.index, "numConstraints"].apply(lambda x: x - 1)) - # Left shift all imputed columns to remove holes. - stat_vars = stat_vars.apply(left_fill_columns, axis=1) + # TODO(REMOVE): Left shift all imputed columns to remove holes. + # stat_vars = stat_vars.apply(left_fill_columns, axis=1) # Rename constraints from merge. for c in range(1, _MAX_CONSTRAINTS + 1): @@ -399,9 +400,9 @@ def ensure_no_overlapping_stat_vars(stat_vars): # Print out error if there are real collisions. bad_overlaps = bad_overlaps.sort_values(['v1', 'v2', 'v3']) bad_overlaps_str = "" - for index, row in bad_overlaps.iterrows(): + for _, row in bad_overlaps.iterrows(): bad_overlaps_str += row_to_stat_var_mcf(row) + "," - assert bad_overlaps.shape[0] == 0, "Duplicate StatVars!: " + bad_overlaps + assert bad_overlaps.shape[0] == 0, f"Duplicate StatVars!: {list(bad_overlaps['HumanReadableName'])}" # No issues so remove these duplicate names. return stat_vars.drop_duplicates("HumanReadableName") @@ -507,20 +508,23 @@ def create_human_readable_names(stat_vars, client): svrf.prepend_and_append_text(prop_remap) svrf.misc_mappings(prop_remap) + # Apply constraint renamings. + stat_vars = stat_vars.apply( + lambda row: remap_constraint_from_prop(row, prop_remap), axis=1) + # Drop erroneous constraints. for c in range(1, _MAX_CONSTRAINTS + 1): stat_vars = stat_vars.query(f"v{c} != 'NAICSUnknown'") - # Apply constraint renamings. - stat_vars = stat_vars.apply( - lambda row: remap_constraint_from_prop(row, prop_remap), axis=1) + # Remap population + stat_vars = svrf.rename_populations(stat_vars) # Remove dependent constraints. stat_vars = remove_dependent_constraints(stat_vars) - stat_vars = left_fill_columns(stat_vars) + stat_vars = stat_vars.apply(left_fill_columns) # Generate human readable names. - stat_vars['HumanReadableName'] = stat_vars.apply(row_to_human_readable) + stat_vars['HumanReadableName'] = stat_vars.apply(row_to_human_readable, axis=1) # Manually rename special case. stat_vars.loc[stat_vars['HumanReadableName'] == 'Count_Death_Medicare', @@ -548,7 +552,7 @@ def output_stat_var_documentation(stat_vars): # (False, True) -> Only group disaster StatVar by populationType # if there are more than 1 statistical variables for that group. svrc.STAT_VAR_POPULATION_GROUPINGS.append( - svrc.SVPopGroup(("Disasters", natural_disasters, False, True))) + svrc.SVPopGroup("Disasters", natural_disasters, False, True)) # Assert that all population types belong to a category. used = [] @@ -619,7 +623,7 @@ def main(argv): stat_vars = create_human_readable_names(stat_vars, client) # Limit to only existing statistical variables if chosen. - if ONLY_REGENERATE_OUTPUT: + if _ONLY_REGENERATE_EXISTING: stat_vars = remove_new_stat_vars(stat_vars, client) stat_vars = stat_vars.query("statType != 'Unknown'") diff --git a/stat_var_renaming/stat_var_renaming_constants.py b/stat_var_renaming/stat_var_renaming_constants.py index 453a6abf..be919881 100644 --- a/stat_var_renaming/stat_var_renaming_constants.py +++ b/stat_var_renaming/stat_var_renaming_constants.py @@ -276,7 +276,8 @@ def range_to_array(read_code): 'ICD10/D50-D89': 'DiseasesOfBloodAndBloodFormingOrgansAndImmuneDisorders', 'ICD10/R00-R99': 'AbnormalNotClassfied', 'ICD10/U00-U99': 'SpecialCases', - 'ICD10/V01-Y89': 'ExternalCauses' + 'ICD10/V01-Y89': 'ExternalCauses', + 'IntentionalSelf-Harm(Suicide)': 'IntentionalSelfHarm' } # List of properties to perform a numerical quantity remap on. @@ -327,11 +328,14 @@ def range_to_array(read_code): True, False), SVPopGroup("Employment", ['Worker', 'Establishment', 'JobPosting', - 'UnemploymentInsuranceClaim'], + 'UnemploymentInsuranceClaim', 'Faculty'], True, False), SVPopGroup("Economic", ['EconomicActivity', 'Consumption', 'Debt', 'TreasuryBill', - 'TreasuryBond', 'TreasuryNote'], + 'TreasuryBond', 'TreasuryNote', 'ConsumerGoodsAndServices', 'Stock'], + True, False), + SVPopGroup("Government", + ['Legislation', 'PublicInformationCampaign', 'Remittance', 'School'], True, False), SVPopGroup("Environment", ['Emissions'], diff --git a/stat_var_renaming/stat_var_renaming_functions.py b/stat_var_renaming/stat_var_renaming_functions.py index 8dc5a1cf..90f04e65 100644 --- a/stat_var_renaming/stat_var_renaming_functions.py +++ b/stat_var_renaming/stat_var_renaming_functions.py @@ -142,7 +142,7 @@ def provide_consistent_naming(row): icd10_code = row['ICD10Code'] # Manually remap abnormally long names. if icd10_code in svrc.MANUAL_CAUSE_OF_DEATH_RENAMINGS: - return svrc.MANUAL_CAUSE_OF_DEATH_RENAMINGS[id] + return svrc.MANUAL_CAUSE_OF_DEATH_RENAMINGS[icd10_code] # Otherwise remove codes and camel case the name. row['id'] = icd10_code @@ -158,7 +158,12 @@ def provide_consistent_naming(row): # Add modification function. def death_id_to_name(_, death_id, _pop): - return cause_of_death_instances.loc[death_id]['name'] + if death_id in cause_of_death_instances or death_id.startswith("ICD10"): + return cause_of_death_instances.loc[death_id]['name'] + else: + # Some causeOfDeathCodes are not ICD10. Use these literally. + return death_id.replace("(", "").replace(")", "").replace("-", "") + svr.addPropertyRemapping(prop_remap, 'medicalCode', death_id_to_name) svr.addPropertyRemapping(prop_remap, 'causeOfDeath', death_id_to_name)