diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..684b139 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +__pycache__/ +*.pyc +.venv/ +.streamlit/ +*.png diff --git a/__pycache__/era5_data.cpython-312.pyc b/__pycache__/era5_data.cpython-312.pyc new file mode 100644 index 0000000..838b9c6 Binary files /dev/null and b/__pycache__/era5_data.cpython-312.pyc differ diff --git a/__pycache__/nasa_data.cpython-312.pyc b/__pycache__/nasa_data.cpython-312.pyc index 5bf9b2e..43c23a9 100644 Binary files a/__pycache__/nasa_data.cpython-312.pyc and b/__pycache__/nasa_data.cpython-312.pyc differ diff --git a/era5_data.py b/era5_data.py index 7cf793c..e673ca8 100644 --- a/era5_data.py +++ b/era5_data.py @@ -455,29 +455,24 @@ def get_era5_extreme_heat(lat, lon, year, percentile=95): np.exp((17.625 * df['Temperature (°C)']) / (243.04 + df['Temperature (°C)'])) ) - # Calculate heat index - def calculate_heat_index(row): - t = row['Temperature (°C)'] - rh = row['Relative Humidity (%)'] - - # Convert to Fahrenheit for the formula - t_f = t * 9/5 + 32 - - # Simple formula for lower temperatures - if t_f < 80: - return t - - # Full formula for higher temperatures - hi = -42.379 + 2.04901523 * t_f + 10.14333127 * rh - hi = hi - 0.22475541 * t_f * rh - 0.00683783 * t_f**2 - hi = hi - 0.05481717 * rh**2 + 0.00122874 * t_f**2 * rh - hi = hi + 0.00085282 * t_f * rh**2 - 0.00000199 * t_f**2 * rh**2 - - # Convert back to Celsius - hi_c = (hi - 32) * 5/9 - return hi_c - - df['Heat Index (°C)'] = df.apply(calculate_heat_index, axis=1) + # Calculate heat index using vectorized operations + t = df['Temperature (°C)'] + rh = df['Relative Humidity (%)'] + + # Convert to Fahrenheit for the formula + t_f = t * 9/5 + 32 + + # Full formula calculation for all points + hi = -42.379 + 2.04901523 * t_f + 10.14333127 * rh + hi = hi - 0.22475541 * t_f * rh - 0.00683783 * t_f**2 + hi = hi - 0.05481717 * rh**2 + 0.00122874 * t_f**2 * rh + hi = hi + 0.00085282 * t_f * rh**2 - 0.00000199 * t_f**2 * rh**2 + + # Convert back to Celsius + hi_c = (hi - 32) * 5/9 + + # Use simple formula (just temperature) where t_f < 80, else full formula + df['Heat Index (°C)'] = np.where(t_f < 80, t, hi_c) # Get daily maximum heat index daily_heat_index = df.groupby('Date')['Heat Index (°C)'].max().reset_index() diff --git a/nasa_data.py b/nasa_data.py index 419477f..567f243 100644 --- a/nasa_data.py +++ b/nasa_data.py @@ -350,30 +350,24 @@ def _get_extreme_heat_days_cached(lat, lon, year, percentile): df = fetch_nasa_power_data(lat, lon, start_date, end_date, parameters=["T2M_MAX", "RH2M"]) - # Calculate heat index - def calculate_heat_index(row): - t = row['T2M_MAX'] # Temperature in Celsius - rh = row['RH2M'] # Relative humidity in % - - # Simple formula for heat index - if t < 26: - return t # Below this temperature, heat index equals temperature - - # Full formula - hi = -8.78469475556 + \ - 1.61139411 * t + \ - 2.33854883889 * rh + \ - -0.14611605 * t * rh + \ - -0.012308094 * t**2 + \ - -0.0164248277778 * rh**2 + \ - 0.002211732 * t**2 * rh + \ - 0.00072546 * t * rh**2 + \ - -0.000003582 * t**2 * rh**2 - - return hi - - # Apply heat index calculation - df['Heat Index (°C)'] = df.apply(calculate_heat_index, axis=1) + # Calculate heat index using vectorized operations for performance + t = df['T2M_MAX'] # Temperature in Celsius + rh = df['RH2M'] # Relative humidity in % + + # Full formula calculation for all points + # This is much faster than applying a function row by row + hi_full = -8.78469475556 + \ + 1.61139411 * t + \ + 2.33854883889 * rh + \ + -0.14611605 * t * rh + \ + -0.012308094 * t**2 + \ + -0.0164248277778 * rh**2 + \ + 0.002211732 * t**2 * rh + \ + 0.00072546 * t * rh**2 + \ + -0.000003582 * t**2 * rh**2 + + # Use simple formula (just temperature) where t < 26 + df['Heat Index (°C)'] = np.where(t < 26, t, hi_full) # Determine thresholds temp_threshold = np.percentile(df['T2M_MAX'], percentile) diff --git a/test_heat_index_performance.py b/test_heat_index_performance.py new file mode 100644 index 0000000..3b6244c --- /dev/null +++ b/test_heat_index_performance.py @@ -0,0 +1,166 @@ + +import unittest +import pandas as pd +import numpy as np +from unittest.mock import MagicMock, patch +import nasa_data +import era5_data + +class TestHeatIndexLogic(unittest.TestCase): + + def test_nasa_heat_index_logic(self): + # Create a mock DataFrame with known values + # Case 1: Low temp (Heat Index = Temp) + # Case 2: High temp, Low RH + # Case 3: High temp, High RH + + # NASA data uses T2M_MAX (C) and RH2M (%) + mock_df = pd.DataFrame({ + 'Date': pd.date_range(start='2023-01-01', periods=3), + 'T2M_MAX': [20.0, 30.0, 35.0], + 'RH2M': [50.0, 40.0, 80.0], + 'T2M': [20.0, 30.0, 35.0], # Needed for other parts but not heat index + 'T2M_MIN': [15.0, 25.0, 30.0] # Needed for other parts but not heat index + }) + + # Mock fetch_nasa_power_data to return our mock_df + with patch('nasa_data.fetch_nasa_power_data', return_value=mock_df): + # We call the internal cached function directly or the wrapper. + # The wrapper calls the cached function. + # However, since the function is cached, we might hit the cache if not careful. + # But in a fresh test run, cache is empty. + + # The function calculates heat index internally + result_df, _, _ = nasa_data.get_extreme_heat_days(0, 0, 2023) + + # Verify calculations + + # Row 0: T=20 (<26), so HI should be 20 + self.assertAlmostEqual(result_df.loc[0, 'Heat Index (°C)'], 20.0) + + # Row 1: T=30, RH=40. + # Formula check: + # HI = -8.78469475556 + 1.61139411*30 + 2.33854883889*40 + ... + # Let's verify against the known formula value. + # I will use the old scalar implementation to verify the new vectorized one + # matches the expected logic. + + t = 30.0 + rh = 40.0 + hi_expected_1 = -8.78469475556 + \ + 1.61139411 * t + \ + 2.33854883889 * rh + \ + -0.14611605 * t * rh + \ + -0.012308094 * t**2 + \ + -0.0164248277778 * rh**2 + \ + 0.002211732 * t**2 * rh + \ + 0.00072546 * t * rh**2 + \ + -0.000003582 * t**2 * rh**2 + + self.assertAlmostEqual(result_df.loc[1, 'Heat Index (°C)'], hi_expected_1) + + # Row 2: T=35, RH=80 + t = 35.0 + rh = 80.0 + hi_expected_2 = -8.78469475556 + \ + 1.61139411 * t + \ + 2.33854883889 * rh + \ + -0.14611605 * t * rh + \ + -0.012308094 * t**2 + \ + -0.0164248277778 * rh**2 + \ + 0.002211732 * t**2 * rh + \ + 0.00072546 * t * rh**2 + \ + -0.000003582 * t**2 * rh**2 + + self.assertAlmostEqual(result_df.loc[2, 'Heat Index (°C)'], hi_expected_2) + + def test_era5_heat_index_logic(self): + # Mock ERA5 data + # It expects '2m_temperature' (K) and '2m_dewpoint_temperature' (K) + # or 'Temperature (°C)' and 'Dewpoint (°C)' if processed? + # fetch_era5_data returns processed data. + + # Let's see what get_era5_extreme_heat does. + # It calls fetch_era5_data, which returns a DF with 'Temperature (°C)' and 'Dewpoint (°C)' (if computed) + # But wait, fetch_era5_data processes raw data. + # So if we mock fetch_era5_data, we should return the processed format that get_era5_extreme_heat expects. + + # get_era5_extreme_heat calls: + # df = fetch_era5_data(..., variables=['2m_temperature', '2m_dewpoint_temperature']) + + # And expects: + # 'Temperature (°C)' + # '2m_dewpoint_temperature' (raw?) or 'd2m'? + + # Let's check era5_data.py again. + # inside get_era5_extreme_heat: + # if '2m_dewpoint_temperature' in df.columns: + # df['Dewpoint (°C)'] = df['2m_dewpoint_temperature'] - 273.15 + + # So fetch_era5_data returns whatever process_era5_data returns. + # process_era5_data converts '2m_temperature' to 'Temperature (°C)'. + # It does NOT seem to touch '2m_dewpoint_temperature' explicitly in the renames dict? + # Renames: {'t2m': 'Temperature (K)', '2m_temperature': 'Temperature (K)', ...} + # And then converts 'Temperature (K)' to 'Temperature (°C)'. + + # So '2m_dewpoint_temperature' likely remains as is in the dataframe returned by fetch_era5_data. + + # Let's verify correct mocking. + + # Create mock data with Temperature in C (already processed) and Dewpoint in K (raw) + # T = 30°C + # RH needs to be calculated to verify heat index. + # RH = 100 * (EXP((17.625 * TD)/(243.04 + TD)) / EXP((17.625 * T)/(243.04 + T))) + + # Let's just mock 'Temperature (°C)' and '2m_dewpoint_temperature' (K) + # Case 1: Low temp (20°C) -> 293.15 K + # Case 2: High temp (35°C) -> 308.15 K + + # To get a specific RH, we need to set Dewpoint appropriately. + # Approx T=35, RH=50% -> TD approx 23°C -> 296.15 K + + mock_df = pd.DataFrame({ + 'time': pd.to_datetime(['2023-01-01 12:00:00', '2023-01-02 12:00:00']), + 'Temperature (°C)': [20.0, 35.0], + '2m_dewpoint_temperature': [293.15 - 10, 296.15] # 283.15K (10°C), 296.15K (23°C) + }) + + with patch('era5_data.fetch_era5_data', return_value=mock_df): + result_df, _, _ = era5_data.get_era5_extreme_heat(0, 0, 2023) + + # Verify results + # Row 0: T=20C. F = 68F < 80F. Heat Index should be T. + # Note: get_era5_extreme_heat groups by Date and takes MAX. + # We have 2 dates, so 2 rows. + + row0 = result_df[result_df['Date'] == pd.to_datetime('2023-01-01').date()].iloc[0] + self.assertAlmostEqual(row0['Heat Index (°C)'], 20.0) + + row1 = result_df[result_df['Date'] == pd.to_datetime('2023-01-02').date()].iloc[0] + + # Verify calculation for row 1 + # Re-calculate inputs locally + t = 35.0 + td = 296.15 - 273.15 # 23.0 + + # RH calculation used in code + rh = 100 * ( + np.exp((17.625 * td) / (243.04 + td)) / + np.exp((17.625 * t) / (243.04 + t)) + ) + + # Heat Index calculation + t_f = t * 9/5 + 32 # 95F + + # Full formula + hi = -42.379 + 2.04901523 * t_f + 10.14333127 * rh + hi = hi - 0.22475541 * t_f * rh - 0.00683783 * t_f**2 + hi = hi - 0.05481717 * rh**2 + 0.00122874 * t_f**2 * rh + hi = hi + 0.00085282 * t_f * rh**2 - 0.00000199 * t_f**2 * rh**2 + + hi_c = (hi - 32) * 5/9 + + self.assertAlmostEqual(row1['Heat Index (°C)'], hi_c) + +if __name__ == '__main__': + unittest.main()