-
Notifications
You must be signed in to change notification settings - Fork 0
Fix CI, unpin versions, and improve Polars benchmark fairness #8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,49 @@ | |
| from typing import Any, Callable, Dict | ||
|
|
||
|
|
||
| def normalize_result(result: Any) -> Any: | ||
| """ | ||
| Normalize the result DataFrame to ensure consistent format across frameworks. | ||
| - Resets index (moving index to columns) | ||
| - Flattens MultiIndex columns | ||
| """ | ||
| # Check if it looks like a pandas/fireducks DataFrame | ||
| if ( | ||
| hasattr(result, "index") | ||
| and hasattr(result, "columns") | ||
| and hasattr(result, "reset_index") | ||
| ): | ||
| # 1. Reset index if it's not a RangeIndex | ||
| # This moves grouping keys from index to columns | ||
| is_range_index = False | ||
| # Check for RangeIndex (has start/stop/step attributes) | ||
| if ( | ||
| hasattr(result.index, "start") | ||
| and hasattr(result.index, "stop") | ||
| and hasattr(result.index, "step") | ||
| ): | ||
| is_range_index = True | ||
|
|
||
| if not is_range_index: | ||
| result = result.reset_index() | ||
|
|
||
| # 2. Flatten MultiIndex columns | ||
| # Check if columns is a MultiIndex (has nlevels > 1) | ||
| if hasattr(result.columns, "nlevels") and result.columns.nlevels > 1: | ||
| new_columns = [] | ||
| for col in result.columns.values: | ||
| if isinstance(col, tuple): | ||
| # Join non-empty parts with underscore | ||
| # E.g. ('total_amount', 'sum') -> 'total_amount_sum' | ||
| name = "_".join([str(c) for c in col if str(c) != ""]).strip("_") | ||
| new_columns.append(name) | ||
| else: | ||
| new_columns.append(str(col)) | ||
| result.columns = new_columns | ||
|
|
||
| return result | ||
|
|
||
|
|
||
| def time_operation( | ||
| operation_name: str, | ||
| df_lib: Any, | ||
|
|
@@ -49,6 +92,14 @@ def time_operation( | |
| if hasattr(result, "to_frame"): | ||
| result = result.to_frame(name=operation_name) | ||
|
|
||
| # Normalize result (reset index, flatten columns) before saving | ||
| # This ensures consistency between Pandas (which uses Index/MultiIndex) | ||
| # and Polars (which uses flat DataFrames) | ||
| try: | ||
| result = normalize_result(result) | ||
| except Exception as e: | ||
| print(f"Warning: Failed to normalize result for {operation_name}: {e}") | ||
|
Comment on lines
+98
to
+101
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using a broad |
||
|
|
||
| if hasattr(result, "to_parquet"): | ||
| result.to_parquet(output_filename, index=False) | ||
| elif hasattr(result, "write_parquet"): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mutating input DataFrame's columns in-place.
Line 46 directly assigns to
result.columns, which mutates the input DataFrame. If the caller holds a reference to the original DataFrame, they'll see modified column names unexpectedly.Proposed fix: operate on a copy
if hasattr(result.columns, "nlevels") and result.columns.nlevels > 1: + result = result.copy() new_columns = [] for col in result.columns.values: if isinstance(col, tuple):🤖 Prompt for AI Agents