diff --git a/.gitignore b/.gitignore
index 1f3dd3b..83644c1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,3 +38,6 @@ docs/_build
 
 # Cookiecutter
 output/
+
+# rck_jr dev files
+rosey/delme.py
\ No newline at end of file
diff --git a/rosey/README.md b/rosey/README.md
new file mode 100644
index 0000000..8d15f82
--- /dev/null
+++ b/rosey/README.md
@@ -0,0 +1,71 @@
+<!-- ruff: noqa -->
+<!-- linting: ignore -->
+# send_time_p13n
+
+# GOAL:
+
+Predict the ideal send time for each contact to maximize probability of interaction
+
+# HYPOTHESIS:
+
+The closer an email is to the top of a contact's inbox the more probable it is that interact.
+
+# LEGACY MODEL:
+
+Creates a pdf of historical interactions for each contact. Essentially a count is performed of how often a contact has interacted (opens or clicks) with emails in the past, these counts populate a pdf that represents every possible hour of the week (168 hours). This count is normalized to create probabilities that a custom `argmax` function ingests and selects send times from.
+    
+    NOTE: If a contact does not meet a certain threshold of interactions in the lookback window, a default `mode` distribution is used for that contact.
+
+# NEW MODEL:
+
+A catboost classifier is trained on enriched historical interaction data for each contact. The features used from enrichment are provided by the customer or publicly available (ie H3 features).
+
+# WHY SHOULD THE NEW MODEL BE BETTER?
+
+- Generalized learning from `high` interaction users to `low` to `no` interaction users
+- Minimize lookback window due to learning from population level data
+- Go beyond interaction data to make predictions (ie demographic data)
+
+# RESULTS OF RECENT AB TEST
+
+We lost :-( (I told everyone that it was because of Stephen....)
+
+# Things I need help on:
+
+- Auto feature selection to replace manual EDA
+- How should I evaluate a model offline? Currently use LogLoss and ROC curves.
+- Model selection, is Catboost the correct model? I've been told the idea is train one and then try to beat it.
+
+# AUTO FEATURE SELECTION PROCESS
+
+AIM FOR CAUSALLY UPSTREAM OF THE VARIABLE
+
+- Ingest contact features provided by customer
+- Filter out based on `feasible_dtypes`
+- Filter out based on `null_frac_threshold` (0.8 threshold)
+    - MAKE SURE THE FEATURES DROPPED ARE OBVIOUSLY BAD, WHAT ABOUT FEATURES CLOSE TO THE THRESHOLD?
+    - Rules of thumb can be misleading
+- Filter out highly correlated features (0.95 threshold)
+    - Tree based models are not unstable with highly correlated features, but feature importance can be messed up
+- Use `catboost.select_features()` functionality, basically trains a model and filters features using `feature_importance`
+- Select the `top_n` features
+
+# OFFLINE TESTING:
+
+- What is the AUC on the low interacation audience?
+- Actually check when the email was sent vs when we thought it should. (Plot of residual of when it was actually sent vs desired send time.)
+    CONFIRM IT IS ACTUALLY A MODEL PROBLEM! (Compliance Testing)
+- Customer Splitting (hold out customers that the model never sees during training)
+- Hold out the last week of historical data to test on. (Need to match up all the datetime features properly)
+
+# TODO:
+
+- Catboost Baseline functionality (see catboost website)
+- Predict click_hour?
+- Data Reduction?
+- Include Mode Distribution as an input feature to the model
+- Add conversion probability to the `send_to_interaction` distribution plots
+- Check on `data_drift` between training, test and live AB
+- Check on contacts that got emails from both the legacy and the new model.
+
+# REPORT BACK 2ish WEEKs
diff --git a/rosey/gbm.py b/rosey/gbm.py
new file mode 100644
index 0000000..68dc944
--- /dev/null
+++ b/rosey/gbm.py
@@ -0,0 +1,261 @@
+# mypy: disable-error-code="import-untyped"
+
+from pathlib import Path
+from typing import Dict, List, Literal, Optional
+
+import numpy as np
+import pandas as pd
+from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
+from sklearn.base import BaseEstimator
+from sklearn.exceptions import NotFittedError
+from sklearn.model_selection import train_test_split
+
+FILEPATH_MODEL = "catboost_model_storage/catboost.cbm"
+
+
+def evaluate_model(
+    model: CatBoost,
+    x: pd.DataFrame,
+    y: pd.Series,
+    metrics: List[str],
+    verbose=False,
+    **kwargs,
+) -> Dict[str, float]:
+    """
+    Ingests model and processed test data then evaluates the model using the specified metrics.
+    """
+    test_pool = Pool(
+        x,
+        y,
+        **kwargs,
+    )
+    eval_result = model.eval_metrics(  # type: ignore[attr-defined]
+        test_pool,
+        metrics,
+    )
+
+    # Log evaluation metrics and print if verbose
+    logs = {}
+    for metric in metrics:
+        logs[metric] = eval_result[metric][-1]
+        if verbose:
+            print(f"{metric}: {eval_result[metric][-1]}")
+
+    # Log feature importances and print if verbose
+    importances = model.get_feature_importance(  # type: ignore[attr-defined]
+        prettified=True
+    )
+    if verbose:
+        print(importances)
+    for name, value in zip(importances["Feature Id"], importances["Importances"]):
+        logs[f"Importance of {name}"] = value
+
+    logs["Testing Sample Count"] = len(x)
+    print(logs)
+    return logs
+
+
+# TODO: Implement a function to write a model to a local path
+def write_model(model: CatBoost, path: Path) -> None:
+    model.save_model(path.cwd() / FILEPATH_MODEL)
+
+
+# TODO: Implement a function to load a model from a local path
+def load_model(
+    path: Path, model_type: Literal["classifier", "regressor"]
+) -> CatBoost:
+    """
+    model_type: either "classifier" or "regressor"
+    """
+    if model_type == "classifier":
+        model = CatBoostClassifier()
+    elif model_type == "regressor":
+        model = CatBoostRegressor()
+    else:
+        raise ValueError("model_type must be either 'classifier' or 'regressor'")
+
+    # join the path to the filename
+    model_path = path.cwd() / FILEPATH_MODEL
+
+    return model.load_model(model_path)
+
+
+class _BoostingModelTrainer(BaseEstimator):
+    """Base class for CatBoost Models Trainers"""
+
+    def __init__(
+        self,
+        iterations: int,
+        cat_features: Optional[List[str]],
+        text_features: Optional[List[str]],
+        embedding_features: Optional[List[str]],
+        early_stopping_rounds: int,
+        use_best_model: bool,
+        verbose: bool,
+        random_state: int,
+        loss_function: str,
+        eval_metric: str,
+    ):
+        self.iterations = iterations
+        self.cat_features = cat_features
+        self.text_features = text_features
+        self.embedding_features = embedding_features
+        self.early_stopping_rounds = early_stopping_rounds
+        self.use_best_model = use_best_model
+        self.verbose = verbose
+        self.random_state = random_state
+        self.loss_function = loss_function
+        self.eval_metric = eval_metric
+
+        self.model_ = None
+
+    @property
+    def get_model(self):
+        if self.model_:
+            return self.model_
+        raise NotFittedError("You must call `.fit()` first")
+
+    def fit(
+        self,
+        x: pd.DataFrame,
+        y: pd.Series,
+        stratify_by: Optional[pd.DataFrame] = None,
+        x_val: Optional[pd.DataFrame] = None,
+        y_val: Optional[pd.Series] = None,
+        **kwargs,
+    ) -> CatBoost:
+        """
+        Ingest processed train data, perform train test split, and fit
+        the model .
+        """
+        x, y = x.reset_index(drop=True), y.reset_index(drop=True)
+        if x_val is None or y_val is None:
+            x_train, x_val, y_train, y_val = train_test_split(
+                x, y, stratify=stratify_by
+            )
+        else:
+            x_train, y_train = x, y
+
+        self.model_ = self.model_.fit(  # type: ignore[attr-defined]
+            x_train,
+            y_train,
+            eval_set=(x_val, y_val),
+            **kwargs,
+        )
+
+        # TODO (2024/07/03) @srose: Implement batched fitting if required
+
+        print(
+            {
+                "Iterations Completed": min(
+                    (
+                        self.model_.best_iteration_  # type: ignore[attr-defined]
+                        + self.early_stopping_rounds
+                        + 1
+                    ),
+                    self.iterations,
+                ),
+                "Training Sample Count": len(x_train),
+            }
+        )
+        return self.model_
+
+    def predict(self, x: pd.DataFrame) -> np.ndarray:
+        return self.model_.predict(x)  # type: ignore[attr-defined]
+
+
+class CBClassifierTrainer(_BoostingModelTrainer):
+    """
+    CatBoost Classifier Trainer.
+
+    NOTE: this class is set up for binary classification
+    """
+
+    def __init__(
+        self,
+        iterations=10_000,
+        cat_features=None,
+        text_features=None,
+        embedding_features=None,
+        early_stopping_rounds=25,
+        use_best_model=True,
+        verbose=True,
+        random_state=42,
+        loss_function="Logloss",
+        eval_metric="AUC",  # TODO: Does this trigger early stopping or Loss Function?
+        **kwargs,
+    ):
+        super().__init__(
+            iterations,
+            cat_features,
+            text_features,
+            embedding_features,
+            early_stopping_rounds,
+            use_best_model,
+            verbose,
+            random_state,
+            loss_function,
+            eval_metric,
+        )
+
+        self.model_ = CatBoostClassifier(
+            iterations=self.iterations,
+            cat_features=self.cat_features,
+            text_features=self.text_features,
+            embedding_features=self.embedding_features,
+            early_stopping_rounds=self.early_stopping_rounds,
+            use_best_model=self.use_best_model,
+            verbose=self.verbose,
+            random_state=self.random_state,
+            loss_function=self.loss_function,
+            eval_metric=self.eval_metric,
+            **kwargs,
+        )
+
+
+class CBRegressorTrainer(_BoostingModelTrainer):
+    """CatBoost Regressor Trainer."""
+
+    def __init__(
+        self,
+        iterations=10_000,
+        cat_features=None,
+        text_features=None,
+        embedding_features=None,
+        early_stopping_rounds=25,
+        use_best_model=True,
+        verbose=True,
+        random_state=42,
+        loss_function="RMSE",
+        eval_metric="RMSE",
+        **kwargs,
+    ):
+        super().__init__(
+            iterations,
+            cat_features,
+            text_features,
+            embedding_features,
+            early_stopping_rounds,
+            use_best_model,
+            verbose,
+            random_state,
+            loss_function,
+            eval_metric,
+        )
+
+        self.eval_metric = eval_metric
+
+        self.model_ = CatBoostRegressor(
+            iterations=self.iterations,
+            cat_features=self.cat_features,
+            text_features=self.text_features,
+            embedding_features=self.embedding_features,
+            early_stopping_rounds=self.early_stopping_rounds,
+            use_best_model=self.use_best_model,
+            verbose=self.verbose,
+            random_state=self.random_state,
+            loss_function=self.loss_function,
+            eval_metric=self.eval_metric,
+            **kwargs,
+        )
+        
\ No newline at end of file
diff --git a/rosey/gbm_auto_feature_selection.py b/rosey/gbm_auto_feature_selection.py
new file mode 100644
index 0000000..67567f6
--- /dev/null
+++ b/rosey/gbm_auto_feature_selection.py
@@ -0,0 +1,277 @@
+"""
+Implemenent an automated feature seleection pipeline for CatBoost models
+
+INPUTS:
+    - pd.DataFrame of raw data
+    - target variable name
+    - model type (classifier or regressor)
+
+OUTPUT:
+    - dictionary of feature name, feature importance, and feature type
+
+This process will include the following:
+    . Set assumptions around:
+        - What data types are feasible
+        - how much of a feature can be null
+        - Correlation threshold
+    . Ingest a pandas DataFrame of raw data
+    . Peform data cleaning/prep using generalized pipeline based on feature Dtypes
+        - Define strategy for processing text and categorical variables
+    . Drop features based on null data fraction
+    . Drop features based on correlation threshold
+    . Train a CatBoost model
+    . Use catboost.select_features() method
+"""
+
+import pandas as pd # type: ignore
+import numpy as np # type: ignore
+from gbm import CBClassifierTrainer, CBRegressorTrainer # type: ignore
+from sklearn.preprocessing import LabelEncoder # type: ignore
+
+# TODO: Build out complete list of Pandas Dtypes
+FEASIBLE_DTYPES = [
+    "int64",
+    "float64",
+    "bool",
+    "datetime64",
+    "object"
+    ]
+
+# TODO: Is there a general rule of thumb for this threshold?
+NULL_DATA_FRAC_THRESHOLD = 0.5
+
+# TODO: Find basis for assumption
+UNIQUE_VALUE_THRESHOLD = 0.9
+
+# TODO: Find basis for assumption
+HIGH_CORR_THRESHOLD = 0.95
+
+FILL_NULL_DICT = {
+    "int64": pd.NA, # TODO: Will pd.NA work here?
+    "float64": pd.NA, # TODO: Will pd.NA work here?
+    "object": "<unknown>",
+    "string": "<unknown>",
+    "date": "1900/1/1", # TODO: Should this be a date in the far past?
+    "bool": None,
+}
+
+# NOTE: MAX Number of features to keep by end of feature selection process
+TOP_K_FEATURES = 10
+
+def _null_data_fraction_filter(df: pd.DataFrame, column_name: str) -> tuple[pd.DataFrame, str]:
+    """
+    Remove features from a DataFrame that have a null data fraction above the threshold
+    """
+
+    column_state = "kept"
+    null_data_frac = df[column_name].isnull().sum() / len(df)
+
+    if null_data_frac > NULL_DATA_FRAC_THRESHOLD:
+        df.drop(columns=[column_name], inplace=True)
+        column_state = "dropped"
+
+    return df, column_state
+
+def _process_string(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
+    """
+    Process string data types in a DataFrame
+    Cast to python str type
+    Strip whitespace
+    Convert to lowercase
+    Fill null values
+    """
+
+    df[column_name] = df[column_name].astype(str)
+    df[column_name] = df[column_name].str.strip()
+    df[column_name] = df[column_name].str.lower()
+    return df[column_name].fillna(FILL_NULL_DICT["string"], inplace=True)
+
+def _process_numerics(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
+    """
+    Process numeric data types in a DataFrame
+    Cast to float
+    Round to 2 decimal places
+    """
+
+    df[column_name] = df[column_name].astype(float)
+    return df[column_name].round(2)
+
+def _process_date(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
+    """
+    Process date data types in a DataFrame
+    Cast to datetime
+    Fill null values
+    """
+
+    df['date_column'] = pd.to_datetime(df['date_column'], infer_datetime_format=True)
+    
+    # NOTE: CatBoost models cannot handle datetime data types, so cast to String
+    return df['date_column'].dt.strftime('%Y-%m-%d')
+
+def _process_bool(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
+    """
+    Process boolean data types in a DataFrame by casting to float
+        TRUE -> 1
+        FALSE -> 0
+    """
+    return df[column_name].astype(float)
+
+def _unique_value_filter(df: pd.DataFrame, column_name: str) -> tuple[pd.DataFrame, str]:
+    """
+    Remove features from a DataFrame that have a high frequency of unique values
+    """
+
+    column_state = "kept"
+    unique_value_frac = df[column_name].nunique() / len(df)
+
+    if unique_value_frac > UNIQUE_VALUE_THRESHOLD:
+        df.drop(columns=[column_name], inplace=True)
+        column_state = "dropped"
+
+    return df, column_state
+
+def _remove_numeric_outliers(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
+    """
+    Remove outliers from a DataFrame based on IQR method
+
+    PARAMS:
+        - DataFrame
+        - Column name to process
+    RETURNS:
+        - DataFrame with outliers removed
+    """
+
+    Q1 = df[column_name].quantile(0.25)
+    Q3 = df[column_name].quantile(0.75)
+    IQR = Q3 - Q1
+
+    lower_bound = Q1 - 1.5 * IQR
+    upper_bound = Q3 + 1.5 * IQR
+
+    return df[(df[column_name] > lower_bound) & (df[column_name] < upper_bound)]
+
+def _catboost_data_preprocessing(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Process raw data for CatBoost model training
+    """
+
+    for column in df.columns:
+
+        df, column_state = _null_data_fraction_filter(df, column)
+        if column_state == "dropped":
+            print(f"Column {column} dropped due to high null data fraction")
+            continue
+
+        # TODO: replace with dictionary of functions?
+        if df[column].dtype == "object":
+            df = _process_string(df, column)
+
+        elif df[column].dtype == "float64":
+            df = _process_numerics(df, column)
+            df = _remove_numeric_outliers(df, column)
+
+        elif df[column].dtype == "int64":
+            df = _process_numerics(df, column)
+            df = _remove_numeric_outliers(df, column)
+
+        elif df[column].dtype == "datetime64":
+            df = _process_date(df, column)
+
+        elif df[column].dtype == "bool":
+            df = _process_bool(df, column)
+        else:
+            print(f"Data type {df[column].dtype} is not supported")
+
+        df, column_state = _unique_value_filter(df, column)
+        if column_state == "dropped":
+            print(f"Column {column} dropped due to high unique value frequency")
+            continue
+
+    return df
+
+# TODO: Design high correlation feature removal function
+def _drop_highly_correlated_columns(
+    df: pd.DataFrame, features: list[str], threshold: float = 0.9
+) -> tuple[pd.DataFrame, list[str]]:
+    """
+    Transform categorical features into numerical values using LabelEncoder(), so we can
+    properly perform correlation calculations.
+    Drop highly correlated columns from the DataFrame based on the threshold value.
+
+    Parameters:
+        df (pd.DataFrame): The DataFrame containing the customer data.
+        features (list[str]): The list of features to be analyzed.
+        threshold (float): The threshold value for the correlation matrix.
+
+    Returns:
+        Tuple[pd.DataFrame, list[str]]: A tuple containing the transformed DataFrame and
+        the list of features after dropping the highly correlated columns.
+    """
+    # Convert categorical and text features to numerical values
+    df_encoded = df[features].copy()
+    for column in df_encoded.select_dtypes(include=["object", "category"]).columns:
+        le = LabelEncoder()
+        df_encoded[column] = le.fit_transform(df_encoded[column].astype(str))
+    corr_matrix = df_encoded.corr().abs()
+    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
+    to_drop = [
+        column for column in upper_tri.columns if any(upper_tri[column] > threshold)
+    ]
+    print(f"Columns to drop due to high correlation: {to_drop}")
+
+    df = df.drop(columns=to_drop)
+    features = [feature for feature in features if feature not in to_drop]
+
+    return df, features
+
+def main(df: pd.DataFrame, required_features: list, target_column: str,  model_type: str) -> dict:
+    """
+    Main function to run the automated feature selection pipeline
+
+    INPUTS:
+        - pd.DataFrame of raw data
+        - list of required features, that cannot be dropped
+        - target variable name
+        - model type (classifier or regressor)
+
+    OUTPUT:
+        - dictionary of feature name, feature importance, and feature type
+    """
+
+    required_features.append(target_column)
+    required_df = df[required_features]
+    df = df.drop(columns=required_features)
+
+    df = _catboost_data_preprocessing(df)
+
+    df = _drop_highly_correlated_columns(df, df.columns, HIGH_CORR_THRESHOLD)
+
+    df = pd.concat([required_df, df], axis=1)
+
+    if model_type == "classifier":
+        trainer = CBClassifierTrainer()
+    elif model_type == "regressor":
+        trainer = CBRegressorTrainer()
+    else:
+        raise ValueError("Model type must be either 'classifier' or 'regressor'")
+
+    # TODO: Implement model.select_features() method
+    model = trainer.fit(df.drop(columns=[target_column]), df[target_column])
+
+    return model.select_features() 
+
+if __name__ == "__main__":
+
+    # Test Input Data
+    test_data = {
+        "feature_1": [1, 2, 3, 4, 5],
+        "feature_2": [1, 2, 3, 4, 5],
+        "feature_3": [1, 2, 3, 4, 5],
+        "target": [0, 1, 0, 1, 0],
+    }
+    df = pd.DataFrame(test_data)
+
+    model_type = "classifier"
+    target_variable = "target"
+
+    print("Done")
\ No newline at end of file