diff --git a/Titanic-Survival-Prediction/README.md b/Titanic-Survival-Prediction/README.md new file mode 100644 index 0000000..a6a810d --- /dev/null +++ b/Titanic-Survival-Prediction/README.md @@ -0,0 +1,38 @@ +# Titanic Survival Predictor 🚢 + +A machine learning model that predicts whether a Titanic passenger would have survived, based on features like age, gender, class, and fare. + +## Results +| Model | Accuracy | +|---|---| +| Logistic Regression | 81.01% | +| Random Forest | **82.12%** | + +## Project Structure +``` +titanic-survival-predictor/ +├── data/ # Raw dataset +├── explore.py # Data exploration & visualization +├── preprocess.py # Data cleaning & feature engineering +├── train.py # Model training & evaluation +├── predict.py # Make predictions on new passengers +└── requirements.txt # Dependencies +``` + +## Setup +```bash +pip install -r requirements.txt +``` +## Dataset +Download `titanic.csv` from [here](https://github.com/datasciencedojo/datasets/blob/master/Titanic.csv) and place it inside the `data/` folder. + +## Usage +```bash +python predict.py +``` + +## What I learned +- Data cleaning and handling missing values +- Feature engineering and label encoding +- Training and comparing ML models +- Evaluating with confusion matrix and classification report \ No newline at end of file diff --git a/Titanic-Survival-Prediction/explore.py b/Titanic-Survival-Prediction/explore.py new file mode 100644 index 0000000..c7e9bb9 --- /dev/null +++ b/Titanic-Survival-Prediction/explore.py @@ -0,0 +1,32 @@ +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + +# Load the data +df = pd.read_csv('data/titanic.csv') + +# --- Basic overview --- +print("Shape:", df.shape) +print("\nFirst 5 rows:") +print(df.head()) +print("\nColumn info:") +print(df.info()) +print("\nMissing values:") +print(df.isnull().sum()) + +# --- Survival rate by key features --- +fig, axes = plt.subplots(1, 3, figsize=(14, 4)) + +sns.barplot(x='Sex', y='Survived', data=df, ax=axes[0]) +axes[0].set_title('Survival by Sex') + +sns.barplot(x='Pclass', y='Survived', data=df, ax=axes[1]) +axes[1].set_title('Survival by Passenger Class') + +sns.histplot(data=df, x='Age', hue='Survived', bins=30, ax=axes[2]) +axes[2].set_title('Survival by Age') + +plt.tight_layout() +plt.savefig('exploration.png') +plt.show() +print("\nChart saved as exploration.png") \ No newline at end of file diff --git a/Titanic-Survival-Prediction/predict.py b/Titanic-Survival-Prediction/predict.py new file mode 100644 index 0000000..ae5f216 --- /dev/null +++ b/Titanic-Survival-Prediction/predict.py @@ -0,0 +1,23 @@ +import pandas as pd +import joblib + +# Load the saved model +model = joblib.load('models/titanic_model.pkl') + +# Example passenger (you can change these values) +passenger = pd.DataFrame([{ + 'Pclass': 3, # 1=First, 2=Second, 3=Third class + 'Sex': 1, # 1=Male, 0=Female + 'Age': 22, + 'SibSp': 1, # siblings/spouses aboard + 'Parch': 0, # parents/children aboard + 'Fare': 7.25, + 'Embarked': 2 # 0=Cherbourg, 1=Queenstown, 2=Southampton +}]) + +result = model.predict(passenger)[0] +probability = model.predict_proba(passenger)[0] + +print(f"Prediction: {'Survived' if result == 1 else 'Did not survive'}") +print(f"Survival probability: {probability[1]:.2%}") +print(f"Death probability: {probability[0]:.2%}") \ No newline at end of file diff --git a/Titanic-Survival-Prediction/preprocess.py b/Titanic-Survival-Prediction/preprocess.py new file mode 100644 index 0000000..876f6d1 --- /dev/null +++ b/Titanic-Survival-Prediction/preprocess.py @@ -0,0 +1,30 @@ +import pandas as pd +from sklearn.preprocessing import LabelEncoder + +def preprocess(df): + # Drop columns that are useless for prediction + df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin']) + + # Fill missing Age with median (more robust than mean) + df['Age'] = df['Age'].fillna(df['Age'].median()) + + # Fill missing Embarked with most common value + df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0]) + + # Convert Sex and Embarked from text to numbers + # Machine learning models only understand numbers, not "male"/"female" + le = LabelEncoder() + df['Sex'] = le.fit_transform(df['Sex']) # male=1, female=0 + df['Embarked'] = le.fit_transform(df['Embarked']) # S=2, C=0, Q=1 + + return df + +if __name__ == '__main__': + df = pd.read_csv('data/titanic.csv') + df = preprocess(df) + + print("Cleaned shape:", df.shape) + print("\nMissing values after cleaning:") + print(df.isnull().sum()) + print("\nFirst 5 rows after cleaning:") + print(df.head()) \ No newline at end of file diff --git a/Titanic-Survival-Prediction/requirements.txt b/Titanic-Survival-Prediction/requirements.txt new file mode 100644 index 0000000..f6871ad --- /dev/null +++ b/Titanic-Survival-Prediction/requirements.txt @@ -0,0 +1,6 @@ +pandas +numpy +scikit-learn +matplotlib +seaborn +joblib \ No newline at end of file diff --git a/Titanic-Survival-Prediction/train.py b/Titanic-Survival-Prediction/train.py new file mode 100644 index 0000000..6e41c2a --- /dev/null +++ b/Titanic-Survival-Prediction/train.py @@ -0,0 +1,52 @@ +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score, classification_report, confusion_matrix +import joblib +import os +from preprocess import preprocess + +# Load and preprocess +df = pd.read_csv('data/titanic.csv') +df = preprocess(df) + +# Split features and target +X = df.drop(columns=['Survived']) # everything except what we're predicting +y = df['Survived'] # what we're predicting + +# 80% for training, 20% for testing +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# --- Model 1: Logistic Regression --- +lr = LogisticRegression(max_iter=200) +lr.fit(X_train, y_train) +lr_preds = lr.predict(X_test) +lr_acc = accuracy_score(y_test, lr_preds) + +# --- Model 2: Random Forest --- +rf = RandomForestClassifier(n_estimators=100, random_state=42) +rf.fit(X_train, y_train) +rf_preds = rf.predict(X_test) +rf_acc = accuracy_score(y_test, rf_preds) + +# --- Compare --- +print(f"Logistic Regression Accuracy: {lr_acc:.2%}") +print(f"Random Forest Accuracy: {rf_acc:.2%}") + +# Pick the better model +best_model = rf if rf_acc >= lr_acc else lr +best_name = "Random Forest" if rf_acc >= lr_acc else "Logistic Regression" +print(f"\nBest model: {best_name}") + +# --- Detailed report for best model --- +best_preds = rf_preds if rf_acc >= lr_acc else lr_preds +print("\nClassification Report:") +print(classification_report(y_test, best_preds)) +print("Confusion Matrix:") +print(confusion_matrix(y_test, best_preds)) + +# --- Save the best model --- +os.makedirs('models', exist_ok=True) +joblib.dump(best_model, 'models/titanic_model.pkl') +print("\nModel saved to models/titanic_model.pkl") \ No newline at end of file