Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions Titanic-Survival-Prediction/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Titanic Survival Predictor 🚢

A machine learning model that predicts whether a Titanic passenger would have survived, based on features like age, gender, class, and fare.

## Results
| Model | Accuracy |
|---|---|
| Logistic Regression | 81.01% |
| Random Forest | **82.12%** |

## Project Structure
```
titanic-survival-predictor/
├── data/ # Raw dataset
├── explore.py # Data exploration & visualization
├── preprocess.py # Data cleaning & feature engineering
├── train.py # Model training & evaluation
├── predict.py # Make predictions on new passengers
└── requirements.txt # Dependencies
```

## Setup
```bash
pip install -r requirements.txt
```
## Dataset
Download `titanic.csv` from [here](https://github.com/datasciencedojo/datasets/blob/master/Titanic.csv) and place it inside the `data/` folder.

## Usage
```bash
python predict.py
```

## What I learned
- Data cleaning and handling missing values
- Feature engineering and label encoding
- Training and comparing ML models
- Evaluating with confusion matrix and classification report
32 changes: 32 additions & 0 deletions Titanic-Survival-Prediction/explore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('data/titanic.csv')

# --- Basic overview ---
print("Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nColumn info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())

# --- Survival rate by key features ---
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

sns.barplot(x='Sex', y='Survived', data=df, ax=axes[0])
axes[0].set_title('Survival by Sex')

sns.barplot(x='Pclass', y='Survived', data=df, ax=axes[1])
axes[1].set_title('Survival by Passenger Class')

sns.histplot(data=df, x='Age', hue='Survived', bins=30, ax=axes[2])
axes[2].set_title('Survival by Age')

plt.tight_layout()
plt.savefig('exploration.png')
plt.show()
print("\nChart saved as exploration.png")
23 changes: 23 additions & 0 deletions Titanic-Survival-Prediction/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pandas as pd
import joblib

# Load the saved model
model = joblib.load('models/titanic_model.pkl')

# Example passenger (you can change these values)
passenger = pd.DataFrame([{
'Pclass': 3, # 1=First, 2=Second, 3=Third class
'Sex': 1, # 1=Male, 0=Female
'Age': 22,
'SibSp': 1, # siblings/spouses aboard
'Parch': 0, # parents/children aboard
'Fare': 7.25,
'Embarked': 2 # 0=Cherbourg, 1=Queenstown, 2=Southampton
}])

result = model.predict(passenger)[0]
probability = model.predict_proba(passenger)[0]

print(f"Prediction: {'Survived' if result == 1 else 'Did not survive'}")
print(f"Survival probability: {probability[1]:.2%}")
print(f"Death probability: {probability[0]:.2%}")
30 changes: 30 additions & 0 deletions Titanic-Survival-Prediction/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def preprocess(df):
# Drop columns that are useless for prediction
df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

# Fill missing Age with median (more robust than mean)
df['Age'] = df['Age'].fillna(df['Age'].median())

# Fill missing Embarked with most common value
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Convert Sex and Embarked from text to numbers
# Machine learning models only understand numbers, not "male"/"female"
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex']) # male=1, female=0
df['Embarked'] = le.fit_transform(df['Embarked']) # S=2, C=0, Q=1

return df

if __name__ == '__main__':
df = pd.read_csv('data/titanic.csv')
df = preprocess(df)

print("Cleaned shape:", df.shape)
print("\nMissing values after cleaning:")
print(df.isnull().sum())
print("\nFirst 5 rows after cleaning:")
print(df.head())
6 changes: 6 additions & 0 deletions Titanic-Survival-Prediction/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pandas
numpy
scikit-learn
matplotlib
seaborn
joblib
52 changes: 52 additions & 0 deletions Titanic-Survival-Prediction/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import os
from preprocess import preprocess

# Load and preprocess
df = pd.read_csv('data/titanic.csv')
df = preprocess(df)

# Split features and target
X = df.drop(columns=['Survived']) # everything except what we're predicting
y = df['Survived'] # what we're predicting

# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Model 1: Logistic Regression ---
lr = LogisticRegression(max_iter=200)
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)
lr_acc = accuracy_score(y_test, lr_preds)

# --- Model 2: Random Forest ---
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_preds)

# --- Compare ---
print(f"Logistic Regression Accuracy: {lr_acc:.2%}")
print(f"Random Forest Accuracy: {rf_acc:.2%}")

# Pick the better model
best_model = rf if rf_acc >= lr_acc else lr
best_name = "Random Forest" if rf_acc >= lr_acc else "Logistic Regression"
print(f"\nBest model: {best_name}")

# --- Detailed report for best model ---
best_preds = rf_preds if rf_acc >= lr_acc else lr_preds
print("\nClassification Report:")
print(classification_report(y_test, best_preds))
print("Confusion Matrix:")
print(confusion_matrix(y_test, best_preds))

# --- Save the best model ---
os.makedirs('models', exist_ok=True)
joblib.dump(best_model, 'models/titanic_model.pkl')
print("\nModel saved to models/titanic_model.pkl")
Loading