Grow-with-Open-Source · zain-cs · Jul 2, 2026
diff --git a/Titanic-Survival-Prediction/README.md b/Titanic-Survival-Prediction/README.md
@@ -0,0 +1,38 @@
+# Titanic Survival Predictor 🚢
+
+A machine learning model that predicts whether a Titanic passenger would have survived, based on features like age, gender, class, and fare.
+
+## Results
+| Model | Accuracy |
+|---|---|
+| Logistic Regression | 81.01% |
+| Random Forest | **82.12%** |
+
+## Project Structure
+```
+titanic-survival-predictor/
+├── data/               # Raw dataset
+├── explore.py          # Data exploration & visualization
+├── preprocess.py       # Data cleaning & feature engineering
+├── train.py            # Model training & evaluation
+├── predict.py          # Make predictions on new passengers
+└── requirements.txt    # Dependencies
+```
+
+## Setup
+```bash
+pip install -r requirements.txt
+```
+## Dataset
+Download `titanic.csv` from [here](https://github.com/datasciencedojo/datasets/blob/master/Titanic.csv) and place it inside the `data/` folder.
+
+## Usage
+```bash
+python predict.py
+```
+
+## What I learned
+- Data cleaning and handling missing values
+- Feature engineering and label encoding
+- Training and comparing ML models
+- Evaluating with confusion matrix and classification report
diff --git a/Titanic-Survival-Prediction/explore.py b/Titanic-Survival-Prediction/explore.py
@@ -0,0 +1,32 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# Load the data
+df = pd.read_csv('data/titanic.csv')
+
+# --- Basic overview ---
+print("Shape:", df.shape)
+print("\nFirst 5 rows:")
+print(df.head())
+print("\nColumn info:")
+print(df.info())
+print("\nMissing values:")
+print(df.isnull().sum())
+
+# --- Survival rate by key features ---
+fig, axes = plt.subplots(1, 3, figsize=(14, 4))
+
+sns.barplot(x='Sex', y='Survived', data=df, ax=axes[0])
+axes[0].set_title('Survival by Sex')
+
+sns.barplot(x='Pclass', y='Survived', data=df, ax=axes[1])
+axes[1].set_title('Survival by Passenger Class')
+
+sns.histplot(data=df, x='Age', hue='Survived', bins=30, ax=axes[2])
+axes[2].set_title('Survival by Age')
+
+plt.tight_layout()
+plt.savefig('exploration.png')
+plt.show()
+print("\nChart saved as exploration.png")
diff --git a/Titanic-Survival-Prediction/predict.py b/Titanic-Survival-Prediction/predict.py
@@ -0,0 +1,23 @@
+import pandas as pd
+import joblib
+
+# Load the saved model
+model = joblib.load('models/titanic_model.pkl')
+
+# Example passenger (you can change these values)
+passenger = pd.DataFrame([{
+    'Pclass':   3,      # 1=First, 2=Second, 3=Third class
+    'Sex':      1,      # 1=Male, 0=Female
+    'Age':      22,
+    'SibSp':    1,      # siblings/spouses aboard
+    'Parch':    0,      # parents/children aboard
+    'Fare':     7.25,
+    'Embarked': 2       # 0=Cherbourg, 1=Queenstown, 2=Southampton
+}])
+
+result = model.predict(passenger)[0]
+probability = model.predict_proba(passenger)[0]
+
+print(f"Prediction: {'Survived' if result == 1 else 'Did not survive'}")
+print(f"Survival probability:  {probability[1]:.2%}")
+print(f"Death probability:     {probability[0]:.2%}")
diff --git a/Titanic-Survival-Prediction/preprocess.py b/Titanic-Survival-Prediction/preprocess.py
@@ -0,0 +1,30 @@
+import pandas as pd
+from sklearn.preprocessing import LabelEncoder
+
+def preprocess(df):
+    # Drop columns that are useless for prediction
+    df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])
+
+    # Fill missing Age with median (more robust than mean)
+    df['Age'] = df['Age'].fillna(df['Age'].median())
+
+    # Fill missing Embarked with most common value
+    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
+
+    # Convert Sex and Embarked from text to numbers
+    # Machine learning models only understand numbers, not "male"/"female"
+    le = LabelEncoder()
+    df['Sex'] = le.fit_transform(df['Sex'])         # male=1, female=0
+    df['Embarked'] = le.fit_transform(df['Embarked']) # S=2, C=0, Q=1
+
+    return df
+
+if __name__ == '__main__':
+    df = pd.read_csv('data/titanic.csv')
+    df = preprocess(df)
+
+    print("Cleaned shape:", df.shape)
+    print("\nMissing values after cleaning:")
+    print(df.isnull().sum())
+    print("\nFirst 5 rows after cleaning:")
+    print(df.head())
diff --git a/Titanic-Survival-Prediction/requirements.txt b/Titanic-Survival-Prediction/requirements.txt
@@ -0,0 +1,6 @@
+pandas
+numpy
+scikit-learn
+matplotlib
+seaborn
+joblib
diff --git a/Titanic-Survival-Prediction/train.py b/Titanic-Survival-Prediction/train.py
@@ -0,0 +1,52 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+import joblib
+import os
+from preprocess import preprocess
+
+# Load and preprocess
+df = pd.read_csv('data/titanic.csv')
+df = preprocess(df)
+
+# Split features and target
+X = df.drop(columns=['Survived'])  # everything except what we're predicting
+y = df['Survived']                 # what we're predicting
+
+# 80% for training, 20% for testing
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+# --- Model 1: Logistic Regression ---
+lr = LogisticRegression(max_iter=200)
+lr.fit(X_train, y_train)
+lr_preds = lr.predict(X_test)
+lr_acc = accuracy_score(y_test, lr_preds)
+
+# --- Model 2: Random Forest ---
+rf = RandomForestClassifier(n_estimators=100, random_state=42)
+rf.fit(X_train, y_train)
+rf_preds = rf.predict(X_test)
+rf_acc = accuracy_score(y_test, rf_preds)
+
+# --- Compare ---
+print(f"Logistic Regression Accuracy: {lr_acc:.2%}")
+print(f"Random Forest Accuracy:       {rf_acc:.2%}")
+
+# Pick the better model
+best_model = rf if rf_acc >= lr_acc else lr
+best_name = "Random Forest" if rf_acc >= lr_acc else "Logistic Regression"
+print(f"\nBest model: {best_name}")
+
+# --- Detailed report for best model ---
+best_preds = rf_preds if rf_acc >= lr_acc else lr_preds
+print("\nClassification Report:")
+print(classification_report(y_test, best_preds))
+print("Confusion Matrix:")
+print(confusion_matrix(y_test, best_preds))
+
+# --- Save the best model ---
+os.makedirs('models', exist_ok=True)
+joblib.dump(best_model, 'models/titanic_model.pkl')
+print("\nModel saved to models/titanic_model.pkl")