{
"cells": [
{
"cell_type": "markdown",
"id": "fcaa9518",
"metadata": {},
"source": [
"## Import Libraries "
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "14066cad",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, fbeta_score"
]
},
{
"cell_type": "markdown",
"id": "c71eebbb",
"metadata": {},
"source": [
"## Import Data "
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ca76ad9c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"No. of records are: (891, 4)\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fare | \n",
" Age | \n",
" Sex | \n",
" Survived | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 7.2500 | \n",
" 22.0 | \n",
" male | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 71.2833 | \n",
" 38.0 | \n",
" female | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 7.9250 | \n",
" 26.0 | \n",
" female | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 53.1000 | \n",
" 35.0 | \n",
" female | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 8.0500 | \n",
" 35.0 | \n",
" male | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Fare Age Sex Survived\n",
"0 7.2500 22.0 male 0\n",
"1 71.2833 38.0 female 1\n",
"2 7.9250 26.0 female 1\n",
"3 53.1000 35.0 female 1\n",
"4 8.0500 35.0 male 0"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.read_csv(\"titanic-data.csv\")\n",
"data = data[['Fare', 'Age', 'Sex', 'Survived']]\n",
"print(\"No. of records are:\", data.shape)\n",
"data.head()"
]
},
{
"cell_type": "markdown",
"id": "c0638e9c",
"metadata": {},
"source": [
"## PreProcessing"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "448cac1a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fare | \n",
" Age | \n",
" Sex | \n",
" Survived | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 7.2500 | \n",
" 22.0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 71.2833 | \n",
" 38.0 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 7.9250 | \n",
" 26.0 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 53.1000 | \n",
" 35.0 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 8.0500 | \n",
" 35.0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Fare Age Sex Survived\n",
"0 7.2500 22.0 0 0\n",
"1 71.2833 38.0 1 1\n",
"2 7.9250 26.0 1 1\n",
"3 53.1000 35.0 1 1\n",
"4 8.0500 35.0 0 0"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def replace_string(value):\n",
" if value == \"male\":\n",
" return 0\n",
" if value == \"female\":\n",
" return 1\n",
" \n",
"data['Sex'] = data['Sex'].apply(replace_string)\n",
"data['Age'] = data['Age'].fillna(data['Age'].mean())\n",
"data['Age'].describe()\n",
"\n",
"data.head()"
]
},
{
"cell_type": "markdown",
"id": "24adc435",
"metadata": {},
"source": [
"## Model Training "
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "1ef98aa8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestClassifier()"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X = data.drop(['Survived'], axis = 1)\n",
"y = data['Survived']\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.3)\n",
"\n",
"rc = RandomForestClassifier()\n",
"rc.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c681e3f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "d60c4263",
"metadata": {},
"source": [
"## Model Predictions "
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "2ea7df32",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Actual_Target | \n",
" Predicted_Target | \n",
" Predicted_Probability | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.050000 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0.068333 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 0.315230 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 0.030000 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 0 | \n",
" 0.080000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Actual_Target Predicted_Target Predicted_Probability\n",
"0 0 0 0.050000\n",
"1 0 0 0.068333\n",
"2 0 0 0.315230\n",
"3 1 0 0.030000\n",
"4 1 0 0.080000"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predictions = rc.predict(X_test)\n",
"predictions_prob = rc.predict_proba(X_test)\n",
"\n",
"y_test = y_test.reset_index(drop = True)\n",
"\n",
"result = pd.DataFrame()\n",
"result['Actual_Target'] = y_test\n",
"result['Predicted_Target'] = predictions\n",
"result['Predicted_Probability'] = predictions_prob[:, 1]\n",
"result.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b6de461b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 268.000000\n",
"mean 0.392465\n",
"std 0.345726\n",
"min 0.000000\n",
"25% 0.064125\n",
"50% 0.314282\n",
"75% 0.722500\n",
"max 1.000000\n",
"Name: Predicted_Probability, dtype: float64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result.Predicted_Probability.describe()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "843cb524",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 167\n",
"1 101\n",
"Name: Predicted_Target, dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result['Predicted_Target'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "346d6ae4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(101, 3)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result[result['Predicted_Probability'] > 0.5].shape"
]
},
{
"cell_type": "markdown",
"id": "3a2d10c4",
"metadata": {},
"source": [
"## Confusion Matrix "
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "b14b3b16",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" Actual_Target | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" Predicted_Target | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 135 | \n",
" 32 | \n",
"
\n",
" \n",
" 1 | \n",
" 30 | \n",
" 71 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Actual_Target 0 1\n",
"Predicted_Target \n",
"0 135 32\n",
"1 30 71"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.crosstab(result['Predicted_Target'], result['Actual_Target'])"
]
},
{
"cell_type": "markdown",
"id": "e030bbbe",
"metadata": {},
"source": [
"## Custom Threshold "
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "07d11e27",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Actual_Target | \n",
" Predicted_Target | \n",
" Predicted_Probability | \n",
" Custom_Target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.050000 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0.068333 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 0.315230 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 0.030000 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 0 | \n",
" 0.080000 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Actual_Target Predicted_Target Predicted_Probability Custom_Target\n",
"0 0 0 0.050000 0\n",
"1 0 0 0.068333 0\n",
"2 0 0 0.315230 0\n",
"3 1 0 0.030000 0\n",
"4 1 0 0.080000 0"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result['Custom_Target'] = result['Predicted_Probability'].apply(lambda r: 1 if r > 0.6 else 0)\n",
"result.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "c1e542a0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" Actual_Target | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" Predicted_Target | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 135 | \n",
" 32 | \n",
"
\n",
" \n",
" 1 | \n",
" 30 | \n",
" 71 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Actual_Target 0 1\n",
"Predicted_Target \n",
"0 135 32\n",
"1 30 71"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.crosstab(result['Predicted_Target'], result['Actual_Target'])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "b2f5f7ca",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" Actual_Target | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" Custom_Target | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 145 | \n",
" 37 | \n",
"
\n",
" \n",
" 1 | \n",
" 20 | \n",
" 66 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Actual_Target 0 1\n",
"Custom_Target \n",
"0 145 37\n",
"1 20 66"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.crosstab(result['Custom_Target'], result['Actual_Target'])"
]
},
{
"cell_type": "markdown",
"id": "e158c35d",
"metadata": {},
"source": [
"## Calculating Metrics - Predicted Target "
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "1d3c7ae5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0.7029702970297029,\n",
" 0.6893203883495146,\n",
" 0.7686567164179104,\n",
" 0.6960784313725491)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tp = result[(result['Actual_Target'] == 1) & (result['Predicted_Target'] == 1)].shape[0]\n",
"fn = result[(result['Actual_Target'] == 1) & (result['Predicted_Target'] == 0)].shape[0]\n",
"tn = result[(result['Actual_Target'] == 0) & (result['Predicted_Target'] == 0)].shape[0]\n",
"fp = result[(result['Actual_Target'] == 0) & (result['Predicted_Target'] == 1)].shape[0]\n",
"\n",
"Accuracy = (tp + tn) / (tp + fn + tn + fp)\n",
"Precision = (tp) / (tp + fp)\n",
"Recall = (tp) / (tp + fn)\n",
"F1Score = 2*Precision*Recall/(Precision+Recall)\n",
"\n",
"Precision, Recall, Accuracy, F1Score"
]
},
{
"cell_type": "markdown",
"id": "f7d73a5c",
"metadata": {},
"source": [
"## Calculating Metrics - Custom Target "
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "8c9bc7bd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0.7674418604651163,\n",
" 0.6407766990291263,\n",
" 0.7873134328358209,\n",
" 0.6984126984126984)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tp = result[(result['Actual_Target'] == 1) & (result['Custom_Target'] == 1)].shape[0]\n",
"fn = result[(result['Actual_Target'] == 1) & (result['Custom_Target'] == 0)].shape[0]\n",
"tn = result[(result['Actual_Target'] == 0) & (result['Custom_Target'] == 0)].shape[0]\n",
"fp = result[(result['Actual_Target'] == 0) & (result['Custom_Target'] == 1)].shape[0]\n",
"\n",
"Accuracy = (tp + tn) / (tp + fn + tn + fp)\n",
"Precision = (tp) / (tp + fp)\n",
"Recall = (tp) / (tp + fn)\n",
"F1Score = 2*Precision*Recall/(Precision+Recall)\n",
"\n",
"Precision, Recall, Accuracy, F1Score"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5352c933",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}