Lab Manual for scikit-learn with Various Datasets

Lab 1: Housing Price Prediction (Regression)

Dataset: Boston Housing Dataset (for demonstration; real-world use should use updated datasets)

Objective: Build a regression model to predict housing prices.

Instructions:

Load and Explore Dataset:

 from sklearn.datasets import load_boston
 import pandas as pd

 # Load dataset
 boston = load_boston()
 X = pd.DataFrame(boston.data, columns=boston.feature_names)
 y = boston.target

 # Display dataset information
 print(X.head())
 print(f"Target variable (prices): {y[:5]}")

Train-Test Split and Model Training:

 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LinearRegression
 from sklearn.metrics import mean_squared_error

 # Split dataset
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

 # Train model
 model = LinearRegression()
 model.fit(X_train, y_train)

 # Make predictions
 y_pred = model.predict(X_test)

 # Evaluate model
 mse = mean_squared_error(y_test, y_pred)
 print(f"Mean Squared Error: {mse:.2f}")

Notes:

Linear Regression is used here to predict continuous values (house prices).
Mean Squared Error (MSE) is used to evaluate the performance of regression models.

Exercise:

Try other regression algorithms such as Ridge or Lasso.
Implement feature scaling and observe its impact on model performance.

Lab 2: Titanic Survival Prediction (Classification)

Dataset: Titanic Dataset

Objective: Build a classification model to predict passenger survival.

Instructions:

Load and Preprocess Dataset:

 import pandas as pd

 # Load dataset
 df = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')

 # Preprocess data
 df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)
 df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
 df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

 # Handle missing values
 df = df.fillna(df.mean())

 X = df.drop('Survived', axis=1)
 y = df['Survived']

Train-Test Split and Model Training:

 from sklearn.model_selection import train_test_split
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

 # Split dataset
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

 # Train model
 model = RandomForestClassifier(n_estimators=100)
 model.fit(X_train, y_train)

 # Make predictions
 y_pred = model.predict(X_test)

 # Evaluate model
 accuracy = accuracy_score(y_test, y_pred)
 cm = confusion_matrix(y_test, y_pred)
 cr = classification_report(y_test, y_pred)

 print(f'Accuracy: {accuracy:.2f}')
 print('Confusion Matrix:\n', cm)
 print('Classification Report:\n', cr)

Notes:

The Titanic dataset is used for binary classification (survived vs. not survived).
Random Forest is a robust algorithm for classification tasks.
Evaluation metrics include accuracy, confusion matrix, and classification report.

Exercise:

Try different classifiers like LogisticRegression or SVC.
Experiment with feature engineering and observe the impact on model performance.

Lab 3: Wine Quality Classification

Dataset: Wine Quality Dataset

Objective: Classify wine quality using various machine learning algorithms.

Instructions:

Load and Explore Dataset:

 from sklearn.datasets import load_wine
 import pandas as pd

 # Load dataset
 wine = load_wine()
 X = pd.DataFrame(wine.data, columns=wine.feature_names)
 y = wine.target

 # Display dataset information
 print(X.head())
 print(f"Target variable (wine classes): {y[:5]}")

Train-Test Split and Model Training:

 from sklearn.model_selection import train_test_split
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.metrics import accuracy_score, classification_report

 # Split dataset
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

 # Train model
 model = GradientBoostingClassifier(n_estimators=100)
 model.fit(X_train, y_train)

 # Make predictions
 y_pred = model.predict(X_test)

 # Evaluate model
 accuracy = accuracy_score(y_test, y_pred)
 cr = classification_report(y_test, y_pred)

 print(f'Accuracy: {accuracy:.2f}')
 print('Classification Report:\n', cr)

Notes:

The Wine Quality dataset is used for multi-class classification (wine quality ratings).
Gradient Boosting is effective for handling complex classification problems.

Exercise:

Compare the performance of Gradient Boosting with other classifiers such as KNeighborsClassifier.
Investigate feature importance provided by the model.

Lab 4: Breast Cancer Detection (Classification)

Dataset: Breast Cancer Wisconsin (Diagnostic) Dataset

Objective: Classify breast cancer as malignant or benign.

Instructions:

Load and Explore Dataset:

 from sklearn.datasets import load_breast_cancer
 import pandas as pd

 # Load dataset
 cancer = load_breast_cancer()
 X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
 y = cancer.target

 # Display dataset information
 print(X.head())
 print(f"Target variable (cancer classes): {y[:5]}")

Train-Test Split and Model Training:

 from sklearn.model_selection import train_test_split
 from sklearn.svm import SVC
 from sklearn.metrics import accuracy_score, classification_report

 # Split dataset
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

 # Train model
 model = SVC(kernel='linear')
 model.fit(X_train, y_train)

 # Make predictions
 y_pred = model.predict(X_test)

 # Evaluate model
 accuracy = accuracy_score(y_test, y_pred)
 cr = classification_report(y_test, y_pred)

 print(f'Accuracy: {accuracy:.2f}')
 print('Classification Report:\n', cr)

Notes:

The Breast Cancer dataset is a binary classification problem.
Support Vector Machines (SVM) with a linear kernel can be effective for this type of classification task.

Exercise:

Try different kernels for SVM (poly, rbf).
Experiment with scaling features and observe its impact on performance.

Lab 5: Customer Segmentation (Clustering)

Dataset: Mall Customer Segmentation Dataset

Objective: Use clustering to segment customers based on their spending behavior.

Instructions:

Load and Explore Dataset:

 import pandas as pd

 # Load dataset
 df = pd.read_csv('https://raw.githubusercontent.com/siddhantbhattarai/ML-Datasets/main/Mall_Customers.csv')

 # Preprocess data
 X = df[['Annual Income (k$)', 'Spending Score (1-100)']]

 # Display dataset information
 print(X.head())

Apply K-Means Clustering:

 from sklearn.cluster import KMeans
 import matplotlib.pyplot as plt

 # Find optimal number of clusters using Elbow Method
 wcss = []
 for i in range(1, 11):
     kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
     kmeans.fit(X)
     wcss.append(kmeans.inertia_)

 # Plot the Elbow graph
 plt.plot(range(1, 11), wcss)
 plt.title('Elbow Method')
 plt.xlabel('Number of clusters')
 plt.ylabel('WCSS')
 plt.show()

 # Apply K-Means with the optimal number of clusters (e.g., 5)
 kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=300, n_init=10, random_state=42)
 y_kmeans = kmeans.fit_predict(X)

 # Plot clusters
 plt.scatter(X.iloc[y_kmeans == 0, 0], X.iloc[y_kmeans == 0, 1], s=100, c='red', label='Cluster 1')
 plt.scatter(X.iloc[y_kmeans == 1, 0], X.iloc[y_kmeans == 1, 1], s=100, c='blue', label='Cluster 2')
 plt.scatter(X.iloc[y_kmeans == 2, 0], X.iloc[y_kmeans == 2, 1], s=100, c='green', label='Cluster 3')
 plt.scatter(X.iloc[y_kmeans == 3, 0], X.iloc[y_kmeans == 3, 1], s=100, c='cyan', label='Cluster 4')
 plt.scatter(X.iloc[y_kmeans == 4, 0], X.iloc[y_kmeans == 4, 1], s=100, c='magenta', label='Cluster 5')
 plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='yellow', label='Centroids')
 plt.title('Customer Clusters')
 plt.xlabel('Annual Income (k$)')
 plt.ylabel('Spending Score (1-100)')
 plt.legend()
 plt.show()