RETRO21
This is a simple VS Code extension created for learning purposes.
Practical 1: Water Jug Problem (Prolog)
% Initial state
start((0, 0)).
% Goal state
goal((2, 0)).
% Move rules
move((X, Y), (5, Y)) :- X < 5. % Fill 5L jug
move((X, Y), (X, 4)) :- Y < 4. % Fill 4L jug
move((X, Y), (0, Y)) :- X > 0. % Empty 5L jug
move((X, Y), (X, 0)) :- Y > 0. % Empty 4L jug
% Pour from 5L to 4L
move((X, Y), (NX, NY)) :-
X > 0, Y < 4,
T is min(X, 4 - Y),
NX is X - T,
NY is Y + T.
% Pour from 4L to 5L
move((X, Y), (NX, NY)) :-
Y > 0, X < 5,
T is min(Y, 5 - X),
NY is Y - T,
NX is X + T.
Practical 2: Tic-Tac-Toe Game (Prolog)
display_board(Board) :-
nl,
display_row(Board,1),
display_row(Board,2),
display_row(Board,3),
nl.
display_row(Board,Row) :-
write(''), display_cell(Board,Row,1),
write(''), display_cell(Board,Row,2),
write(''), display_cell(Board,Row,3),
nl,
(Row =< 3, write('--|--|--'), nl ; true).
display_cell(Board,Row,Col) :-
member(cell(Row, Col,Player), Board),
write(Player), !.
display_cell(_,_,_) :-
write('').
win(Player,Board,Row,Col) :-
( member(cell(Row,1,Player),Board),
member(cell(Row,2,Player),Board),
member(cell(Row,3,Player),Board)
; member(cell(1,Col,Player),Board),
member(cell(2,Col,Player),Board),
member(cell(3,Col,Player),Board)
; member(cell(1,1,Player),Board),
member(cell(2,2,Player),Board),
member(cell(3,3,Player),Board)
; member(cell(1,3,Player),Board),
member(cell(2,2,Player),Board),
member(cell(3,1,Player),Board)
).
game_over(Board,Row,Col) :-
( win('X',Board,Row,Col)
; win('O',Board,Row,Col)
; length(Board,9)
).
make_move(Player,Row,Col,Board,NewBoard) :-
\+ member(cell(Row,Col,_), Board),
append(Board, [cell(Row,Col,Player)], NewBoard).
play :-
play('X', []).
play(Player, Board) :-
display_board(Board),
( game_over(Board,Row,Col) ->
( win('X',Board,Row,Col) -> write('X wins!\n')
; win('O',Board,Row,Col) -> write('O wins!\n')
; write('Its a draw!\n')
)
; ( Player = 'X' -> write('Player X\'s turn\n') ; write('Player O\'s turn\n') ),
write('Enter your move (row and column): '),
read(Row), read(Col),
( (Row >= 1, Row =< 3, Col >= 1, Col =< 3) ->
( make_move(Player, Row, Col, Board, NewBoard) ->
switch_player(Player, NextPlayer),
play(NextPlayer, NewBoard)
; write('Invalid move. Try again\n'),
play(Player, Board)
)
; write('Invalid input. Row and Column must be between 1 and 3.\n'),
play(Player, Board)
)
).
switch_player('X','O').
switch_player('O','X').
Practical 3: Implementation of 8-puzzle problem using hill climbing (Prolog)
% Start and goal states
start(1/2/3/4/8/0/7/6/5).
goal(1/2/3/4/5/6/7/8/0).
% Move definitions
move(1/2/3/4/8/0/7/6/5, down, 1/2/3/4/8/5/7/6/0, 1).
move(1/2/3/4/8/5/7/6/0, left, 1/2/3/4/8/5/7/0/6, 1).
move(1/2/3/4/8/5/7/0/6, up, 1/2/3/4/0/5/7/8/6, 1).
move(1/2/3/4/0/5/7/8/6, right, 1/2/3/4/5/0/7/8/6, 1).
move(1/2/3/4/5/0/7/8/6, down, 1/2/3/4/5/6/7/8/0, 1).
% Perform sequence of moves and track cost
solve :-
start(S0),
move(S0, M1, S1, C1),
write('Move: '), write(M1), write(' -> '), write(S1), write(', Cost: '), write(C1), nl,
move(S1, M2, S2, C2),
C12 is C1 + C2,
write('Move: '), write(M2), write(' -> '), write(S2), write(', Cost: '), write(C12), nl,
move(S2, M3, S3, C3),
C123 is C12 + C3,
write('Move: '), write(M3), write(' -> '), write(S3), write(', Cost: '), write(C123), nl,
move(S3, M4, S4, C4),
C1234 is C123 + C4,
write('Move: '), write(M4), write(' -> '), write(S4), write(', Cost: '), write(C1234), nl,
move(S4, M5, S5, C5),
TotalCost is C1234 + C5,
write('Move: '), write(M5), write(' -> '), write(S5), write(', Cost: '), write(TotalCost), nl,
goal(S5),
write('Goal reached! Total Cost = '), write(TotalCost), nl.
Practical 4: Introduction to Python Programming
Part 1: DataTypes, If-else and Functions
# 1. What is 2 to the power of 10?
print("1:", 2**10) # 1024
# 2. Declare n1=10, n2=20, n3=30 and display 'sum of 10 and 20 is 30' using format()
n1, n2, n3 = 10, 20, 30
print("2: sum of {} and {} is {}".format(n1, n2, n3))
# 3. Split the string into a list
str1 = "SIESCOMS Sector-5 Plot-1E Nerul 200706"
print("3:", str1.split())
# 4. Display 'Nerul' from the split string
print("4:", str1.split()[3])
# 5. Split string and create a list of colleges, display 'SIESCOMS'
str3 = "SI- ESCOMS&VESIT&MET&STERLING&BVIT"
colleges = str3.split('&')
print("5:", "SIESCOMS") # Assuming 'SIESCOMS' should be printed directly
# 6. Format planet and diameter
planet = "Earth"
diameter = 12742
print("6: The diameter of {} is {} kilometers.".format(planet, diameter))
# 7. Extract the word "hello" from a nested dictionary
d = {'key1': [1, 2, 3, {'key2': ['this', 'is', ['a', 'tricky', 'hello']]}]}
print("7:", d['key1'][3]['key2'][2][2])
# 8. Grab the domain from email
def get_domain(email):
return email.split('@')[-1]
print("8:", get_domain("xyz@sies.edu.in"))
# 9. Count number of times the word "dog" occurs in a string
def count_dogs(text):
return text.lower().split().count("dog")
essay = """The dog is a pet animal. A dog has sharp teeth so that it can eat flesh very easily. A dog has four legs, two ears, two eyes, a tail, a mouth, and a nose. A dog is a very clever animal and is very useful in catching thieves. A dog runs very fast, barks loudly and attacks the strangers. A dog saves the life of the master from danger. Dog are a very faithful animal. Usually, the dog eats fish, meat, milk, rice, bread, etc. Dogs are sometimes called canines. The lifespan of a dog is very small however it can live around 12-15 years long which depend on their size such as smaller dogs lives a longer life. A female dog gives birth to a baby and feed milk that's why dogs under the mammal category. The dog baby is called a puppy or pup and dog home is called kennel."""
print("9: Number of times 'dog' occurs:", count_dogs(essay))
# 10. Speeding ticket function with birthday consideration
def speeding_ticket(speed, is_birthday):
allowance = 5 if is_birthday else 0
if speed <= 60 + allowance:
return "No Ticket"
elif speed <= 80 + allowance:
return "Small Ticket"
else:
return "Big Ticket"
print("10:", speeding_ticket(70, False)) # Small Ticket
print("10:", speeding_ticket(81, True)) # Small Ticket
Part 2: NumPy
# Import NumPy
import numpy as np
# 1. Create an array of 10 zeros
zeros_array = np.zeros(10)
print("1:", zeros_array)
# 2. Create an array of 10 ones
ones_array = np.ones(10)
print("2:", ones_array)
# 3. Create an array of 10 fives
fives_array = np.full(10, 5)
print("3:", fives_array)
# 4. Create an array of integers from 10 to 50
arr_10_to_50 = np.arange(10, 51)
print("4:", arr_10_to_50)
# 5. Create an array of even integers from 10 to 50
even_arr = np.arange(10, 51, 2)
print("5:", even_arr)
# 6. Create a 3x3 matrix with values from 0 to 8
matrix_3x3 = np.arange(9).reshape(3, 3)
print("6:\n", matrix_3x3)
# 7. Create a 3x3 identity matrix
identity_matrix = np.eye(3)
print("7:\n", identity_matrix)
# 8. Generate a random number between 0 and 1
rand_num = np.random.rand()
print("8:", rand_num)
# 9. Generate an array of 25 random numbers from a standard normal distribution
rand_array_25 = np.random.randn(25)
print("9:", rand_array_25)
# 10. Create an array of 20 linearly spaced points between 0 and 1
linspace_20 = np.linspace(0, 1, 20)
print("10:", linspace_20)
# 11. Create the given 5x5 matrix
mat = np.arange(1, 26).reshape(5, 5)
print("11:\n", mat)
# 12. Get the sum of all values in mat
sum_mat = mat.sum()
print("12: Sum of all values:", sum_mat)
# 13. Get the standard deviation of the values in mat
std_mat = mat.std()
print("13: Standard deviation:", std_mat)
# 14. Get the sum of all the columns in mat
col_sum = mat.sum(axis=0)
print("14: Column-wise sum:", col_sum)
Part 3: Pandas
# Import required libraries
import pandas as pd
import numpy as np
# Basic DataFrame creation
df1 = pd.DataFrame({'Numbers': [1, 2, 3, 4, 5]})
print("Basic DataFrame:")
print(df1)
# DataFrame with custom index
df2 = pd.DataFrame({'Numbers': [1, 2, 3, 4, 5]},
index=['one', 'two', 'three', 'four', 'five'])
print("\nDataFrame with custom index:")
print(df2)
# Create DataFrame with multiple columns
data = {
'Name': ['Tom', 'Jack', 'Steve', 'Ricky'],
'Age': [28, 34, 29, 42],
'Mobile': [1234, 5678, 9876, 5432]
}
df4 = pd.DataFrame(data)
print("\nDataFrame with multiple columns:")
print(df4)
# Display specific columns
print("\nDisplay Names:")
print(df4['Name'])
# Display specific row
print("\nDisplay Jack's data:")
print(df4.loc[df4['Name'] == 'Jack'])
# Display multiple columns
print("\nDisplay name and mobile:")
print(df4[['Name', 'Mobile']])
# Create DataFrame with custom index
data = {
'Name': ['Tom', 'Jack', 'Steve', 'Ricky', 'Greg'],
'Age': [28, 34, 29, 42, 54],
'Mobile': [1234, 5678, 9876, 5432, 5555]
}
df5 = pd.DataFrame(data, index=['A', 'B', 'C', 'D', 'E'])
print("\nDataFrame with custom index:")
print(df5)
# Add new columns
df5['m1'] = [55, 78, 90, 89, 78]
df5['m2'] = [85, 89, 79, 80, 89]
print("\nAfter adding marks columns:")
print(df5)
# Calculate total
df5['Total'] = df5['m1'] + df5['m2']
print("\nAfter adding total:")
print(df5)
# Add remarks column
df5['remarks'] = ['F', 'P', 'P', 'P', 'P']
print("\nAfter adding remarks:")
print(df5)
# Remove column
df5 = df5.drop('remarks', axis=1)
print("\nAfter removing remarks:")
print(df5)
# Remove row
df5 = df5.drop(index='D')
print("\nAfter removing row D:")
print(df5)
# Check DataFrame shape
print("\nDataFrame shape:", df5.shape)
Part 4: Data Visualization
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Line Plot Example
height = [0, 100, 200, 300, 400, 500]
temperature = [30, 28, 25, 22, 20, 18]
plt.plot(height, temperature)
plt.xlabel("Height (m)")
plt.ylabel("Temperature (°C)")
plt.title("Temperature vs Height")
plt.show()
# Date-wise Temperature Plot
date = ["25/12", "26/12", "27/12"]
temp = [8.5, 10.5, 6.8]
plt.plot(date, temp)
plt.xlabel("Date")
plt.ylabel("Temperature (°C)")
plt.title("Date wise Temperature")
plt.grid(True)
plt.show()
# Weight vs Height Plot
height = [121.9, 124.5, 129.5, 134.6, 139.7, 147.3, 152.4, 157.5, 162.6]
weight = [19.7, 21.3, 23.5, 25.9, 28.5, 32.1, 35.7, 39.6, 43.2]
plt.plot(weight, height, marker='*', markersize=10, color='green',
linewidth=2, linestyle='dashed')
plt.xlabel("Weight (kg)")
plt.ylabel("Height (cm)")
plt.title("Average Weight vs Height")
plt.show()
# Pie Chart Example
df = pd.DataFrame({
'Category': ['A', 'B', 'C', 'D'],
'Values': [20, 30, 25, 25]
})
df.plot(kind='pie', y='Values', labels=df['Category'],
autopct='%1.2f%%', figsize=(6,6))
plt.title("Custom Pie Chart")
plt.show()
Note: For the data visualization examples that require CSV files (like salary_data.csv, Marks.csv, etc.), you'll need to have these files in your working directory. The paths in the code should be adjusted according to your file locations.
Practical 5: Perceptron algorithm for OR Function
import numpy as np
class Perceptron:
def __init__(self, learning_rate=0.01, n_itsperations=100):
self.learning_rate = learning_rate
self.n_iterations = n_iterations
self.weights = None
self.bias = None
def fit(self, X, y):
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
y_ = np.array([1 if i > 0 else 0 for i in y])
for _ in range(self.n_iterations):
for idx, x_i in enumerate(X):
linear_output = np.dot(x_i, self.weights) + self.bias
y_predicted = self.activation_function(linear_output)
update = self.learning_rate * (y_[idx] - y_predicted)
self.weights += update * x_i
self.bias += update
def activation_function(self, x):
return np.where(x>=0, 1, 0)
def predict(self, X):
linear_output = np.dot(X, self.weights) + self.bias
y_predicted = self.activation_function(linear_output)
return y_predicted
# OR gate inputs and outputs
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 1, 1, 1])
# Initialize and train the perceptron
perceptron = Perceptron(learning_rate=0.1, n_iterations=10)
perceptron.fit(X, y)
# Test the perceptron
predictions = perceptron.predict(X)
predictions
# Expected output: [0 1 1 1]
Practical 6: Improve prediction accuracy using Stochastic Gradient Descent
import numpy as np
def compute_error_for_line_given_points(b, m, points):
"""Calculate mean squared error for a line defined by slope (m) and intercept (b)
Args:
b (float): y-intercept
m (float): slope
points (numpy.array): Array of [x,y] coordinates
Returns:
float: Mean squared error
"""
totalError = 0
for i in range(len(points)):
x = points[i, 0]
y = points[i, 1]
totalError += (y - (m * x + b)) ** 2
return totalError / float(len(points))
def step_gradient(b_current, m_current, points, learningRate):
"""Calculate one step of gradient descent
Args:
b_current (float): Current y-intercept
m_current (float): Current slope
points (numpy.array): Array of [x,y] coordinates
learningRate (float): Step size for gradient descent
Returns:
tuple: Updated b and m values
"""
b_gradient = 0
m_gradient = 0
N = float(len(points))
for i in range(len(points)):
x = points[i, 0]
y = points[i, 1]
b_gradient += -(2/N) * (y - ((m_current * x) + b_current))
m_gradient += -(2/N) * x * (y - ((m_current * x) + b_current))
new_b = b_current - (learningRate * b_gradient)
new_m = m_current - (learningRate * m_gradient)
return [new_b, new_m]
def gradient_descent_runner(points, starting_b, starting_m, learning_rate, num_iterations):
"""Run gradient descent algorithm
Args:
points (numpy.array): Array of [x,y] coordinates
starting_b (float): Initial y-intercept
starting_m (float): Initial slope
learning_rate (float): Step size for gradient descent
num_iterations (int): Number of iterations to run
Returns:
tuple: Final b and m values
"""
b = starting_b
m = starting_m
# Print initial error
print(f"Starting gradient descent at b = {b}, m = {m}, "
f"error = {compute_error_for_line_given_points(b, m, points)}")
for i in range(num_iterations):
b, m = step_gradient(b, m, points, learning_rate)
return [b, m]
def run():
"""Main function to run linear regression"""
try:
points = np.genfromtxt("data.csv", delimiter=",")
learning_rate = 0.0001
initial_b = 0 # initial y-intercept guess
initial_m = 0 # initial slope guess
num_iterations = 1000
print("Running...")
[b, m] = gradient_descent_runner(points, initial_b, initial_m,
learning_rate, num_iterations)
print(f"After {num_iterations} iterations b = {b}, "
f"m = {m}, error = {compute_error_for_line_given_points(b, m, points)}")
except FileNotFoundError:
print("Error: Could not find data.csv file.")
except Exception as e:
print(f"An error occurred: {str(e)}")
if __name__ == '__main__':
run()
Practical 7: Implement Adaline algorithm for AND operation
import numpy as np
class Adaline:
def __init__(self, input_size, learning_rate=0.1, epochs=100):
self.weights = np.zeros(input_size)
self.bias = 0
self.learning_rate = learning_rate
self.epochs = epochs
def activation(self, x):
# Linear activation (identity function)
return x
def predict(self, X):
# Compute the linear output
return self.activation(np.dot(X, self.weights) + self.bias)
def train(self, X, y):
# Train the model using Adaline's learning rule (Least Mean Squares)
for epoch in range(self.epochs):
for i in range(len(X)):
# Calculate the prediction
prediction = self.predict(X[i])
# Compute the error
error = y[i] - prediction
# Update the weights and bias
self.weights += self.learning_rate * error * X[i]
self.bias += self.learning_rate * error
def evaluate(self, X):
# Make predictions for the input X
return np.where(self.predict(X) >= 0.5, 1, 0) # Convert to binary output
# AND operation input and output
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) # Input pairs
y = np.array([0, 0, 0, 1]) # AND outputs
# Initialize Adaline model with 2 input features (for A and B), learning rate, and epochs
adaline = Adaline(input_size=2, learning_rate=0.1, epochs=100)
# Train the Adaline model
adaline.train(X, y)
# Evaluate the trained model on the same inputs (X)
predictions = adaline.evaluate(X)
print("Predictions on the AND operation:")
for i, prediction in enumerate(predictions):
print(f"Input: {X[i]} => Predicted: {prediction} => Actual: {y[i]}")
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
# Sample dataset
data = {
'Math': [85, 90, 88, 60, 76, 95],
'Science': [80, 85, 89, 65, 70, 100],
'English': [78, 85, 80, 70, 75, 90],
'Computer': [92, 96, 94, 65, 78, 98],
'Passed': [1, 1, 1, 0, 0, 1]
}
df = pd.DataFrame(data)
print("Original Data:\n", df)
# Feature extraction
X = df.drop('Passed', axis=1)
y = df['Passed']
# Feature selection
selector = SelectKBest(score_func=f_classif, k=2)
X_new = selector.fit_transform(X, y)
print("\nSelected Features (Top 2):\n", X_new)
# Normalization
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)
print("\nNormalized Data:\n", X_normalized)
# Standardization
standardizer = StandardScaler()
X_transformed = standardizer.fit_transform(X)
print("\nStandardized Data:\n", X_transformed)
# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_transformed)
print("\nPCA Result:\n", X_pca)
Practical 9: Logistic Regression for Survival Prediction
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
# Loading the data
train = pd.read_csv('titanic.csv')
print("First few rows of the dataset:")
print(train.head())
# Analyzing Missing Data
percent_missing = train.isnull().sum() * 100 / len(train)
result = pd.DataFrame({'cols': train.columns, 'percent_missing': percent_missing})
result.sort_values('percent_missing', inplace=True)
print("\nMissing Data Analysis:")
print(result)
# Analyzing Survival Counts
x = train[train['Survived'] == 0]
notsurvived = x.count()
y = train[train['Survived'] == 1]
survived = y.count()
pdsurvived = pd.DataFrame({"Not Survived": notsurvived, "Survived": survived},
index=["Not Survived", "Survived"])
print("\nSurvival Counts:")
print(pdsurvived)
# Data Cleaning
train.dropna(inplace=True)
# Converting Categorical Features
print("\nDataset Info:")
print(train.info())
train.drop(['Sex', 'Name', 'Ticket'], axis=1, inplace=True)
print("\nDataset after dropping categorical features:")
print(train.head())
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
train.drop('Survived', axis=1),
train['Survived'],
test_size=0.30,
random_state=101
)
# Training and Predicting
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)
predictions = logmodel.predict(X_test)
print("\nPredictions:")
print(predictions)
# Evaluation
confusion_matrix = metrics.confusion_matrix(y_test, predictions)
cm_display = metrics.ConfusionMatrixDisplay(
confusion_matrix=confusion_matrix,
display_labels=[0, 1]
)
cm_display.plot()
plt.show()
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, predictions))
Practical 10: Bank Customers Retirement Predictions using SVM
Step 1: Import Libraries and Load Data
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load the dataset
bank_df = pd.read_csv('Bank_Customer_retirement.csv')
# Display basic information
print("Dataset Shape:", bank_df.shape)
print("\nFirst few rows:")
print(bank_df.head())
# Visualize the data
sns.pairplot(bank_df, hue='Retire', vars=['Age', 'Savings'])
plt.show()
Step 2: Data Preprocessing
# Drop Customer ID column
bank_df = bank_df.drop(['Customer ID'], axis=1)
# Prepare features and target
X = bank_df.drop(['Retire'], axis=1)
y = bank_df['Retire']
# Standardize the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_data_scaled = scaler.fit_transform(X)
# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X_data_scaled, y, test_size=0.20, random_state=101
)
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
Step 3: Model Training and Evaluation with Different Kernels
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Linear Kernel
print("Linear Kernel Results:")
svmmodel1 = SVC(kernel='linear')
svmmodel1.fit(X_train, y_train)
y_pred = svmmodel1.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True)
plt.show()
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# Polynomial Kernel
print("\nPolynomial Kernel Results:")
svmmodel2 = SVC(kernel='poly')
svmmodel2.fit(X_train, y_train)
y_pred = svmmodel2.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True)
plt.show()
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# RBF Kernel
print("\nRBF Kernel Results:")
svmmodel3 = SVC(kernel='rbf')
svmmodel3.fit(X_train, y_train)
y_pred = svmmodel3.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True)
plt.show()
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
Practical 11: Elbow K-means Clustering for Indian States/UTs
Step 1: Import Libraries and Load Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.cluster import KMeans
# Load the dataset
data = pd.read_csv("India StatesUTs.csv")
# Display the dataset
print("Dataset:")
print(data)
# Display the shape of the dataset
print("\nDataset Shape:", data.shape)
Step 2: Prepare Data and Apply K-means
# Select features for clustering
x = data.iloc[:, 1:3] # Selecting Longitude and Latitude columns
# Apply K-means clustering
kmeansmodel = KMeans(n_clusters=5)
kmeansmodel.fit(x)
# Get cluster assignments
identified_clusters = kmeansmodel.fit_predict(x)
print("\nCluster Assignments:")
print(identified_clusters)
# Add cluster information to the dataset
data_with_clusters = data.copy()
data_with_clusters['Cluster'] = identified_clusters
print("\nData with Clusters:")
print(data_with_clusters)
Step 3: Visualize Clusters
# Plot the clusters
plt.figure(figsize=(10, 6))
plt.scatter(data_with_clusters['Longitude'],
data_with_clusters['Latitude'],
c=data_with_clusters['Cluster'],
cmap='brg',
s=200)
plt.xlim(50, 100)
plt.ylim(0, 50)
plt.title('Clusters of Indian States/UTs')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()
Step 4: Elbow Method for Optimal Clusters
# Calculate WCSS for different numbers of clusters
wcss = []
for i in range(1, 7):
kmeans = KMeans(i)
kmeans.fit(x)
wcss_iter = kmeans.inertia_
wcss.append(wcss_iter)
print("\nWithin-Cluster Sum of Squares (WCSS):")
print(wcss)
# Plot the elbow curve
plt.figure(figsize=(10, 6))
number_clusters = range(1, 7)
plt.plot(number_clusters, wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-cluster Sum of Squares')
plt.show()
Practical 12: Random Forest and Bagging Implementation
Part 1: Random Forest Implementation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score, classification_report
# Load and explore the dataset
df = pd.read_csv("playgolf.csv")
print("Dataset Preview:")
print(df.head(15))
print("\nDataset Info:")
print(df.info())
print("\nDataset Description:")
print(df.describe())
# Analyze categorical columns
categorical_col = []
for column in df.columns:
categorical_col.append(column)
print(f"{column} : {df[column].unique()}")
print("====================================")
# Check target variable distribution
print("\nPlayGolf Distribution:")
print(df.PlayGolf.value_counts())
# Prepare categorical columns for encoding
categorical_col.remove('PlayGolf')
# Encode categorical variables
label = LabelEncoder()
for column in categorical_col:
df[column] = label.fit_transform(df[column])
print("\nEncoded Dataset:")
print(df)
# Split the data
X = df.drop('PlayGolf', axis=1)
y = df.PlayGolf
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Random Forest model
RandomForestmodel = RandomForestClassifier(n_estimators=10)
RandomForestmodel.fit(X_train, y_train)
# Apply Bagging
n_estimators = 10
bagging_classifier = BaggingClassifier(base_estimator=RandomForestmodel,
n_estimators=n_estimators)
# Train and evaluate
bagging_classifier.fit(X_train, y_train)
y_pred = bagging_classifier.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"\nAccuracy Score: {accuracy_score(y_test, y_pred) * 100:.2f}%")
Part 2: Ensemble Bagging with Voting
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
# Create sample dataset
def CreateDataFrame(N):
columns = ['a', 'b', 'c', 'y']
df = pd.DataFrame(columns=columns)
for i in range(N):
a = np.random.randint(10)
b = np.random.randint(20)
c = np.random.randint(5)
y = "normal"
if((a+b+c) > 25):
y = "high"
elif((a+b+c) < 12):
y = "low"
df.loc[i] = [a, b, c, y]
return df
# Generate and prepare data
df = CreateDataFrame(200)
print("Generated Dataset Preview:")
print(df.head())
X = df[["a", "b", "c"]]
Y = df[["y"]]
# Encode target variable
le = LabelEncoder()
y = le.fit_transform(Y)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Initialize base models
dtcmodel = DecisionTreeClassifier(criterion="entropy")
lrmodel = LogisticRegression()
bnbmodel = BernoulliNB()
gnbmodel = GaussianNB()
svcmodel = SVC()
# Train and evaluate base models with bagging
base_methods = [dtcmodel, lrmodel, bnbmodel, gnbmodel, svcmodel]
for bm in base_methods:
print(f"\nMethod: {bm}")
bag_model = BaggingClassifier(base_estimator=bm, n_estimators=100, bootstrap=True)
bag_model.fit(X_train, y_train)
ytest_pred = bag_model.predict(X_test)
print(f"Accuracy: {bag_model.score(X_test, y_test)}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, ytest_pred))
# Create and evaluate voting classifier
voting_clf = VotingClassifier(estimators=[
('DecisionTree', dtcmodel),
('Logistic', lrmodel),
('Bernoulli', bnbmodel),
('Gaussian', gnbmodel),
('SVC', svcmodel)
])
# Train and evaluate voting classifier
voting_clf.fit(X_train, y_train)
predictions = voting_clf.predict(X_test)
print("\nVoting Classifier Results:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("\nClassification Report:")
print(classification_report(y_test, predictions))
Practical 13: AdaBoost, Stochastic Gradient Boosting, and Voting Ensemble
Part 1: AdaBoost Classification
import pandas as pd
from sklearn import model_selection
from sklearn.ensemble import AdaBoostClassifier
# Load the dataset
df = pd.read_csv("pimaindiansdiabetes.csv")
print("Dataset Preview:")
print(df)
print("\nDataset Shape:", df.shape)
# Prepare features and target
X = df.iloc[:, 0:8]
y = df.iloc[:, 8]
# Implement AdaBoost
kfold = model_selection.KFold(n_splits=10, random_state=42)
num_trees = 30
model = AdaBoostClassifier(n_estimators=num_trees, random_state=42)
results = model_selection.cross_val_score(model, X, y, cv=kfold)
print("\nAdaBoost Mean Accuracy:", results.mean())
Part 2: Stochastic Gradient Boosting
import pandas as pd
from sklearn import model_selection
from sklearn.ensemble import GradientBoostingClassifier
# Load the dataset
df = pd.read_csv("pimaindiansdiabetes.csv")
print("Dataset Preview:")
print(df)
# Prepare features and target
X = df.iloc[:, 0:8]
y = df.iloc[:, 8]
# Implement Stochastic Gradient Boosting
kfold = model_selection.KFold(n_splits=10, random_state=42)
num_trees = 30
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=42)
results = model_selection.cross_val_score(model, X, y, cv=kfold)
print("\nStochastic Gradient Boosting Mean Accuracy:", results.mean())
Part 3: Voting Ensemble
import pandas as pd
import warnings
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
# Suppress warnings
warnings.filterwarnings("ignore")
# Load the dataset
df = pd.read_csv("pimaindiansdiabetes.csv")
print("Dataset Preview:")
print(df)
# Prepare features and target
X = df.iloc[:, 0:8]
y = df.iloc[:, 8]
# Create base models
estimators = []
logmodel = LogisticRegression()
estimators.append(('logistic', logmodel))
DTmodel = DecisionTreeClassifier()
estimators.append(('cart', DTmodel))
SVCmodel = SVC()
estimators.append(('svm', SVCmodel))
# Create and evaluate voting ensemble
ensemble = VotingClassifier(estimators)
print("\nVoting Ensemble Model:", ensemble)
kfold = model_selection.KFold(n_splits=10)
results = model_selection.cross_val_score(ensemble, X, y, cv=kfold)
print("\nVoting Ensemble Mean Accuracy:", results.mean())
Practical 14: Feature Selection, Scaling, and PCA
Part 1: Feature Selection using Chi-Square Test
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
# Load the dataset
data = pd.read_csv('Mobile_Data.csv')
print("Dataset Preview:")
print(data.head(5))
# Prepare features and target
X = data.iloc[:, 0:20] # All columns except price range
y = data.iloc[:, -1] # Only price range column
# Apply Chi-Square Test to select top 10 features
bestfeatures = SelectKBest(score_func=chi2, k=10)
model = bestfeatures.fit(X, y)
# Create DataFrame with feature scores
dfscores = pd.DataFrame(model.scores_)
dfcolumns = pd.DataFrame(X.columns)
feature_scores = pd.concat([dfcolumns, dfscores], axis=1)
feature_scores.columns = ['Feature', 'Score']
print("\nFeature Scores:")
print(feature_scores.sort_values(by='Score', ascending=False))
Part 2: Feature Scaling - Normalization and Standardization
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Load the dataset
cols = ['loan_amount', 'interest_rate', 'installment']
data = pd.read_csv('Loan_Data.csv', usecols=cols)
print("Dataset Preview:")
print(data.head())
# Standardization
scaler = StandardScaler()
std_data_scaled = scaler.fit_transform(data)
print("\nStandardized Data:")
print(std_data_scaled)
print("\nStandardized Data Statistics:")
print("Mean:", std_data_scaled.mean(axis=0))
print("Standard Deviation:", std_data_scaled.std(axis=0))
# Normalization
scaler = MinMaxScaler()
norm_data_scaled = scaler.fit_transform(data)
print("\nNormalized Data:")
print(norm_data_scaled)
print("\nNormalized Data Statistics:")
print("Mean:", norm_data_scaled.mean(axis=0))
print("Standard Deviation:", norm_data_scaled.std(axis=0))
Part 3: Linear Discriminant Analysis (LDA)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
# Load the dataset
dataset = pd.read_csv('Wine.csv')
X = dataset.iloc[:, 0:13].values
y = dataset.iloc[:, 13].values
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Apply LDA
lda = LDA(n_components=2)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)
# Train Logistic Regression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
# Make predictions and evaluate
y_pred = classifier.predict(X_test)
print("\nLDA Results:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
Part 4: Principal Component Analysis (PCA)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
# Load the dataset
dataset = pd.read_csv('Wine.csv')
X = dataset.iloc[:, 0:13].values
y = dataset.iloc[:, 13].values
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Apply PCA
pca = PCA(n_components=2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
# Train Logistic Regression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
# Make predictions and evaluate
y_pred = classifier.predict(X_test)
print("\nPCA Results:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))