Quix
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
# Step 1: Get Iris dataset
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
# Step 2: Perform one data understanding technique
# Let's display the first few rows and some basic statistics
print(df.head())
print(df.describe())
# Visualize the pairplot
sns.pairplot(df, hue='target')
plt.show()
sns.barplot(x='col_1', y='col_2', data=df)
sns.scatterplot(x='col_1', y='col_2', data=df)
sns.histplot(x=col_name, data=df)
plt.show(
# Step 3: Perform one feature selection technique
# Calculate the correlation matrix
correlation_matrix = df.corr()
print(correlation_matrix)
# Visualize the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()
# Step 4: Perform any data preprocessing
# In this case, we can standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.iloc[:, :-1])
y = df['target']
df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
# Check for missing values
print(df.isnull().sum())
# If there were missing values, we could handle them like this:
# df.fillna(df.mean(), inplace=True) # for numerical columns
# df.fillna(df.mode().iloc[0], inplace=True) # for categorical columns
# Step 5: Perform any other technique required for step 6
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
# Step 6: Break data into features and target and apply Logistic Regression algorithm
# Train the model
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train, y_train)
# Predict on the test set
y_pred = log_reg.predict(X_test)
# Evaluate the model
report = classification_report(y_test, y_pred, target_names=iris.target_names)
print(report)
Other Algos:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# Step 6: Apply Logistic Regression
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
report = classification_report(y_test, y_pred, target_names=iris.target_names)
print("Logistic Regression Report:\n", report)
# Apply Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
report = classification_report(y_test, y_pred, target_names=iris.target_names)
print("Naive Bayes Report:\n", report)
# Apply Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
report = classification_report(y_test, y_pred, target_names=iris.target_names)
print("Random Forest Report:\n", report)
theory
Certainly! Here's a concise explanation for each concept:
1. **Linear Regression**: Models the relationship between a dependent variable and independent variables by fitting a linear equation. The goal is to minimize the sum of squared differences between observed and predicted values. It’s used for predicting continuous outcomes.
2. **Logistic Regression**: Used for binary classification, modeling the probability of a class using the logistic function. It predicts the likelihood of an event occurring by fitting data to a logistic curve. Outputs probabilities which can be thresholded to predict classes.
3. **K-Nearest Neighbors (KNN)**: Classifies data points based on the 'k' nearest neighbors' majority class. Uses distance metrics to find the nearest neighbors in the feature space. It's a simple, instance-based learning method for classification and regression.
4. **Naive Bayes**: Applies Bayes' theorem with strong independence assumptions between features for classification. Calculates the probability of each class given the feature vector and selects the class with the highest probability. Effective for text classification and spam filtering.
5. **Random Forest**: An ensemble of decision trees where each tree is trained on a random subset of data and features. Combines the predictions of multiple trees to improve accuracy and robustness. Reduces overfitting by averaging multiple decision trees.
6. **Bagging (Bootstrap Aggregating)**: Involves training multiple models on different random subsets of the data and averaging their predictions. Reduces variance and helps improve model stability and accuracy. Commonly used in Random Forests.
7. **Boosting**: Sequentially trains models to correct errors made by previous models, improving overall performance. Combines weak learners to create a strong learner. Popular algorithms include AdaBoost and Gradient Boosting.
8. **Stacking (Stacked Generalization)**: Combines multiple models by training a meta-model on their outputs to improve prediction accuracy. Uses the strengths of different models to enhance overall performance. Typically involves a two-level model training process.
9. **Covariance**: Measures how two variables change together, with positive values indicating they increase together and negative values indicating one increases while the other decreases. Indicates the direction of the linear relationship. It’s not standardized.
10. **Correlation**: Quantifies the strength and direction of a linear relationship between two variables, ranging from -1 to 1. Standardized form of covariance. A correlation of 1 or -1 indicates a perfect linear relationship.
11. **Label Encoder**: Converts categorical labels into numeric values for model compatibility. Assigns a unique integer to each category. Suitable for ordinal categorical variables.
12. **Standard Scaler**: Standardizes features by removing the mean and scaling to unit variance. Centers the data around 0 with a standard deviation of 1. Useful for algorithms sensitive to feature scaling.
13. **Min-Max Scaler**: Scales features to a specified range, typically [0, 1]. Transforms data linearly based on the minimum and maximum values. Preserves the relationships between the original data values.
14. **Bias**: The error introduced by approximating a real-world problem with a simplified model. High bias can cause underfitting, missing relevant patterns in the data. Indicates model assumptions.
15. **Variance**: The error introduced by the model’s sensitivity to small fluctuations in the training data. High variance can cause overfitting, capturing noise as patterns. Reflects the model’s complexity.
16. **Outliers**: Data points significantly different from others in the dataset. Can distort statistical analyses and models. Handled by removal, transformation, or using robust algorithms.
17. **Underfitting**: When a model is too simple to capture the underlying data patterns, leading to poor performance. Can be addressed by increasing model complexity or adding features. Results in high bias and low variance.
18. **Overfitting**: When a model captures noise in the training data as if it were a true pattern, leading to poor generalization. Mitigated by techniques like cross-validation, regularization, or pruning. Results in low bias and high variance.
19. **One-Hot Encoder**: Converts categorical variables into a binary matrix representation. Each category is represented by a unique binary vector. Eliminates ordinal relationships in categorical data.
20. **Feature Selection**: The process of selecting the most relevant features for model training to improve performance. Helps reduce overfitting and improve model interpretability. Techniques include filter, wrapper, and embedded methods.
21. **Forward Feature Selection**: Iteratively adds features that improve model performance one at a time. Starts with no features and adds them based on performance improvement. Continues until no significant improvement is observed.
22. **Backward Feature Selection**: Starts with all features and removes the least significant feature one at a time. Continues until the removal of features worsens model performance. Helps identify the most important features.
23. **Preprocessing Techniques**: Involves steps like scaling, normalization, and encoding to prepare data for modeling. Ensures models perform well and generalize better. Essential for consistent and reliable model training.
24. **Ensemble Growing**: Building an ensemble by adding more models to improve performance. Techniques include bagging and boosting. Enhances accuracy and robustness of predictions.
25. **Ensemble Pruning**: Reducing the number of models in an ensemble to improve efficiency and avoid overfitting. Removes models that contribute little to performance. Streamlines the ensemble for better generalization.
26. **F1 Score**: Harmonic mean of precision and recall, balancing false positives and false negatives. \( F1 = 2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}} \). Useful for imbalanced classification tasks.
27. **Chi-Square Test**: A statistical test to determine the independence of categorical variables. Compares observed frequencies to expected frequencies. Useful in hypothesis testing and feature selection.
28. **Precision**: The ratio of true positive predictions to the total predicted positives. \( \text{Precision} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Positives}} \). Indicates the accuracy of positive predictions.
29. **Recall**: The ratio of true positive predictions to all actual positives. \( \text{Recall} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}} \). Measures the model's ability to identify all positive instances.
• Gradient Boosting: Gradient Boosting is an ensemble technique that builds models sequentially, each new model correcting errors made by the previous ones. It minimizes a loss function by adding models in a stage-wise fashion, often using decision trees as the weak learners. This process helps improve model accuracy and reduce bias.
• Decision Tree: A Decision Tree is a non-parametric model used for classification and regression. It splits the data into subsets based on feature values, creating a tree-like structure where each node represents a feature, each branch represents a decision rule, and each leaf represents an outcome. Decision Trees are easy to interpret but can overfit if not pruned properly.
whatsapp end time wala ryn pai ka
`
Certainly! Here's a concise explanation for each concept:
Linear Regression: Models the relationship between a dependent variable and independent variables by fitting a linear equation. The goal is to minimize the sum of squared differences between observed and predicted values. It’s used for predicting continuous outcomes.
Logistic Regression: Used for binary classification, modeling the probability of a class using the logistic function. It predicts the likelihood of an event occurring by fitting data to a logistic curve. Outputs probabilities which can be thresholded to predict classes.
K-Nearest Neighbors (KNN): Classifies data points based on the 'k' nearest neighbors' majority class. Uses distance metrics to find the nearest neighbors in the feature space. It's a simple, instance-based learning method for classification and regression.
Naive Bayes: Applies Bayes' theorem with strong independence assumptions between features for classification. Calculates the probability of each class given the feature vector and selects the class with the highest probability. Effective for text classification and spam filtering.
Random Forest: An ensemble of decision trees where each tree is trained on a random subset of data and features. Combines the predictions of multiple trees to improve accuracy and robustness. Reduces overfitting by averaging multiple decision trees.
Bagging (Bootstrap Aggregating): Involves training multiple models on different random subsets of the data and averaging their predictions. Reduces variance and helps improve model stability and accuracy. Commonly used in Random Forests.
Boosting: Sequentially trains models to correct errors made by previous models, improving overall performance. Combines weak learners to create a strong learner. Popular algorithms include AdaBoost and Gradient Boosting.
Stacking (Stacked Generalization): Combines multiple models by training a meta-model on their outputs to improve prediction accuracy. Uses the strengths of different models to enhance overall performance. Typically involves a two-level model training process.
Covariance: Measures how two variables change together, with positive values indicating they increase together and negative values indicating one increases while the other decreases. Indicates the direction of the linear relationship. It’s not standardized.
Correlation: Quantifies the strength and direction of a linear relationship between two variables, ranging from -1 to 1. Standardized form of covariance. A correlation of 1 or -1 indicates a perfect linear relationship.
Label Encoder: Converts categorical labels into numeric values for model compatibility. Assigns a unique integer to each category. Suitable for ordinal categorical variables.
Standard Scaler: Standardizes features by removing the mean and scaling to unit variance. Centers the data around 0 with a standard deviation of 1. Useful for algorithms sensitive to feature scaling.
Min-Max Scaler: Scales features to a specified range, typically [0, 1]. Transforms data linearly based on the minimum and maximum values. Preserves the relationships between the original data values.
Bias: The error introduced by approximating a real-world problem with a simplified model. High bias can cause underfitting, missing relevant patterns in the data. Indicates model assumptions.
Variance: The error introduced by the model’s sensitivity to small fluctuations in the training data. High variance can cause overfitting, capturing noise as patterns. Reflects the model’s complexity.
Outliers: Data points significantly different from others in the dataset. Can distort statistical analyses and models. Handled by removal, transformation, or using robust algorithms.
Underfitting: When a model is too simple to capture the underlying data patterns, leading to poor performance. Can be addressed by increasing model complexity or adding features. Results in high bias and low variance.
Overfitting: When a model captures noise in the training data as if it were a true pattern, leading to poor generalization. Mitigated by techniques like cross-validation, regularization, or pruning. Results in low bias and high variance.
One-Hot Encoder: Converts categorical variables into a binary matrix representation. Each category is represented by a unique binary vector. Eliminates ordinal relationships in categorical data.
Feature Selection: The process of selecting the most relevant features for model training to improve performance. Helps reduce overfitting and improve model interpretability. Techniques include filter, wrapper, and embedded methods.
Forward Feature Selection: Iteratively adds features that improve model performance one at a time. Starts with no features and adds them based on performance improvement. Continues until no significant improvement is observed.
Backward Feature Selection: Starts with all features and removes the least significant feature one at a time. Continues until the removal of features worsens model performance. Helps identify the most important features.
Preprocessing Techniques: Involves steps like scaling, normalization, and encoding to prepare data for modeling. Ensures models perform well and generalize better. Essential for consistent and reliable model training.
Ensemble Growing: Building an ensemble by adding more models to improve performance. Techniques include bagging and boosting. Enhances accuracy and robustness of predictions.
Ensemble Pruning: Reducing the number of models in an ensemble to improve efficiency and avoid overfitting. Removes models that contribute little to performance. Streamlines the ensemble for better generalization.
F1 Score: Harmonic mean of precision and recall, balancing false positives and false negatives. ( F1 = 2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}} ). Useful for imbalanced classification tasks.
Chi-Square Test: A statistical test to determine the independence of categorical variables. Compares observed frequencies to expected frequencies. Useful in hypothesis testing and feature selection.
Precision: The ratio of true positive predictions to the total predicted positives. ( \text{Precision} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Positives}} ). Indicates the accuracy of positive predictions.
Recall: The ratio of true positive predictions to all actual positives. ( \text{Recall} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}} ). Measures the model's ability to identify all positive instances.
• Gradient Boosting: Gradient Boosting is an ensemble technique that builds models sequentially, each new model correcting errors made by the previous ones. It minimizes a loss function by adding models in a stage-wise fashion, often using decision trees as the weak learners. This process helps improve model accuracy and reduce bias.
• Decision Tree: A Decision Tree is a non-parametric model used for classification and regression. It splits the data into subsets based on feature values, creating a tree-like structure where each node represents a feature, each branch represents a decision rule, and each leaf represents an outcome. Decision Trees are easy to interpret but can overfit if not pruned properly.
`
lab wali ipynb
import pandas as pd # data manipulation
import numpy as np # numerical computation
import matplotlib.pyplot as plt # plotting
import seaborn as sns # dataset bhe hote hain + plotting
import sklearn # models
#ds load
df = load_iris()
x = df.data
y = df.target
df = pd.DataFrame(x, columns=df.feature_names)
df['target'] = y
# load dataset from pc
df = pd.read_csv('iris.csv') # agar csv file hai
df = pd.read_excel('iris.xlsx') # aagar excel file hai
df.describe() # summary statistics
df.info()
n = df.describe() # summary statistics
print(n.index)
print(n.loc['count'])
df.duplicated().sum()
df.drop_duplicates() # mostly duplicates values ko drop karte hain, un cases me nahe karte jaha bahuuuuuut minimum duplicate values hain
df.isnull().sum()
df.dtypes
df.survived = df.survived.astype('category')
df.pclass = df.survived.pclass('category')
df.embarked = df.embarked.astype('category')
df.sex = df.sex.astype('category')
df.who = df.who.astype('category') # ye datatype change karne ka tareqa hai
df.dtypes
df.isnull().sum() # ye har column ki null values ka sum la kar de dega
df.dropna()
df.fillna(method = 0)
df.fillna(method='ffill') # pichli value se fill karega
df.fillna(method='bfill') # next value se fill karega
df.select_dtypes(include=['number']).mean()
# fill NaN values with mean
df.fillna(df.mean())
---------------------------
import seaborn as sns
tips = sns.load_dataset('tips')
tips.head(3)
tips.groupby('smoker')['total_bill'].mean()
df.corr()
df.cov()
# difference between correlation and covariance
# covariance is a measure of correlation, but it is not normalized
# correlation is normalized covariance
# Covariance and correlation are both used to measure the relationship between two variables, but they differ in a key way:
# Covariance: This tells you how much two variables move together, but it doesn't consider the strength of that relationship. It can be positive (both variables increase together or decrease together) or negative (one increases while the other decreases). The units of covariance depend on the units of your data.
# Correlation: This is a standardized version of covariance, taking into account the scales of the variables. It gives you a value between -1 and 1, where -1 indicates a perfect negative relationship, 1 indicates a perfect positive relationship, and 0 indicates no relationship.
# In simpler terms, covariance is like the raw signal, while correlation is the signal strength.
# agar 2 features ka correlation ziada ho (1 se kareeb ho ya -1 se kareeb ho) to wo features similar hain isliye un me se kisi ik ko drop kardenge
# agar feature ka target ke sath correlation ziaada ho to ye achi baat hai
df.feature_names
# Information gain calculates the reduction in entropy from the transformation of a dataset. It can be
# used for feature selection by evaluating the Information gain of each variable in the context of the
# target variable.
from sklearn.feature_selection import mutual_info_classif
df = load_iris()
X = pd.DataFrame(df.data, columns=df.feature_names)
y = pd.DataFrame(df.target, columns=['target'])
importance = mutual_info_classif(X, y) # har feature ko target se compare karega ke feature kitna contribute karrah hai taregt predict karne ke liye
feat_importance = pd.Series(importance, index = df.feature_names)
feat_importance.plot(kind='barh', color='teal')
chiscore = chi2(X, y)
feat_importance = pd.Series(chiscore[1], df.columns[:-1])
feat_importance < 0.005
from sklearn.feature_selection import SelectKBest, SelectPercentile,chi2
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
# selector = SelectKBest(chi2, k=2)
selector = SelectPercentile(chi2, percentile = 50)
selector.fit_transform(X, y)
print(X.columns[selector.pvalues_ < 0.005])
col = X.columns[selector.get_support()]
col
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy='mean')
np.random.seed(0)
X[np.random.rand(*X.shape) < 0.5 ] = np.nan
X_imputed = si.fit_transform(X)
X_imputed = pd.DataFrame(X_imputed, columns=X.columns)
X_imputed
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes, load_breast_cancer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
housing_url = 'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv'
housing = pd.read_csv(housing_url)
diabetes = load_diabetes()
breast_cancer = load_breast_cancer()
housing = pd.get_dummies(housing, drop_first=True)
housing_features = housing.drop('median_house_value', axis=1)
housing_target = housing['median_house_value']
X_train_housing, X_test_housing, y_train_housing, y_test_housing = train_test_split(housing_features, housing_target, test_size=0.2, random_state=42)
sfs_forward = SFS(LinearRegression(),
k_features='best',
forward=True,
floating=False,
scoring='r2',
cv=5)
sfs_forward = sfs_forward.fit(X_train_housing, y_train_housing)
forward_selected_features = list(sfs_forward.k_feature_names_)
print("Forward Selected Features on Housing Data:", forward_selected_features)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# one encoding at a time
for col in df.columns:
if df[col].dtype == 'object':
df[col] = le.fit_transform(df[col])
df.head()
sns.histplot(x = 'sepal length (cm)', data = df, kde=True)
baqi khud likhle mein n likhra