AI Sprint Week 1: Learning Resources & Code Repository
Week 1: Machine Learning Fundamentals - Learning Resources
This page contains all the resources, code examples, and materials needed for Week 1 of the AI Sprint.
Week 1 Learning Objectives
Core Skills to Master
- Environment setup and dependency management
- Data loading, exploration, and preprocessing
- Basic data visualization techniques
- Linear/Logistic regression implementation
- sklearn pipeline and cross-validation
Expected Deliverables
- 1 complete notebook with end-to-end ML project
- Data processing pipeline implementation
- Basic model training and evaluation
- Performance metrics and analysis
Code Examples & Templates
1. Environment Setup Script
# Create virtual environment
python -m venv .venv
# Activate environment (Windows)
.venv\Scripts\activate
# Activate environment (Mac/Linux)
source .venv/bin/activate
# Install required packages
pip install -U numpy pandas scikit-learn matplotlib seaborn jupyter
2. Data Loading Template
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
# Load dataset
def load_wine_dataset():
"""Load and prepare wine quality dataset"""
from sklearn.datasets import load_wine
wine = load_wine()
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df['target'] = wine.target
return df
# Data exploration
def explore_data(df):
"""Basic data exploration"""
print(f"Dataset shape: {df.shape}")
print(f"Features: {list(df.columns[:-1])}")
print(f"Target distribution:\n{df['target'].value_counts()}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nMissing values:\n{df.isnull().sum()}")
return df.describe()
3. Data Visualization Templates
# Correlation heatmap
def plot_correlation_heatmap(df, target_col='target'):
"""Plot feature correlation heatmap"""
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()
# Feature distribution plots
def plot_feature_distributions(df, target_col='target'):
"""Plot feature distributions by target class"""
features = df.columns[:-1] # Exclude target
n_features = len(features)
n_cols = 3
n_rows = (n_features + n_cols - 1) // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
axes = axes.flatten()
for i, feature in enumerate(features):
for target_class in df[target_col].unique():
subset = df[df[target_col] == target_class][feature]
axes[i].hist(subset, alpha=0.7, label=f'Class {target_class}')
axes[i].set_title(f'{feature} Distribution')
axes[i].set_xlabel(feature)
axes[i].set_ylabel('Frequency')
axes[i].legend()
# Hide empty subplots
for i in range(n_features, len(axes)):
axes[i].set_visible(False)
plt.tight_layout()
plt.show()
4. Data Preprocessing Pipeline
# Data preprocessing class
class DataPreprocessor:
def __init__(self):
self.scaler = StandardScaler()
self.is_fitted = False
def fit_transform(self, X):
"""Fit scaler and transform data"""
X_scaled = self.scaler.fit_transform(X)
self.is_fitted = True
return X_scaled
def transform(self, X):
"""Transform data using fitted scaler"""
if not self.is_fitted:
raise ValueError("Scaler must be fitted before transform")
return self.scaler.transform(X)
def inverse_transform(self, X):
"""Inverse transform scaled data"""
if not self.is_fitted:
raise ValueError("Scaler must be fitted before inverse transform")
return self.scaler.inverse_transform(X)
# Usage example
def prepare_data(df, target_col='target', test_size=0.2):
"""Prepare data for machine learning"""
# Separate features and target
X = df.drop(target_col, axis=1)
y = df[target_col]
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=42, stratify=y
)
# Scale features
preprocessor = DataPreprocessor()
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)
return X_train_scaled, X_test_scaled, y_train, y_test, preprocessor
Datasets & Resources
Primary Dataset: Wine Quality
- Source: sklearn built-in dataset
- Size: 178 samples, 13 features
- Task: 3-class classification
- Features: Chemical properties of wine
- Target: Wine quality class (0, 1, 2)
Alternative Datasets
- Iris Dataset: 150 samples, 4 features, 3 classes
- Breast Cancer: 569 samples, 30 features, 2 classes
- Diabetes: 442 samples, 10 features, regression task
Dataset Loading Functions
def load_alternative_datasets():
"""Load alternative datasets for practice"""
from sklearn.datasets import load_iris, load_breast_cancer, load_diabetes
datasets = {
'iris': load_iris(),
'breast_cancer': load_breast_cancer(),
'diabetes': load_diabetes()
}
return datasets
Machine Learning Implementation
1. Logistic Regression from Scratch
class LogisticRegression:
def __init__(self, learning_rate=0.01, max_iter=1000):
self.learning_rate = learning_rate
self.max_iter = max_iter
self.weights = None
self.bias = None
def sigmoid(self, z):
"""Sigmoid activation function"""
return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
def fit(self, X, y):
"""Train logistic regression model"""
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
for _ in range(self.max_iter):
# Forward pass
linear_pred = np.dot(X, self.weights) + self.bias
predictions = self.sigmoid(linear_pred)
# Gradients
dw = (1/n_samples) * np.dot(X.T, (predictions - y))
db = (1/n_samples) * np.sum(predictions - y)
# Update parameters
self.weights -= self.learning_rate * dw
self.bias -= self.learning_rate * db
def predict(self, X):
"""Make predictions"""
linear_pred = np.dot(X, self.weights) + self.bias
y_pred = self.sigmoid(linear_pred)
return (y_pred >= 0.5).astype(int)
def predict_proba(self, X):
"""Predict probabilities"""
linear_pred = np.dot(X, self.weights) + self.bias
return self.sigmoid(linear_pred)
2. sklearn Pipeline Implementation
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
def create_ml_pipeline():
"""Create machine learning pipeline"""
# Define preprocessing steps
preprocessor = Pipeline([
('scaler', StandardScaler())
])
# Define models
models = {
'logistic_regression': LogisticRegression(random_state=42),
'random_forest': RandomForestClassifier(random_state=42),
'svm': SVC(random_state=42, probability=True)
}
# Create pipelines
pipelines = {}
for name, model in models.items():
pipelines[name] = Pipeline([
('preprocessor', preprocessor),
('classifier', model)
])
return pipelines
def hyperparameter_tuning(pipelines, X_train, y_train):
"""Perform hyperparameter tuning"""
param_grids = {
'logistic_regression': {
'classifier__C': [0.1, 1, 10],
'classifier__max_iter': [100, 200, 500]
},
'random_forest': {
'classifier__n_estimators': [50, 100, 200],
'classifier__max_depth': [3, 5, 10, None]
},
'svm': {
'classifier__C': [0.1, 1, 10],
'classifier__kernel': ['rbf', 'linear']
}
}
best_models = {}
for name, pipeline in pipelines.items():
grid_search = GridSearchCV(
pipeline, param_grids[name], cv=5, scoring='f1_macro', n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_models[name] = grid_search.best_estimator_
print(f"{name} - Best F1: {grid_search.best_score_:.4f}")
return best_models
Evaluation & Metrics
Performance Metrics
def evaluate_model(model, X_test, y_test, model_name="Model"):
"""Evaluate model performance"""
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# Make predictions
y_pred = model.predict(X_test)
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
print(f"\n{model_name} Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'{model_name} Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
Daily Learning Checklist
Day 1: Environment & Data Loading
- [x] Set up Python virtual environment
- [x] Install required packages
- [x] Load and explore wine dataset
- [x] Basic data visualization
Day 2: Data Analysis & Preprocessing
- [ ] Feature correlation analysis
- [ ] Data quality assessment
- [ ] Feature scaling and normalization
- [ ] Train-test split preparation
Day 3: Algorithm Implementation
- [ ] Implement logistic regression from scratch
- [ ] Train and test custom model
- [ ] Compare with sklearn implementation
- [ ] Performance evaluation
Day 4: Pipeline & Validation
- [ ] Create sklearn pipeline
- [ ] Implement cross-validation
- [ ] Hyperparameter tuning
- [ ] Model selection
Day 5: End-to-End Project
- [ ] Complete ML workflow
- [ ] Performance analysis
- [ ] Documentation
- [ ] Project submission
Additional Resources
Online Courses & Tutorials
Books & References
- "Python for Data Analysis" by Wes McKinney
- "Hands-On Machine Learning" by Aurélien Géron
- "Introduction to Machine Learning with Python" by Andreas Müller
Practice Platforms
Progress Tracking
Week 1 Completion: 1/5 days (20%) Next Milestone: Complete data preprocessing and algorithm implementation Target Date: August 15, 2025
Use these resources to accelerate your Week 1 learning journey. Each code example is designed to be runnable and educational.