Simple linear regression

 # Step a. Import libraries

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split


# Step b. Load the dataset

data = pd.read_csv('Salary_Data.csv')  # Make sure the CSV is in the same folder


# Step c. Print first few rows

print(data.head())


# Step d. Assign variables

X = data['YearsExperience'].values

Y = data['Salary'].values


# Split data into 80% train and 20% test

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


# Step e. Plot train dataset

plt.scatter(X_train, Y_train, color='blue')

plt.xlabel('Years of Experience')

plt.ylabel('Salary')

plt.title('Training Data')

plt.show()


# Step f. Calculate slope and intercept

# Formula: slope (m) = covariance(X, Y) / variance(X)


mean_x = np.mean(X_train)

mean_y = np.mean(Y_train)


numerator = sum((X_train - mean_x) * (Y_train - mean_y))

denominator = sum((X_train - mean_x) ** 2)


slope = numerator / denominator

intercept = mean_y - slope * mean_x


print(f"Slope (m): {slope}")

print(f"Intercept (c): {intercept}")


# Plot regression line

regression_line = slope * X_train + intercept


plt.scatter(X_train, Y_train, color='blue')

plt.plot(X_train, regression_line, color='red')

plt.xlabel('Years of Experience')

plt.ylabel('Salary')

plt.title('Regression Line with Training Data')

plt.show()


# Step g. Predict values for Test data

Y_pred = slope * X_test + intercept


# Step h. Calculate RMSE (Root Mean Squared Error)

rmse = np.sqrt(np.mean((Y_test - Y_pred) ** 2))

print(f"Root Mean Squared Error: {rmse}")


Comments