# Step a. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# Step b. Load the dataset
data = pd.read_csv('Salary_Data.csv') # Make sure the CSV is in the same folder
# Step c. Print first few rows
print(data.head())
# Step d. Assign variables
X = data['YearsExperience'].values
Y = data['Salary'].values
# Split data into 80% train and 20% test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Step e. Plot train dataset
plt.scatter(X_train, Y_train, color='blue')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.title('Training Data')
plt.show()
# Step f. Calculate slope and intercept
# Formula: slope (m) = covariance(X, Y) / variance(X)
mean_x = np.mean(X_train)
mean_y = np.mean(Y_train)
numerator = sum((X_train - mean_x) * (Y_train - mean_y))
denominator = sum((X_train - mean_x) ** 2)
slope = numerator / denominator
intercept = mean_y - slope * mean_x
print(f"Slope (m): {slope}")
print(f"Intercept (c): {intercept}")
# Plot regression line
regression_line = slope * X_train + intercept
plt.scatter(X_train, Y_train, color='blue')
plt.plot(X_train, regression_line, color='red')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.title('Regression Line with Training Data')
plt.show()
# Step g. Predict values for Test data
Y_pred = slope * X_test + intercept
# Step h. Calculate RMSE (Root Mean Squared Error)
rmse = np.sqrt(np.mean((Y_test - Y_pred) ** 2))
print(f"Root Mean Squared Error: {rmse}")
Comments
Post a Comment