Statistics & Learning

Contents

Statistics & Learning#

Supervised Learning#

Linear Regression#

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('https://piazza.com/redirect/s3?bucket=uploads&prefix=paste%2Fi5t4zjz64oq3sl%2Fe69559fac1a2f68f704222629e603299bfa8aabfd88e81225e52e125484764a0%2Fdata1.txt', sep = ",", header=None)
data.columns = ["Pop", "Profit"]

plt.scatter(data["Pop"], data["Profit"], marker = "x", alpha = 0.6, color = "r")
plt.xlabel("Population in 10,000s")
plt.ylabel("Profit in $10,000s")
plt.title("Scatter Plot of training data")
plt.show()

../_images/de54a86a4487efe2668b8a75deac6173524eab5274ef21334354d9ff584d4151.png

def normalEqn(X, y):
    return (np.linalg.inv(X.T@X) @  X.T@y)

X = pd.DataFrame(data["Pop"])
y = data["Profit"]
m = len(y)
constant = np.ones(m)
X.insert(0, "constant", constant)
b = normalEqn(X, y)
print("intercept =",b[0],", slope =", b[1])

intercept = -3.89578087831185 , slope = 1.1930336441895941

plt.scatter(data["Pop"], data["Profit"], marker = "x", alpha = 0.4, color = "r")
plt.plot(data["Pop"], b[0] + b[1]*data["Pop"])
plt.xlabel("Population in 10,000s")
plt.ylabel("Profit in $10,000s")
plt.title("Scatter Plot of training data")
plt.show()

../_images/52ef381e094b702e2d8a647afed29b29fa991a6a7194015180fec26b46e169a2.png

def cost(X, y, theta):
    m = len(y)
    J = (1/(2*m))*(((X@theta-y)**2).sum())
    return J

def gradientDescent(X, y, theta, alpha, iterations):
    m = len(y)
    J_history = np.zeros(iterations)
    for iter in range(iterations):
        theta = theta - (((alpha/m)*(X@theta - y).T@X).T)
        J_history[iter] = cost(X, y, theta)
    return theta, J_history

theta = np.zeros(2)
iterations = 1500
alpha = 0.02

print(cost(X,y, theta))

32.072733877455676

theta, J_history = gradientDescent(X, y, theta, alpha, iterations)
print("intercept =",theta[0],", slope =", theta[1])

intercept = -3.878137690865592 , slope = 1.191261194638165

plt.plot(range(iterations),J_history)
plt.xlabel('Number of iterations')
plt.ylabel('Cost')
plt.show()

../_images/7033daf9cd85f66216ef9da43dbc44a13e10fcf60b88ec9dae625f379772057d.png

from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(X.iloc[:,1:], y)
print("intercept =",reg.intercept_,", slope =", reg.coef_[0])

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[12], line 1
----> 1 from sklearn import linear_model
      2 reg = linear_model.LinearRegression()
      3 reg.fit(X.iloc[:,1:], y)

ModuleNotFoundError: No module named 'sklearn'

data = pd.read_csv('https://piazza.com/redirect/s3?bucket=uploads&prefix=paste%2Fi5t4zjz64oq3sl%2Fc19f8051a03d4b73be5bb9a60ace1c2f24adba2947cc0028e0f184d5680d4d21%2Fdata2.txt', sep = ",", header=None)
data.columns = ["Size", "Bedrooms", "Price"]
data.head()

	Size	Bedrooms	Price
0	2104	3	399900
1	1600	3	329900
2	2400	3	369000
3	1416	2	232000
4	3000	4	539900

X = data.iloc[:,:-1]
y = data.iloc[:,2]
m = len(y)
def normalize(X):
    X_norm = (X - np.mean(X,0))/np.std(X,0)
    mu = np.mean(X,0)
    sigma = np.std(X,0)
    return X_norm, mu, sigma

X, mu, sigma = normalize(X)

one = np.ones((m,1))
X.insert(0, 'constant', one)

alpha = 0.1
iterations = 500
theta = np.zeros(3)
theta, J_history = gradientDescent(X, y, theta, alpha, iterations)

print(theta)

constant    340412.659574
Size        109447.796460
Bedrooms     -6578.354844
dtype: float64

print(normalEqn(X, y))

  340412.659574
  109447.796470
   -6578.354854
dtype: float64

reg = linear_model.LinearRegression()
reg.fit(X.iloc[:,1:], y)
print(reg.intercept_, reg.coef_)

340412.6595744681 [109447.79646964  -6578.35485416]

# Predicting the price of the house with following features : size = 2000 sqft, number of bedrooms = 4
size = 2000
bedrooms = 4
normalized = (np.array([size, bedrooms]) - mu)/sigma
print("Predicted Price of house of size {} sq ft and number of bedrooms {} is around ${}".format(size, bedrooms, round(theta[0] + theta[1]*normalized[0] + theta[2]*normalized[1])))

Predicted Price of house of size 2000 sq ft and number of bedrooms 4 is around $333067

Logistic Regression#

data = pd.read_csv('https://piazza.com/redirect/s3?bucket=uploads&prefix=paste%2Fi5t4zjz64oq3sl%2Fed7298995232c17a7f1bc50c9807cb7216337563e4a7c00bfcaa8f5d6bd32f02%2Fdata3.txt', sep = ",", header=None)
data.columns = ['Exam1', 'Exam2', 'Admit']

X = data.iloc[:,:2]
y = data.iloc[:,2]

plt.scatter(X["Exam1"], X["Exam2"], c = y)
for i, label in enumerate(list(y)):
    plt.text(X["Exam1"][i], X["Exam2"][i],label)
plt.xlabel("Exam 1 Scores")
plt.ylabel("Exam 2 Scores")
plt.show()

../_images/66b8ec0b8fa59691c45d148ee29edddbdc8c78ca8d5d435f74699d006750e129.png

from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()
reg.fit(X,y)
print(reg.intercept_, reg.coef_)

[-25.05219314] [[0.20535491 0.2005838 ]]

t0 = reg.intercept_[0]
t1 = reg.coef_[0,0]
t2 = reg.coef_[0,1]

l = X['Exam1'].min()
u = X['Exam1'].max()
x1 = np.array([l, u])
x2 = (-t0/t2) - (t1/t2)*x1

plt.scatter(X["Exam1"], X["Exam2"], c = y)
plt.plot(x1, x2)
plt.xlabel("Exam 1 Scores")
plt.ylabel("Exam 2 Scores")
plt.show()

../_images/5b7be17dfb04c4675f452feb832ec626a1325715003644802b0977ac0bfb8bfd.png

# Predicting the admission decision when scores of the student are : Score in Exam 1 = 80, Score in Exam 2 = 45
s1 = 80
s2 = 45
positive = t0 + t1*s1 + t2*s2 >= 0
if positive:
    print('Chances of admission are high.')
else:
    print('Admission is unlikely.')

Chances of admission are high.