See the previous post here .
Before moving on to exploring the normal equation, let's go over the basics of matrix and vector multiplication.
Matrices
A matrix is a two-dimensional array of numbers. The dimension of the matrix is expressed by the number of rows and columns.
For example, A is a four-row, two-column matrix:
In mathematical notation, the matrix is usually assigned to a variable, which is denoted by an uppercase letter, in order to distinguish it from others in an equation.
A numpy array can be constructed from a dataset using the pandas function df.values
:
def ex_3_16():
''' () numpy
'''
df = swimmer_data()[[', ', '']]
return df.values
As a result of executing this example, we get the following one-dimensional array:
array([[166., 68.],
[192., 89.],
[173., 65.],
...,
[188., 79.],
[187., 78.],
[183., 70.]])
You can also use a function from the numpy library np.array
that takes a sequence of scalars or a sequence of sequences and, if possible, converts them to a one-dimensional array (in the format numpy.ndarray
):
def ex_3_17():
''' () numpy
'''
return swimmer_data()[', '].head(20).values # 20
As a result, we get the following one-dimensional array:
array([166., 192., 173., 179., 201., 190., 175., 160., 202., 173., 175.,
205., 185., 175., 185., 170., 165., 179., 165., 170.])
, , , , pandas head tail
numpy (result_array[:5]
); .
pandas numpy , , log, exp, sqrt ., /. , numpy DataFrame ( Series) pandas , . , np.exp(df), np.asarray(df), df.T.dot(df)).
i- j- Aij. :
. pandas numpy shape
, : , .
— , . :
y — 4- ; i- yi. , , .
, , . , .
numpy pandas ( Series DataFrame ) , , np.array .
, Python pandas. , , , . :
''' ()'''
df = pd.DataFrame({'x':[2, 3, 6, 7],'y':[8, 7, 4, 3]})
df[''] = 1
df
x y
0 2 8 1
1 3 7 1
2 6 4 1
3 7 3 1
, . , β1 , , x1 . y , x .
— . , .
- . . , , .
Python pandas:
df1 = pd.DataFrame([[1,0],[2,5],[3,1]])
df2 = pd.DataFrame([[4,0.5],[2,5],[0,1]])
df1 + df2
0 1
0 5 0.5
1 4 10.0
2 3 2.0
, pandas , , . , .
Python pandas:
df1 * 3
0 1
0 3 0
1 6 15
2 9 3
-
dot
. , 3 × 2 2 × 1 3 × 1, :
Ax A x . , A 1 3. x: 1 5. , 16. , , , , .
Python pandas:
df3 = pd.DataFrame([[1,3],[0,4],[2,1]])
vec = [1,5]
df3.dot(vec)
0 16
1 20
2 7
dtype: int64
-
- - . A B , , .
, , . A mA × nA, B — mB × nB, , nA mB .
:
, . pandas numpy , .
df3 = pd.DataFrame([[1,3],[0,4],[2,1]])
df4 = pd.DataFrame([[1,0],[5,6]])
df3.dot(df4)
0 1
0 16 18
1 20 24
2 7 6
numpy:
np.matmul(df3,np.asarray(df4))
, . A AT:
, :
, :
:
Python pandas:
df3.T
0 1 2
0 1 0 2
1 3 4 1
. , . :
— ( ). 1, .
numpy np.identity
, . , , , .
Python pandas:
df = pd.DataFrame(np.identity(5))
df
0 1 2 3 4
0 1.0 0.0 0.0 0.0 0.0
1 0.0 1.0 0.0 0.0 0.0
2 0.0 0.0 1.0 0.0 0.0
3 0.0 0.0 0.0 1.0 0.0
4 0.0 0.0 0.0 0.0 1.0
A, A A-1, , I — :
. . . np.linalg.pinv
numpy.
Python pandas:
df5 = pd.DataFrame(np.random.rand(3, 3), list('abc'), list('xyz'))
print(df5)
df_inv = pd.DataFrame(np.linalg.pinv(df5.values), df5.columns, df5.index)
print(df_inv)
x y z
a 0.625754 0.385261 0.462726
b 0.615084 0.111360 0.255420
c 0.723909 0.270869 0.221620
a b c
x -1.451613 1.303231 1.528861
y 1.584699 -6.402303 4.070011
z 2.804750 3.568103 -5.456161
, , . , :
« β X X, X y», X — ( ) y — , . β . , , .
Python, , :
def normal_equation(x, y):
''' '''
# numpy.linalg.inv(A) numpy.linalg.solve(A,I),
# I - ,
# LU lapack
xtx = np.matmul(x.T.values, x.values)
#
xtxi = np.matmul(np.linalg.inv(np.matmul(xtx.T,xtx)),xtx.T)
xty = np.matmul(x.T.values, y.values)
return np.matmul(xtxi, xty)
. ( ):
def ex_3_18():
'''
'''
df = swimmer_data()
X = df[[', ']]
X.insert(0, '', 1)
y = df[''].apply(np.log)
return normal_equation(X, y)
:
array([ 1.69103131, 0.01429648])
β1 β2, . , , .
, , , . , . pandas .
(-, . . « » « machine learning?») «», . feature. «», «», «», « », « ».
:
def ex_3_19():
''' NumPy
'''
X = swimmer_data()[[', ', '']]
X.insert(0, '', 1)
return X.values
:
array([[ 1., 166., 23.],
[ 1., 192., 22.],
[ 1., 173., 20.],
...,
[ 1., 188., 24.],
[ 1., 187., 19.],
[ 1., 183., 22.]])
- :
def ex_3_20():
'''
'''
df = swimmer_data()
X = df[[', ', '']]
X.insert(0, '', 1)
y = df[''].apply(np.log)
return normal_equation(X, y)
:
array([1.69002036, 0.01395437, 0.00279859])
, ( ) (0.013954) (0.002799). R2 .
R-
R2 , , :
, — , var(ε) var(y) R2:
. pandas dot
, R- :
def matrix_r_squared(coefs, x, y):
''' R-'''
fitted = x.dot(coefs)
residuals = y - fitted
difference = y - y.mean()
rss = residuals.dot(residuals) #
ess = difference.dot(difference)
return 1 - (rss / ess)
rss , . residual sum of squares (RSS), ess — , . explained sum of squares (ESS). R2 :
def ex_3_21():
''' R-
'''
df = swimmer_data()
X = df[[', ', '']]
X.insert(0, '', 1)
y = df[''].apply(np.log)
beta = normal_equation(X, y)
return matrix_r_squared(beta, X, y)
0.7568466547183842
0.757. R2 . , , R2 .
R-
, R2 . — , 0, R2 , .
, . , , , R̅2. R2, R̅2 , R2 , :
def matrix_adj_r_squared(coefs, x, y):
''' R-'''
r_squared = matrix_r_squared(coefs, x, y)
n = y.shape[0] #
p = coefs.shape[0]
dec = lambda x: x-1
return 1 - (1 - r_squared) * (dec(n) / dec(n-p))
R̅2 , n p, :
def ex_3_22():
''' R-
'''
df = swimmer_data()
X = df[[', ', '']]
X.insert(0, '', 1)
y = df[''].apply(np.log)
beta = normal_equation(X, y)
return matrix_adj_r_squared(beta, X, y)
0.7559934850858171
0.756. - , .
numpy scipy
R̅2 , , numpy scipy np.linalg.lstsq
stats.linregress
, , . , .
, numpy np.linalg.lstsq
x y ( , ). x
, , residuals
, rank
s
. , . , , F-:
def linear_model(x, y):
'''
,
normal_equation'''
return np.linalg.lstsq(x,y,rcond=-1)[0]
F-
, F- , . , - , .
:
j — , .. . F- () . , . mean squared model (MSM) , . mean square error (MSE):
(MSM) (ESS) , — . (MSE) (RSS) , — .
F- F-, :
def f_test(fitted, x, y):
'''F- '''
difference = fitted - y.mean()
residuals = y - fitted
ess = difference.dot(difference) #
rss = residuals.dot(residuals)
p = x.shape[1] #
n = y.shape[0] #
df1 = p - 1
df2 = n - p
msm = ess / df1
mse = rss / df2
f_stat = msm / mse # mse / mse
f_test = 1-stats.f.cdf(f_stat, df1, df2)
return f_test
def ex_3_23():
''' F-
, '''
df = swimmer_data()
X = df[[', ', '']]
X.insert(0, '', 1.0)
y = df[''].apply(np.log)
beta = linear_model(X, y)
fittedvalues = np.dot(X,beta)
#
return ('F-', f_test(fittedvalues, X, y))
('F-', 1.1102230246251565e-16)
1.11x10e-16. , , , , .
, F- , . , , - , F- , 50%- .
«» , . , «» «». : , () . , , , .
. ? , .
, .. . , , , , .
, , , . , , , 0 1.
, , , . .. 1 0 — .
, 0 — 1 , .
R̅2, :
def ex_3_25():
'''
( )'''
df = swimmer_data()
df['_'] = df[''].map({'': 1, '': 0}).astype(int) # ->
X = df[[', ', '', '_']]
X.insert(0, '', 1)
y = df[''].apply(np.log)
beta = linear_model(X, y)
return matrix_adj_r_squared(beta, X, y)
0.8082954905432824
0.809. , , , 80% .
, , , : , ? R2 , , , .
, , , : , , 0 1.
, , -.
- . Python:
def beta_weight(coefs, x, y):
''' '''
sdx = x.std()
sdy = y.std()
return [x / sdy * c for x,c in zip(sdx,coefs)]
def ex_3_26():
'''
, '''
df = swimmer_data()
#
df['_'] = df[''].map({'': 1, '': 0}).astype(int)
X = df[[', ', '', '_']]
X.insert(0, '', 1)
y = df[''].apply(np.log)
beta = linear_model(X, y)
res = beta_weight(beta, X, y)
return res
( ):
[0.0, 0.6501469135033348, 0.05842998157513067, 0.30387262631851747]
, , . , 0.65 .
, .
, « », . , , . , , pandas pd.to_datetime
:
'''
DateTime '''
str_to_year = lambda x: pd.to_datetime(x).year
def ex_3_27():
'''
()'''
df = swimmer_data()
df['_'] = df[''].map({'': 1, '': 0}).astype(int)
df[' '] = df[' '].map(str_to_year)
X = df[[', ', '', '_', ' ']]
X.insert(0, '', 1.0)
y = df[''].apply(np.log)
beta = linear_model(X, y)
return beta_weight(beta, X, y)
[-0.0,
0.650070475196164,
0.09580282723307212,
0.3041431115029873,
0.03769748899125406]
« » - 0.038, «», . , «» 0.096. 65%, « ». , , , .
« », . :
def ex_3_28():
''' '''
df = swimmer_data()
df[' '] = df[' '].map(str_to_year)
xs = df[''].apply(jitter(0.5))
ys = df[' ']
pd.DataFrame(np.array([xs,ys]).T).plot.scatter(0, 1, s=3, grid=True)
plt.xlabel('')
plt.ylabel(' ')
#saveplot('ex_3_28.png')
plt.show()
( ) . , :
, , — . , .
. , , . , .
, , , . , , ? , .
: , , . , , R2 .
, , . , 0.8 . , , , .
. R2 1.0, . R2 .
:
. .
. , , .
( ).
. , - .
, , . , , .
«» R2 = 0.1049, « » R2 = 0.1050.
, , 10% . , «».
The source code examples for this post are in my Github repo . All source data is taken from the repository of the author of the book.
The next short post, post # 4 , will look at the prediction process.