Python, correlation and regression: part 3

Before moving on to exploring the normal equation, let's go over the basics of matrix and vector multiplication.


A matrix is ​​a two-dimensional array of numbers. The dimension of the matrix is ​​expressed by the number of rows and columns.

For example, is a four-row, two-column matrix:

In mathematical notation, the matrix is ​​usually assigned to a variable, which is denoted by an uppercase letter, in order to distinguish it from others in an equation.

A numpy array can be constructed from a dataset using the pandas function df.values


def ex_3_16():
    '''   () numpy 
    df = swimmer_data()[[', ', '']]
    return df.values

As a result of executing this example, we get the following one-dimensional array:

array([[166.,  68.],
       [192.,  89.],
       [173.,  65.],
       [188.,  79.],
       [187.,  78.],
       [183.,  70.]])

You can also use a function from the numpy library np.array

that takes a sequence of scalars or a sequence of sequences and, if possible, converts them to a one-dimensional array (in the format numpy.ndarray


def ex_3_17():
    '''   () numpy 
    return swimmer_data()[', '].head(20).values #  20

As a result, we get the following one-dimensional array:

array([166., 192., 173., 179., 201., 190., 175., 160., 202., 173., 175.,
       205., 185., 175., 185., 170., 165., 179., 165., 170.])

, , , , pandas head tail

numpy (result_array[:5]

); .

pandas numpy , , log, exp, sqrt ., /. , numpy DataFrame ( Series) pandas , . , np.exp(df), np.asarray(df),

i- j- Aij. :

A_ {31} = 2

. pandas numpy shape

, : , .

— , . :

— 4- ; i- yi. , , .

,   , . , .

numpy pandas ( Series DataFrame ) , , np.array .

, Python pandas. , , , . :

'''     ()'''
df = pd.DataFrame({'x':[2, 3, 6, 7],'y':[8, 7, 4, 3]})
df[''] = 1

	x	y	
0	2	8	1
1	3	7	1
2	6	4	1
3	7	3	1

, . , β1  , , x1  . , x .

— . , .

- . . , , .

Python pandas:

df1 = pd.DataFrame([[1,0],[2,5],[3,1]])
df2 = pd.DataFrame([[4,0.5],[2,5],[0,1]])
df1 + df2

 	0	   1
0	5	 0.5
1	4	10.0
2	3	 2.0

, pandas , , . , .

Python pandas:

df1 * 3

	0	 1
0	3	 0
1	6	15
2	9	 3



. , 3 × 2  2 × 1  3 × 1, :

Ax  x  . , A  1 3. x: 1 5. , 16. , , , , .

Python pandas:

df3 = pd.DataFrame([[1,3],[0,4],[2,1]])
vec = [1,5]

0    16
1    20
2     7
dtype: int64


- - . A  B , , .

, , . A mA × nA, B — mB × nB, , nA  mB  .


, . pandas numpy , .

df3 = pd.DataFrame([[1,3],[0,4],[2,1]])
df4 = pd.DataFrame([[1,0],[5,6]])

	0    1
0	16	18
1	20	24
2	 7	 6



, . AT:

, :

, :


Python pandas:


	0	1	2
0	1	0	2
1	3	4	1

. , . :

— ( ). 1, .

numpy np.identity

, . , , , .

Python pandas:

df = pd.DataFrame(np.identity(5))

	  0	  1	  2	  3	  4
0	1.0	0.0	0.0	0.0	0.0
1	0.0	1.0	0.0	0.0	0.0
2	0.0	0.0	1.0	0.0	0.0
3	0.0	0.0	0.0	1.0	0.0
4	0.0	0.0	0.0	0.0	1.0

A, A  A-1, , I — :

. . . np.linalg.pinv


Python pandas:

df5 = pd.DataFrame(np.random.rand(3, 3), list('abc'), list('xyz'))
df_inv = pd.DataFrame(np.linalg.pinv(df5.values), df5.columns, df5.index)

          x         y         z
a  0.625754  0.385261  0.462726
b  0.615084  0.111360  0.255420
c  0.723909  0.270869  0.221620
          a         b         c
x -1.451613  1.303231  1.528861
y  1.584699 -6.402303  4.070011
z  2.804750  3.568103 -5.456161

, , . , :

« β  X, X  y», — ( ) — , . β  . , , .

Python, , :

def normal_equation(x, y):
    '''  '''
    # numpy.linalg.inv(A)   numpy.linalg.solve(A,I), 
    #  I -   ,     
    # LU     lapack
    xtx  = np.matmul(x.T.values, x.values) 
    xtxi = np.matmul(np.linalg.inv(np.matmul(xtx.T,xtx)),xtx.T)
    xty  = np.matmul(x.T.values, y.values) 
    return np.matmul(xtxi, xty)  

. ( ):

def ex_3_18():
    df = swimmer_data()
    X = df[[', ']] 
    X.insert(0, '', 1)
    y = df[''].apply(np.log)
    return normal_equation(X, y)


array([ 1.691031310.01429648])

β1  β2, . , , .

, , , . , . pandas .

(-, . . « » « machine learning?») «», . feature. «», «», «», « », « ».


def ex_3_19():
    '''    NumPy
    X = swimmer_data()[[', ', '']]
    X.insert(0, '', 1)
    return X.values


array([[  1., 166.,  23.],
       [  1., 192.,  22.],
       [  1., 173.,  20.],
       [  1., 188.,  24.],
       [  1., 187.,  19.],
       [  1., 183.,  22.]])

- :

def ex_3_20():
    df = swimmer_data()
    X = df[[', ', '']] 
    X.insert(0, '', 1)
    y = df[''].apply(np.log)
    return normal_equation(X, y)


array([1.69002036, 0.01395437, 0.00279859])

, ( ) (0.013954) (0.002799). R2  .


R2  , , :

, — , var(ε)  var(y) R2:

. pandas dot

, R- :

def matrix_r_squared(coefs, x, y):
    '''  R-'''
    fitted      = 
    residuals   = y - fitted 
    difference  = y - y.mean()  
    rss         =  #  
    ess         =
    return 1 - (rss / ess)

rss , . residual sum of squares (RSS), ess — , . explained sum of squares (ESS). R2  :

def ex_3_21():
    '''  R- 
    df = swimmer_data()
    X = df[[', ', '']] 
    X.insert(0, '', 1)
    y = df[''].apply(np.log)
    beta = normal_equation(X, y) 
    return matrix_r_squared(beta, X, y)


0.757. R2  . , , R2  .


  , R2  . — , 0, R2  , .

, . , , , 2. R2, 2  , R2 , :

def matrix_adj_r_squared(coefs, x, y):
    '''   R-'''
    r_squared = matrix_r_squared(coefs, x, y) 
    n = y.shape[0]  # 
    p = coefs.shape[0]
    dec = lambda x: x-1
    return 1 - (1 - r_squared) * (dec(n) / dec(n-p))

2  , p, :

def ex_3_22():
    '''   R- 
    df = swimmer_data()
    X = df[[', ', '']] 
    X.insert(0, '', 1)
    y = df[''].apply(np.log)
    beta = normal_equation(X, y) 
    return matrix_adj_r_squared(beta, X, y)


0.756. - , .

numpy scipy

2  , , numpy scipy np.linalg.lstsq


, , . , .

, numpy np.linalg.lstsq

x  y ( , ). x

, , residuals

, rank


. , . , , F-:

def linear_model(x, y):
    return np.linalg.lstsq(x,y,rcond=-1)[0]


, F- , . , - , .


j — , .. . F- () . , . mean squared model (MSM) , . mean square error (MSE):

(MSM) (ESS) , — . (MSE) (RSS) , — .

F- F-, :

def f_test(fitted, x, y):
    '''F-  '''
    difference = fitted - y.mean() 
    residuals  = y - fitted
    ess        = #  
    rss        =
    p          = x.shape[1]    # 
    n          = y.shape[0]    # 
    df1        = p - 1
    df2        = n - p
    msm        = ess / df1
    mse        = rss / df2
    f_stat     = msm / mse     # mse  / mse 
    f_test     = 1-stats.f.cdf(f_stat, df1, df2) 
    return f_test

def ex_3_23():
    '''     F-
          ,   '''
    df = swimmer_data()
    X = df[[', ', '']]
    X.insert(0, '', 1.0)
    y = df[''].apply(np.log)
    beta = linear_model(X, y)    
    fittedvalues =,beta) 

    return ('F-', f_test(fittedvalues, X, y))

('F-', 1.1102230246251565e-16)

1.11x10e-16. , , , , .

, F- , . , , - , F- , 50%- .

«» , . , «» «». : , () . , , , .

. ? , .

, .. . , , , , .

, , , . , , , 0 1.

, , , . .. 1 0 — .

, 0 — 1 , .

2, :

def ex_3_25():
       (  )'''
    df = swimmer_data()
    df['_'] = df[''].map({'': 1, '': 0}).astype(int) #  -> 

    X = df[[', ', '', '_']] 
    X.insert(0, '', 1)
    y = df[''].apply(np.log)  
    beta = linear_model(X, y) 
    return matrix_adj_r_squared(beta, X, y)


0.809. , , , 80% .

, , , : , ? R2 , ,   , .

, , , : , , 0 1.

, , -.

- . Python:

def beta_weight(coefs, x, y):
    '''    '''
    sdx = x.std()
    sdy = y.std()
    return [x / sdy * c for x,c in zip(sdx,coefs)] 

def ex_3_26():
          ,   '''
    df = swimmer_data()
    df['_'] = df[''].map({'': 1, '': 0}).astype(int)
    X = df[[', ', '', '_']] 
    X.insert(0, '', 1)
    y = df[''].apply(np.log) 
    beta = linear_model(X, y) 
    res = beta_weight(beta, X, y)
    return res

( ):

[0.0, 0.6501469135033348, 0.05842998157513067, 0.30387262631851747]

, , . , 0.65 .

, .

, « », . , , . , , pandas pd.to_datetime


       DateTime   '''
str_to_year = lambda x: pd.to_datetime(x).year

def ex_3_27():
    df = swimmer_data()
    df['_'] = df[''].map({'': 1, '': 0}).astype(int) 
    df[' '] = df[' '].map(str_to_year)
    X = df[[', ', '', '_', ' ']] 
    X.insert(0, '', 1.0)
    y = df[''].apply(np.log) 
    beta = linear_model(X, y) 
    return beta_weight(beta, X, y)


« » - 0.038, «», . , «» 0.096. 65%, « ». , , , .

« », . :

def ex_3_28():
    '''       '''
    df = swimmer_data()
    df[' '] = df[' '].map(str_to_year)
    xs = df[''].apply(jitter(0.5))
    ys = df[' ']
    pd.DataFrame(np.array([xs,ys]).T).plot.scatter(0, 1, s=3, grid=True)
    plt.ylabel(' ')

( ) . , :

, , — . , .

. , , . , .

, , , . , , ? , .

: , , . , , R2  .

, , . , 0.8 . , , , .

. R2  1.0, . R2  .


  • . .

  • . , , .

  • ( ).

  • . , - .

, , . , , .

«» R2 = 0.1049, « » R2 = 0.1050.

, , 10% . , «».

