Python, correlation and regression: part 3

See the previous post here .





Before moving on to exploring the normal equation, let's go over the basics of matrix and vector multiplication.





Matrices

A matrix is ​​a two-dimensional array of numbers. The dimension of the matrix is ​​expressed by the number of rows and columns.





For example, is a four-row, two-column matrix:





In mathematical notation, the matrix is ​​usually assigned to a variable, which is denoted by an uppercase letter, in order to distinguish it from others in an equation.





A numpy array can be constructed from a dataset using the pandas function df.values



:





def ex_3_16():
    '''   () numpy 
           '''
    df = swimmer_data()[[', ', '']]
    return df.values
      
      



As a result of executing this example, we get the following one-dimensional array:





array([[166.,  68.],
       [192.,  89.],
       [173.,  65.],
       ...,
       [188.,  79.],
       [187.,  78.],
       [183.,  70.]])
      
      



You can also use a function from the numpy library np.array



that takes a sequence of scalars or a sequence of sequences and, if possible, converts them to a one-dimensional array (in the format numpy.ndarray



):





def ex_3_17():
    '''   () numpy 
             '''
    return swimmer_data()[', '].head(20).values #  20
      
      



As a result, we get the following one-dimensional array:





array([166., 192., 173., 179., 201., 190., 175., 160., 202., 173., 175.,
       205., 185., 175., 185., 170., 165., 179., 165., 170.])
      
      



, , , , pandas head tail



numpy (result_array[:5]



); .





pandas numpy , , log, exp, sqrt ., /. , numpy DataFrame ( Series) pandas , . , np.exp(df), np.asarray(df), df.T.dot(df)).





i- j- Aij. :





A_ {31} = 2

. pandas numpy shape



, : , .





— , . :





— 4- ; i- yi. , , .





,   , . , .





numpy pandas ( Series DataFrame ) , , np.array .





, Python pandas. , , , . :





'''     ()'''
df = pd.DataFrame({'x':[2, 3, 6, 7],'y':[8, 7, 4, 3]})
df[''] = 1
df
      
      



	x	y	
0	2	8	1
1	3	7	1
2	6	4	1
3	7	3	1
      
      



, . , β1  , , x1  . , x .





— . , .





- . . , , .





Python pandas:





df1 = pd.DataFrame([[1,0],[2,5],[3,1]])
df2 = pd.DataFrame([[4,0.5],[2,5],[0,1]])
df1 + df2
      
      



 	0	   1
0	5	 0.5
1	4	10.0
2	3	 2.0
      
      



, pandas , , . , .





Python pandas:





df1 * 3
      
      



	0	 1
0	3	 0
1	6	15
2	9	 3
      
      



-

dot



. , 3 × 2  2 × 1  3 × 1, :





Ax  x  . , A  1 3. x: 1 5. , 16. , , , , .





Python pandas:





df3 = pd.DataFrame([[1,3],[0,4],[2,1]])
vec = [1,5]
df3.dot(vec)
      
      



0    16
1    20
2     7
dtype: int64
      
      



-

- - . A  B , , .





, , . A mA × nA, B — mB × nB, , nA  mB  .





:





, . pandas numpy , .





df3 = pd.DataFrame([[1,3],[0,4],[2,1]])
df4 = pd.DataFrame([[1,0],[5,6]])     
df3.dot(df4)
      
      



	0    1
0	16	18
1	20	24
2	 7	 6
      
      



numpy:





np.matmul(df3,np.asarray(df4))
      
      



, . AT:





, :





, :





:





Python pandas:





df3.T
      
      



	0	1	2
0	1	0	2
1	3	4	1
      
      



. Gitgub .





. , . :





— ( ). 1, .





numpy np.identity



, . , , , .





Python pandas:





df = pd.DataFrame(np.identity(5))
df
      
      



	  0	  1	  2	  3	  4
0	1.0	0.0	0.0	0.0	0.0
1	0.0	1.0	0.0	0.0	0.0
2	0.0	0.0	1.0	0.0	0.0
3	0.0	0.0	0.0	1.0	0.0
4	0.0	0.0	0.0	0.0	1.0
      
      



A, A  A-1, , I — :





. . . np.linalg.pinv



numpy.





Python pandas:





df5 = pd.DataFrame(np.random.rand(3, 3), list('abc'), list('xyz'))
print(df5)
df_inv = pd.DataFrame(np.linalg.pinv(df5.values), df5.columns, df5.index)
print(df_inv)
      
      



          x         y         z
a  0.625754  0.385261  0.462726
b  0.615084  0.111360  0.255420
c  0.723909  0.270869  0.221620
          a         b         c
x -1.451613  1.303231  1.528861
y  1.584699 -6.402303  4.070011
z  2.804750  3.568103 -5.456161
      
      



, , . , :





« β  X, X  y», — ( ) — , . β  . , , .





Python, , :





def normal_equation(x, y):
    '''  '''
    # numpy.linalg.inv(A)   numpy.linalg.solve(A,I), 
    #  I -   ,     
    # LU     lapack
    xtx  = np.matmul(x.T.values, x.values) 
    #    
    xtxi = np.matmul(np.linalg.inv(np.matmul(xtx.T,xtx)),xtx.T)
    xty  = np.matmul(x.T.values, y.values) 
    return np.matmul(xtxi, xty)  
      
      



. ( ):





def ex_3_18():
    '''   
            '''
    df = swimmer_data()
    X = df[[', ']] 
    X.insert(0, '', 1)
    y = df[''].apply(np.log)
    return normal_equation(X, y)
      
      



:





array([ 1.691031310.01429648])
      
      



β1  β2, . , , .





, , , . , . pandas .





(-, . . « » « machine learning?») «», . feature. «», «», «», « », « ».





:





def ex_3_19():
    '''    NumPy
            '''
    X = swimmer_data()[[', ', '']]
    X.insert(0, '', 1)
    return X.values
      
      



:





array([[  1., 166.,  23.],
       [  1., 192.,  22.],
       [  1., 173.,  20.],
       ...,
       [  1., 188.,  24.],
       [  1., 187.,  19.],
       [  1., 183.,  22.]])
      
      



- :





def ex_3_20():
    '''   
                
           '''
    df = swimmer_data()
    X = df[[', ', '']] 
    X.insert(0, '', 1)
    y = df[''].apply(np.log)
    return normal_equation(X, y)
      
      



:





array([1.69002036, 0.01395437, 0.00279859])
      
      



, ( ) (0.013954) (0.002799). R2  .





R-

R2  , , :





, — , var(ε)  var(y) R2:





. pandas dot



, R- :





def matrix_r_squared(coefs, x, y):
    '''  R-'''
    fitted      = x.dot(coefs) 
    residuals   = y - fitted 
    difference  = y - y.mean()  
    rss         = residuals.dot(residuals)  #  
    ess         = difference.dot(difference)
    return 1 - (rss / ess)
      
      



rss , . residual sum of squares (RSS), ess — , . explained sum of squares (ESS). R2  :





def ex_3_21():
    '''  R- 
                
           '''
    df = swimmer_data()
    X = df[[', ', '']] 
    X.insert(0, '', 1)
    y = df[''].apply(np.log)
    beta = normal_equation(X, y) 
    return matrix_r_squared(beta, X, y)
      
      



0.7568466547183842
      
      



0.757. R2  . , , R2  .





R-

  , R2  . — , 0, R2  , .





, . , , , 2. R2, 2  , R2 , :





def matrix_adj_r_squared(coefs, x, y):
    '''   R-'''
    r_squared = matrix_r_squared(coefs, x, y) 
    n = y.shape[0]  # 
    p = coefs.shape[0]
    dec = lambda x: x-1
    return 1 - (1 - r_squared) * (dec(n) / dec(n-p))
      
      



2  , p, :





def ex_3_22():
    '''   R- 
                
           '''
    df = swimmer_data()
    X = df[[', ', '']] 
    X.insert(0, '', 1)
    y = df[''].apply(np.log)
    beta = normal_equation(X, y) 
    return matrix_adj_r_squared(beta, X, y)
      
      



0.7559934850858171
      
      



0.756. - , .





numpy scipy

2  , , numpy scipy np.linalg.lstsq



stats.linregress



, , . , .





, numpy np.linalg.lstsq



x  y ( , ). x



, , residuals



, rank



s



. , . , , F-:





def linear_model(x, y):
    '''    
          , 
           
        normal_equation'''
    return np.linalg.lstsq(x,y,rcond=-1)[0]
      
      



F-

, F- , . , - , .





:





j — , .. . F- () . , . mean squared model (MSM) , . mean square error (MSE):





(MSM) (ESS) , — . (MSE) (RSS) , — .





F- F-, :





def f_test(fitted, x, y):
    '''F-  '''
    difference = fitted - y.mean() 
    residuals  = y - fitted
    ess        = difference.dot(difference) #  
    rss        = residuals.dot(residuals)
    p          = x.shape[1]    # 
    n          = y.shape[0]    # 
    df1        = p - 1
    df2        = n - p
    msm        = ess / df1
    mse        = rss / df2
    f_stat     = msm / mse     # mse  / mse 
    f_test     = 1-stats.f.cdf(f_stat, df1, df2) 
    return f_test
      
      



def ex_3_23():
    '''     F-
          ,   '''
    df = swimmer_data()
    X = df[[', ', '']]
    X.insert(0, '', 1.0)
    y = df[''].apply(np.log)
    beta = linear_model(X, y)    
    fittedvalues = np.dot(X,beta) 

    #   
    return ('F-', f_test(fittedvalues, X, y))
      
      



('F-', 1.1102230246251565e-16)
      
      



1.11x10e-16. , , , , .





, F- , . , , - , F- , 50%- .





«» , . , «» «». : , () . , , , .





. ? , .





, .. . , , , , .





, , , . , , , 0 1.





, , , . .. 1 0 — .





, 0 — 1 , .





2, :





def ex_3_25():
    '''   
       (  )'''
    df = swimmer_data()
    df['_'] = df[''].map({'': 1, '': 0}).astype(int) #  -> 

    X = df[[', ', '', '_']] 
    X.insert(0, '', 1)
    y = df[''].apply(np.log)  
    
    beta = linear_model(X, y) 
    return matrix_adj_r_squared(beta, X, y)
      
      



0.8082954905432824
      
      



0.809. , , , 80% .





, , , : , ? R2 , ,   , .





, , , : , , 0 1.





, , -.





- . Python:





def beta_weight(coefs, x, y):
    '''    '''
    sdx = x.std()
    sdy = y.std()
    return [x / sdy * c for x,c in zip(sdx,coefs)] 
      
      



def ex_3_26():
    '''      
          ,   '''
    df = swimmer_data()
    #   
    df['_'] = df[''].map({'': 1, '': 0}).astype(int)
    X = df[[', ', '', '_']] 
    X.insert(0, '', 1)
    y = df[''].apply(np.log) 
    beta = linear_model(X, y) 
    res = beta_weight(beta, X, y)
    return res
      
      



( ):





[0.0, 0.6501469135033348, 0.05842998157513067, 0.30387262631851747]
      
      



, , . , 0.65 .





, .





, « », . , , . , , pandas pd.to_datetime



:





'''    
       DateTime   '''
str_to_year = lambda x: pd.to_datetime(x).year

def ex_3_27():
    '''     
            ()'''
    df = swimmer_data()
    df['_'] = df[''].map({'': 1, '': 0}).astype(int) 
    df[' '] = df[' '].map(str_to_year)
    X = df[[', ', '', '_', ' ']] 
    X.insert(0, '', 1.0)
    y = df[''].apply(np.log) 
    
    beta = linear_model(X, y) 
    return beta_weight(beta, X, y)
      
      



[-0.0,
 0.650070475196164,
 0.09580282723307212,
 0.3041431115029873,
 0.03769748899125406]
      
      



« » - 0.038, «», . , «» 0.096. 65%, « ». , , , .





« », . :





def ex_3_28():
    '''       '''
    df = swimmer_data()
    df[' '] = df[' '].map(str_to_year)
    xs = df[''].apply(jitter(0.5))
    ys = df[' ']
    pd.DataFrame(np.array([xs,ys]).T).plot.scatter(0, 1, s=3, grid=True)
    plt.xlabel('')
    plt.ylabel(' ')
    #saveplot('ex_3_28.png')
    plt.show()
      
      



( ) . , :





, , — . , .





. , , . , .





, , , . , , ? , .





: , , . , , R2  .





, , . , 0.8 . , , , .





. R2  1.0, . R2  .





:





  • . .





  • . , , .





  • ( ).





  • . , - .





, , . , , .





«» R2 = 0.1049, « » R2 = 0.1050.





, , 10% . , «».





The source code examples for this post are in my   Github repo . All source data is taken from the  repository of the  author of the book. 





The next short post, post # 4 , will look at the prediction process.








All Articles