Useful tricks and best practices from Kaggle

In anticipation of the start of the " Machine Learning. Professional " course , we share the traditional translation of useful material.






In this article, you will learn what can only be learned by spending countless hours of study and practice.





About this project

Kaggle — . - . , , , ,  , .





164 . . , , Kaggle.





data science Kaggle, . , , .





, . , . , , , .





, . « Kaggle». , data science, , , Kaggle, . . - , . !





1.

. , (features) . , .





, , , :





, . , . . , :





houses = pd.read_csv('data/melb_data.csv')

# Calculate pairwise-correlation
matrix = houses.corr()

# Create a mask
mask = np.triu(np.ones_like(matrix, dtype=bool))

# Create a custom diverging palette
cmap = sns.diverging_palette(250, 15, s=75, l=40,
                             n=9, center="light", as_cmap=True)

plt.figure(figsize=(16, 12))

sns.heatmap(matrix, mask=mask, center=0, annot=True,
             fmt='.2f', square=True, cmap=cmap)

plt.show();
      
      



, . , DataFrame .corr



. np.ones_like dtype



, bool, True , DataFrame



:





>>> np.ones_like(matrix, dtype=bool)[:5]

array([[ True, True, True, True, True, True, True, True, True,
 True, True, True, True],
 [ True, True, True, True, True, True, True, True, True,
 True, True, True, True],
 [ True, True, True, True, True, True, True, True, True,
 True, True, True, True],
 [ True, True, True, True, True, True, True, True, True,
 True, True, True, True],
 [ True, True, True, True, True, True, True, True, True,
 True, True, True, True]])
      
      



Numpy .triu



, , False . Seaborn heatmap :





sns.heatmap(matrix, mask=mask, center=0, annot=True,
               fmt='.2f', square=True, cmap=cmap)
      
      



, , , .





2. value_counts

value_counts



, , dropna False:





>>> houses.CouncilArea.value_counts(dropna=False, normalize=True).head()

NaN           0.100810
Moreland      0.085641
Boroondara    0.085420
Moonee Valley 0.073417
Darebin       0.068778
Name: CouncilArea, dtype: float64
      
      



, , , . , , value_counts



- . :





>>> missing_props = houses.isna().sum() / len(houses)
>>> missing_props[missing_props > 0].sort_values(ascending=False
                                                 
BuildingArea 0.474963
YearBuilt    0.395803
CouncilArea  0.100810
Car          0.004566
dtype: float64
      
      



, DataFrame



. 0%, .. .





3. Pandas DataFrame Styler

 pandas. pandas DataFrame’. .style



DataFrame’ pandas, . , :





>>> diamonds = sns.load_dataset('diamonds')

>>> pd.crosstab(diamonds.cut, diamonds.clarity).\
                style.background_gradient(cmap='rocket_r')
      
      



Seaborn heatmap. pd.crosstab



. .style.background_gradient



, , . DataFrame



, , - VS2.





, :





>>> pd.crosstab(diamonds.cut, diamonds.clarity,
          aggfunc=np.mean, values=diamonds.price).\
          style.background_gradient(cmap='flare')
      
      



. DataFrame



, VS2 . , , . .style



:





>>> agg_prices = pd.crosstab(diamonds.cut, diamonds.clarity,
                         aggfunc=np.mean, values=diamonds.price).\
                         style.background_gradient(cmap='flare')

>>> agg_prices.format('{:.2f}')
      
      



.format {:.2f} 2 .





.style



- . CSS, . pandas .





4. Matplotlib

EDA (Exploratory Data Analysis) , Matplotlib . , , , , . .





, . , rcParams Matplotlib :





from matplotlib import rcParams







rcParams



- Python, Matplotlib:





. , , , :





# Remove top and right spines
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False

# Set fixed figure size
rcParams['figure.figsize'] = [12, 9]

# Set dots per inch to 300, very high quality images
rcParams['figure.dpi'] = 300

# Enable autolayout
rcParams['figure.autolayout'] = True

# Set global fontsize
rcParams['font.style'] = 16

# Fontsize of ticklabels
rcParams['xtick.labelsize'] = 10
rcParams['ytick.labelsize'] = 10
      
      



, Matplotlib. , rcParams.keys()



.





5. Pandas

Matplotlib, pandas , . , . , pandas 5 , pandas:





  • get_option()



    / set_option()



    - / .





  • reset_option()



    - .





  • description_option()



    - .





  • option_context()



    - , .





. pd.get_option



, , , set_option



:





>>> pd.get_option(‘display.max_columns’)
20
      
      



, , , DataFrame



. 20 , , .head



, pandas , :





>>> houses.head()







, . :





>>> pd.set_option(‘display.max_columns’, None)







:





>>> houses.head()







:





pd.reset_option(‘display.max_columns’)







, . display.max_rows



5, .head()



:





>>> pd.set_option(‘display.max_rows’, 5)>>> houses







plotly



, pandas. , , .plot



DataFrame’ pandas:





pd.set_option(‘plotting.backend’, ‘plotly’)







, plotly.





, pd.option_context



. , . , , pandas . , :





>>> df = pd.DataFrame(np.random.randn(5, 5))
>>> pd.reset_option('display.max_rows')
>>> with pd.option_context('float_format', '{:f}'.format):
        df.describe()
      
      



pandas.






"Machine Learning. Professional"








All Articles