Python, data science and choices: part 5

The final post # 5 for beginners is devoted to comparative visualization of electoral data.





Comparative visualization of electoral data

Now consider a dataset of other general elections, this time in Russia, in 2011. Russia is a much larger country, and therefore much more voluminous data on voters. To do this, we load one large TSV file into RAM with tab delimited data fields.





def load_ru():
    '''   '''
    return pd.read_csv('data/ch01/Russia2011.tsv', '\t')
      
      



Let's see what column names are available in Russian data:





def ex_1_29():
    '''    
         '''
    return load_ru().columns
      
      



The following list of columns will be displayed:





Index([' ', ' β„–', ' ',
' ,    ',
...
'   ',
'   - -  ',
'  " "',
'     ',
'  "    ""',
'  " "',
'   " "'],
dtype='object')
      
      



The column names in the Russian dataset are very descriptive, but perhaps longer than necessary. It would also be convenient if columns representing the same attributes that we already saw in the UK election data (for example, winner rate and voter turnout) were labeled in the same way in both datasets. Let's rename them appropriately.





Along with the dataset, the pandas library function rename



expects a dictionary in which the keys with the current column names are mapped to the values ​​with the new names. If we combine this with the data we have already looked at, we get the following:





def load_ru_victors():
    '''   , 
       ,    '''
    new_cols_dict = {
        ' ,    ':'',
        '   ':
                                             ' ',
        '  " "':''      
        }
    newcols = list(new_cols_dict.values())    
    df = load_ru().rename( columns=new_cols_dict )[newcols]  
    df[' '] = df[''] / df[' '
    df['']            = df[' '] / df['']              
    return df 
      
      



pandas divide



, /



, . (nan



) , fill_value



. nan



, . :





 df[ ' ' ] = \
         df[ '' ].divide( df[ ' ' ], \
                                                        fill_value=1 )
      
      



, ( ). , , , :





def ex_1_30():
    '''  
          '''
    load_ru_victors()[''].hist(bins=20)
    plt.xlabel('  ') 
    plt.ylabel('')
    plt.show()
      
      



:





, . , 80% 100% β€” , .





, (), . :





def ex_1_31():
    '''   
           '''
    qqplot( load_ru_victors()[' '].dropna() )
    plt.show()
      
      



:





, , S- . , . , , .





, , , : 0.5 1.0 , 0.7 1.0. , 100% , ( , ), 1.0 .





, , 100% . .





, . , , , .





, , :





  • , , ,





  • ,





, , .





(), . Probability Mass Function (PMF), , . , , , , , . , , 0 1, ( 1 ), 1.





, . β€” , . β€” ( ).





, , 0 1. , , :





def plot_as_pmf(dt, label, ax):
    '''     
       (   )'''
    s   = pd.cut(dt, bins=40, labels=False)      #   40 
    pmf = s.value_counts().sort_index() / len(s) #    
    newax = pmf.plot(label=label, grid=True, ax=ax)   
    return newax
      
      



, :





def ex_1_32():
    '''      ,
             '''
    ax = plot_as_pmf(load_uk_victors()[''], '', None)
    plot_as_pmf(load_ru_victors()[''], '', ax)
    plt.xlabel('  ')   #  
    plt.ylabel('')
    plt.legend(loc='best')
    plt.show()
      
      



:





. , β€” , (0.6366 0.6523) β€” 100%. , . , , , , ( ), .





, , , 50% . (Peter Klimek) , .





, , , . , . , , , , - . , , , .





, , .





, , . : , , , . pandas scatter



, plot



.





def ex_1_33():
    '''   
         '''
    df = load_uk_victors()[ ['', ' '] ]
    df.plot.scatter(0, 1, s=3)
    plt.xlabel('')
    plt.ylabel(' ')
    plt.show()
      
      



:





, . β€” , , , : , , .





, 2010 . : «» . , «» , , . .





, :





def ex_1_34():
    '''     '''
    df = load_ru_victors()[ ['', ' '] ]
    df.plot.scatter(0, 1, s=3)
    plt.xlabel('')
    plt.ylabel(' ')
    plt.show()
      
      



:





, . , , .





, , , . , , , .





-, pandas alpha



scatter 0 1, 1 , 0 β€” .





def ex_1_35():
    '''   ( )   '''
    df = load_ru_victors()[ ['', ' '] ]
    rows = sp.random.choice(df.index.values, 10000)
    df.loc[rows].plot.scatter(0, 1, s=3, alpha=0.1)
    plt.xlabel('')
    plt.ylabel(' ')
    plt.axis([0, 1.05, 0, 1.05])
    plt.show()
      
      



:





. Β« Β» , 100% 100%- . , . , , 2011 . .





. - - , , .





Github. .





. , .





, , . , Python scipy. , pandas, , . .





. - , . , , .








All Articles