32.    Python Package - pandas

January 28, 2019
home

Contents

01. Streamline Data Analysis using DataFrame in package pandas

dataframe

02. database content to DATA FRAME directly

        print('----------   mysql3.py --------')
        import pandas as pd
        import mysql.connector

        mydb = mysql.connector.connect(
          host="XXXX.winhost.com",
          user="xxxxxxxx",
          password="xxx2xxx",
          database="mysql_56314_peter"
        )

        df = pd.read_sql("SELECT * FROM dogs", con=mydb)

        print(df)

        '''
          dog_name dog_color
        0    Tairo     brown
        1      Emi     white
        2      Joy     black
        3    Lucky    yellow
        '''

        print('---------- end    test 5 --------')
            

03. creating dataframes for reporting

        print('-------  dataframe ==> csv file  for reporting ---')

        print('--- 1. prepare a dataframe -------')
        import pandas as pd

        df =pd.DataFrame({                       # python dictionary
            'name': ['Happy', 'Lucky', 'Wiwi'],  # key-value pairs
            'weight' : [35.50, 65.33, 42.50],
            'length' : [2.5,  4.3,  3.2],
            'color': ['brown', 'white', 'black']
        })
        print(df)
        # The values for color are not measurable,
        # The statistical data will not be generated for it.

        print('--- 2. create a report -------')
        info = df.describe()
        print(info)
        print('type of info = ' + str(type(info)))   # dataframe
        '''
                  weight    length
        count   3.000000  3.000000
        mean   47.776667  3.333333
        std    15.599347  0.907377
        min    35.500000  2.500000
        25%    39.000000  2.850000
        50%    42.500000  3.200000
        75%    53.915000  3.750000
        max    65.330000  4.300000

        note-1: The index are created using method describe.
        note-2: Method describe is defined in a standard way.
        '''

        print('--- 3. save the df to csv file ----')
        info.to_csv('dogs.csv', header = True, index = True)
        #the file will be in the parent of the current folder for my mac.

        print('---------  end of test  ---------')

            

04. Storing python objects

        import pickle        # built-in module

        print('--- 1.  pickle     write  ------------')

        my_dict = {1: 'Lucky', 2: 'Happy', 3: 'Joy'}
        pickle_out = open('my_dict.pickle', 'wb')   #write output file, byte                        
        pickle.dump(my_dict, pickle_out)            #from, to
        pickle_out.close()

        print('--- 2.  pickle     read  ------------')
        pickle_in = open('my_dict.pickle', 'rb') 
        dict2 = pickle.load(pickle_in)
        pickle_in.close()
        print(dict2)   #{1: 'Lucky', 2: 'Happy', 3: 'Joy'}
        print('type of dict2 = ' + str(type(dict2)))   #dict

        print('---      end     test  -----')
            

05. Creating DataFrames from CSV files

06. Numpy, Series

        print('--- 1, create df  from a python dictionary ----')
        import pandas as pd
        dogs = {
                  'name':  ['Lucky', 'Happy', 'Joy'],        # column names
                  'age':   [8, 6, 3],                        # column 1
                  'weight':[41.5, 32.7, 64.4],               # column 2
               }                        
        df = pd.DataFrame(dogs)
        print(df)

        c01 = df['age'][1]
        print('c01 = ' + str(c01))               # 6
        print('c01 type is ' + str(type(c01)))   # class 'numpy.int64'

        c11 = df['weight'][1]                    
        print(c11)                               # 32.7
        print('c11 type is ' + str(type(c11)))   # class 'numpy.float64'
            
        import numpy as np
        import pandas as pd

        data = np.array([['', 'Col1', 'Col2','name'],        #python lists
                         [0, '11npa','12npa', 'A'],          #python lists
                         [1, '21npa','22npa', 'B']])         #python lists


        df = pd.DataFrame(data = data[1:,1:],
                          index=data[1:,0],
                          columns= data[0,1:])
        print(df)

        print('df.index = ' + str(df.index))
        print('df.columns = ' +str(df.columns))
        print('df.values = ' + str(df.values))
            
        # continue to use the dataframe for dogs data
        print('--- 2. get a column ---------------')
        c1 = df['weight']
        print(' c1 = ' + str(c1))
        print('c1 type = ' + str(type(c1)))  # pandas.core.series.Series
        '''
        
        0    41.5
        1    32.7
        2    64.4
        
        '''
        print('--- 3. get a row ---------------')
        r1 = df.iloc[1]
        print(r1)
        print(' r1 type ' + str(type(r1)))   # pandas.core.series.Series
        '''   note: column names are on the left
        name      Happy
        age           6
        weight     32.7
        '''
            
        import pandas as pd
        c1 = pd.Series([22,55,33])     #  Series for one column values
        c2 = pd.Series([99,77,11])     #  Series for one column values
        print('c1 type is ' + str(type(c1))) # class pandas.core.series.Series

        dict = {'col1': c1,'col2': c2}  # two column names
        print('dict type is ' + str(type(dict)))    #class 'dict'

        df = pd.DataFrame(dict) # class dataframe
        print(df)
            

07. DataFrame.loc for both rows and columns



08.Columns first or Rows first

08.1 Columns First

        print('------- dataframe column test , Jan 16, 2019 -----')
        import pandas as pd

        print('--- 1. create df ----')
        import pandas as pd

        dogs = {
                  'name':  ['Lucky', 'Happy', 'Joy'],        
                  'age':   [8, 6, 3],                        
                  'weight':[41.5, 32.7, 64.4],              
               }                        
        df = pd.DataFrame(dogs)
        print(df)
        '''
            name  age  weight
        0  Lucky    8    41.5
        1  Happy    6    32.7
        2    Joy    3    64.4
        '''

        print('--- 2.  rearrange columns ----')
        
        cols2 = ['age', 'weight', 'name']          #python list
        df2 = df[cols2]
        
        print(df2)

        print('--- 3.  drop a column -------')
        df3 = df.drop('weight', 1)  # axis number, 1 for column
        print(df3)

        print('--- 4. add a column ---------')
        
        dogs_color = ['white', 'brown', 'black']    #python list, no need for Series
        df['color'] = dogs_color                    # color is the new column name
        
        print(df)

        print('--- 5. sort ---------')
        
        df5 = df.sort_values(by=['age'])             # no need from index for row
        
        print(df5)

        print('--- 6. access  one column ---------------------')
        column_weight = df['weight']
        print('column weight = ' + str(column_weight))
        '''
        0    41.5
        1    32.7
        2    64.4
        '''

        print('--- 7. access a cell --------------------------')
        weight4Joy = df['weight'][2]                # index is not set for row
        print('weight4Joy = ' + str(weight4Joy))    #64.4

        '''
        
        notes:
        1. There are TWO ways to access a cell in a dataframe.
        2. This is one way to get all the column first. Then select a row.
        
        

        '''

        print('---------- end of test 5 ------------------------')
            

08.2 Rows FIRST

        import pandas as pd
        import pandas as pd
        dogs = {
                  'name':  ['Lucky', 'Happy', 'Joy'],        
                  'age':   [8, 6, 3],                        
                  'weight':[41.5, 32.7, 64.4],              
               }                        
        df = pd.DataFrame(dogs)
        print(df)
        print(df.index)   #RangeIndex(start=0, stop=3, step=1)
        '''
            name  age  weight
        0  Lucky    8    41.5
        1  Happy    6    32.7
        2    Joy    3    64.4
        '''
        d1 = df.iloc[1]                         # 1 row
        print('d1 = ' + str(d1))
        print('d1 type is ' + str(type(d1)))    #class 'pandas.core.series.Series'
        '''
        name      Happy
        age           6
        weight     32.7
        '''
        d01 = df.iloc[0:2]                       # 0, 1 rows, row 2 is excluded.
        print('d01 = ' + str(d01))
        print('d01 type is ' + str(type(d01)))   #dataframe 

        '''
        d01 =     name  age  weight
        0  Lucky    8    41.5
        1  Happy    6    32.7
        '''
            
        import pandas as pd
        dogs = {
                  'name':  ['Lucky', 'Happy', 'Joy'],        
                  'age':   [8, 6, 3],                        
                  'weight':[41.5, 32.7, 64.4],              
               }                        
        df = pd.DataFrame(dogs)
        
        df.set_index('name', inplace=True)
        df.sort_values(by='name', ascending=True, inplace=True)
        
        print(df)

        '''
               age  weight
        name
        Happy    6    32.7
        Joy      3    64.4
        Lucky    8    41.5
        '''

        d1 = df.loc['Joy']                       # 1 row
        print('d1 = ' + str(d1))
        print('d1 type is ' + str(type(d1)))     # Series
        '''
         age        3.0
         weight    64.4
         Name: Joy, dtype: float64
        '''

        d01 = df.loc['Happy':'Joy']                 # 0, 1 rows, row 1 is included.
        print('d01 = ' + str(d01))
        print('d01 type is ' + str(type(d01)))      # dataframe
        '''
                age  weight
        name
        Happy    6    32.7
        Joy      3    64.4
        ''' 
            
    Joy_weight = df.loc['Joy']['weight']
    print('Joy_weight = ' + str(Joy_weight))  # 64.5                
            

09. Multilevel indices and grouping

        # --- 1. prepare a csv file  ----
        name,month,weight
        Joy, 1, 45.9
        Happy, 3, 12.3
        Happy, 1, 11.1
        Joy, 3, 42.2
        Happy, 2, 10.5
        Joy, 2, 43.1

        #  ---2. create a df, sort
        import pandas as pd
        import pandas as pd
        df = pd.read_csv('dogs_weight.csv') 
        df2 = df.sort_values(by=['name','month'])
        print(df2)
        '''
            name  month  weight
        2  Happy      1    11.1
        4  Happy      2    10.5
        1  Happy      3    12.3
        0    Joy      1    45.9
        5    Joy      2    43.1
        3    Joy      3    42.2

        '''
        #----  3. create mutilevel indices
        df3 = df2.set_index(['name', 'month'])
        print(df3)
        '''
                     weight
        name  month
        Happy 1        11.1
              2        10.5
              3        12.3
        Joy   1        45.9
              2        43.1
              3        42.2
        '''
        #---  4. grouping -------------------------
        df4 = df3.groupby('name') 
        s_report = df4.mean()   
        print(s_report)

        '''    weight
         name
        Happy   11.30
        Joy     43.73
        '''

        #---  5. having the 2nd index to be a column
        df5 =df3.unstack()
        print(df5)
        '''
           month     1     2     3
         name
           Happy   11.1  10.5  12.3
           Joy     45.9  43.1  42.2  
        '''    
            

10. pivoting-drill down

        -------------  csv   data  ------------------------
        name,store,day,time,tip
        Joe,Pandas,1,lunch,250.50
        Joe,Pandas,1,dinner,420.50
        Joe,Pandas,2,lunch,110.50
        Joe,Pandas,2,dinner,120.50
        Nancy,Pandas,1,lunch,150.50
        Nancy,Pandas,1,dinner,320.50
        Nancy,Pandas,2,lunch,410.50
        Nancy,Pandas,2,dinner,220.50
        Michael,Amber,1,lunch,310.00             note: one data for Muchael is not there
        Michael,Amber,2,lunch,110.50
        Michael,Amber,2,dinner,315.00
        Carey,Amber,1,lunch,110.00
        Carey,Amber,1,dinner,315.00
        Carey,Amber,2,lunch,150.50
        Carey,Amber,2,dinner,     215.00          note: space is ok

        ----------------    code -----------------------------------
        print('------ pivot table, drilldown  -----')
        import pandas as pd

        print('--- 1. prepare df --------')
        df = pd.read_csv('tips.csv')
        print(df.head())
        '''
            name   store  day    time    tip
        0    Joe  Pandas    1   lunch  250.5
        1    Joe  Pandas    1  dinner  420.5
        2    Joe  Pandas    2   lunch  110.5
        3    Joe  Pandas    2  dinner  120.5
        4  Nancy  Pandas    1   lunch  150.5
        '''

        print('---2. create pivot for drill down   ---')

        df2 = pd.pivot_table(df,'tip',['store','name'],['day','time'])
              
        print(df2)
        '''
                  day        1               2
                  time  dinner  lunch  dinner  lunch
        store  name
        Amber  Carey    315.0  110.0    215.0  150.5
               Michael    NaN  310.0    315.0  110.5
        Pandas Joe      420.5  250.5    120.5  110.5
               Nancy    320.5  150.5    220.5  410.5
        '''
        

        print('----     end of test  -------')
            

11. Combining dataframes

        import pandas as pd
        df1 = pd.DataFrame([[11,12],[21,22]], columns = ['A', 'B'])
        df2 = pd.DataFrame([[31,32],[41,42]], columns = ['A', 'B'])

        df12 = df1.append(df2, ignore_index = True)
        print('df12 = ' + str(df12))
        '''
             A   B
        0  11  12
        1  21  22
        2  31  32
        3  41  42
        '''

        df12x =pd.concat([df1, df2], ignore_index = True)
        print(' df12x = ' + str(df12x))    # the same
            
                print('------------------- test merge -----------------------')
                import pandas as pd
                
                print('------  workers ------------------------------')
                workers = pd.DataFrame([{'name': 'Joe','emp_id': 12,'rate': 15.50},
                                        {'name': 'Emi','emp_id': 35,'rate': 20.10}])
                print(workers)
                '''
                   emp_id name  rate
                0      12  Joe  15.5
                1      35  Emi  20.1
                '''
                
                print('-----  works ------------------')
                works = pd.DataFrame([{'id': 12,'hours': 24},
                                      {'id': 35,'hours': 41}])
                                                   
                print(works)
                '''
                   hours  id
                0     24  12
                1     41  35
                '''
                
                print('-----   merge ---------------------')
                m = pd.merge(workers, works, left_on='emp_id', right_on='id').drop('id',axis=1) 
                
                print(m)
                '''
                   emp_id name  rate  hours
                0      12  Joe  15.5     24
                1      35  Emi  20.1     41
                '''
                print('------------   end -- test 3----------------------------')
            

12. Data Cleanup

    # ---  senario-1 --------------------------------------  
    # --- mydogs.csv-----
    name,month,weight
    Joy, 1, 45.9
    Happy, 3
    Happy, 1, 11.1
    Joy, 3
    Happy, 2, 10.5
    Joy, 2, 43.1

    # --- python code ----
    import pandas as pd 
    print('---   data with missing values ----')
    dogs= pd.read_csv('mydogs.csv')
    print(dogs)
    '''
    name  month  weight
    0    Joy      1    45.9
    1  Happy      3     NaN
    2  Happy      1    11.1
    3    Joy      3     NaN
    4  Happy      2    10.5
    5    Joy      2    43.1
    ''''
    print('----- solution: fill 0  -----------')  
    dogs['weight'].fillna(0, inplace=True)
    print(dogs)
    '''
    name  month  weight
    0    Joy      1    45.9
    1  Happy      3     0.0
    2  Happy      1    11.1
    3    Joy      3     0.0
    4  Happy      2    10.5
    5    Joy      2    43.1
    '''

    # ---  senario-2 ---------------------------------------------  
    # --- my_expenses.csv-----
    item,          amount       Weekday
    misc,           50.10        Mon
    foods,         120.99        Wed
    haircut,        20.55        Thurs
    dinner,         99.99        Sun
    ,               15.00        Sun
    misc,          100.50        Sat

    # ---python code -----
    import pandas as pd 

    print('---   data with missing values -------------')
    expenses= pd.read_csv('my_expenses.csv')
    print(expenses)
    '''
        item             amount       Weekday
    0     misc                 50.10        Mon
    1    foods                120.99        Wed
    2  haircut               20.55        Thurs
    3   dinner                 99.99        Sun
    4      NaN                 15.00        Sun
    5     misc                100.50        Sat
    '''

    print('----- solution:   fill them with reasonable values -----------')

    expenses['item'].fillna(value='misc', inplace= True)
    print(expenses)
    '''
        item             amount       Weekday
    0     misc                 50.10        Mon
    1    foods                120.99        Wed
    2  haircut               20.55        Thurs
    3   dinner                 99.99        Sun
    4     misc                 15.00        Sun
    5     misc                100.50        Sat
    '''
    
    print('------- end of test  -------------')

            

13. List Comprehension

        print('---  1. prepare a dataframe')
        import pandas as pd
        
        data = {'product': ['A', 'B', 'C', 'D'],
                'id': [1, 2, 3, 4],
                'hourly_pay': [35.50, 16.50, 17.50, 120.50]}
        df = pd.DataFrame(data)
        print(df)
        '''
            product  id  hourly_pay
        0       A   1        35.5
        1       B   2        16.5
        2       C   3        17.5
        3       D   4       120.5
        '''
                        
        print('--- 2. Using list comprehension and a function to convert a list ---')
        def myfunc(p):
            if p < 100:
                return p * 1.1
            else:
                return p

        new_hourly_pay = [myfunc(p)  for p in df['hourly_pay']] 
        
        print('---  3. add a new column  ------------')
        df['new_hourly_pay'] = new_hourly_pay
        print(df)
        '''
            product  id  hourly_pay  new_hourly_pay
        0       A   1        35.5           39.05
        1       B   2        16.5           18.15
        2       C   3        17.5           19.25
        3       D   4       120.5          120.50
        '''
        print('--- or    4.  update ------------------')
        #df['hourly_pay'] = new_hourly_pay
        #print(df)
                

14. method iterrows

        import pandas as pd

        print('-----   1. create df -------------')
        
        df = pd.DataFrame({
            'type': ['A', 'B', 'C'],
            'price': [11.11, 22.22, 33.33],
            'quantity': [2, 4, 3]
        })
        df.set_index('type', inplace=True)          
        print(df)
        '''
                price  quantity
        type
        A     11.11         2
        B     22.22         4
        C     33.33         3
        '''

        print('----    2. Using method iterrows to iterate each row ---')
        total_amts = []
        for index, row in df.iterrows():
            amt = row['price'] * row['quantity']
            total_amts.append(amt)
            
        df['total_amt'] = total_amts       # add a column
        print(df)
        '''
                price  quantity  total_amt
        type
        A     11.11         2      22.22
        B     22.22         4      88.88
        C     33.33         3      99.99
        '''

                

15. code visual studio for python