DataFrame group by 이해하기

학습목표

  1. DataFrame groupby 이해하기
import pandas as pd
import numpy as np

group by

  • 아래의 세 단계를 적용하여 데이터를 그룹화(groupping) (SQL의 group by 와 개념적으로는 동일, 사용법은 유사)
    • 데이터 분할
    • operation 적용
    • 데이터 병합
# data 출처: https://www.kaggle.com/hesh97/titanicdataset-traincsv/data
df = pd.read_csv('./train.csv')
df.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

GroupBy groups 속성

  • 각 그룹과 그룹에 속한 index를 dict 형태로 표현
class_group = df.groupby('Pclass')
class_group
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000018605571DC0>
class_group.groups
{1: [1, 3, 6, 11, 23, 27, 30, 31, 34, 35, 52, 54, 55, 61, 62, 64, 83, 88, 92, 96, 97, 102, 110, 118, 124, 136, 137, 139, 151, 155, 166, 168, 170, 174, 177, 185, 187, 194, 195, 209, 215, 218, 224, 230, 245, 248, 252, 256, 257, 258, 262, 263, 268, 269, 270, 273, 275, 284, 290, 291, 295, 297, 298, 299, 305, 306, 307, 309, 310, 311, 318, 319, 325, 329, 331, 332, 334, 336, 337, 339, 341, 351, 356, 366, 369, 370, 373, 375, 377, 380, 383, 390, 393, 412, 430, 434, 435, 438, 445, 447, ...], 2: [9, 15, 17, 20, 21, 33, 41, 43, 53, 56, 58, 66, 70, 72, 78, 84, 98, 99, 117, 120, 122, 123, 133, 134, 135, 144, 145, 148, 149, 150, 161, 178, 181, 183, 190, 191, 193, 199, 211, 213, 217, 219, 221, 226, 228, 232, 234, 236, 237, 238, 239, 242, 247, 249, 259, 265, 272, 277, 288, 292, 303, 308, 312, 314, 316, 317, 322, 323, 327, 340, 342, 343, 344, 345, 346, 357, 361, 385, 387, 389, 397, 398, 399, 405, 407, 413, 416, 417, 418, 426, 427, 432, 437, 439, 440, 443, 446, 450, 458, 463, ...], 3: [0, 2, 4, 5, 7, 8, 10, 12, 13, 14, 16, 18, 19, 22, 24, 25, 26, 28, 29, 32, 36, 37, 38, 39, 40, 42, 44, 45, 46, 47, 48, 49, 50, 51, 57, 59, 60, 63, 65, 67, 68, 69, 71, 73, 74, 75, 76, 77, 79, 80, 81, 82, 85, 86, 87, 89, 90, 91, 93, 94, 95, 100, 101, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114, 115, 116, 119, 121, 125, 126, 127, 128, 129, 130, 131, 132, 138, 140, 141, 142, 143, 146, 147, 152, 153, 154, 156, 157, 158, 159, ...]}
gender_group = df.groupby('Sex')
gender_group.groups
{'female': [1, 2, 3, 8, 9, 10, 11, 14, 15, 18, 19, 22, 24, 25, 28, 31, 32, 38, 39, 40, 41, 43, 44, 47, 49, 52, 53, 56, 58, 61, 66, 68, 71, 79, 82, 84, 85, 88, 98, 100, 106, 109, 111, 113, 114, 119, 123, 128, 132, 133, 136, 140, 141, 142, 147, 151, 156, 161, 166, 167, 172, 177, 180, 184, 186, 190, 192, 194, 195, 198, 199, 205, 208, 211, 215, 216, 218, 229, 230, 233, 235, 237, 240, 241, 246, 247, 251, 254, 255, 256, 257, 258, 259, 264, 268, 269, 272, 274, 275, 276, ...], 'male': [0, 4, 5, 6, 7, 12, 13, 16, 17, 20, 21, 23, 26, 27, 29, 30, 33, 34, 35, 36, 37, 42, 45, 46, 48, 50, 51, 54, 55, 57, 59, 60, 62, 63, 64, 65, 67, 69, 70, 72, 73, 74, 75, 76, 77, 78, 80, 81, 83, 86, 87, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 101, 102, 103, 104, 105, 107, 108, 110, 112, 115, 116, 117, 118, 120, 121, 122, 124, 125, 126, 127, 129, 130, 131, 134, 135, 137, 138, 139, 143, 144, 145, 146, 148, 149, 150, 152, 153, 154, 155, ...]}

groupping 함수

  • 그룹 데이터에 적용 가능한 통계 함수(NaN은 제외하여 연산)
  • count - 데이터 개수
  • sum - 데이터의 합
  • mean, std, var - 평균, 표준편차, 분산
  • min, max - 최소, 최대값
class_group.count()
PassengerId Survived Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
Pclass
1 216 216 216 216 186 216 216 216 216 176 214
2 184 184 184 184 173 184 184 184 184 16 184
3 491 491 491 491 355 491 491 491 491 12 491
class_group.mean()['Survived']
Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64
  • 성별에 따른 생존율 구해보기
df.groupby('Sex').mean()['Survived']
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

복수 columns로 groupping 하기

  • groupby에 column 리스트를 전달
  • 통계함수를 적용한 결과는 multiindex를 갖는 dataframe

  • 클래스와 성별에 따른 생존률 구해보기
df.groupby(['Pclass', 'Sex']).mean()['Survived']
Pclass  Sex   
1       female    0.968085
        male      0.368852
2       female    0.921053
        male      0.157407
3       female    0.500000
        male      0.135447
Name: Survived, dtype: float64
df.groupby(['Pclass', 'Sex']).mean()
PassengerId Survived Age SibSp Parch Fare
Pclass Sex
1 female 469.212766 0.968085 34.611765 0.553191 0.457447 106.125798
male 455.729508 0.368852 41.281386 0.311475 0.278689 67.226127
2 female 443.105263 0.921053 28.722973 0.486842 0.605263 21.970121
male 447.962963 0.157407 30.740707 0.342593 0.222222 19.741782
3 female 399.729167 0.500000 21.750000 0.895833 0.798611 16.118810
male 455.515850 0.135447 26.507589 0.498559 0.224784 12.661633
df.groupby(['Pclass', 'Sex']).mean().loc[(2, 'female')] #Pclass 가 2인 평균값
PassengerId    443.105263
Survived         0.921053
Age             28.722973
SibSp            0.486842
Parch            0.605263
Fare            21.970121
Name: (2, female), dtype: float64
df.groupby(['Pclass', 'Sex']).mean().index
MultiIndex([(1, 'female'),
            (1,   'male'),
            (2, 'female'),
            (2,   'male'),
            (3, 'female'),
            (3,   'male')],
           names=['Pclass', 'Sex'])

index를 이용한 group by

  • index가 있는 경우, groupby 함수에 level 사용 가능
    • level은 index의 depth를 의미하며, 가장 왼쪽부터 0부터 증가
  • set_index 함수
  • column 데이터를 index 레벨로 변경
  • reset_index 함수
  • 인덱스 초기화
df.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
df.set_index(['Pclass', 'Sex'])
PassengerId Survived Name Age SibSp Parch Ticket Fare Cabin Embarked
Pclass Sex
3 male 1 0 Braund, Mr. Owen Harris 22.0 1 0 A/5 21171 7.2500 NaN S
1 female 2 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 1 0 PC 17599 71.2833 C85 C
3 female 3 1 Heikkinen, Miss. Laina 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
1 female 4 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 1 0 113803 53.1000 C123 S
3 male 5 0 Allen, Mr. William Henry 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ...
2 male 887 0 Montvila, Rev. Juozas 27.0 0 0 211536 13.0000 NaN S
1 female 888 1 Graham, Miss. Margaret Edith 19.0 0 0 112053 30.0000 B42 S
3 female 889 0 Johnston, Miss. Catherine Helen "Carrie" NaN 1 2 W./C. 6607 23.4500 NaN S
1 male 890 1 Behr, Mr. Karl Howell 26.0 0 0 111369 30.0000 C148 C
3 male 891 0 Dooley, Mr. Patrick 32.0 0 0 370376 7.7500 NaN Q

891 rows × 10 columns

df.set_index(['Pclass', 'Sex']).reset_index()
Pclass Sex PassengerId Survived Name Age SibSp Parch Ticket Fare Cabin Embarked
0 3 male 1 0 Braund, Mr. Owen Harris 22.0 1 0 A/5 21171 7.2500 NaN S
1 1 female 2 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 1 0 PC 17599 71.2833 C85 C
2 3 female 3 1 Heikkinen, Miss. Laina 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 1 female 4 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 1 0 113803 53.1000 C123 S
4 3 male 5 0 Allen, Mr. William Henry 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 2 male 887 0 Montvila, Rev. Juozas 27.0 0 0 211536 13.0000 NaN S
887 1 female 888 1 Graham, Miss. Margaret Edith 19.0 0 0 112053 30.0000 B42 S
888 3 female 889 0 Johnston, Miss. Catherine Helen "Carrie" NaN 1 2 W./C. 6607 23.4500 NaN S
889 1 male 890 1 Behr, Mr. Karl Howell 26.0 0 0 111369 30.0000 C148 C
890 3 male 891 0 Dooley, Mr. Patrick 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

df.set_index('Age').groupby(level=0).mean()
PassengerId Survived Pclass SibSp Parch Fare
Age
0.42 804.0 1.0 3.0 0.0 1.0 8.5167
0.67 756.0 1.0 2.0 1.0 1.0 14.5000
0.75 557.5 1.0 3.0 2.0 1.0 19.2583
0.83 455.5 1.0 2.0 0.5 1.5 23.8750
0.92 306.0 1.0 1.0 1.0 2.0 151.5500
... ... ... ... ... ... ...
70.00 709.5 0.0 1.5 0.5 0.5 40.7500
70.50 117.0 0.0 3.0 0.0 0.0 7.7500
71.00 295.5 0.0 1.0 0.0 0.0 42.0792
74.00 852.0 0.0 3.0 0.0 0.0 7.7750
80.00 631.0 1.0 1.0 0.0 0.0 30.0000

88 rows × 6 columns

나이대별로 생존율 구하기

import math
def age_categorize(age):
    if math.isnan(age):
        return -1
    return math.floor(age / 10) * 10
df.set_index('Age').groupby(age_categorize).mean()['Survived']
-1     0.293785
 0     0.612903
 10    0.401961
 20    0.350000
 30    0.437126
 40    0.382022
 50    0.416667
 60    0.315789
 70    0.000000
 80    1.000000
Name: Survived, dtype: float64

MultiIndex를 이용한 groupping

df.set_index(['Pclass', 'Sex']).groupby(level=[0, 1]).mean()['Age']
Pclass  Sex   
1       female    34.611765
        male      41.281386
2       female    28.722973
        male      30.740707
3       female    21.750000
        male      26.507589
Name: Age, dtype: float64

aggregate(집계) 함수 사용하기

  • groupby 결과에 집계함수를 적용하여 그룹별 데이터 확인 가능
df.set_index(['Pclass', 'Sex']).groupby(level=[0, 1]).aggregate([np.mean, np.sum, np.max])
PassengerId Survived Age SibSp Parch Fare
mean sum amax mean sum amax mean sum amax mean sum amax mean sum amax mean sum amax
Pclass Sex
1 female 469.212766 44106 888 0.968085 91 1 34.611765 2942.00 63.0 0.553191 52 3 0.457447 43 2 106.125798 9975.8250 512.3292
male 455.729508 55599 890 0.368852 45 1 41.281386 4169.42 80.0 0.311475 38 3 0.278689 34 4 67.226127 8201.5875 512.3292
2 female 443.105263 33676 881 0.921053 70 1 28.722973 2125.50 57.0 0.486842 37 3 0.605263 46 3 21.970121 1669.7292 65.0000
male 447.962963 48380 887 0.157407 17 1 30.740707 3043.33 70.0 0.342593 37 2 0.222222 24 2 19.741782 2132.1125 73.5000
3 female 399.729167 57561 889 0.500000 72 1 21.750000 2218.50 63.0 0.895833 129 8 0.798611 115 6 16.118810 2321.1086 69.5500
male 455.515850 158064 891 0.135447 47 1 26.507589 6706.42 74.0 0.498559 173 8 0.224784 78 5 12.661633 4393.5865 69.5500