DataFrame group by 이해하기
학습목표
- DataFrame groupby 이해하기
import pandas as pd
import numpy as np
group by
- 아래의 세 단계를 적용하여 데이터를 그룹화(groupping) (SQL의 group by 와 개념적으로는 동일, 사용법은 유사)
- 데이터 분할
- operation 적용
- 데이터 병합
# data 출처: https://www.kaggle.com/hesh97/titanicdataset-traincsv/data
df = pd.read_csv('./train.csv')
df.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
GroupBy groups 속성
- 각 그룹과 그룹에 속한 index를 dict 형태로 표현
class_group = df.groupby('Pclass')
class_group
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000018605571DC0>
class_group.groups
{1: [1, 3, 6, 11, 23, 27, 30, 31, 34, 35, 52, 54, 55, 61, 62, 64, 83, 88, 92, 96, 97, 102, 110, 118, 124, 136, 137, 139, 151, 155, 166, 168, 170, 174, 177, 185, 187, 194, 195, 209, 215, 218, 224, 230, 245, 248, 252, 256, 257, 258, 262, 263, 268, 269, 270, 273, 275, 284, 290, 291, 295, 297, 298, 299, 305, 306, 307, 309, 310, 311, 318, 319, 325, 329, 331, 332, 334, 336, 337, 339, 341, 351, 356, 366, 369, 370, 373, 375, 377, 380, 383, 390, 393, 412, 430, 434, 435, 438, 445, 447, ...], 2: [9, 15, 17, 20, 21, 33, 41, 43, 53, 56, 58, 66, 70, 72, 78, 84, 98, 99, 117, 120, 122, 123, 133, 134, 135, 144, 145, 148, 149, 150, 161, 178, 181, 183, 190, 191, 193, 199, 211, 213, 217, 219, 221, 226, 228, 232, 234, 236, 237, 238, 239, 242, 247, 249, 259, 265, 272, 277, 288, 292, 303, 308, 312, 314, 316, 317, 322, 323, 327, 340, 342, 343, 344, 345, 346, 357, 361, 385, 387, 389, 397, 398, 399, 405, 407, 413, 416, 417, 418, 426, 427, 432, 437, 439, 440, 443, 446, 450, 458, 463, ...], 3: [0, 2, 4, 5, 7, 8, 10, 12, 13, 14, 16, 18, 19, 22, 24, 25, 26, 28, 29, 32, 36, 37, 38, 39, 40, 42, 44, 45, 46, 47, 48, 49, 50, 51, 57, 59, 60, 63, 65, 67, 68, 69, 71, 73, 74, 75, 76, 77, 79, 80, 81, 82, 85, 86, 87, 89, 90, 91, 93, 94, 95, 100, 101, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 114, 115, 116, 119, 121, 125, 126, 127, 128, 129, 130, 131, 132, 138, 140, 141, 142, 143, 146, 147, 152, 153, 154, 156, 157, 158, 159, ...]}
gender_group = df.groupby('Sex')
gender_group.groups
{'female': [1, 2, 3, 8, 9, 10, 11, 14, 15, 18, 19, 22, 24, 25, 28, 31, 32, 38, 39, 40, 41, 43, 44, 47, 49, 52, 53, 56, 58, 61, 66, 68, 71, 79, 82, 84, 85, 88, 98, 100, 106, 109, 111, 113, 114, 119, 123, 128, 132, 133, 136, 140, 141, 142, 147, 151, 156, 161, 166, 167, 172, 177, 180, 184, 186, 190, 192, 194, 195, 198, 199, 205, 208, 211, 215, 216, 218, 229, 230, 233, 235, 237, 240, 241, 246, 247, 251, 254, 255, 256, 257, 258, 259, 264, 268, 269, 272, 274, 275, 276, ...], 'male': [0, 4, 5, 6, 7, 12, 13, 16, 17, 20, 21, 23, 26, 27, 29, 30, 33, 34, 35, 36, 37, 42, 45, 46, 48, 50, 51, 54, 55, 57, 59, 60, 62, 63, 64, 65, 67, 69, 70, 72, 73, 74, 75, 76, 77, 78, 80, 81, 83, 86, 87, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 101, 102, 103, 104, 105, 107, 108, 110, 112, 115, 116, 117, 118, 120, 121, 122, 124, 125, 126, 127, 129, 130, 131, 134, 135, 137, 138, 139, 143, 144, 145, 146, 148, 149, 150, 152, 153, 154, 155, ...]}
groupping 함수
- 그룹 데이터에 적용 가능한 통계 함수(NaN은 제외하여 연산)
- count - 데이터 개수
- sum - 데이터의 합
- mean, std, var - 평균, 표준편차, 분산
- min, max - 최소, 최대값
class_group.count()
PassengerId | Survived | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
Pclass | |||||||||||
1 | 216 | 216 | 216 | 216 | 186 | 216 | 216 | 216 | 216 | 176 | 214 |
2 | 184 | 184 | 184 | 184 | 173 | 184 | 184 | 184 | 184 | 16 | 184 |
3 | 491 | 491 | 491 | 491 | 355 | 491 | 491 | 491 | 491 | 12 | 491 |
class_group.mean()['Survived']
Pclass
1 0.629630
2 0.472826
3 0.242363
Name: Survived, dtype: float64
- 성별에 따른 생존율 구해보기
df.groupby('Sex').mean()['Survived']
Sex
female 0.742038
male 0.188908
Name: Survived, dtype: float64
복수 columns로 groupping 하기
- groupby에 column 리스트를 전달
-
통계함수를 적용한 결과는 multiindex를 갖는 dataframe
- 클래스와 성별에 따른 생존률 구해보기
df.groupby(['Pclass', 'Sex']).mean()['Survived']
Pclass Sex
1 female 0.968085
male 0.368852
2 female 0.921053
male 0.157407
3 female 0.500000
male 0.135447
Name: Survived, dtype: float64
df.groupby(['Pclass', 'Sex']).mean()
PassengerId | Survived | Age | SibSp | Parch | Fare | ||
---|---|---|---|---|---|---|---|
Pclass | Sex | ||||||
1 | female | 469.212766 | 0.968085 | 34.611765 | 0.553191 | 0.457447 | 106.125798 |
male | 455.729508 | 0.368852 | 41.281386 | 0.311475 | 0.278689 | 67.226127 | |
2 | female | 443.105263 | 0.921053 | 28.722973 | 0.486842 | 0.605263 | 21.970121 |
male | 447.962963 | 0.157407 | 30.740707 | 0.342593 | 0.222222 | 19.741782 | |
3 | female | 399.729167 | 0.500000 | 21.750000 | 0.895833 | 0.798611 | 16.118810 |
male | 455.515850 | 0.135447 | 26.507589 | 0.498559 | 0.224784 | 12.661633 |
df.groupby(['Pclass', 'Sex']).mean().loc[(2, 'female')] #Pclass 가 2인 평균값
PassengerId 443.105263
Survived 0.921053
Age 28.722973
SibSp 0.486842
Parch 0.605263
Fare 21.970121
Name: (2, female), dtype: float64
df.groupby(['Pclass', 'Sex']).mean().index
MultiIndex([(1, 'female'),
(1, 'male'),
(2, 'female'),
(2, 'male'),
(3, 'female'),
(3, 'male')],
names=['Pclass', 'Sex'])
index를 이용한 group by
- index가 있는 경우, groupby 함수에 level 사용 가능
- level은 index의 depth를 의미하며, 가장 왼쪽부터 0부터 증가
- set_index 함수
- column 데이터를 index 레벨로 변경
- reset_index 함수
- 인덱스 초기화
df.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
df.set_index(['Pclass', 'Sex'])
PassengerId | Survived | Name | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | ||
---|---|---|---|---|---|---|---|---|---|---|---|
Pclass | Sex | ||||||||||
3 | male | 1 | 0 | Braund, Mr. Owen Harris | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | female | 2 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
3 | female | 3 | 1 | Heikkinen, Miss. Laina | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
1 | female | 4 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
3 | male | 5 | 0 | Allen, Mr. William Henry | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2 | male | 887 | 0 | Montvila, Rev. Juozas | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
1 | female | 888 | 1 | Graham, Miss. Margaret Edith | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
3 | female | 889 | 0 | Johnston, Miss. Catherine Helen "Carrie" | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
1 | male | 890 | 1 | Behr, Mr. Karl Howell | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
3 | male | 891 | 0 | Dooley, Mr. Patrick | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 10 columns
df.set_index(['Pclass', 'Sex']).reset_index()
Pclass | Sex | PassengerId | Survived | Name | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | male | 1 | 0 | Braund, Mr. Owen Harris | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 1 | female | 2 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | female | 3 | 1 | Heikkinen, Miss. Laina | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 1 | female | 4 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 3 | male | 5 | 0 | Allen, Mr. William Henry | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 2 | male | 887 | 0 | Montvila, Rev. Juozas | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
887 | 1 | female | 888 | 1 | Graham, Miss. Margaret Edith | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
888 | 3 | female | 889 | 0 | Johnston, Miss. Catherine Helen "Carrie" | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
889 | 1 | male | 890 | 1 | Behr, Mr. Karl Howell | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
890 | 3 | male | 891 | 0 | Dooley, Mr. Patrick | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
df.set_index('Age').groupby(level=0).mean()
PassengerId | Survived | Pclass | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|
Age | ||||||
0.42 | 804.0 | 1.0 | 3.0 | 0.0 | 1.0 | 8.5167 |
0.67 | 756.0 | 1.0 | 2.0 | 1.0 | 1.0 | 14.5000 |
0.75 | 557.5 | 1.0 | 3.0 | 2.0 | 1.0 | 19.2583 |
0.83 | 455.5 | 1.0 | 2.0 | 0.5 | 1.5 | 23.8750 |
0.92 | 306.0 | 1.0 | 1.0 | 1.0 | 2.0 | 151.5500 |
... | ... | ... | ... | ... | ... | ... |
70.00 | 709.5 | 0.0 | 1.5 | 0.5 | 0.5 | 40.7500 |
70.50 | 117.0 | 0.0 | 3.0 | 0.0 | 0.0 | 7.7500 |
71.00 | 295.5 | 0.0 | 1.0 | 0.0 | 0.0 | 42.0792 |
74.00 | 852.0 | 0.0 | 3.0 | 0.0 | 0.0 | 7.7750 |
80.00 | 631.0 | 1.0 | 1.0 | 0.0 | 0.0 | 30.0000 |
88 rows × 6 columns
나이대별로 생존율 구하기
import math
def age_categorize(age):
if math.isnan(age):
return -1
return math.floor(age / 10) * 10
df.set_index('Age').groupby(age_categorize).mean()['Survived']
-1 0.293785
0 0.612903
10 0.401961
20 0.350000
30 0.437126
40 0.382022
50 0.416667
60 0.315789
70 0.000000
80 1.000000
Name: Survived, dtype: float64
MultiIndex를 이용한 groupping
df.set_index(['Pclass', 'Sex']).groupby(level=[0, 1]).mean()['Age']
Pclass Sex
1 female 34.611765
male 41.281386
2 female 28.722973
male 30.740707
3 female 21.750000
male 26.507589
Name: Age, dtype: float64
aggregate(집계) 함수 사용하기
- groupby 결과에 집계함수를 적용하여 그룹별 데이터 확인 가능
df.set_index(['Pclass', 'Sex']).groupby(level=[0, 1]).aggregate([np.mean, np.sum, np.max])
PassengerId | Survived | Age | SibSp | Parch | Fare | ||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
mean | sum | amax | mean | sum | amax | mean | sum | amax | mean | sum | amax | mean | sum | amax | mean | sum | amax | ||
Pclass | Sex | ||||||||||||||||||
1 | female | 469.212766 | 44106 | 888 | 0.968085 | 91 | 1 | 34.611765 | 2942.00 | 63.0 | 0.553191 | 52 | 3 | 0.457447 | 43 | 2 | 106.125798 | 9975.8250 | 512.3292 |
male | 455.729508 | 55599 | 890 | 0.368852 | 45 | 1 | 41.281386 | 4169.42 | 80.0 | 0.311475 | 38 | 3 | 0.278689 | 34 | 4 | 67.226127 | 8201.5875 | 512.3292 | |
2 | female | 443.105263 | 33676 | 881 | 0.921053 | 70 | 1 | 28.722973 | 2125.50 | 57.0 | 0.486842 | 37 | 3 | 0.605263 | 46 | 3 | 21.970121 | 1669.7292 | 65.0000 |
male | 447.962963 | 48380 | 887 | 0.157407 | 17 | 1 | 30.740707 | 3043.33 | 70.0 | 0.342593 | 37 | 2 | 0.222222 | 24 | 2 | 19.741782 | 2132.1125 | 73.5000 | |
3 | female | 399.729167 | 57561 | 889 | 0.500000 | 72 | 1 | 21.750000 | 2218.50 | 63.0 | 0.895833 | 129 | 8 | 0.798611 | 115 | 6 | 16.118810 | 2321.1086 | 69.5500 |
male | 455.515850 | 158064 | 891 | 0.135447 | 47 | 1 | 26.507589 | 6706.42 | 74.0 | 0.498559 | 173 | 8 | 0.224784 | 78 | 5 | 12.661633 | 4393.5865 | 69.5500 |