transform 함수의 이해 및 활용하기

학습목표

  1. transform 함수 이해하기
import numpy as np
import pandas as pd
# data 출처: https://www.kaggle.com/hesh97/titanicdataset-traincsv/data
df = pd.read_csv('./train.csv')
df.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

transform 함수

  • groupby 후 transform 함수를 사용하면 원래의 index를 유지한 상태로 통계함수를 적용
  • 전체 데이터의 집계가 아닌 각 그룹에서의 집계를 계산
  • 따라서 새로 생성된 데이터를 원본 dataframe과 합치기 쉬움
df.groupby('Pclass').mean()
PassengerId Survived Age SibSp Parch Fare
Pclass
1 461.597222 0.629630 38.233441 0.416667 0.356481 84.154687
2 445.956522 0.472826 29.877630 0.402174 0.380435 20.662183
3 439.154786 0.242363 25.140620 0.615071 0.393075 13.675550
df.groupby('Pclass').transform(np.mean)
PassengerId Survived Age SibSp Parch Fare
0 439.154786 0.242363 25.140620 0.615071 0.393075 13.675550
1 461.597222 0.629630 38.233441 0.416667 0.356481 84.154687
2 439.154786 0.242363 25.140620 0.615071 0.393075 13.675550
3 461.597222 0.629630 38.233441 0.416667 0.356481 84.154687
4 439.154786 0.242363 25.140620 0.615071 0.393075 13.675550
... ... ... ... ... ... ...
886 445.956522 0.472826 29.877630 0.402174 0.380435 20.662183
887 461.597222 0.629630 38.233441 0.416667 0.356481 84.154687
888 439.154786 0.242363 25.140620 0.615071 0.393075 13.675550
889 461.597222 0.629630 38.233441 0.416667 0.356481 84.154687
890 439.154786 0.242363 25.140620 0.615071 0.393075 13.675550

891 rows × 6 columns

df['Age2'] = df.groupby('Pclass').transform(np.mean)['Age']
df
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Age2
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 25.140620
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 38.233441
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 25.140620
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 38.233441
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 25.140620
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S 29.877630
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S 38.233441
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S 25.140620
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C 38.233441
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q 25.140620

891 rows × 13 columns

df.groupby(['Pclass', 'Sex']).mean()
PassengerId Survived Age SibSp Parch Fare Age2
Pclass Sex
1 female 469.212766 0.968085 34.611765 0.553191 0.457447 106.125798 38.233441
male 455.729508 0.368852 41.281386 0.311475 0.278689 67.226127 38.233441
2 female 443.105263 0.921053 28.722973 0.486842 0.605263 21.970121 29.877630
male 447.962963 0.157407 30.740707 0.342593 0.222222 19.741782 29.877630
3 female 399.729167 0.500000 21.750000 0.895833 0.798611 16.118810 25.140620
male 455.515850 0.135447 26.507589 0.498559 0.224784 12.661633 25.140620
df.groupby(['Pclass','Sex']).transform(np.mean)
PassengerId Survived Age SibSp Parch Fare Age2
0 455.515850 0.135447 26.507589 0.498559 0.224784 12.661633 25.140620
1 469.212766 0.968085 34.611765 0.553191 0.457447 106.125798 38.233441
2 399.729167 0.500000 21.750000 0.895833 0.798611 16.118810 25.140620
3 469.212766 0.968085 34.611765 0.553191 0.457447 106.125798 38.233441
4 455.515850 0.135447 26.507589 0.498559 0.224784 12.661633 25.140620
... ... ... ... ... ... ... ...
886 447.962963 0.157407 30.740707 0.342593 0.222222 19.741782 29.877630
887 469.212766 0.968085 34.611765 0.553191 0.457447 106.125798 38.233441
888 399.729167 0.500000 21.750000 0.895833 0.798611 16.118810 25.140620
889 455.729508 0.368852 41.281386 0.311475 0.278689 67.226127 38.233441
890 455.515850 0.135447 26.507589 0.498559 0.224784 12.661633 25.140620

891 rows × 7 columns

df['Age3'] = df.groupby(['Pclass', 'Sex']).transform(np.mean)['Age']
df
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Age2 Age3
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 25.140620 26.507589
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 38.233441 34.611765
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 25.140620 21.750000
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 38.233441 34.611765
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 25.140620 26.507589
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S 29.877630 30.740707
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S 38.233441 34.611765
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S 25.140620 21.750000
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C 38.233441 41.281386
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q 25.140620 26.507589

891 rows × 14 columns