transform 함수의 이해 및 활용하기
학습목표
- transform 함수 이해하기
import numpy as np
import pandas as pd
# data 출처: https://www.kaggle.com/hesh97/titanicdataset-traincsv/data
df = pd.read_csv('./train.csv')
df.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
transform 함수
- groupby 후 transform 함수를 사용하면 원래의 index를 유지한 상태로 통계함수를 적용
- 전체 데이터의 집계가 아닌 각 그룹에서의 집계를 계산
- 따라서 새로 생성된 데이터를 원본 dataframe과 합치기 쉬움
df.groupby('Pclass').mean()
PassengerId | Survived | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|
Pclass | ||||||
1 | 461.597222 | 0.629630 | 38.233441 | 0.416667 | 0.356481 | 84.154687 |
2 | 445.956522 | 0.472826 | 29.877630 | 0.402174 | 0.380435 | 20.662183 |
3 | 439.154786 | 0.242363 | 25.140620 | 0.615071 | 0.393075 | 13.675550 |
df.groupby('Pclass').transform(np.mean)
PassengerId | Survived | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|
0 | 439.154786 | 0.242363 | 25.140620 | 0.615071 | 0.393075 | 13.675550 |
1 | 461.597222 | 0.629630 | 38.233441 | 0.416667 | 0.356481 | 84.154687 |
2 | 439.154786 | 0.242363 | 25.140620 | 0.615071 | 0.393075 | 13.675550 |
3 | 461.597222 | 0.629630 | 38.233441 | 0.416667 | 0.356481 | 84.154687 |
4 | 439.154786 | 0.242363 | 25.140620 | 0.615071 | 0.393075 | 13.675550 |
... | ... | ... | ... | ... | ... | ... |
886 | 445.956522 | 0.472826 | 29.877630 | 0.402174 | 0.380435 | 20.662183 |
887 | 461.597222 | 0.629630 | 38.233441 | 0.416667 | 0.356481 | 84.154687 |
888 | 439.154786 | 0.242363 | 25.140620 | 0.615071 | 0.393075 | 13.675550 |
889 | 461.597222 | 0.629630 | 38.233441 | 0.416667 | 0.356481 | 84.154687 |
890 | 439.154786 | 0.242363 | 25.140620 | 0.615071 | 0.393075 | 13.675550 |
891 rows × 6 columns
df['Age2'] = df.groupby('Pclass').transform(np.mean)['Age']
df
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 25.140620 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 38.233441 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 25.140620 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 38.233441 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 25.140620 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S | 29.877630 |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S | 38.233441 |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S | 25.140620 |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C | 38.233441 |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q | 25.140620 |
891 rows × 13 columns
df.groupby(['Pclass', 'Sex']).mean()
PassengerId | Survived | Age | SibSp | Parch | Fare | Age2 | ||
---|---|---|---|---|---|---|---|---|
Pclass | Sex | |||||||
1 | female | 469.212766 | 0.968085 | 34.611765 | 0.553191 | 0.457447 | 106.125798 | 38.233441 |
male | 455.729508 | 0.368852 | 41.281386 | 0.311475 | 0.278689 | 67.226127 | 38.233441 | |
2 | female | 443.105263 | 0.921053 | 28.722973 | 0.486842 | 0.605263 | 21.970121 | 29.877630 |
male | 447.962963 | 0.157407 | 30.740707 | 0.342593 | 0.222222 | 19.741782 | 29.877630 | |
3 | female | 399.729167 | 0.500000 | 21.750000 | 0.895833 | 0.798611 | 16.118810 | 25.140620 |
male | 455.515850 | 0.135447 | 26.507589 | 0.498559 | 0.224784 | 12.661633 | 25.140620 |
df.groupby(['Pclass','Sex']).transform(np.mean)
PassengerId | Survived | Age | SibSp | Parch | Fare | Age2 | |
---|---|---|---|---|---|---|---|
0 | 455.515850 | 0.135447 | 26.507589 | 0.498559 | 0.224784 | 12.661633 | 25.140620 |
1 | 469.212766 | 0.968085 | 34.611765 | 0.553191 | 0.457447 | 106.125798 | 38.233441 |
2 | 399.729167 | 0.500000 | 21.750000 | 0.895833 | 0.798611 | 16.118810 | 25.140620 |
3 | 469.212766 | 0.968085 | 34.611765 | 0.553191 | 0.457447 | 106.125798 | 38.233441 |
4 | 455.515850 | 0.135447 | 26.507589 | 0.498559 | 0.224784 | 12.661633 | 25.140620 |
... | ... | ... | ... | ... | ... | ... | ... |
886 | 447.962963 | 0.157407 | 30.740707 | 0.342593 | 0.222222 | 19.741782 | 29.877630 |
887 | 469.212766 | 0.968085 | 34.611765 | 0.553191 | 0.457447 | 106.125798 | 38.233441 |
888 | 399.729167 | 0.500000 | 21.750000 | 0.895833 | 0.798611 | 16.118810 | 25.140620 |
889 | 455.729508 | 0.368852 | 41.281386 | 0.311475 | 0.278689 | 67.226127 | 38.233441 |
890 | 455.515850 | 0.135447 | 26.507589 | 0.498559 | 0.224784 | 12.661633 | 25.140620 |
891 rows × 7 columns
df['Age3'] = df.groupby(['Pclass', 'Sex']).transform(np.mean)['Age']
df
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Age2 | Age3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 25.140620 | 26.507589 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 38.233441 | 34.611765 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 25.140620 | 21.750000 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 38.233441 | 34.611765 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 25.140620 | 26.507589 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S | 29.877630 | 30.740707 |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S | 38.233441 | 34.611765 |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S | 25.140620 | 21.750000 |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C | 38.233441 | 41.281386 |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q | 25.140620 | 26.507589 |
891 rows × 14 columns