python, R, vimでデータマイニング

python, R, vim で疑問に思ったことなどを

count encodingの実装

データフレーム全体に対してCount Encodingするパッケージを作成しました。

別データに適用することも考慮して

変換テーブルの役割の辞書を作成する関数

上記を適用する関数に分けました。

create_count_encoding_dicts

apply_encode_dicts

 

以下 README

 

3. utils4ml.encoding

Utilities for encoding.

3.1. create_count_encoding_dicts

Create count encoding for pd.DataFrame.

3.1.1. Usage

from utils4ml.encoding import create_count_encoding_dicts

3.1.2. Definition

def create_count_encoding_dicts(
    X: Union[pd.DataFrame, pd.Series],
) -> Dict[str, Dict[str, int]]:

3.1.3. Example

Code:
# %%
import json
from utils4ml.encoding import create_count_encoding_dicts
from utils4ml.encoding import apply_encode_dicts

from utils4ml.utils import load_bank_classifier

X, y = load_bank_classifier()
Xcat = X.select_dtypes(include="category")

encoding_dicts = create_count_encoding_dicts(Xcat)
X_encoded = apply_encode_dicts(
    Xcat,
    encoding_dicts
)
json.dump(
    encoding_dicts,
    open("output/encoding_dicts.json", "w"),
    indent=4
)
X_encoded.head(10).to_csv("output/X_encoded.csv")
# %%
Result: encoding_dicts.json
{
    "job": {
        "management": 969,
        "blue-collar": 946,
        "technician": 768,
        "admin.": 478,
        "services": 417,
        "retired": 230,
        "self-employed": 183,
        "entrepreneur": 168,
        "unemployed": 128,
        "housemaid": 112,
        "student": 84,
        "unknown": 38
    },
    "marital": {
        "married": 2797,
        "single": 1196,
        "divorced": 528
    },
    "education": {
        "secondary": 2306,
        "tertiary": 1350,
        "primary": 678,
        "unknown": 187
    },
    "default": {
        "no": 4445,
        "yes": 76
    },
    "housing": {
        "yes": 2559,
        "no": 1962
    },
    "loan": {
        "no": 3830,
        "yes": 691
    },
    "contact": {
        "cellular": 2896,
        "unknown": 1324,
        "telephone": 301
    },
    "month": {
        "may": 1398,
        "jul": 706,
        "aug": 633,
        "jun": 531,
        "nov": 389,
        "apr": 293,
        "feb": 222,
        "jan": 148,
        "oct": 80,
        "sep": 52,
        "mar": 49,
        "dec": 20
    },
    "poutcome": {
        "unknown": 3705,
        "failure": 490,
        "other": 197,
        "success": 129
    }
}

3.2. apply_encode_dicts

apply encoding_dicts for pd.DataFrame.

3.2.1. Usage

from utils4ml.encoding import apply_encode_dicts

3.2.2. Definition

def apply_encode_dicts(
    X: Union[pd.DataFrame, pd.Series],
    encode_dicts: Dict[str, Dict[str, int]],
    suffix: str='',
    fillna: int=1,
) -> pd.DataFrame:

3.2.3. Example

Code:
# %%
import json
from utils4ml.encoding import create_count_encoding_dicts
from utils4ml.encoding import apply_encode_dicts

from utils4ml.utils import load_bank_classifier

X, y = load_bank_classifier()
Xcat = X.select_dtypes(include="category")

encoding_dicts = create_count_encoding_dicts(Xcat)
X_encoded = apply_encode_dicts(
    Xcat,
    encoding_dicts
)
json.dump(
    encoding_dicts,
    open("output/encoding_dicts.json", "w"),
    indent=4
)
X_encoded.head(10).to_csv("output/X_encoded.csv")
# %%
Table 6. Result: X_encoded.csv
  job marital education default housing loan contact month poutcome

0

128

2797

678

4445

1962

3830

2896

80

3705

1

417

2797

2306

4445

2559

691

2896

1398

490

2

969

1196

1350

4445

2559

3830

2896

293

490

3

969

2797

1350

4445

2559

691

1324

531

3705

4

946

2797

2306

4445

2559

3830

1324

1398

3705

5

969

1196

1350

4445

1962

3830

2896

222

490

6

183

2797

1350

4445

2559

3830

2896

1398

197

7

768

2797

2306

4445

2559

3830

2896

1398

3705

8

168

2797

1350

4445

2559

3830

1324

1398

3705

9

417

2797

678

4445

2559

691

2896

293

490