Explore Data in train_static

Kaggle
Python
Author

F.L and Team

Published

February 9, 2024

import pandas as pd
import os
from directory_tree import display_tree
from sklearn.metrics import roc_auc_score
from pathlib import Path
import duckdb
import ibis
from ibis import _
from ibis import selectors as s
import re
from importlib import reload 
from IPython.display import display, HTML
# reload(tools)

from tools import FeatureDefinition, pretty_display
import altair as alt
from functools import reduce
DATA_DIR = 'home-credit-credit-risk-model-stability/parquet_files/train'
p = Path(DATA_DIR)

ibis.options.interactive=True

# load data into directory
data_files=list(p.glob('*.parquet'))#load_all data
ds = {re.sub('.parquet','',f.name):ibis.read_parquet(f) for f in data_files}

Why train_static? Please this dataset don’t seems to have cardinality problems.

from importlib import reload 
from IPython.display import display, HTML
# reload(tools)

from tools import FeatureDefinition, pretty_display
# lookup column definition here if you don't know what 
intersting_cols = [
    'bankacctype_710L',
    'credtype_322L','disbursementtype_67L',
    'inittransactioncode_186L','lastst_736L','paytype1st_925L','paytype_783L',
    'twobodfilling_608L','typesuite_864L'
]
pretty_display(
    FeatureDefinition().lookup_col(intersting_cols)
)
Variable.str.contains("bankacctype_710L") or Variable.str.contains("credtype_322L") or Variable.str.contains("disbursementtype_67L") or Variable.str.contains("inittransactioncode_186L") or Variable.str.contains("lastst_736L") or Variable.str.contains("paytype1st_925L") or Variable.str.contains("paytype_783L") or Variable.str.contains("twobodfilling_608L") or Variable.str.contains("typesuite_864L")
Variable Description File
0 bankacctype_710L Type of applicant's bank account. [train_static_0_0, train_static_0_1]
1 credtype_322L Type of credit. [train_static_0_0, train_static_0_1]
2 disbursementtype_67L Type of disbursement. [train_static_0_0, train_static_0_1]
3 inittransactioncode_186L Transaction type of the initial credit
transaction.
[train_static_0_0, train_static_0_1]
4 lastst_736L Status of the client's previous credit
application.
[train_static_0_0, train_static_0_1]
5 paytype1st_925L Type of first payment of the client. [train_static_0_0, train_static_0_1]
6 paytype_783L Type of payment. [train_static_0_0, train_static_0_1]
7 twobodfilling_608L Type of application process. [train_static_0_0, train_static_0_1]
8 typesuite_864L Persons accompanying the client during the loan
application process.
[train_static_0_0, train_static_0_1]

Time Sensitivity?

Maybe not to include time-features in as training dataset.

import matplotlib.pyplot as plt
import seaborn as sns

import tools

d=(
    ds.get('train_base')
    .mutate(date_decision=_.date_decision.to_timestamp('%Y-%m-%d').date())
    .mutate(dm=_.WEEK_NUM)
    .group_by([_.dm, _.target])
    .aggregate(n=_.count())
    .to_pandas()
)
# d

import altair as alt
c0=alt.Chart(
    ds.get('train_base')
        .mutate(date_decision=_
                .date_decision
                .to_timestamp('%Y-%m-%d').date()
                .truncate('M')
                )
        .group_by([_.date_decision])
        .aggregate(n=_.count())
        .to_pandas()
    ,
    title=['Loan Approval Volumn',
           'There is a sharp drope during covid'
        ],height=150, width=670
    ).mark_line(color='midnightblue').encode(
        x=alt.X('date_decision',title='Decision Date'),
        y=alt.Y('n',title='Loan Approved')
)
c1=alt.Chart(d,title='Volumn of Approval and Deafult').mark_area().encode(
    x=alt.X('dm',title='Week Number'),
    y=alt.Y('n',title='Number of Case'),
    color='target:N'
)
c2=alt.Chart(d,title='Propotion of Default Overtime').mark_area().encode(
    x=alt.X('dm',title='Week Number'),
    y=alt.Y('n',title='Propotion of Case').stack('normalize'),
    color='target:N'
)
c0 & (c1  | c2)
reload(tools)
from tools import FeatureDefinition
pretty_display(
    FeatureDefinition().lookup_dsc('change')
    .query('File=="train_static_0_0"')
)
Variable File Description
415 equalitydataagreement_891L train_static_0_0 Flag indicating sudden changes in client's social-
demographic data (e.g. education, family status,
housing type).
417 equalityempfrom_62L train_static_0_0 Flag indicating a sudden change in the client's
length of employment.

Explore Categorical Variables

fd=FeatureDefinition()
intersting_cols = [
    'equalitydataagreement_891L',
    'equalityempfrom_62L',
    'bankacctype_710L',
    'credtype_322L','disbursementtype_67L',
    'inittransactioncode_186L','lastst_736L','paytype1st_925L','paytype_783L',
    'twobodfilling_608L','typesuite_864L',
    'lastrejectreasonclient_4145040M'
]
# pretty_display(
#     fd.lookup_col(intersting_cols)
# )
d2=(
    ibis.union(
        ds.get('train_static_0_0'),
        ds.get('train_static_0_1')
    )
    .select(s.contains(intersting_cols + ['case_id']))
    .join(ds.get('train_base'), ['case_id'])
)
C = []
for c in intersting_cols:
    # display(d2.select(c,'target').head(1))
    description = fd.lookup_col(c).Description.str.wrap(30).loc[0]
    # print(description)
    d=(d2
        .group_by([c,'target'])
        .aggregate(n=_.count())
        .to_pandas())
    ct1=alt.Chart(
        d
    ).mark_bar().encode(
        y=alt.Y(c + ':N', title=description.split('\n')),
        x=alt.X('n:Q',title=''),
        color='target:N'
    )
    ct2=alt.Chart(
        d,width=100
    ).mark_bar().encode(
        y=alt.Y(c + ':N', title='', axis=None),
        x=alt.X('n:Q',title='').stack('normalize'),
        color='target:N',
    )
    ct=ct1 | ct2
    C += [ct]
reduce(alt.vconcat, C).configure_axisY(
    titleAngle=0,
    titleAlign="left",
    titleY=10,
    titleX=-200,
    titleColor='#404040',
    titleFontWeight='lighter'
).properties(
    title=[
        "Categorical Variable Appears in Static Dataset",
        "Nothing distinct by propotion except 'Status of the client's previous application'"
        ]
)
pretty_display(fd
               .lookup_tbl('static_0_0')
               .query('Variable.str.endswith("M")')
               .reset_index(drop=True))
Variable File Description
0 lastapprcommoditycat_1041M train_static_0_0 Commodity category of the last loan applications
made by the applicant.
1 lastapprcommoditytypec_5251766M train_static_0_0 Commodity type of the last application.
2 lastcancelreason_561M train_static_0_0 Cancellation reason of the last application.
3 lastrejectcommoditycat_161M train_static_0_0 Category of commodity in the applicant's last
rejected application.
4 lastrejectcommodtypec_5251769M train_static_0_0 Commodity type of the last rejected application.
5 lastrejectreason_759M train_static_0_0 Reason for rejection on the most recent rejected
application.
6 lastrejectreasonclient_4145040M train_static_0_0 Reason for the client's last loan rejection.
7 previouscontdistrict_112M train_static_0_0 Contact district of the client's previous approved
application.
d=(
    ds.get('train_static_0_0')
    .join(ds.get('train_base'),'case_id')
    .select(s.endswith('M'), _.target, _.case_id)
    .drop('WEEK_NUM')
    # .pivot_longer(
    #     s.endswith('M')
    # )
    # .group_by([_.name,_.value])
    # .aggregate(n=_.count())
)
(d
.select(c, 'target')
.group_by([s.contains(c),s.contains('target')])
.aggregate(n=_.count())
.to_pandas())
previouscontdistrict_112M target n
0 P6_35_77 0 2506
1 P41_138_103 0 1766
2 P54_133_26 0 8831
3 P197_47_166 0 34040
4 P111_135_181 0 11957
... ... ... ...
393 P159_160_144 1 27
394 P217_60_135 1 16
395 P78_30_175 1 26
396 P7_110_89 1 18
397 P31_42_128 1 7

398 rows × 3 columns

fd=FeatureDefinition()
mcols = [i for i in d.columns if i not in ['target', "case_id"]]
C=[]
for c in mcols:
    data=(d
        .select(c, 'target')
        .group_by([s.contains(c),s.contains('target')])
        .aggregate(n=_.count())
        .to_pandas())
    description=fd.lookup_col(c).Description.str.wrap(30).loc[0]
    cl1=(
        alt
        .Chart(data)
        .mark_bar()
        .encode(
            x='n:Q',
            y=alt.Y(c + ':N',title=description.split('\n')),
            color='target:N'
        )
    )
    cl2=(
        alt
        .Chart(data,width=100)
        .mark_bar(
        )
        .encode(
            x=alt.X('n:Q',title='').stack('normalize'),
            y=alt.Y(c + ':N',title='',axis=None),
            color='target:N'
        )
    )
    C+=[cl1 | cl2]
CH=reduce(alt.vconcat,C)
CH.configure_axisY(
    titleAngle=0,
    titleAlign="left",
    titleY=10,
    titleX=-250,
    titleColor='#404040',
    titleFontWeight='lighter'
).properties(
    title='All Category Variable'
)
# alt.data_transformers.enable("vegafusion")
# alt.Chart(d.to_pandas()).transform_fold(
#     [i for i in d.columns if i not in ['target', "case_id"]]
# ).mark_bar().encode(
#     x='count():Q',
#     y='value:N',
#     column='key:N'
# )
d
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┓
┃ lastapprcommoditycat_1041M  lastapprcommoditytypec_5251766M  lastcancelreason_561M  lastrejectcommoditycat_161M  lastrejectcommodtypec_5251769M  lastrejectreason_759M  lastrejectreasonclient_4145040M  previouscontdistrict_112M  target  case_id ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━┩
│ stringstringstringstringstringstringstringstringint64int64   │
├────────────────────────────┼─────────────────────────────────┼───────────────────────┼─────────────────────────────┼────────────────────────────────┼───────────────────────┼─────────────────────────────────┼───────────────────────────┼────────┼─────────┤
│ a55475b1                  a55475b1                       a55475b1             a55475b1                   a55475b1                      a55475b1             a55475b1                       a55475b1                 00 │
│ a55475b1                  a55475b1                       a55475b1             a55475b1                   a55475b1                      a55475b1             a55475b1                       a55475b1                 02 │
│ a55475b1                  a55475b1                       P94_109_143          a55475b1                   a55475b1                      P94_109_143          a55475b1                       a55475b1                 03 │
│ a55475b1                  a55475b1                       P94_109_143          a55475b1                   a55475b1                      P94_109_143          a55475b1                       a55475b1                 06 │
│ a55475b1                  a55475b1                       a55475b1             a55475b1                   a55475b1                      a55475b1             a55475b1                       a55475b1                 07 │
│ a55475b1                  a55475b1                       a55475b1             a55475b1                   a55475b1                      a55475b1             a55475b1                       a55475b1                 08 │
│ a55475b1                  a55475b1                       P73_130_169          a55475b1                   a55475b1                      a55475b1             a55475b1                       a55475b1                 010 │
│ a55475b1                  a55475b1                       a55475b1             a55475b1                   a55475b1                      a55475b1             a55475b1                       a55475b1                 011 │
│ a55475b1                  a55475b1                       a55475b1             a55475b1                   a55475b1                      a55475b1             a55475b1                       a55475b1                 012 │
│ a55475b1                  a55475b1                       P94_109_143          a55475b1                   a55475b1                      P94_109_143          a55475b1                       a55475b1                 013 │
│  │
└────────────────────────────┴─────────────────────────────────┴───────────────────────┴─────────────────────────────┴────────────────────────────────┴───────────────────────┴─────────────────────────────────┴───────────────────────────┴────────┴─────────┘