import pandas as pd
import os
from directory_tree import display_tree
from sklearn.metrics import roc_auc_score
from pathlib import Path
import duckdb
import ibis
from ibis import _
from ibis import selectors as s
import re
from importlib import reload 
from IPython.display import display, HTML
# reload(tools)
from tools import FeatureDefinition, pretty_display
import altair as alt
from functools import reduceDATA_DIR = 'home-credit-credit-risk-model-stability/parquet_files/train'
p = Path(DATA_DIR)
ibis.options.interactive=True
# load data into directory
data_files=list(p.glob('*.parquet'))#load_all data
ds = {re.sub('.parquet','',f.name):ibis.read_parquet(f) for f in data_files}Why train_static? Please this dataset don’t seems to have cardinality problems.
from importlib import reload 
from IPython.display import display, HTML
# reload(tools)
from tools import FeatureDefinition, pretty_display
# lookup column definition here if you don't know what 
intersting_cols = [
    'bankacctype_710L',
    'credtype_322L','disbursementtype_67L',
    'inittransactioncode_186L','lastst_736L','paytype1st_925L','paytype_783L',
    'twobodfilling_608L','typesuite_864L'
]
pretty_display(
    FeatureDefinition().lookup_col(intersting_cols)
)Variable.str.contains("bankacctype_710L") or Variable.str.contains("credtype_322L") or Variable.str.contains("disbursementtype_67L") or Variable.str.contains("inittransactioncode_186L") or Variable.str.contains("lastst_736L") or Variable.str.contains("paytype1st_925L") or Variable.str.contains("paytype_783L") or Variable.str.contains("twobodfilling_608L") or Variable.str.contains("typesuite_864L")| Variable | Description | File | |
|---|---|---|---|
| 0 | bankacctype_710L | Type of applicant's bank account. | [train_static_0_0, train_static_0_1] | 
| 1 | credtype_322L | Type of credit. | [train_static_0_0, train_static_0_1] | 
| 2 | disbursementtype_67L | Type of disbursement. | [train_static_0_0, train_static_0_1] | 
| 3 | inittransactioncode_186L | Transaction type of the initial credit transaction. | [train_static_0_0, train_static_0_1] | 
| 4 | lastst_736L | Status of the client's previous credit application. | [train_static_0_0, train_static_0_1] | 
| 5 | paytype1st_925L | Type of first payment of the client. | [train_static_0_0, train_static_0_1] | 
| 6 | paytype_783L | Type of payment. | [train_static_0_0, train_static_0_1] | 
| 7 | twobodfilling_608L | Type of application process. | [train_static_0_0, train_static_0_1] | 
| 8 | typesuite_864L | Persons accompanying the client during the loan application process. | [train_static_0_0, train_static_0_1] | 
Time Sensitivity?
Maybe not to include time-features in as training dataset.
import matplotlib.pyplot as plt
import seaborn as sns
import tools
d=(
    ds.get('train_base')
    .mutate(date_decision=_.date_decision.to_timestamp('%Y-%m-%d').date())
    .mutate(dm=_.WEEK_NUM)
    .group_by([_.dm, _.target])
    .aggregate(n=_.count())
    .to_pandas()
)
# d
import altair as alt
c0=alt.Chart(
    ds.get('train_base')
        .mutate(date_decision=_
                .date_decision
                .to_timestamp('%Y-%m-%d').date()
                .truncate('M')
                )
        .group_by([_.date_decision])
        .aggregate(n=_.count())
        .to_pandas()
    ,
    title=['Loan Approval Volumn',
           'There is a sharp drope during covid'
        ],height=150, width=670
    ).mark_line(color='midnightblue').encode(
        x=alt.X('date_decision',title='Decision Date'),
        y=alt.Y('n',title='Loan Approved')
)
c1=alt.Chart(d,title='Volumn of Approval and Deafult').mark_area().encode(
    x=alt.X('dm',title='Week Number'),
    y=alt.Y('n',title='Number of Case'),
    color='target:N'
)
c2=alt.Chart(d,title='Propotion of Default Overtime').mark_area().encode(
    x=alt.X('dm',title='Week Number'),
    y=alt.Y('n',title='Propotion of Case').stack('normalize'),
    color='target:N'
)
c0 & (c1  | c2)reload(tools)
from tools import FeatureDefinition
pretty_display(
    FeatureDefinition().lookup_dsc('change')
    .query('File=="train_static_0_0"')
)| Variable | File | Description | |
|---|---|---|---|
| 415 | equalitydataagreement_891L | train_static_0_0 | Flag indicating sudden changes in client's social- demographic data (e.g. education, family status, housing type). | 
| 417 | equalityempfrom_62L | train_static_0_0 | Flag indicating a sudden change in the client's length of employment. | 
Explore Categorical Variables
fd=FeatureDefinition()
intersting_cols = [
    'equalitydataagreement_891L',
    'equalityempfrom_62L',
    'bankacctype_710L',
    'credtype_322L','disbursementtype_67L',
    'inittransactioncode_186L','lastst_736L','paytype1st_925L','paytype_783L',
    'twobodfilling_608L','typesuite_864L',
    'lastrejectreasonclient_4145040M'
]
# pretty_display(
#     fd.lookup_col(intersting_cols)
# )
d2=(
    ibis.union(
        ds.get('train_static_0_0'),
        ds.get('train_static_0_1')
    )
    .select(s.contains(intersting_cols + ['case_id']))
    .join(ds.get('train_base'), ['case_id'])
)
C = []
for c in intersting_cols:
    # display(d2.select(c,'target').head(1))
    description = fd.lookup_col(c).Description.str.wrap(30).loc[0]
    # print(description)
    d=(d2
        .group_by([c,'target'])
        .aggregate(n=_.count())
        .to_pandas())
    ct1=alt.Chart(
        d
    ).mark_bar().encode(
        y=alt.Y(c + ':N', title=description.split('\n')),
        x=alt.X('n:Q',title=''),
        color='target:N'
    )
    ct2=alt.Chart(
        d,width=100
    ).mark_bar().encode(
        y=alt.Y(c + ':N', title='', axis=None),
        x=alt.X('n:Q',title='').stack('normalize'),
        color='target:N',
    )
    ct=ct1 | ct2
    C += [ct]
reduce(alt.vconcat, C).configure_axisY(
    titleAngle=0,
    titleAlign="left",
    titleY=10,
    titleX=-200,
    titleColor='#404040',
    titleFontWeight='lighter'
).properties(
    title=[
        "Categorical Variable Appears in Static Dataset",
        "Nothing distinct by propotion except 'Status of the client's previous application'"
        ]
)pretty_display(fd
               .lookup_tbl('static_0_0')
               .query('Variable.str.endswith("M")')
               .reset_index(drop=True))| Variable | File | Description | |
|---|---|---|---|
| 0 | lastapprcommoditycat_1041M | train_static_0_0 | Commodity category of the last loan applications made by the applicant. | 
| 1 | lastapprcommoditytypec_5251766M | train_static_0_0 | Commodity type of the last application. | 
| 2 | lastcancelreason_561M | train_static_0_0 | Cancellation reason of the last application. | 
| 3 | lastrejectcommoditycat_161M | train_static_0_0 | Category of commodity in the applicant's last rejected application. | 
| 4 | lastrejectcommodtypec_5251769M | train_static_0_0 | Commodity type of the last rejected application. | 
| 5 | lastrejectreason_759M | train_static_0_0 | Reason for rejection on the most recent rejected application. | 
| 6 | lastrejectreasonclient_4145040M | train_static_0_0 | Reason for the client's last loan rejection. | 
| 7 | previouscontdistrict_112M | train_static_0_0 | Contact district of the client's previous approved application. | 
d=(
    ds.get('train_static_0_0')
    .join(ds.get('train_base'),'case_id')
    .select(s.endswith('M'), _.target, _.case_id)
    .drop('WEEK_NUM')
    # .pivot_longer(
    #     s.endswith('M')
    # )
    # .group_by([_.name,_.value])
    # .aggregate(n=_.count())
)(d
.select(c, 'target')
.group_by([s.contains(c),s.contains('target')])
.aggregate(n=_.count())
.to_pandas())| previouscontdistrict_112M | target | n | |
|---|---|---|---|
| 0 | P6_35_77 | 0 | 2506 | 
| 1 | P41_138_103 | 0 | 1766 | 
| 2 | P54_133_26 | 0 | 8831 | 
| 3 | P197_47_166 | 0 | 34040 | 
| 4 | P111_135_181 | 0 | 11957 | 
| ... | ... | ... | ... | 
| 393 | P159_160_144 | 1 | 27 | 
| 394 | P217_60_135 | 1 | 16 | 
| 395 | P78_30_175 | 1 | 26 | 
| 396 | P7_110_89 | 1 | 18 | 
| 397 | P31_42_128 | 1 | 7 | 
398 rows × 3 columns
fd=FeatureDefinition()
mcols = [i for i in d.columns if i not in ['target', "case_id"]]
C=[]
for c in mcols:
    data=(d
        .select(c, 'target')
        .group_by([s.contains(c),s.contains('target')])
        .aggregate(n=_.count())
        .to_pandas())
    description=fd.lookup_col(c).Description.str.wrap(30).loc[0]
    cl1=(
        alt
        .Chart(data)
        .mark_bar()
        .encode(
            x='n:Q',
            y=alt.Y(c + ':N',title=description.split('\n')),
            color='target:N'
        )
    )
    cl2=(
        alt
        .Chart(data,width=100)
        .mark_bar(
        )
        .encode(
            x=alt.X('n:Q',title='').stack('normalize'),
            y=alt.Y(c + ':N',title='',axis=None),
            color='target:N'
        )
    )
    C+=[cl1 | cl2]
CH=reduce(alt.vconcat,C)
CH.configure_axisY(
    titleAngle=0,
    titleAlign="left",
    titleY=10,
    titleX=-250,
    titleColor='#404040',
    titleFontWeight='lighter'
).properties(
    title='All Category Variable'
)# alt.data_transformers.enable("vegafusion")
# alt.Chart(d.to_pandas()).transform_fold(
#     [i for i in d.columns if i not in ['target', "case_id"]]
# ).mark_bar().encode(
#     x='count():Q',
#     y='value:N',
#     column='key:N'
# )
d┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┓ ┃ lastapprcommoditycat_1041M ┃ lastapprcommoditytypec_5251766M ┃ lastcancelreason_561M ┃ lastrejectcommoditycat_161M ┃ lastrejectcommodtypec_5251769M ┃ lastrejectreason_759M ┃ lastrejectreasonclient_4145040M ┃ previouscontdistrict_112M ┃ target ┃ case_id ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━┩ │ string │ string │ string │ string │ string │ string │ string │ string │ int64 │ int64 │ ├────────────────────────────┼─────────────────────────────────┼───────────────────────┼─────────────────────────────┼────────────────────────────────┼───────────────────────┼─────────────────────────────────┼───────────────────────────┼────────┼─────────┤ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 0 │ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 2 │ │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ 0 │ 3 │ │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ 0 │ 6 │ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 7 │ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 8 │ │ a55475b1 │ a55475b1 │ P73_130_169 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 10 │ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 11 │ │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ a55475b1 │ 0 │ 12 │ │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ P94_109_143 │ a55475b1 │ a55475b1 │ 0 │ 13 │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └────────────────────────────┴─────────────────────────────────┴───────────────────────┴─────────────────────────────┴────────────────────────────────┴───────────────────────┴─────────────────────────────────┴───────────────────────────┴────────┴─────────┘