8)Feature engineering

class Xfrmer_rateofinterest(BaseEstimator,TransformerMixin):
"""
This calculates rate of interest should be one of the GOLDEN FEATURES
Lets wait and watch
"""
# constructor
def __init__(self):
#we are not going to use this
self.rteofintrst = None
#Return self
def fit(self,X,y=None):
return self
def transform(self,X,y=None):
#rate of interest calculation I hope this will be a stellar feature let see
#the formula is numerator = 6(a-n)
# denominator = square(n) + (5+2a)n - 2a
# RATEOFINTEREST = numerator / denominator
n =X["CNT_PAYMENT"]
anuity = X["AMT_ANNUITY"]
numerator = 6 * (anuity - n)
denominator = np.square(n) + (5+2*anuity)*n - 2*anuity
rteofintrst = (numerator / denominator).replace((np.inf, -np.inf), (-9, -9))
X = rteofintrst.values.reshape(-1,1)
X[np.isnan(X)] = 0
print('rteint shape',X.shape)
return Xclass Xfrmer_rteofintrst(BaseEstimator,TransformerMixin):
"""
This calculates rate of interest should be one of the GOLDEN FEATURES
Lets wait and watch
"""
# constructor
def __init__(self):
#we are not going to use this
self.rteofintrst = None
#Return self
def fit(self,X,y=None):
return self
def transform(self,X,y=None):
#rate of interest used by other competitors
#https://www.kaggle.com/c/home-credit-default-risk/discussion/64598
n =X["CNT_PAYMENT"]
anuity = X["AMT_ANNUITY"]
interst = (n*anuity) - X["AMT_CREDIT"]
numerator = 24 * interst
denominator =X["AMT_CREDIT"] * (n+1)
rteofintrst = (numerator / denominator).replace((np.inf, -np.inf), (-9, -9))
X = rteofintrst.values.reshape(-1,1)
X[np.isnan(X)] = 0
return X
```
def display_mising_value(self):
"""
fn that displays the missing values of the dataset
"""
dct_tmp = {}
#self.df_mis_val = pd.DataFrame((self.df_source.isnull().sum())*100/se
lf.df_source.shape, columns=['COLNAME','PCT']).reset_index()
dct_tmp["PCT"] = (self.df_source.isnull().sum())*100/self.df_source.sh
ape
dct_tmp["SKEWNES"] = (self.df_source.skew().round(2))
#self.df_mis_val = pd.DataFrame((self.df_source.isnull().sum())*100/se
lf.df_source.shape, columns=['PCT'])
self.df_mis_val = pd.DataFrame(dct_tmp,columns=["PCT","SKEWNES"])
self.df_misval_lvl1 = pd.DataFrame(self.df_mis_val.loc[(self.df_mis_va
l["PCT"]< 5)])
self.df_misval_lvl2 = pd.DataFrame(self.df_mis_val.loc[(self.df_mis_va
l["PCT"] >= 5) & (self.df_mis_val["PCT"] < 20 )])
self.df_misval_lvl3 = pd.DataFrame(self.df_mis_val.loc[(self.df_mis_va
l["PCT"] >= 20) & (self.df_mis_val["PCT"] < 75)])
self.df_misval_lvl4 = pd.DataFrame(self.df_mis_val.loc[(self.df_mis_va
l["PCT"] >= 75)])
self.df_misval_lvl1.sort_values(by="PCT", axis=0,inplace=True)
self.df_misval_lvl2.sort_values(by="PCT", axis=0,inplace=True)
self.df_misval_lvl3.sort_values(by="PCT", axis=0,inplace=True)
self.df_misval_lvl4.sort_values(by="PCT", axis=0,inplace=True)
mdn =X.loc[(X['FLAG_OWN_CAR'] == 'Y'),'OWN_CAR_AGE'].median()   X.loc[(X['FLAG_OWN_CAR'] == 'Y' ) & (X['OWN_CAR_AGE'].isna()),'OWN_CAR _AGE'] = mdn  X.loc[(X['FLAG_OWN_CAR'] == 'N' ) & (X['OWN_CAR_AGE'].isna()),'OWN_CAR _AGE'] = 0#name_income_type='Commercial associate' and occupation_type  cat_mode =X.loc[(X['NAME_INCOME_TYPE'] == 'Commercial associate'),'OCC UPATION_TYPE'].value_counts().index  )  X.loc[(X['NAME_INCOME_TYPE'] == 'Civil marriage' ) & (X['OCCUPATION_TY PE'].isna()),'OCCUPATION_TYPE'] = cat_mode

9) Model Explanation

class Xfrmer_replacenp(BaseEstimator, TransformerMixin):
"""
this transformer does the global repplace within the dataframe
replace 365243 spcific to this case study with 0
replace +/-inf , nan with zero
"""
# constructor
def __init__(self):
#we are not going to use this
self._features = None

def fit(self, X,y=None ):
return self
def transform(self,X,y=None):
X[X==365243.0] = 0
X[X=="XAP"] = 0
X[X=="XNA"] = 0
X[X=="nan"] = 0
print('all replace',X.shape)
return X
if tmpval.dtype == 'object':
tmpval= pd.DataFrame(tmpval)
tmpval.fillna(0,inplace=True)
X = tmpval
col_lvl_ft_names = []
ft_union = crdcrd_ft_gen_piplin['engineer_data']
tpl_xfrmr_lst =ft_union.transformer_list
catpipe = tpl_xfrmr_lst
ccohe = catpipe.named_steps['CC_OHE']
#print(type(ccohe))
print(len(ccohe.categories_))
lst_ohe_ft_name = ccohe.categories_
print('Generating column names for numerical features...')
for xfrmr_lvl in range(len(tpl_xfrmr_lst)-1):
col_lvl_ft_names.append(tpl_xfrmr_lst[xfrmr_lvl])
#add OHE categories as column names
print('Generating column names for categorical features...')
for itm in lst_ohe_ft_name :
col_lvl_ft_names.append("CC_NMECTRCTSTAT_"+ itm.upper())
#recreate the dataframe with imputed column values
df_trn_colvl = pd.DataFrame(X_trn_xfrm,columns=col_lvl_ft_names)
df_tst_colvl = pd.DataFrame(x_tst_xfrm,columns=col_lvl_ft_names)

#concatenate the two dataframes to the original dataframe
dtprcs.X_train = pd.concat([dtprcs.X_train,df_trn_colvl], axis=1)
dtprcs.x_test = pd.concat([dtprcs.x_test,df_tst_colvl], axis=1)