Feature Engineering — Outlier Trimming and Capping

from feature_engine.outlier_removers import Winsorizer
import scipy.stats as stats

Find Quantile Upper and Lower Boundaries

def find_quantile_boundaries(df,col, distance):    IQR = df[col].quantile(0.75) - df[col].quantile(0.25)
lower_boundary = df[col].quantile(0.25) -( IQR * distance)
upper_boundary = df[col].quantile(0.75) + (IQR * distance)
print("UPPER AND LOWER BOUNDRY FOR : ", col, upper_boundary , lower_boundary) return upper_boundary , lower_boundary

Find Mean Upper and Lower Boundaries

def find_mean_boundaries(data,col, distance):    upper_boundary = data[col].mean() + distance * data[col].std()
lower_boundary = data[col].mean() - distance * data[col].std()
print("UPPER AND LOWER BOUNDRY FOR : ", col, upper_boundary , lower_boundary) return upper_boundary , lower_boundary
def trim_outliers (data,cols):    for col in cols:
i=1
col_upper_limit, col_lower_limit = find_quantile_boundaries(data,col ,1.5)

outliers_col = np.where(data[col] > col_upper_limit, True,
np.where(data[col] < col_lower_limit, True, False))
if i==1:
a=outliers_col
else:
a=a+outliers_col
i=2
data_trimmed =data.loc[~(a)] print("shape data",data.shape)
print("shape trimmed", data_trimmed.shape)
print("*********")
return(data_trimmed)

Feature Engine Windsoriser— Gaussian , Quantile Outlier Transformation

def FE_Outlier_Capper(data,cols):    windsoriser = Winsorizer(distribution='skewed', 
tail='both',
fold=1.5,
variables=cols)

windsoriser.fit(data)
data_capped = windsoriser.transform(data) print("TAIL CAPS LEFT",windsoriser.left_tail_caps_)
print("TAIL CAPS RIGHT",windsoriser.right_tail_caps_)


return (data_capped)
*******
def FE_Outlier_Gaussian_Capper(data,cols):
windsoriser = Winsorizer(distribution='gaussian',
tail='both',
fold=3,
variables=cols)

windsoriser.fit(data)
data_capped = windsoriser.transform(data) print("TAIL CAPS LEFT",windsoriser.left_tail_caps_)
print("TAIL CAPS RIGHT",windsoriser.right_tail_caps_)


return (data_capped)
*********
def FE_Outlier_Quantile_Capper(data,cols):
windsoriser = Winsorizer(distribution='quantiles',
tail='both',
fold=0.05,
variables=cols)

windsoriser.fit(data)
data_capped = windsoriser.transform(data) print("TAIL CAPS LEFT",windsoriser.left_tail_caps_)
print("TAIL CAPS RIGHT",windsoriser.right_tail_caps_)

return (data_capped)

Bonus : Rare Categories Encoding

def rare_labels(data,cols,target):    X_train,X_test,y_train,y_test= split_data(data,target)
rare_encoder = RareLabelCategoricalEncoder(
tol=0.05,
n_categories=4,
variables= cols )

rare_encoder.fit(X_train.fillna('Missing'))

print("RARE ENCODER")
print(rare_encoder.variables
print(rare_encoder.encoder_dict_)
print("ORIGINAL SHAPE",X_train.shape, X_test.shape) X_train = rare_encoder.transform(X_train.fillna('Missing'))
X_test = rare_encoder.transform(X_test.fillna('Missing'))
print("TRANSFORMED SHAPE",X_train.shape, X_test.shape)

for col in cols:
print(pd.Series(X_train[col].value_counts()))
return X_train, X_test

ML2021_DSB