import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder,RobustScaler
from sklearn.pipeline import make_pipeline

url='banking.csv'
data= pd.read_csv(url)

data.head()

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp_var_rate    41188 non-null  float64
 16  cons_price_idx  41188 non-null  float64
 17  cons_conf_idx   41188 non-null  float64
 18  euribor3m       41188 non-null  float64
 19  nr_employed     41188 non-null  float64
 20  y               41188 non-null  int64  
dtypes: float64(5), int64(6), object(10)
memory usage: 6.6+ MB

for col in data.select_dtypes('object'):
    status = data[col]
    print(status.value_counts())

job
admin.           10422
blue-collar       9254
technician        6743
services          3969
management        2924
retired           1720
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
unknown            330
Name: count, dtype: int64
marital
married     24928
single      11568
divorced     4612
unknown        80
Name: count, dtype: int64
education
university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
unknown                 1731
illiterate                18
Name: count, dtype: int64
default
no         32588
unknown     8597
yes            3
Name: count, dtype: int64
housing
yes        21576
no         18622
unknown      990
Name: count, dtype: int64
loan
no         33950
yes         6248
unknown      990
Name: count, dtype: int64
contact
cellular     26144
telephone    15044
Name: count, dtype: int64
month
may    13769
jul     7174
aug     6178
jun     5318
nov     4101
apr     2632
oct      718
sep      570
mar      546
dec      182
Name: count, dtype: int64
day_of_week
thu    8623
mon    8514
wed    8134
tue    8090
fri    7827
Name: count, dtype: int64
poutcome
nonexistent    35563
failure         4252
success         1373
Name: count, dtype: int64

df = data.copy()

df.dtypes.value_counts().plot.pie()

<Axes: ylabel='count'>

taux_valeurs_manquantes =(df.isna().sum()/df.shape[0]).sort_values(ascending=True)
taux_valeurs_manquantes

age               0.0
euribor3m         0.0
cons_conf_idx     0.0
cons_price_idx    0.0
emp_var_rate      0.0
poutcome          0.0
previous          0.0
pdays             0.0
campaign          0.0
nr_employed       0.0
duration          0.0
month             0.0
contact           0.0
loan              0.0
housing           0.0
default           0.0
education         0.0
marital           0.0
job               0.0
day_of_week       0.0
y                 0.0
dtype: float64

# b. Drop unrepresentative features
#df.drop(columns = ["month", "previous", "day_of_week", "pdays"],inplace = True)
#df.head()
#Numerical features: ["age", "balance", "duration", "campaign"]
#Categorical Features: ["job", "marital", "education", "default", "housing", "loan", "poutcome", "y"]
def imputation(df):
    df = df.dropna(axis=0)
    df.drop(columns = ["month", "previous", "day_of_week", "pdays","emp_var_rate","cons_price_idx","cons_conf_idx",
                       "euribor3m","nr_employed","contact"],inplace = True)
    return  df

def deleteInconherenteVal(df):
    for col in df.select_dtypes('object'):
        df[col].replace(["unknown"],df[col].mode(),inplace = True)
    return df

def encodage(df):
    for col in df.select_dtypes('object'):
        encoder = LabelEncoder()
        encode = encoder.fit_transform(df[col].unique())
        cl=df[col].unique()
        code = {a:b for a,b in zip(cl,encode)}
        df[col] = df[col].map(code)
        #print(encode)
        #print(cl)
        #print(code)
    return df

def apurement(df):
    df=imputation(df)
    df=deleteInconherenteVal(df)
    df=encodage(df)
    return df

df_apurer=apurement(df)
df_apurer.head()

C:\Users\lucre\AppData\Local\Temp\ipykernel_22932\4192318323.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].replace(["unknown"],df[col].mode(),inplace = True)

plt.figure(figsize=(20,10))
sns.heatmap(df_apurer.corr(),annot = True)

<Axes: >

f, (ax_box, ax_hist) = plt.subplots(2, sharex=True) # gridspec_kw= {"height_ratios": (0.3, 1)}
sns.set(rc={'figure.figsize':(11,8)}, font_scale=1.5, style='whitegrid')
mean=df_apurer['duration'].mean()
median=df_apurer['duration'].median()
mode=df_apurer['duration'].mode().values[0]

duration = sns.boxplot(data=df_apurer, x="duration", y="y", ax=ax_box, order = df_apurer["y"].value_counts().index)
duration.set(xscale="log")
ax_box.axvline(mean, color='r', linestyle='--')
ax_box.axvline(median, color='g', linestyle='-')
ax_box.axvline(mode, color='b', linestyle='-')

sns.histplot(data=df_apurer, x="duration", ax=ax_hist, kde=True)
ax_hist.axvline(mean, color='r', linestyle='--', label="Mean")
ax_hist.axvline(median, color='g', linestyle='-', label="Median")
ax_hist.axvline(mode, color='b', linestyle='-', label="Mode")
ax_hist.legend()
ax_box.set(xlabel='')
plt.show()

mean=df_apurer['age'].mean()
median=df_apurer['age'].median()
mode=df_apurer['age'].mode().values[0]

sns.boxplot(data=df_apurer, x="age", y="y")
sns.histplot(data=df_apurer, x="age",kde=True)

plt.show()

sns.countplot(x="job", data = df_apurer, hue = "y", order = df_apurer["job"].value_counts().index)
plt.title("Analyse de la relation entre 'job' et 'y'")
plt.show()

default = sns.countplot(x="default", data = df_apurer, hue = "y", order = df_apurer["default"].value_counts().index)
plt.title("Analyse bivariée entre 'default' et 'y'")
plt.show()

trainset, testset = train_test_split(df_apurer, test_size=0.2, random_state=0)

trainset['y'].value_counts()

y
0    29223
1     3727
Name: count, dtype: int64

testset['y'].value_counts()

y
0    7325
1     913
Name: count, dtype: int64

def preprocessing(df):
    
    df = encodage(df)
    df = deleteInconherenteVal(df)
    #df = imputation(df)
    
    X = df.drop('y', axis=1)
    y = df['y']
    
    print(y.value_counts())
    
    return X, y

X_train, y_train = preprocessing(df_apurer)

y
0    36548
1     4640
Name: count, dtype: int64

X_test, y_test = preprocessing(testset)

y
0    7325
1     913
Name: count, dtype: int64

model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8983976693372178

def evaluation(model):
    
    model.fit(X_train, y_train)
    ypred = model.predict(X_test)
    
    print(confusion_matrix(y_test, ypred))
    print(classification_report(y_test, ypred))

evaluation(model)

[[7217  108]
 [ 729  184]]
              precision    recall  f1-score   support

           0       0.91      0.99      0.95      7325
           1       0.63      0.20      0.31       913

    accuracy                           0.90      8238
   macro avg       0.77      0.59      0.63      8238
weighted avg       0.88      0.90      0.87      8238

SCORING BANCAIRE (Prédiction bon et mauvais clients pour des emprunts)¶

Cette vérification se fait grâce à l'analyse de plusieurs paramètres tels que les revenus, les biens, les dépenses actuelles du client, etc. Cette analyse est encore effectuée manuellement par plusieurs banques. Ainsi, elle est très consommatrice en temps et en ressources financières.¶

Grâce au Machine Learning, il est possible d'automatiser cette tâche et de pouvoir prédire avec plus de précision les clients qui seront en défaut de paiement.¶

Technologies Utilisées : Python, Machine Learning, Traitement de Données.¶

Dans ce projet, j’ai :¶

CODING TIME ...¶

1- CHARGEMENT DES PACKAGES NECESSAIRES¶

2- NETTOYAGE DE LA BASE DE DONNEES¶

ANALYSE DE LA FORME DES DONNEES¶

IMPUTATION¶

SUPPRESSION DES INCOHERENCES¶

ENCODAGE¶

DEFINITION DE LA FONCTION DE NETTOYAGE DE LA BASE DE DONNEES¶

EXECUTION DE LA FONCTION DE NETTOYAGE DE LA BASE DE DONNEES¶

MATRICE DE CORRELATION DES VARIABLES¶

3- Analyses Descriptives (Duration,âge,job,default)¶

Duration Feature¶

Age Feature¶

Job feature¶

Default feature¶

4- Modélisation¶

5- EVALUATION DU MODELE¶

	age	job	marital	education	default	housing	loan	contact	month	day_of_week	...	campaign	pdays	previous	poutcome	emp_var_rate	cons_price_idx	cons_conf_idx	euribor3m	nr_employed	y
0	44	blue-collar	married	basic.4y	unknown	yes	no	cellular	aug	thu	...	1	999	0	nonexistent	1.4	93.444	-36.1	4.963	5228.1	0
1	53	technician	married	unknown	no	no	no	cellular	nov	fri	...	1	999	0	nonexistent	-0.1	93.200	-42.0	4.021	5195.8	0
2	28	management	single	university.degree	no	yes	no	cellular	jun	thu	...	3	6	2	success	-1.7	94.055	-39.8	0.729	4991.6	1
3	39	services	married	high.school	no	no	no	cellular	apr	fri	...	2	999	0	nonexistent	-1.8	93.075	-47.1	1.405	5099.1	0
4	55	retired	married	basic.4y	no	yes	no	cellular	aug	fri	...	1	3	1	success	-2.9	92.201	-31.4	0.869	5076.2	1

	age	job	marital	education	housing	duration	campaign	poutcome	y
0	44	1	1	0	1	210	1	1	0
1	53	9	1	6	0	138	1	1	0
2	28	4	2	6	1	339	3	2	1
3	39	7	1	3	0	185	2	1	0
4	55	5	1	0	1	137	1	2	1

	age	job	marital	education	housing	duration	campaign	poutcome	y
0	44	1	1	0	1	210	1	1	0
1	53	9	1	6	0	138	1	1	0
2	28	4	2	6	1	339	3	2	1
3	39	7	1	3	0	185	2	1	0
4	55	5	1	0	1	137	1	2	1

	age	job	marital	education	housing	duration	campaign	poutcome	y
0	44	1	1	0	1	210	1	1	0
1	53	9	1	6	0	138	1	1	0
2	28	4	2	6	1	339	3	2	1
3	39	7	1	3	0	185	2	1	0
4	55	5	1	0	1	137	1	2	1