#Exploratory Data Analysis and Plotting Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Plots to appear inside the notebook
%matplotlib inline


#Models from scitkit-learn that will be used for this project
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


#Model Evaluations libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_auc_score


df=pd.read_csv("water_potability.csv")


#The size of the dataset (rows and columns)
df.shape

(3276, 10)


#The top of the data

df.head()


#Lets find out what the total number is in each class (0 and 1) 
df["Potability"].value_counts()

Potability
0    1998
1    1278
Name: count, dtype: int64


#Visualising the the total number in each class (0 and 1) for the Potability Column in the dataset

df ["Potability"].value_counts().plot (kind="bar", color=["orange", "blue"])
plt.title("Water Quality Potability", size=20, weight='bold')
plt.annotate(text="Not safe for Human consumption", xytext=(0.5,1750),xy=(0.2,1250), arrowprops =dict(arrowstyle="->", color='orange', connectionstyle="angle3,angleA=0,angleB=90"), color='black')
plt.annotate(text="Safe for Human consumption", xytext=(0.8,1500),xy=(1.2,1000), arrowprops =dict(arrowstyle="->", color='blue',  connectionstyle="angle3,angleA=0,angleB=90"), color='black')
plt.ylabel("Numbers");


#How many missing values are there in the dataset?

df.isna().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64


#Replace NaN values based on the group/sample mean
df['ph']=df['ph'].fillna(df.groupby(['Potability'])['ph'].transform('mean'))
df['Sulfate']=df['Sulfate'].fillna(df.groupby(['Potability'])['Sulfate'].transform('mean'))
df['Trihalomethanes']=df['Trihalomethanes'].fillna(df.groupby(['Potability'])['Trihalomethanes'].transform('mean'))


#Check if there are still missing values

df.isna().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64


#More information about the dataset (mean, std, min,max, Q1,Q2,Q3)
df.describe()


#Information about the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               3276 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          3276 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3276 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


#Comparing one feature to the potabilty feature.

pd.crosstab(df.Potability, df.Sulfate)


#Comparing two features to the potability feature.
pd.crosstab(df.Hardness [df.Potability==0], df.Sulfate [df.Potability==0])


df.Hardness

0       204.890455
1       129.422921
2       224.236259
3       214.373394
4       181.101509
           ...    
3271    193.681735
3272    193.553212
3273    175.762646
3274    230.603758
3275    195.102299
Name: Hardness, Length: 3276, dtype: float64


#Creating a Scattter Plot with results above where 1 = Potable, 0 = Not Potable

plt.scatter (df.Hardness [df.Potability==1], df.Sulfate [df.Potability==1], color="Salmon")

plt.scatter (df.Hardness [df.Potability==0], df.Sulfate [df.Potability==0], color="lightblue")

plt.title("Relationship between Hardness, Sulfate and Potability Features", size=20, weight='bold')

plt.legend (["Potable", "Not Portable", ""])

plt.xlabel("Number")

plt.ylabel("Amount (Mg/L)");


#Check the distribution of another feature ( Chloramines) with a histogram

df. Chloramines.plot.hist()
plt.ylabel("Amount (Mg/L)")
plt.title("Chloramines Feature", size=20, weight='bold')

Text(0.5, 1.0, 'Chloramines Feature')


#Correlation function
df.corr

<bound method DataFrame.corr of             ph    Hardness        Solids  Chloramines     Sulfate  \
0     7.085378  204.890455  20791.318981     7.300212  368.516441   
1     3.716080  129.422921  18630.057858     6.635246  334.564290   
2     8.099124  224.236259  19909.541732     9.275884  334.564290   
3     8.316766  214.373394  22018.417441     8.059332  356.886136   
4     9.092223  181.101509  17978.986339     6.546600  310.135738   
...        ...         ...           ...          ...         ...   
3271  4.668102  193.681735  47580.991603     7.166639  359.948574   
3272  7.808856  193.553212  17329.802160     8.061362  332.566990   
3273  9.419510  175.762646  33155.578218     7.350233  332.566990   
3274  5.126763  230.603758  11983.869376     6.303357  332.566990   
3275  7.874671  195.102299  17404.177061     7.509306  332.566990   

      Conductivity  Organic_carbon  Trihalomethanes  Turbidity  Potability  
0       564.308654       10.379783        86.990970   2.963135           0  
1       592.885359       15.180013        56.329076   4.500656           0  
2       418.606213       16.868637        66.420093   3.055934           0  
3       363.266516       18.436524       100.341674   4.628771           0  
4       398.410813       11.558279        31.997993   4.075075           0  
...            ...             ...              ...        ...         ...  
3271    526.424171       13.894419        66.687695   4.435821           1  
3272    392.449580       19.903225        66.539684   2.798243           1  
3273    432.044783       11.039070        69.845400   3.298875           1  
3274    402.883113       11.168946        77.488213   4.708658           1  
3275    327.459760       16.140368        78.698446   2.309149           1  

[3276 rows x 10 columns]>


#Making correlation matrix more visual for better understanding

fig=plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), annot=True, fmt='0.2f', square=True)
plt.title("Correlation Matrixs", size=20, weight='bold')

Text(0.5, 1.0, 'Correlation Matrixs')


#View all the top row data

df.head()


#Spilt data into X and Y

X= df.drop("Potability", axis=1)
Y= df["Potability"]


#Let's see what X rows and columns lookS like

X


#Let's see what Y row and cloumn looks like

Y

0       0
1       0
2       0
3       0
4       0
       ..
3271    1
3272    1
3273    1
3274    1
3275    1
Name: Potability, Length: 3276, dtype: int64


#Spilt data into train and test sets where 80% of the data is for training and 20% is for testing

np.random.seed(42)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20 )


#Get the length of both X train and Y train

X_train, len(Y_train)

(            ph    Hardness        Solids  Chloramines     Sulfate  \
 233   6.623614  203.030141  17167.301297     6.049601  311.726288   
 831   6.684700  193.840931  34157.184474     9.876574  344.535407   
 2658  6.836060  205.667718  18321.327502     6.712854  297.837188   
 2495  7.085378  183.488839  12675.938962     9.777807  319.870584   
 2603  6.406798  182.885137  17851.064021     7.462758  332.486731   
 ...        ...         ...           ...          ...         ...   
 1095  4.187491  208.374188  21809.709834     5.846112  327.474203   
 1130  7.793915  164.958947  25506.912237     7.868036  358.259200   
 1294  6.630364  186.761088  30939.023214     7.703481  334.564290   
 860   8.783168  218.032840  16183.586649     7.390474  334.053885   
 3174  6.698154  198.286268  34675.862845     6.263602  360.232834   
 
       Conductivity  Organic_carbon  Trihalomethanes  Turbidity  
 233     410.243247       15.914500        65.021229   2.915166  
 831     498.063996        8.818757        66.659352   4.030660  
 2658    494.484249       13.808923        70.714225   4.952508  
 2495    482.445026       13.309723        46.853410   3.240419  
 2603    398.779746       17.301617        64.070236   4.573968  
 ...            ...             ...              ...        ...  
 1095    264.508083       11.235144        46.682597   4.592959  
 1130    398.460312       15.297496        66.539684   4.220028  
 1294    330.876083       13.815757        86.753117   3.490588  
 860     389.021616       16.354520        47.100982   4.274137  
 3174    430.935009       12.176678        66.539684   3.758180  
 
 [2620 rows x 9 columns],
 2620)


#Get length of the X test and Y test

X_test, len(Y_test)

(             ph    Hardness        Solids  Chloramines     Sulfate  \
 2947   7.085378  183.521107  20461.252710     7.333212  333.119476   
 2782   6.643159  188.913541  32873.820022     6.791509  333.848842   
 1644   7.846058  224.058877  23264.109968     5.922367  300.402620   
 70     7.160467  183.089310   6743.346066     3.803036  277.599099   
 2045   6.615350  179.240661  26392.863612     9.309160  332.566990   
 ...         ...         ...           ...          ...         ...   
 208   10.026159  224.266358  14962.177833     7.428313  336.972950   
 1578   6.865569  231.445054  22585.788809     5.676387  332.566990   
 565    7.459145  217.700130  19436.503542     4.639116  352.424439   
 313    5.862641  185.065220  44069.272158     4.382721  412.690111   
 601    7.085378  220.552524  28135.076838     7.978098  307.652451   
 
       Conductivity  Organic_carbon  Trihalomethanes  Turbidity  
 2947    356.369022       20.179029        67.019903   4.886634  
 2782    336.561501       14.706810        67.844849   4.562198  
 1644    387.971336       13.406737        43.075186   2.487969  
 70      428.036344        9.799625        90.035374   3.884891  
 2045    496.363562       12.786595        78.262369   4.453443  
 ...            ...             ...              ...        ...  
 208     517.512842       18.858519        65.363452   4.182278  
 1578    496.603425       16.154964        91.461709   4.916218  
 565     494.094339       14.460295        57.196188   3.841052  
 313     331.570139       15.306079        59.605812   5.507421  
 601     421.464253       17.532298        86.848098   3.569570  
 
 [656 rows x 9 columns],
 656)


#Model Dictionary

models = {"Logistic Regression": LogisticRegression(),
          "KNN":KNeighborsClassifier(),
          "Random Forest": RandomForestClassifier()}


models

{'Logistic Regression': LogisticRegression(),
 'KNN': KNeighborsClassifier(),
 'Random Forest': RandomForestClassifier()}


def potability_fit_score(models, X_train, X_test, Y_train, Y_test):
    
    """
 fits and evaluates given machine learning models.

 Models: A dict of different Scikit-learn machine learning models.

 X_train: Training data (with training labels)

 X_test: Testing data (No labels)

 Y_train: Training data (with training labels)

 Y_test: Testing data (No labels)

 """
    np.random.seed(42) #set randon seed
    
    modelfit_scores = {}
    
    #Loop through the models
    #fit the model to train data
    #Evaluate the model and append the score to the model_scores
    
    for name, model in models.items():
        model.fit(X_train, Y_train)
        model_scores[name]=model.score(X_train, Y_train)
    return  model_scores


model_RandomForest = RandomForestClassifier()


model_RandomForest.fit(X_train, Y_train)

RandomForestClassifier()

RandomForestClassifier()


model_RandomForest.score(X_train, Y_train)

1.0


model_KNN =KNeighborsClassifier()

model_KNN.fit(X_train, Y_train)
model_KNN.score(X_train, Y_train)

0.7145038167938931


model_LogisticRegression = LogisticRegression()
model_LogisticRegression.fit(X_train, Y_train)
model_LogisticRegression.score(X_train, Y_train)

0.6061068702290077


#Lets look at the model results in a bar chart

fig, ax =plt.subplots()

models =['','LogisticRegression', 'KNN', 'RandomForest']

models_labels =['','red','blue', 'green']

models_result = ['','0.60', '0.71', '1.0']

bar_colors= ['tab:red','tab:blue', 'tab:green']

ax.bar(models, models_result, label=models_labels, color=bar_colors)

ax.set_ylabel("Number")
ax.set_xlabel("Model Name")
ax.set_title("Model Comparison", size=20, weight='bold')
ax.legend(['Random Forest','Logistic Regression','KNN',])




plt.show()


#Tunning KNN (Improving the baseline sccore)

train_scores=[]
test_scores=[]

#Create a list of different values for the n-neighbors
neighbors=range(1,21)

#setup the KNN instance
knn= KNeighborsClassifier()

#Loop through different K-neighbors
for i in neighbors:
    knn.set_params(n_neighbors=i)
    knn.fit(X_train, Y_train)
    train_scores.append(knn.score(X_train,Y_train))
    test_scores.append(knn.score(X_test, Y_test))


knn.fit(X_train, Y_train)
knn.score(X_train, Y_train)

0.6339694656488549


#Visualise the Knearest Neighbor model before and after hyperparameter tuning
plt.plot(neighbors, train_scores, label="Train Score")
plt.plot(neighbors, test_scores, label="Test Score")
plt.xlabel ("Number of neighbors")
plt.xlabel("model Score")
plt.title("Train and Test Score", size=20, weight='bold')
plt.legend()
plt.ylabel("Model Accuracy")
print (f"Maximum KNN score on the test data: {max(test_scores)*100:2f} %")

Maximum KNN score on the test data: 61.432927 %


#Create a parameter grid for logisitic regression

log_reg_grid= {"C": np.logspace(-4,4,20), "solver": ["liblinear"]}
np.logspace(-4,4,20)

array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04])


#Tune Logistic Regression
np.random.seed(42)

#Setup random hyperparameter search for logistic Regression
rs_log_reg=RandomizedSearchCV (LogisticRegression(), param_distributions=log_reg_grid, cv=5,n_iter=20, verbose=True)

#fit randomized search hyperparameter for Logistic Regression Model for train datasets
rs_log_reg.fit(X_train,Y_train)
rs_log_reg.best_params_

Fitting 5 folds for each of 20 candidates, totalling 100 fits

{'solver': 'liblinear', 'C': 0.0006951927961775605}


#fit randomized search hyperparameter for Logistic Regression for scoring the test datasets

rs_log_reg.score (X_test,Y_test)

0.6280487804878049


#Different hyperparameters for Logistic Regression model

log_reg_grid= {"C": np.logspace (-4, 4, 30), "solver": ["liblinear"]}

#setting up grid hyperparameter search for Logistic Regression
gs_log_reg = GridSearchCV (LogisticRegression(), param_grid=log_reg_grid, cv=5, verbose=True)

#Fit grid hyperparameter
gs_log_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': array([1.00000000e-04, 1.88739182e-04, 3.56224789e-04, 6.72335754e-04,
       1.26896100e-03, 2.39502662e-03, 4.52035366e-03, 8.53167852e-03,
       1.61026203e-02, 3.03919538e-02, 5.73615251e-02, 1.08263673e-01,
       2.04335972e-01, 3.85662042e-01, 7.27895384e-01, 1.37382380e+00,
       2.59294380e+00, 4.89390092e+00, 9.23670857e+00, 1.74332882e+01,
       3.29034456e+01, 6.21016942e+01, 1.17210230e+02, 2.21221629e+02,
       4.17531894e+02, 7.88046282e+02, 1.48735211e+03, 2.80721620e+03,
       5.29831691e+03, 1.00000000e+04]),
                         'solver': ['liblinear']},
             verbose=True)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': array([1.00000000e-04, 1.88739182e-04, 3.56224789e-04, 6.72335754e-04,
       1.26896100e-03, 2.39502662e-03, 4.52035366e-03, 8.53167852e-03,
       1.61026203e-02, 3.03919538e-02, 5.73615251e-02, 1.08263673e-01,
       2.04335972e-01, 3.85662042e-01, 7.27895384e-01, 1.37382380e+00,
       2.59294380e+00, 4.89390092e+00, 9.23670857e+00, 1.74332882e+01,
       3.29034456e+01, 6.21016942e+01, 1.17210230e+02, 2.21221629e+02,
       4.17531894e+02, 7.88046282e+02, 1.48735211e+03, 2.80721620e+03,
       5.29831691e+03, 1.00000000e+04]),
                         'solver': ['liblinear']},
             verbose=True)

LogisticRegression()

LogisticRegression()


#check for the best hyperparameters
gs_log_reg.best_params_

{'C': 0.0006723357536499335, 'solver': 'liblinear'}


#Evaluate the grid search cv 

gs_log_reg.score(X_test, Y_test)

0.6280487804878049


#To evaluate the Random Forest Classifier model, I will utilise the test data for prediction.

model_RandomForest.fit(X_test, Y_test)

RandomForestClassifier()

RandomForestClassifier()


y_preds=model_RandomForest.predict(X_test)


y_preds

array([0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0], dtype=int64)


Y_test

2947    0
2782    1
1644    0
70      0
2045    1
       ..
208     0
1578    1
565     0
313     1
601     0
Name: Potability, Length: 656, dtype: int64


#Computing a confusion matrix anatomy to know how well the model performed in predicting.
print (confusion_matrix (Y_test, y_preds))

[[412   0]
 [  0 244]]


#Plot confusion matrix  

def plot_conf_mat(Y_test,Y_preds):
    
 """"
 Plots a nice looking confusion matrix using a seaborn's heatmap()
 
 """
fig, ax = plt.subplots(figsize=(3,3))
ax=sns.heatmap (confusion_matrix(Y_test, y_preds), annot=True, cbar=False)
plt.xlabel("True label")
plt.ylabel("Predicted label")
bottom, top = ax.get_ylim()
ax.set_ylim(bottom +0.5, top -0.5)
plot_conf_mat(Y_test,y_preds)


#Classification report

print(classification_report(Y_test, y_preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       412
           1       1.00      1.00      1.00       244

    accuracy                           1.00       656
   macro avg       1.00      1.00      1.00       656
weighted avg       1.00      1.00      1.00       656


X_test


df_X_test=pd.DataFrame (X_test)
df_X_test


df_X_test


#This will save the X_test to a csv file.

df_X_test. to_csv ("X_test")


#Creates the feature importance between the Test dataset features.

feature_importance=df_X_test.corr()
feature_importance


#Creating a dictionary for the features and values.

dict= {"ph":1.000000,
       "Hardness":0.069999,
        "Solids":-0.129438,
        "Chloramines":-0.028890,
        "Sulfate":0.005391,
         "Conductivity":0.005391,
         "Organic_carbon":0.053269,
          "Trihalomethanes":0.054734,
           "Turbidity":-0.030013};

dict

{'ph': 1.0,
 'Hardness': 0.069999,
 'Solids': -0.129438,
 'Chloramines': -0.02889,
 'Sulfate': 0.005391,
 'Conductivity': 0.005391,
 'Organic_carbon': 0.053269,
 'Trihalomethanes': 0.054734,
 'Turbidity': -0.030013}


# Visualizing the feature importance
features_df = pd.DataFrame(dict, index=[0])
features_df.T.plot.barh(title="Feature Importance", color="orange", legend=False, ylabel="Features", xlabel="Importance Number");

	ph	Hardness	Solids	Chloramines	Sulfate	Conductivity	Organic_carbon	Trihalomethanes	Turbidity	Potability
count	3276.000000	3276.000000	3276.000000	3276.000000	3276.000000	3276.000000	3276.000000	3276.000000	3276.000000	3276.000000
mean	7.080855	196.369496	22014.092526	7.122277	333.785123	426.205111	14.284970	66.395671	3.966786	0.390110
std	1.469958	32.879761	8768.570828	1.583085	36.145701	80.824064	3.308162	15.769901	0.780382	0.487849
min	0.000000	47.432000	320.942611	0.352000	129.000000	181.483754	2.200000	0.738000	1.450000	0.000000
25%	6.277673	176.850538	15666.690297	6.127421	317.094638	365.734414	12.065801	56.647656	3.439711	0.000000
50%	7.085378	196.967627	20927.833607	7.130299	334.564290	421.884968	14.218338	66.303555	3.955028	0.000000
75%	7.870050	216.667456	27332.762127	8.114887	350.385756	481.792304	16.557652	76.666609	4.500320	1.000000
max	14.000000	323.124000	61227.196008	13.127000	481.030642	753.342620	28.300000	124.000000	6.739000	1.000000

Sulfate	129.000000	180.206746	182.397370	187.170714	187.424131	192.033592	203.444521	205.935091	206.247229	207.890482	...	447.417962	449.267688	450.914454	455.451234	458.441072	460.107069	462.474215	475.737460	476.539717	481.030642
Potability
0	0	0	0	0	0	0	1	1	0	1	...	1	1	0	1	1	1	0	0	0	0
1	1	1	1	1	1	1	0	0	1	0	...	0	0	1	0	0	0	1	1	1	1

	ph	Hardness	Solids	Chloramines	Sulfate	Conductivity	Organic_carbon	Trihalomethanes	Turbidity
ph	1.000000	0.069999	-0.129438	-0.028890	0.005391	0.036979	0.053269	0.054734	-0.030013
Hardness	0.069999	1.000000	-0.040519	-0.039412	-0.065576	-0.083318	0.032181	-0.054194	-0.040715
Solids	-0.129438	-0.040519	1.000000	-0.107260	-0.115388	0.014736	0.033206	-0.038361	-0.025235
Chloramines	-0.028890	-0.039412	-0.107260	1.000000	0.039700	-0.058668	-0.047849	-0.036002	-0.003748
Sulfate	0.005391	-0.065576	-0.115388	0.039700	1.000000	0.020969	0.070549	-0.025493	-0.008658
Conductivity	0.036979	-0.083318	0.014736	-0.058668	0.020969	1.000000	0.029170	0.011810	-0.035341
Organic_carbon	0.053269	0.032181	0.033206	-0.047849	0.070549	0.029170	1.000000	-0.023722	-0.049432
Trihalomethanes	0.054734	-0.054194	-0.038361	-0.036002	-0.025493	0.011810	-0.023722	1.000000	-0.027805
Turbidity	-0.030013	-0.040715	-0.025235	-0.003748	-0.008658	-0.035341	-0.049432	-0.027805	1.000000

WATER QUALITY AND POTABILITY¶

Finding the highest and least features that contributed to the prediction.¶

DISCLAIMER

	ph	Hardness	Solids	Chloramines	Sulfate	Conductivity	Organic_carbon	Trihalomethanes	Turbidity
0	NaN	204.890455	20791.318981	7.300212	368.516441	564.308654	10.379783	86.990970	2.963135
1	3.716080	129.422921	18630.057858	6.635246	NaN	592.885359	15.180013	56.329076	4.500656
2	8.099124	224.236259	19909.541732	9.275884	NaN	418.606213	16.868637	66.420093	3.055934
3	8.316766	214.373394	22018.417441	8.059332	356.886136	363.266516	18.436524	100.341674	4.628771
4	9.092223	181.101509	17978.986339	6.546600	310.135738	398.410813	11.558279	31.997993	4.075075

Sulfate	203.444521	205.935091	207.890482	214.460834	225.516628	232.548814	234.609808	234.852699	235.995461	237.517456	...	433.952212	437.592300	439.787938	442.761428	445.359547	447.417962	449.267688	455.451234	458.441072	460.107069
Hardness
98.452931	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
100.457615	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
103.173587	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
103.464759	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
104.752425	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
284.098352	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
286.567991	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
298.098679	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
300.292476	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
304.235912	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	ph	Hardness	Solids	Chloramines	Sulfate	Conductivity	Organic_carbon	Trihalomethanes	Turbidity
2947	7.085378	183.521107	20461.252710	7.333212	333.119476	356.369022	20.179029	67.019903	4.886634
2782	6.643159	188.913541	32873.820022	6.791509	333.848842	336.561501	14.706810	67.844849	4.562198
1644	7.846058	224.058877	23264.109968	5.922367	300.402620	387.971336	13.406737	43.075186	2.487969
70	7.160467	183.089310	6743.346066	3.803036	277.599099	428.036344	9.799625	90.035374	3.884891
2045	6.615350	179.240661	26392.863612	9.309160	332.566990	496.363562	12.786595	78.262369	4.453443
...	...	...	...	...	...	...	...	...	...
208	10.026159	224.266358	14962.177833	7.428313	336.972950	517.512842	18.858519	65.363452	4.182278
1578	6.865569	231.445054	22585.788809	5.676387	332.566990	496.603425	16.154964	91.461709	4.916218
565	7.459145	217.700130	19436.503542	4.639116	352.424439	494.094339	14.460295	57.196188	3.841052
313	5.862641	185.065220	44069.272158	4.382721	412.690111	331.570139	15.306079	59.605812	5.507421
601	7.085378	220.552524	28135.076838	7.978098	307.652451	421.464253	17.532298	86.848098	3.569570