In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.callbacks import ModelCheckpoint
Loading Dataset¶
In [2]:
!wget 'https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/water_potability.csv'
In [3]:
df= pd.read_csv('water_potability.csv')
In [4]:
df
Out[4]:
In [5]:
df.describe()
Out[5]:
In [6]:
df.isnull().sum()
Out[6]:
In [7]:
df.Potability.value_counts().plot(kind ='pie');
Resampling Data¶
In [8]:
df['Potability'].value_counts()
Out[8]:
In [9]:
zero = df[df['Potability']==0] #zero values in Potability column
one = df[df['Potability']==1] # one values in Potability column
from sklearn.utils import resample
#minority class that is 1, we need to upsample/increase that class so that there is no bias
#n_samples = 1998 means we want 1998 sample of class 1, since there are 1998 samples of class 0
df_minority_upsampled = resample(one, replace = True, n_samples = 1998)
#concatenate
df = pd.concat([zero, df_minority_upsampled])
from sklearn.utils import shuffle
df = shuffle(df) # shuffling so that there is particular sequence
df.Potability.value_counts().plot(kind ='pie');
Dealing with Null Values¶
In [10]:
from sklearn.impute import SimpleImputer
imp= SimpleImputer(strategy= 'mean')
r= imp.fit_transform(df[['ph']])
s= imp.fit_transform(df[['Sulfate']])
t= imp.fit_transform(df[['Trihalomethanes']])
In [11]:
df['ph']=r
df['Sulfate']= s
df['Trihalomethanes']=t
In [12]:
df.isnull().sum()
Out[12]:
Correlation in Data using Heatmap¶
In [13]:
plt.figure(figsize=(10,8))
sns.set_context('paper')
sns.heatmap(df.corr(),cmap='Blues',linecolor='White',linewidth='1',annot=True,square=True)
Out[13]:
Data is not correlated with each other as we can see in the Heatmap.
Normalizing the Data¶
In [14]:
X = df.iloc[:,:9].values
y = df.iloc[:,9:10].values
In [15]:
#Normalizing the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
print('Normalized data:')
print(X[0])
Splitting the Data¶
In [16]:
#Train test split of model
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.1,random_state = 5)
Building and Fitting of Model¶
In [17]:
#importing libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# creating the model
model= keras.Sequential([
layers.Dense(128, input_shape= (9,), activation= 'relu'),
layers.Dropout(0.5),
layers.Dense(64, activation= 'relu'),
layers.Dropout(0.4),
layers.Dense(32, activation= 'relu'),
layers.Dropout(0.4),
layers.Dense(1, activation= 'sigmoid')
])
model.compile(
optimizer= 'adam',
loss= 'binary_crossentropy',
metrics= ['accuracy']
)
history= model.fit(X_train, y_train, epochs=400,validation_data=(X_test, y_test), verbose= False)
model.evaluate(X_train, y_train)
Out[17]:
In [18]:
model.summary()
In [19]:
model.evaluate(X_test, y_test)
Out[19]:
Plotting Accuracy and Loss¶
In [20]:
# Model Accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
In [21]:
# Model Loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
Saving the model¶
In [22]:
model.save('water_potability_test.h5')
Predictions¶
In [23]:
from tensorflow.keras.models import load_model
m = load_model('water_potability_test.h5')
In [24]:
y_pred= model.predict(X_test)
In [25]:
y_pred = (y_pred>0.5)
In [26]:
y_pred[0:20]
Out[26]:
Deep CC¶
In [27]:
!deepCC water_potability_test.h5