This is an exploratory analysis to figure out whether we can predict rainfall just by getting different environmental parameters. This can be used in devices which are offline and has to decide whether to perform an action based on whether it is going to rain or not. Can be of great use in agricultural sector.
This is not a time series analysis of weather pattern. It is a binary classification problem of whether it is going to rain or not based on certain parameters. Although the data is such that it can be used to do a multivariate time-series analysis of rainfall, but that is not our goal here.
Data taken from here
In this one we will use DNN using Tensorflow with weights adjusted for the distribution of the lables. We will do this since our dataset is imbalanced, i.e, the number of 1 label is far less than 0 label. We will aslo use precison recall and other metrics along with accuracy
############# import necessary libraries ##########################
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
import sklearn
import zipfile
path = os.getcwd()
if not os.path.exists('archive.zip'):
!wget -N "https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/archive.zip"
if not os.path.exists('DataFiles'):
zip_ref = zipfile.ZipFile('archive.zip', 'r')
zip_ref.extractall('DataFiles/')
filenames = os.listdir('DataFiles')
print(filenames)
###########Create one common dataframe from all the files. We will ignore the city for now ##########
list_of_df = []
for file in filenames:
df_temp = pd.read_csv(os.path.join('DataFiles',file))
list_of_df.append(df_temp)
df = pd.concat(list_of_df, axis=0, ignore_index=True)
df.head()
########## Dropping unnecessary columns. ################
df.drop(['date_time','sunrise','sunset','moonrise','moonset', 'totalSnow_cm','maxtempC','mintempC','sunHour'], axis = 1, inplace=True)
df['precipMM'] = df['precipMM'].apply(lambda x: 1 if x > 0 else 0)
####### Getting an idea about the data #############
pd.Index(df['precipMM']).value_counts()
### This shows us that the data is imbalanced
##Checking if there are any nulls in any columns
df.apply(lambda x: sum(x.isnull()))
###There are no nulls
sns.pairplot(df, vars=['pressure'], dropna = True, hue = 'precipMM')
sns.pairplot(df, vars=['tempC'], dropna = True, hue = 'precipMM')
sns.pairplot(df, vars=['windspeedKmph'], dropna = True, hue = 'precipMM')
data = df.loc[:, df.columns != 'precipMM']
label = df['precipMM']
data = np.array(data)
label = np.array(label)
data.shape
train_data, test_data, train_label, test_label = train_test_split(data, label, test_size = 0.1, random_state = 42, shuffle = True, stratify = label) ## We are keeping the test size to 10 percent because we will make one more split in the model.fit stage as well
print(train_data.shape)
print(train_label.shape)
print(test_data.shape)
print(test_label.shape)
METRICS = [
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall'),
tf.keras.metrics.AUC(name='auc'),
tf.keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]
model = Sequential([
Dense(32, input_shape = (train_data.shape[1],), activation = 'relu'),
Dense(16, activation = 'relu'),
Dropout(0.2),
Dense(8, activation = 'relu'),
Dense(1, activation = 'sigmoid'),
])
# model = tf.keras.Sequential()
# model.add(Dense(250, input_shape=(train_data.shape[1],), activation=tf.nn.relu))
# model.add(Dropout(0.4))
# model.add(Dense(200, activation=tf.nn.relu))
# model.add(Dropout(0.4))
# model.add(Dense(100, activation=tf.nn.relu))
# model.add(Dropout(0.3))
# model.add(Dense(50, activation=tf.nn.relu))
# model.add(Dense(1, activation=tf.nn.sigmoid))
model.summary()
class_weights = sklearn.utils.class_weight.compute_class_weight('balanced', np.unique(train_label), train_label)
class_weights = {l:c for l,c in zip(np.unique(train_label), class_weights)}
print(class_weights)
model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics = METRICS)
# lr_schedule = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-5 * 10**(epoch/10))
history = model.fit(train_data, train_label, validation_split=0.3, batch_size = 100, epochs = 60, class_weight=class_weights)
model.optimizer.get_config()
# plt.semilogx(history.history['lr'],history.history['loss'])
# plt.axis([1e-5, 1e-2, 0, 30])
# plt.show()
print(history.history.keys())
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.show()
plt.plot(history.history['recall'])
plt.plot(history.history['val_recall'])
expected_y = test_label
predicted_y = model.predict_classes(test_data)
print(metrics.classification_report(expected_y, predicted_y))
print(metrics.confusion_matrix(expected_y, predicted_y))
model.save("rainfall_prediction.h5")
deepCC¶
!deepCC rainfall_prediction.h5