Tweet Authenticity Prediction
Our Objective is to predict if a Tweet is describing about a disaster or not .
Important Libraries
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection,feature_extraction
from collections import Counter
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import re
!pip install contractions
import contractions
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
!pip install pyspellchecker
from spellchecker import SpellChecker
import tensorflow as tf
from tensorflow import keras
!pip install wget
!wget -N 'https://cainvas-static.s3.amazonaws.com/media/user_data/devanshchowd/tweets.csv'
data = pd.read_csv('tweets.csv',index_col='id')
data.reset_index(drop=True,inplace=True)
data = data[['text', 'target']]
data_train,data_test = model_selection.train_test_split(data,test_size=.2,random_state = 43,stratify=data.target)
data_train.reset_index(drop=True,inplace=True)
data_test.reset_index(drop=True,inplace=True)
The target of 0 indicate a Non Disastrous Tweet and 1 indicate a Disastrous Tweet ,meaning tweet is describing / report
ing a disastor .
display(data_train)
BASIC EDA
THE 9000 sample is divided in roughly 7400 False and 1700 True target points.
sns.set_style('darkgrid')
print(data_train['target'].value_counts())
sns.countplot(data = data_train,x= 'target')
Text Length Distribution
sns.histplot(data_train['text'].apply(lambda x:len(x)))
print('Short Message in our data :',data_train.loc[data['text'].apply(lambda x:len(x)).idxmin()].values,sep='\n')
print('========')
print('Long Message in our data :',data_train.loc[data_train['text'].apply(lambda x:len(x))>140].values[0],sep='\n')
Transforming Data
stop_words = nltk.corpus.stopwords.words('english')
i = 0
wnl = WordNetLemmatizer()
stemmer=SnowballStemmer('english')
for doc in data_train.text:
doc = re.sub(r'https?://\S+|www\.\S+','',doc)
doc = re.sub(r'<.*?>','',doc)
doc = re.sub(r'[^a-zA-Z\s]','',doc,re.I|re.A)
doc = ' '.join([wnl.lemmatize(i) for i in doc.lower().split()])
doc = contractions.fix(doc)
tokens = nltk.word_tokenize(doc)
filtered = [token for token in tokens if token not in stop_words]
doc = ' '.join(filtered)
data_train.text[i] = doc
i+=1
i=0
for doc in data_test.text:
doc = re.sub(r'https?://\S+|www\.\S+','',doc)
doc = re.sub(r'<.*?>','',doc)
doc = re.sub(r'[^a-zA-Z\s]','',doc,re.I|re.A)
doc = ' '.join([wnl.lemmatize(i) for i in doc.lower().split()])
doc = contractions.fix(doc)
tokens = nltk.word_tokenize(doc)
filtered = [token for token in tokens if token not in stop_words]
doc = ' '.join(filtered)
data_test.text[i] = doc
i+=1
print('Short Message in our data :',data_train.loc[data['text'].apply(lambda x:len(x)).idxmin()].values,sep='\n')
print('========')
print('Long Message in our data :',data_train.loc[data_train['text'].apply(lambda x:len(x))>100].values[0],sep='\n')
You can see the conversions being done ,Previously the shortest statement was
'British diver Neil Anthony Fears found dead by the wreck of a steamship - Daily Mail http://t.co/QP3GVvfoFq' which got Transformed into
'british diver neil anthony fear found dead wreck steamship daily mail' as you can see we were successfull in doing 2 things-
1)Making the data a bit shorter
2)Not loosing the data meaning by making conversion
X = data_train['text']
y = data_train['target']
X_test = data_test['text']
y_test = data_test['target']
The Vocablury size of 17293 is too big , this means that to represent a sentence I will have to store a matrix of size of 17293 length ,this seems very much unneccesary as most of the text message have length between 120 - 140 .
count_vectorizer = feature_extraction.text.CountVectorizer()
count_vectorizer.fit_transform(X)
print(len(count_vectorizer.vocabulary_))
Now here comes the difficult part , how many word to be kept in our Vocablury , each has its pro and cons like -
1) If the number is high , this allows us to keep track of more words and store more information but the con being we use too much space and huge computations .
2) If the number is low , Our model will train fast and less space will be used ,but by eliminating other words we are losing some potential information that might have been useful for model.
I will chose to keep words that come more than once
freq_of_word = Counter()
for i in X:
freq_of_word.update(i.split())
vocablury = []
for word,freq in freq_of_word.items():
if freq>1:
vocablury.append(word)
print(len(vocablury))
vocab_size = len(vocablury)
def create_tokenizer(post):
tokenizer = keras.preprocessing.text.Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(post)
return tokenizer
def change(post):
sentence = ''
for i in post.split():
if i in vocablury:
sentence = sentence + i + ' '
return sentence
X_transformed = X.apply(change)
X_test_transformed = X_test.apply(change)
tokenizer = create_tokenizer(X_transformed)
print('Our New Dicioinary has',len(tokenizer.word_index),'Unique Words')
sequence = tokenizer.texts_to_sequences(X_transformed)
X_transformed_seq= keras.preprocessing.sequence.pad_sequences(sequence, maxlen=150)
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_transformed_seq,y ,test_size = .15,stratify = y)
print(X_train.shape,X_val.shape)
keras.backend.clear_session()
model = keras.models.Sequential()
model.add(keras.layers.Embedding(vocab_size, 32))
model.add(keras.layers.LSTM(3,recurrent_dropout=.5,dropout=.5,return_sequences=False,kernel_regularizer=keras.regularizers.l2()))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.summary()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
all_callbacks = [keras.callbacks.EarlyStopping(patience = 5,min_delta=.02,restore_best_weights=True),
keras.callbacks.ModelCheckpoint('LSTM.h5',save_best_only=True,monitor='val_accuracy')]
history = model.fit(X_train,y_train,validation_data= (X_val,y_val),batch_size = 64,epochs=10,callbacks=all_callbacks)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.savefig('Accuracy vs Epochs.png')
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.savefig('Loss vs Epochs.png')
model = keras.models.load_model('LSTM.h5')
model.evaluate(X_train,y_train)
model.evaluate(X_val,y_val)
sequence = tokenizer.texts_to_sequences(X_test_transformed)
X_test_transformed_seq= keras.preprocessing.sequence.pad_sequences(sequence, maxlen=150)
model.evaluate(X_test_transformed_seq,y_test)
We are Achieving about 88% Accuracy .
sample_pred = (model.predict(X_test_transformed_seq[:5])>.5).astype(int)
sample_text = X_test[:5]
truth = y_test[:5]
display(pd.DataFrame(zip(sample_pred,sample_text,truth),columns=['Model Predictions','TEXT','Truth Label']))