Movie Review Sentiment Analysis¶

pip install wget

Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: wget in ./.local/lib/python3.7/site-packages (3.2)
WARNING: You are using pip version 20.3.1; however, version 21.2.1 is available.
You should consider upgrading via the '/opt/tljh/user/bin/python -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.

Importing Necessary Libraries¶

import os
import wget
import numpy as np
import pandas as pd

import tensorflow as tf
keras = tf.keras
tf.__version__

'2.3.0'

Importing Dataset¶

url = "https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Movie.zip"
file = wget.download(url)

import zipfile
data = zipfile.ZipFile(file,"r")
data.extractall("Movie Reviews (2)")

Data Exploration¶

import pandas as pd
train_data = pd.read_csv('Movie Reviews (2)/Movie/train1.tsv', sep = '\t')
print(train_data.Phrase[0])
print(train_data.Phrase[1])
print(train_data.Phrase[2])
train_data.head(5)

A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .
A series of escapades demonstrating the adage that what is good for the goose
A series

print(train_data.Phrase[63])
train_data[62:64+5]

This quiet , introspective and entertaining independent is worth seeking .

Text Preprocessing¶

split_size = int(len(train_data) * 0.8)

total_phrases = train_data.Phrase.to_numpy()
train_phrases = total_phrases[:split_size]
valid_phrases = total_phrases[:split_size]

total_sentiments = train_data.Sentiment.to_numpy()
train_sentiments = total_sentiments[:split_size]
valid_sentiments = total_sentiments[:split_size]

train_ds = tf.data.Dataset.from_tensor_slices((train_phrases, train_sentiments))
valid_ds = tf.data.Dataset.from_tensor_slices((valid_phrases, valid_sentiments))
total_ds = tf.data.Dataset.from_tensor_slices((total_phrases, total_sentiments))

for phrase, sentiment in train_ds.take(4):
    print(phrase, sentiment)
print()    
for phrase, sentiment in valid_ds.take(1):
    print(phrase, sentiment)

tf.Tensor(b'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .', shape=(), dtype=string) tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(b'A series of escapades demonstrating the adage that what is good for the goose', shape=(), dtype=string) tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(b'A series', shape=(), dtype=string) tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(b'A', shape=(), dtype=string) tf.Tensor(2, shape=(), dtype=int64)

tf.Tensor(b'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .', shape=(), dtype=string) tf.Tensor(1, shape=(), dtype=int64)

word_set = set()
lines = []
for line, _ in train_ds:
    line = line.numpy().decode('utf-8')
    lines.append(line)
    for w in line.split(' '):
        word_set.add(w)

print(len(word_set))
for index, w in enumerate(word_set):
    if index >= 10:
        break
    print(f'{index:3}: {w}')

16007
  0: 
  1: Dylan
  2: wrote
  3: mistake
  4: series
  5: workshops
  6: conquers
  7: irrelevant
  8: gratuitous
  9: gently

from tensorflow.keras.preprocessing.text import Tokenizer
vocab_size = 5000
tokenizer = Tokenizer(num_words = vocab_size, oov_token = '<OOV>')
tokenizer.fit_on_texts(lines)
word_index = tokenizer.word_index
for index, (a, b) in enumerate(word_index.items()):
    if index >= 5:
        break
    print(a, b)

<OOV> 1
the 2
a 3
of 4
and 5

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

for line in lines:
    print(line)
    sequences = tokenizer.texts_to_sequences([line])[0]
    print(sequences)
    print([f'{id}: {reverse_word_index[id]}' for id in sequences])
    break

A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .
[3, 318, 4, 1, 1, 2, 1, 10, 52, 9, 46, 14, 2, 2413, 9, 179, 46, 14, 2, 1, 65, 4, 79, 597, 1, 20, 516, 4, 79, 2293, 6, 55, 4, 3, 43]
['3: a', '318: series', '4: of', '1: <OOV>', '1: <OOV>', '2: the', '1: <OOV>', '10: that', '52: what', '9: is', '46: good', '14: for', '2: the', '2413: goose', '9: is', '179: also', '46: good', '14: for', '2: the', '1: <OOV>', '65: some', '4: of', '79: which', '597: occasionally', '1: <OOV>', '20: but', '516: none', '4: of', '79: which', '2293: amounts', '6: to', '55: much', '4: of', '3: a', '43: story']

import numpy as np
sequences = tokenizer.texts_to_sequences(lines[:30])
np.max(list(map(len, sequences)))

maxlen = 50

from tensorflow.keras.preprocessing.sequence import pad_sequences

def pad_f(sequences):
    return pad_sequences(sequences, maxlen = maxlen, padding = 'post', truncating = 'post')

batch_size = 16

def tokenize_and_pad_sequence(text_batch):
    texts = map(lambda t: t.numpy().decode('utf-8'), text_batch)
    sequences = tokenizer.texts_to_sequences(texts)
    return pad_f(sequences)
    
# https://www.tensorflow.org/api_docs/python/tf/function
@tf.function
def encode_text_batch(text_batch):
    return tf.py_function(
        func = tokenize_and_pad_sequence,
        inp = [text_batch],
        Tout = tf.int32,
    )

def create_batch_ds_inner(ds):
    ds = ds.batch(batch_size)
    ds = ds.map(lambda text_batch, label_batch: (encode_text_batch(text_batch), label_batch))
    return ds.cache()

def create_batch_ds(ds, do_shuffle = True):
    ds = create_batch_ds_inner(ds)
    if do_shuffle:
        ds = ds.shuffle(100)
    return ds.prefetch(tf.data.experimental.AUTOTUNE)

for text, _ in train_ds:
    print(text)
    break
    
train_batch_ds = create_batch_ds(train_ds)
valid_batch_ds = create_batch_ds(valid_ds, do_shuffle = False)
total_batch_ds = create_batch_ds(total_ds, do_shuffle = False)

for text_batch, label_batch in create_batch_ds(train_ds, do_shuffle = False).take(1):
    print(label_batch.shape)
    print(text_batch.shape)
    print(text_batch[0])
    for index in text_batch[0]:
        index = index.numpy()
        if index > 0:
            print(f'{index}: {reverse_word_index[index]}')

tf.Tensor(b'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .', shape=(), dtype=string)
(16,)
(16, 50)
tf.Tensor(
[   3  318    4    1    1    2    1   10   52    9   46   14    2 2413
    9  179   46   14    2    1   65    4   79  597    1   20  516    4
   79 2293    6   55    4    3   43    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0], shape=(50,), dtype=int32)
3: a
318: series
4: of
1: <OOV>
1: <OOV>
2: the
1: <OOV>
10: that
52: what
9: is
46: good
14: for
2: the
2413: goose
9: is
179: also
46: good
14: for
2: the
1: <OOV>
65: some
4: of
79: which
597: occasionally
1: <OOV>
20: but
516: none
4: of
79: which
2293: amounts
6: to
55: much
4: of
3: a
43: story

Graphs For Analysis¶

import pandas as pd
import matplotlib.pyplot as plt

def plot_item(history_df, colname = 'loss', f = np.min, ax = None):
    val_colname = f'val_{colname}'
    print(f'{colname}: {f(history_df[colname]):.4f} - {val_colname}: {f(history_df[val_colname]):.4f}')
    history_df.loc[:, [colname, val_colname]].plot(title = colname.capitalize() , ax = ax)

def show_history(history):
    history_df = pd.DataFrame(history.history)
    
    fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize = (15, 5))
    plot_item(history_df, 'loss', ax = axes[0])
    plot_item(history_df, 'accuracy', ax = axes[1], f = np.max)

Model¶

def fit_model(model, train_batch_ds = train_batch_ds, epochs = 20, patience = 2):
    model.compile(
        optimizer = 'adam',
        loss = keras.losses.SparseCategoricalCrossentropy(from_logits = True),
        metrics = ['accuracy'],
    )

    early_stopping = keras.callbacks.EarlyStopping(
        patience = patience,
        restore_best_weights = True,
    )

    history = model.fit(
        train_batch_ds, 
        validation_data = valid_batch_ds,
        epochs = epochs,
        callbacks = [early_stopping],
    )
    return history

embedding_dim = 20
model_lstm_bi = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length = maxlen),
    keras.layers.Bidirectional(keras.layers.LSTM(embedding_dim)),
    keras.layers.Dense(512, activation = 'relu'),
    keras.layers.Dense(128, activation = 'relu'),
    keras.layers.Dense(64),
])

model_lstm_bi.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 50, 20)            100000    
_________________________________________________________________
bidirectional (Bidirectional (None, 40)                6560      
_________________________________________________________________
dense (Dense)                (None, 512)               20992     
_________________________________________________________________
dense_1 (Dense)              (None, 128)               65664     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
=================================================================
Total params: 201,472
Trainable params: 201,472
Non-trainable params: 0
_________________________________________________________________

history_lstm_bi = fit_model(model_lstm_bi, patience = 5)  
history_lstm_bi

Epoch 1/20
7803/7803 [==============================] - 73s 9ms/step - loss: 1.0946 - accuracy: 0.5677 - val_loss: 0.9222 - val_accuracy: 0.6318
Epoch 2/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.9145 - accuracy: 0.6309 - val_loss: 0.8439 - val_accuracy: 0.6576
Epoch 3/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.8591 - accuracy: 0.6531 - val_loss: 0.8036 - val_accuracy: 0.6743
Epoch 4/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.8206 - accuracy: 0.6677 - val_loss: 0.7772 - val_accuracy: 0.6804
Epoch 5/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.7901 - accuracy: 0.6788 - val_loss: 0.7593 - val_accuracy: 0.6889
Epoch 6/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.7633 - accuracy: 0.6898 - val_loss: 0.7318 - val_accuracy: 0.7015
Epoch 7/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.7429 - accuracy: 0.6973 - val_loss: 0.7215 - val_accuracy: 0.7053
Epoch 8/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.7226 - accuracy: 0.7052 - val_loss: 0.7179 - val_accuracy: 0.7082
Epoch 9/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.7045 - accuracy: 0.7123 - val_loss: 0.6968 - val_accuracy: 0.7150
Epoch 10/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.6867 - accuracy: 0.7207 - val_loss: 0.6872 - val_accuracy: 0.7187
Epoch 11/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.6714 - accuracy: 0.7254 - val_loss: 0.6886 - val_accuracy: 0.7199
Epoch 12/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.6563 - accuracy: 0.7319 - val_loss: 0.6683 - val_accuracy: 0.7268
Epoch 13/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.6438 - accuracy: 0.7368 - val_loss: 0.6682 - val_accuracy: 0.7243
Epoch 14/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.6304 - accuracy: 0.7407 - val_loss: 0.6528 - val_accuracy: 0.7327
Epoch 15/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.6170 - accuracy: 0.7465 - val_loss: 0.6424 - val_accuracy: 0.7365
Epoch 16/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.6051 - accuracy: 0.7500 - val_loss: 0.6372 - val_accuracy: 0.7393
Epoch 17/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.5939 - accuracy: 0.7562 - val_loss: 0.6384 - val_accuracy: 0.7412
Epoch 18/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.5841 - accuracy: 0.7591 - val_loss: 0.6226 - val_accuracy: 0.7453
Epoch 19/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.5736 - accuracy: 0.7634 - val_loss: 0.6206 - val_accuracy: 0.7460
Epoch 20/20
7803/7803 [==============================] - 60s 8ms/step - loss: 0.5644 - accuracy: 0.7675 - val_loss: 0.6136 - val_accuracy: 0.7493

<tensorflow.python.keras.callbacks.History at 0x7f5c462d4ac8>

model_lstm_multiple_bi = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length = maxlen),
    keras.layers.Bidirectional(keras.layers.LSTM(embedding_dim, return_sequences = True)),
    keras.layers.Bidirectional(keras.layers.LSTM(embedding_dim)),
    keras.layers.Dense(512, activation = 'relu'),
    keras.layers.Dense(128, activation = 'relu'),
    keras.layers.Dense(64)
])

model_lstm_multiple_bi.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, 50, 20)            100000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 40)            6560      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 40)                9760      
_________________________________________________________________
dense_3 (Dense)              (None, 512)               20992     
_________________________________________________________________
dense_4 (Dense)              (None, 128)               65664     
_________________________________________________________________
dense_5 (Dense)              (None, 64)                8256      
=================================================================
Total params: 211,232
Trainable params: 211,232
Non-trainable params: 0
_________________________________________________________________

history_lstm_multiple_bi = fit_model(model_lstm_multiple_bi, patience = 5)  
show_history(history_lstm_multiple_bi)

Epoch 1/20
7803/7803 [==============================] - 99s 13ms/step - loss: 1.1190 - accuracy: 0.5609 - val_loss: 0.9454 - val_accuracy: 0.6196
Epoch 2/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.9251 - accuracy: 0.6283 - val_loss: 0.8542 - val_accuracy: 0.6503
Epoch 3/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.8644 - accuracy: 0.6504 - val_loss: 0.8129 - val_accuracy: 0.6686
Epoch 4/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.8234 - accuracy: 0.6685 - val_loss: 0.7750 - val_accuracy: 0.6856
Epoch 5/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.7917 - accuracy: 0.6805 - val_loss: 0.7589 - val_accuracy: 0.6925
Epoch 6/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.7649 - accuracy: 0.6912 - val_loss: 0.7370 - val_accuracy: 0.7006
Epoch 7/20
7803/7803 [==============================] - 99s 13ms/step - loss: 0.7428 - accuracy: 0.7002 - val_loss: 0.7114 - val_accuracy: 0.7121
Epoch 8/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.7231 - accuracy: 0.7078 - val_loss: 0.6966 - val_accuracy: 0.7178
Epoch 9/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.7049 - accuracy: 0.7152 - val_loss: 0.6842 - val_accuracy: 0.7219
Epoch 10/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.6874 - accuracy: 0.7206 - val_loss: 0.6761 - val_accuracy: 0.7252
Epoch 11/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.6715 - accuracy: 0.7282 - val_loss: 0.6740 - val_accuracy: 0.7250
Epoch 12/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.6576 - accuracy: 0.7325 - val_loss: 0.6606 - val_accuracy: 0.7303
Epoch 13/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.6443 - accuracy: 0.7383 - val_loss: 0.6567 - val_accuracy: 0.7334
Epoch 14/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.6317 - accuracy: 0.7430 - val_loss: 0.6419 - val_accuracy: 0.7393
Epoch 15/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.6211 - accuracy: 0.7473 - val_loss: 0.6323 - val_accuracy: 0.7431
Epoch 16/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.6086 - accuracy: 0.7509 - val_loss: 0.6345 - val_accuracy: 0.7443
Epoch 17/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.5974 - accuracy: 0.7559 - val_loss: 0.6295 - val_accuracy: 0.7455
Epoch 18/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.5885 - accuracy: 0.7594 - val_loss: 0.6192 - val_accuracy: 0.7508
Epoch 19/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.5780 - accuracy: 0.7643 - val_loss: 0.6323 - val_accuracy: 0.7468
Epoch 20/20
7803/7803 [==============================] - 98s 13ms/step - loss: 0.5674 - accuracy: 0.7674 - val_loss: 0.6164 - val_accuracy: 0.7541
loss: 0.5674 - val_loss: 0.6164
accuracy: 0.7674 - val_accuracy: 0.7541

Prediction¶

test_ds = pd.read_csv('Movie Reviews (2)/Movie/test1.tsv', sep = '\t')
test_ds = tf.data.Dataset.from_tensor_slices(test_ds.Phrase.to_numpy())
test_ds = test_ds.batch(batch_size)
test_ds = test_ds.map(lambda text_batch: encode_text_batch(text_batch))

def predict_and_write_csv(model, csv_name):
    predicted = model.predict(test_ds)
    labels = list(map(tf.argmax, predicted))
    labels = list(map(lambda x: x.numpy(), labels))
    result_df = pd.DataFrame({'PhraseId': test_ds.PhraseId, 'Sentiment': labels})
    result_df.to_csv(csv_name, index = False)

	PhraseId	SentenceId	Phrase	Sentiment
0	1	1	A series of escapades demonstrating the adage ...	1
1	2	1	A series of escapades demonstrating the adage ...	2
2	3	1	A series	2
3	4	1	A	2
4	5	1	series	2

	PhraseId	SentenceId	Phrase	Sentiment
62	63	1	.	2
63	64	2	This quiet , introspective and entertaining in...	4
64	65	2	This quiet , introspective and entertaining in...	3
65	66	2	This	2
66	67	2	quiet , introspective and entertaining indepen...	4
67	68	2	quiet , introspective and entertaining	3
68	69	2	quiet	2