NOTE: This Use Case is not purposed for resource constrained devices.
In [1]:
pip install wget
Importing Necessary Libraries¶
In [2]:
import os
import wget
import numpy as np
import pandas as pd
In [3]:
import tensorflow as tf
keras = tf.keras
tf.__version__
Out[3]:
Importing Dataset¶
In [4]:
url = "https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/Movie.zip"
file = wget.download(url)
In [5]:
import zipfile
data = zipfile.ZipFile(file,"r")
data.extractall("Movie Reviews (2)")
Data Exploration¶
In [6]:
import pandas as pd
train_data = pd.read_csv('Movie Reviews (2)/Movie/train1.tsv', sep = '\t')
print(train_data.Phrase[0])
print(train_data.Phrase[1])
print(train_data.Phrase[2])
train_data.head(5)
Out[6]:
In [7]:
print(train_data.Phrase[63])
train_data[62:64+5]
Out[7]:
Text Preprocessing¶
In [8]:
split_size = int(len(train_data) * 0.8)
total_phrases = train_data.Phrase.to_numpy()
train_phrases = total_phrases[:split_size]
valid_phrases = total_phrases[:split_size]
total_sentiments = train_data.Sentiment.to_numpy()
train_sentiments = total_sentiments[:split_size]
valid_sentiments = total_sentiments[:split_size]
In [9]:
train_ds = tf.data.Dataset.from_tensor_slices((train_phrases, train_sentiments))
valid_ds = tf.data.Dataset.from_tensor_slices((valid_phrases, valid_sentiments))
total_ds = tf.data.Dataset.from_tensor_slices((total_phrases, total_sentiments))
for phrase, sentiment in train_ds.take(4):
print(phrase, sentiment)
print()
for phrase, sentiment in valid_ds.take(1):
print(phrase, sentiment)
In [10]:
word_set = set()
lines = []
for line, _ in train_ds:
line = line.numpy().decode('utf-8')
lines.append(line)
for w in line.split(' '):
word_set.add(w)
print(len(word_set))
for index, w in enumerate(word_set):
if index >= 10:
break
print(f'{index:3}: {w}')
In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
vocab_size = 5000
tokenizer = Tokenizer(num_words = vocab_size, oov_token = '<OOV>')
tokenizer.fit_on_texts(lines)
word_index = tokenizer.word_index
for index, (a, b) in enumerate(word_index.items()):
if index >= 5:
break
print(a, b)
In [12]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
In [13]:
for line in lines:
print(line)
sequences = tokenizer.texts_to_sequences([line])[0]
print(sequences)
print([f'{id}: {reverse_word_index[id]}' for id in sequences])
break
In [14]:
import numpy as np
sequences = tokenizer.texts_to_sequences(lines[:30])
np.max(list(map(len, sequences)))
maxlen = 50
In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
def pad_f(sequences):
return pad_sequences(sequences, maxlen = maxlen, padding = 'post', truncating = 'post')
In [16]:
batch_size = 16
def tokenize_and_pad_sequence(text_batch):
texts = map(lambda t: t.numpy().decode('utf-8'), text_batch)
sequences = tokenizer.texts_to_sequences(texts)
return pad_f(sequences)
# https://www.tensorflow.org/api_docs/python/tf/function
@tf.function
def encode_text_batch(text_batch):
return tf.py_function(
func = tokenize_and_pad_sequence,
inp = [text_batch],
Tout = tf.int32,
)
def create_batch_ds_inner(ds):
ds = ds.batch(batch_size)
ds = ds.map(lambda text_batch, label_batch: (encode_text_batch(text_batch), label_batch))
return ds.cache()
def create_batch_ds(ds, do_shuffle = True):
ds = create_batch_ds_inner(ds)
if do_shuffle:
ds = ds.shuffle(100)
return ds.prefetch(tf.data.experimental.AUTOTUNE)
for text, _ in train_ds:
print(text)
break
train_batch_ds = create_batch_ds(train_ds)
valid_batch_ds = create_batch_ds(valid_ds, do_shuffle = False)
total_batch_ds = create_batch_ds(total_ds, do_shuffle = False)
for text_batch, label_batch in create_batch_ds(train_ds, do_shuffle = False).take(1):
print(label_batch.shape)
print(text_batch.shape)
print(text_batch[0])
for index in text_batch[0]:
index = index.numpy()
if index > 0:
print(f'{index}: {reverse_word_index[index]}')
Graphs For Analysis¶
In [17]:
import pandas as pd
import matplotlib.pyplot as plt
def plot_item(history_df, colname = 'loss', f = np.min, ax = None):
val_colname = f'val_{colname}'
print(f'{colname}: {f(history_df[colname]):.4f} - {val_colname}: {f(history_df[val_colname]):.4f}')
history_df.loc[:, [colname, val_colname]].plot(title = colname.capitalize() , ax = ax)
def show_history(history):
history_df = pd.DataFrame(history.history)
fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize = (15, 5))
plot_item(history_df, 'loss', ax = axes[0])
plot_item(history_df, 'accuracy', ax = axes[1], f = np.max)
Model¶
In [18]:
def fit_model(model, train_batch_ds = train_batch_ds, epochs = 20, patience = 2):
model.compile(
optimizer = 'adam',
loss = keras.losses.SparseCategoricalCrossentropy(from_logits = True),
metrics = ['accuracy'],
)
early_stopping = keras.callbacks.EarlyStopping(
patience = patience,
restore_best_weights = True,
)
history = model.fit(
train_batch_ds,
validation_data = valid_batch_ds,
epochs = epochs,
callbacks = [early_stopping],
)
return history
In [19]:
embedding_dim = 20
model_lstm_bi = keras.Sequential([
keras.layers.Embedding(vocab_size, embedding_dim, input_length = maxlen),
keras.layers.Bidirectional(keras.layers.LSTM(embedding_dim)),
keras.layers.Dense(512, activation = 'relu'),
keras.layers.Dense(128, activation = 'relu'),
keras.layers.Dense(64),
])
model_lstm_bi.summary()
In [20]:
history_lstm_bi = fit_model(model_lstm_bi, patience = 5)
history_lstm_bi
Out[20]:
In [21]:
model_lstm_multiple_bi = keras.Sequential([
keras.layers.Embedding(vocab_size, embedding_dim, input_length = maxlen),
keras.layers.Bidirectional(keras.layers.LSTM(embedding_dim, return_sequences = True)),
keras.layers.Bidirectional(keras.layers.LSTM(embedding_dim)),
keras.layers.Dense(512, activation = 'relu'),
keras.layers.Dense(128, activation = 'relu'),
keras.layers.Dense(64)
])
model_lstm_multiple_bi.summary()
In [22]:
history_lstm_multiple_bi = fit_model(model_lstm_multiple_bi, patience = 5)
show_history(history_lstm_multiple_bi)
Prediction¶
In [23]:
test_ds = pd.read_csv('Movie Reviews (2)/Movie/test1.tsv', sep = '\t')
test_ds = tf.data.Dataset.from_tensor_slices(test_ds.Phrase.to_numpy())
test_ds = test_ds.batch(batch_size)
test_ds = test_ds.map(lambda text_batch: encode_text_batch(text_batch))
In [24]:
def predict_and_write_csv(model, csv_name):
predicted = model.predict(test_ds)
labels = list(map(tf.argmax, predicted))
labels = list(map(lambda x: x.numpy(), labels))
result_df = pd.DataFrame({'PhraseId': test_ds.PhraseId, 'Sentiment': labels})
result_df.to_csv(csv_name, index = False)