NOTE: This Use Case is not purposed for resource constrained devices.
Toxic Comment Detection¶
Credit: AITS Cainvas Community¶
Photo by Daniel Montero on Dribbble
- Discussing things you care about can be difficult. The threat of abuse and harassment online means that many people stop expressing themselves and give up on seeking different opinions. Platforms struggle to effectively facilitate conversations, leading many communities to limit or completely shut down user comments.
Setup: Importing neccessary libraries¶
In [1]:
!pip install matplotlib-venn
Importing Datasets¶
In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
#visualization
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from wordcloud import WordCloud ,STOPWORDS
from PIL import Image
import matplotlib_venn as venn
#settings
color = sns.color_palette()
sns.set_style("dark")
%matplotlib inline
In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
#visualization
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from wordcloud import WordCloud ,STOPWORDS
from PIL import Image
import matplotlib_venn as venn
#settings
color = sns.color_palette()
sns.set_style("dark")
%matplotlib inline
Unzipping Dataset¶
In [4]:
!wget -N "https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/toxic_comment.zip"
!unzip -oq toxic_comment.zip
!rm toxic_comment.zip
Data Pre-Processing and Visualization:¶
In [5]:
train_data = pd.read_csv("train.csv.zip")
train_data.head()
Out[5]:
In [6]:
X_train = train_data["comment_text"]
In [7]:
X_train
Out[7]:
In [8]:
y_train = train_data.iloc[:, 2:]
y_train
Out[8]:
In [9]:
y_train[y_train['toxic'] == 1]
Out[9]:
Checking the count of the various types of words¶
In [10]:
cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
val_counts = y_train[cols].sum()
plt.figure(figsize=(8,5))
ax = sns.barplot(val_counts.index, val_counts.values, alpha=0.8)
plt.title("Comments per Classes")
plt.xlabel("Various Comments Type")
plt.ylabel("Counts of the Comments")
rects = ax.patches
labels = val_counts.values
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height+5, label, ha="center", va="bottom")
plt.show()
In [11]:
#from wordcloud import WordCloud
words = ' '.join([text for text in X_train])
word_cloud = WordCloud(
width=1600,
height=800,
#colormap='PuRd',
margin=0,
max_words=500, # Maximum numbers of words we want to see
min_word_length=3, # Minimum numbers of letters of each word to be part of the cloud
max_font_size=150, min_font_size=30, # Font size range
background_color="white").generate(words)
plt.figure(figsize=(10, 16))
plt.imshow(word_cloud, interpolation="gaussian")
plt.title('Comments and their Nature', fontsize = 40)
plt.axis("off")
plt.show()
Tokenization¶
In [12]:
tokenizer = keras.preprocessing.text.Tokenizer()
In [13]:
tokenizer.fit_on_texts(X_train)
In [14]:
X_train = tokenizer.texts_to_sequences(X_train)
In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train = pad_sequences(X_train, maxlen=100)
In [16]:
X_train.shape
Out[16]:
Model creation and Training¶
In [17]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)
In [18]:
model = keras.Sequential([
keras.layers.Dense(20, activation="tanh"),
keras.layers.Dense(6, activation="softmax")
])
In [19]:
model.compile(loss="categorical_crossentropy", optimizer="sgd", metrics=["accuracy"])
In [20]:
model_history = model.fit(X_train, y_train, epochs=5, validation_data=(X_val, y_val))
In [21]:
## Plotting training & Validation accuracy values
In [22]:
plt.plot(model_history.history['accuracy'])
plt.plot(model_history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()
Save model¶
In [23]:
model.save('final_model.h5')
Evaluation¶
In [24]:
test_data = pd.read_csv('test.csv.zip')
In [25]:
test_data.head()
Out[25]:
In [26]:
X_test = test_data['comment_text']
In [27]:
X_test
Out[27]:
In [28]:
type(test_data['comment_text'])
Out[28]:
In [29]:
tokenizer2 = keras.preprocessing.text.Tokenizer()
tokenizer2.fit_on_texts(X_test)
X_test = tokenizer2.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=100)
In [30]:
X_test[0]
Out[30]:
In [31]:
y_test = pd.read_csv('test_labels.csv.zip')
y_test = y_test.iloc[:, 1:]
y_test
Out[31]:
In [32]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(X_test, y_test, batch_size=128)
print("test accuracy:", results[1])