Transformers for Natural Language Processing (NLP)

Natural Language Processing

Figure 1: Natural Language Processing

What do the Transformers do?

Figure 2: Attention mechanism (Source: )

The architecture of Transformers

Figure 3: The Transformer — Model architecture (Source:

Showcase: Quotes classification

"Quote": "Don't cry because it's over, smile because it happened.",
"Author":"Dr. Seuss",
"Tags":[ "attributed-no-source", "cry", "crying", ...],
"Popularity": 0.15566615566615566,

Install and import libraries

!pip install torch
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install sklearn
!pip install transformers
!pip install pytorch-lightning
# Import all libraries
import pandas as pd
import numpy as np
import re
import os

# Huggingface transformers
import transformers
from transformers import BertModel,BertTokenizer,AdamW, get_linear_schedule_with_warmup

import torch
from torch import nn ,cuda
from import DataLoader,Dataset,RandomSampler, SequentialSampler

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
%matplotlib inline

is_gpu_available = torch.cuda.is_available()
device = torch.device("cuda:0" if is_gpu_available else "cpu")
if is_gpu_available:

Download the dataset

  1. Sign in to, then click on your profile picture on the top right and select “My Account” from the menu.
  2. Scroll down to the “API” section and click “Create New API Token”. This will download a file kaggle.json.
  3. Upload the downloaded kaggle.json file in the next cell.
"""The Kaggle dataset path"""
KAGGLE_DATASET ='akmittal/quotes-dataset'

!pip install -q kaggle
from google.colab import files

!pip install -q kaggle
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download "{KAGGLE_DATASET}"
!mkdir /content/dataset
!unzip -q /content/ -d /content/dataset
!ls /content/dataset

Load dataset

import pandas as pd

df = pd.read_json('/content/dataset/quotes.json')
# Take only the required attributes and discard the others
df = df[['Quote','Tags']]
# Drop the duplicates records into our dataset
df = df.drop_duplicates(['Quote'])
print(f'The data frame contains {len(df)} records.')

Normalize dataset

df.Tags = df.Tags.transform(lambda tags: [tag.lower().strip() for tag in tags])

tags = [element for list_ in df.Tags for element in list_]
tags = [tag.lower().strip() for tag in tags]

print(f'There are {len(tags)} tags.')
# There are 215664 tags.
classes = pd.Series(tags).value_counts()[:15].index
classes = list(set(classes))
df['Tags'] = df.Tags.transform(lambda tags: list(set(tags).intersection(classes)))
df = df[df.Tags.transform(lambda tags: len(tags)>0)]

print(f'We will only consider the following tags: {classes}.')
print(f'The data frame contains {len(df)} records with one or more tags.')
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('Tags')),

Splitting dataset into train, test, and validation data

df = df[:2000]

train_data,temp_data = train_test_split(

test_data, val_data = train_test_split(
print(f' - dataset for training model: {train_data.shape[0]}.')
print(f' - dataset for validate trained model: {val_data.shape[0]}.')
print(f' - dataset for test the model {test_data.shape[0]}.')

Preparing the Dataset and DataModule

class QuoteTagDataset (Dataset):
def __init__(self, data, tokenizer, max_len):
self.tokenizer = tokenizer = data
self.max_len = max_len

def __len__(self):
return len(

def __getitem__(self, item_idx):
item =[item_idx]
quote = item['Quote']
labels = item[classes]

inputs = self.tokenizer.encode_plus(
max_length= self.max_len,
padding = 'max_length',
return_token_type_ids= False,
return_attention_mask= True,
return_tensors = 'pt'

input_ids = inputs['input_ids'].flatten()
attn_mask = inputs['attention_mask'].flatten()

return {
'input_ids': input_ids ,
'attention_mask': attn_mask,
'label': torch.tensor(labels, dtype=torch.float)
class QuoteTagDataModule (pl.LightningDataModule):

def __init__(self, train_data, val_data, test_data,tokenizer,train_batch_size=8, val_batch_size=8, test_batch_size=8, max_token_len=150):
self.train_data = train_data
self.test_data = test_data
self.val_data = val_data
self.tokenizer = tokenizer
self.train_batch_size = train_batch_size
self.test_batch_size = test_batch_size
self.val_batch_size = val_batch_size
self.max_token_len = max_token_len

def setup(self):
self.train_dataset = QuoteTagDataset(data=self.train_data, tokenizer=self.tokenizer,max_len = self.max_token_len)
self.val_dataset = QuoteTagDataset(data=self.val_data,tokenizer=self.tokenizer,max_len = self.max_token_len)
self.test_dataset = QuoteTagDataset(data=self.test_data,tokenizer=self.tokenizer,max_len = self.max_token_len)

def train_dataloader(self):
return DataLoader (self.train_dataset, batch_size = self.train_batch_size, shuffle = True , num_workers=0)

def val_dataloader(self):
return DataLoader (self.val_dataset,batch_size=self.val_batch_size)

def test_dataloader(self):
return DataLoader (self.test_dataset,batch_size=self.test_batch_size)
# Initialize the Bert tokenizer
BERT_MODEL_NAME = 'bert-base-cased'
Bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

# Initialize the parameters that will be use for training
MAX_LEN = 128

# Instantiate and set up the data_module
data_module = QuoteTagDataModule(train_data,val_data,test_data,Bert_tokenizer,TRAIN_BATCH_SIZE,VAL_BATCH_SIZE,TEST_BATCH_SIZE,MAX_LEN)

Train the Model

class QuoteTagClassifier(pl.LightningModule):

def __init__(self, n_classes=15, steps_per_epoch=None, n_epochs=3, lr=2e-5 ):

self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
self.classifier = nn.Linear(self.bert.config.hidden_size,n_classes) # outputs = number of labels
self.steps_per_epoch = steps_per_epoch
self.n_epochs = n_epochs = lr
self.criterion = nn.BCEWithLogitsLoss()

def forward(self,input_ids, attn_mask):
output = self.bert(input_ids = input_ids ,attention_mask = attn_mask)
output = self.classifier(output.pooler_output)
return output

def training_step(self,batch,batch_idx):
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
labels = batch['label']

outputs = self(input_ids,attention_mask)
loss = self.criterion(outputs,labels)
self.log('train_loss',loss , prog_bar=True,logger=True)

return {"loss" :loss, "predictions":outputs, "labels": labels }

def validation_step(self,batch,batch_idx):
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
labels = batch['label']

outputs = self(input_ids,attention_mask)
loss = self.criterion(outputs,labels)
self.log('val_loss',loss , prog_bar=True,logger=True)

return loss

def test_step(self,batch,batch_idx):
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
labels = batch['label']
outputs = self(input_ids,attention_mask)
loss = self.criterion(outputs,labels)
self.log('test_loss',loss , prog_bar=True,logger=True)
return loss

def configure_optimizers(self):
optimizer = AdamW(self.parameters() ,
warmup_steps = self.steps_per_epoch//3
total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps
scheduler = get_linear_schedule_with_warmup(optimizer,warmup_steps,total_steps)
return [optimizer], [scheduler]
LR = 2e-05
steps_per_epoch = len(train_data)//TRAIN_BATCH_SIZE

model = QuoteTagClassifier(n_classes=len(classes), steps_per_epoch=steps_per_epoch,n_epochs=N_EPOCHS,lr=LR)
# Instantiate the Model Trainer
trainer = pl.Trainer(max_epochs = N_EPOCHS , gpus = 1, progress_bar_refresh_rate = 20)
# Train the Classifier Model, data_module)
# Evaluate the model performance on the test dataset

Evaluate Model Performance on Test Set

from import TensorDataset

# Tokenize all quotes in test_data
input_ids = []
attention_masks = []

for quote in test_data.Quote:
encoded_quote = Bert_tokenizer.encode_plus(
max_length= MAX_LEN,
padding = 'max_length',
return_token_type_ids= False,
return_attention_mask= True,
return_tensors = 'pt'


# Now convert the lists into tensors.
input_ids =, dim=0)
attention_masks =, dim=0)
labels = torch.tensor(test_data[classes].values)

# Create the DataLoader.
pred_data = TensorDataset(input_ids, attention_masks, labels)
pred_sampler = SequentialSampler(pred_data)
pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=TEST_BATCH_SIZE)
flat_pred_outs = 0
flat_true_labels = 0
# Put model in evaluation mode
model = # moving model to cuda

# Tracking variables
pred_outs, true_labels = [], []
# Predict
for batch in pred_dataloader:
# Add batch to GPU
batch = tuple( for t in batch)

# Unpack the inputs from our dataloader
b_input_ids, b_attn_mask, b_labels = batch

with torch.no_grad():
pred_out = model(b_input_ids,b_attn_mask)
pred_out = torch.sigmoid(pred_out)
pred_out = pred_out.detach().cpu().numpy()
label_ids ='cpu').numpy()

flat_pred_outs = np.concatenate(pred_outs, axis=0)
flat_true_labels = np.concatenate(true_labels, axis=0)

Predictions of Tags in the Test set

threshold  = np.arange(0.4,0.51,0.01)
# convert probabilities into 0 or 1 based on a threshold value
def classify(pred_prob,thresh):
y_pred = []

for tag_label_row in pred_prob:
for tag_label in tag_label_row:
if tag_label >= thresh:

return y_pred
from sklearn import metrics
scores=[] # Store the list of f1 scores for prediction on each threshold

#convert labels to 1D array
y_true = flat_true_labels.ravel()

for thresh in threshold:

#classes for each threshold
pred_bin_label = classify(flat_pred_outs,thresh)

#convert to 1D array
y_pred = np.array(pred_bin_label).ravel()

# find the optimal threshold
opt_thresh = threshold[scores.index(max(scores))]
print(f'Optimal Threshold Value = {opt_thresh}')

Performance Score Evaluation

#predictions for optimal threshold
y_pred_labels = classify(flat_pred_outs,opt_thresh)
y_pred = np.array(y_pred_labels).ravel() # Flatten
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
yt = mlb.fit_transform([classes])

y_pred = mlb.inverse_transform(np.array(y_pred_labels))
y_act = mlb.inverse_transform(flat_true_labels)

df = pd.DataFrame({'Body':test_data['Quote'],'Actual Tags':y_act,'Predicted Tags':y_pred})






If tech can change the world... why don't we use it to make it better?

Love podcasts or audiobooks? Learn on the go with our new app.

Recommended from Medium

Binary cross-entropy loss — Special case of Categorical cross-entropy loss

Cross-Validation with Code in Python

What is Machine Learning: How It Works and Why it’s Scary

What is Machine Learning: How It Works and Why it's Scary

Bitcoin Daily High Prediction

Feature Scaling and its importance in data Preprocessing Normalization vs Standardization.

YOLOv3 PyTorch Video/Image Model

BASICS: Building a model which can identify diabetes

Review: Deep Learning Face Representation by Joint Identification-Verification (DeepID2)

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Iván Corrales Solera

Iván Corrales Solera

If tech can change the world... why don't we use it to make it better?

More from Medium

BERT(Bidirectional Encoder Representation from Transformer)

Going the extra mile, lessons learnt from Kaggle on how to train better NLP models (Part II)

Two minutes NLP — Intro to Word Error Rate (WER) for Speech-to-Text

Paraphrasing in Natural Language Processing (NLP)