Natural Language Processing (NLP) in Python with 8 Projects - IMDB and Amazon Review Classification with SpaCy

1) Importing the datasets

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data_yelp = pd.read_csv('data/yelp_labelled.txt', sep='\t',header=None)

yelp_labelled.txt 데이터는 아래 링크를 클릭하면 보실 수 있습니다.

링크 -

# review and sentiment
# 0-Negative, 1-Positive for positive review
0 1
0 Wow... Loved this place. 1
1 Crust is not good. 0
2 Not tasty and the texture was just nasty. 0
3 Stopped by during the late May bank holiday of... 1
4 The selection on the menu was great and so wer... 1
# Assign column names
columan_name = ['Review', 'Sentiment']
data_yelp.columns = columan_name
Review Sentiment
0 Wow... Loved this place. 1
1 Crust is not good. 0
2 Not tasty and the texture was just nasty. 0
3 Stopped by during the late May bank holiday of... 1
4 The selection on the menu was great and so wer... 1
# 1000 rows (reviews), 2 columns (Sentiments)
(1000, 2)
data_amazon = pd.read_csv('data/amazon_cells_labelled.txt',sep='\t',header=None)
# review and sentiment
# 0-Negative, 1-Positive for positive review
0 1
0 So there is no way for me to plug it in here i... 0
1 Good case, Excellent value. 1
2 Great for the jawbone. 1
3 Tied to charger for conversations lasting more... 0
4 The mic is great. 1

amazon_cells_labelled.txt 데이터는 아래 링크를 클릭하면 보실 수 있습니다.


data_amazon.columns = columan_name
Review Sentiment
0 So there is no way for me to plug it in here i... 0
1 Good case, Excellent value. 1
2 Great for the jawbone. 1
3 Tied to charger for conversations lasting more... 0
4 The mic is great. 1
(1000, 2)
data_imdb = pd.read_csv('data/imdb_labelled.txt',sep='\t',header=None)
0 1
0 A very, very, very slow-moving, aimless movie ... 0
1 Not sure who was more lost - the flat characte... 0
2 Attempting artiness with black & white and cle... 0
3 Very little music or anything to speak of. 0
4 The best scene in the movie was when Gerardo i... 1

imdb_labelled.txt 데이터는 아래 링크를 클릭 하면 보실 수 있습니다.

링크 -

data_imdb.columns = columan_name
Review Sentiment
0 A very, very, very slow-moving, aimless movie ... 0
1 Not sure who was more lost - the flat characte... 0
2 Attempting artiness with black & white and cle... 0
3 Very little music or anything to speak of. 0
4 The best scene in the movie was when Gerardo i... 1
(748, 2)
# Append all the data in a single dataframe
data = data_yelp.append([data_amazon, data_imdb],ignore_index=True)
(2748, 2)
Review Sentiment
0 Wow... Loved this place. 1
1 Crust is not good. 0
2 Not tasty and the texture was just nasty. 0
3 Stopped by during the late May bank holiday of... 1
4 The selection on the menu was great and so wer... 1
# check distribution of sentiments

# 1386 positive reviews
# 1362 Negative reviews
1    1386
0    1362
Name: Sentiment, dtype: int64
# check for null values

# no null values in the data
Review       0
Sentiment    0
dtype: int64
x = data['Review']
y = data['Sentiment']

2) Data Cleaning

# here we will remove stopwords, punctuations
# as well as we will apply lemmatization

Create a function to clean the data

string.punctuation 은 무엇일까?

참고사이트 -

import string
punct = string.punctuation
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS) # list of stopwords
# creating a function for data cleaning
import spacy 

nlp = spacy.load('en_core_web_sm')

def text_data_cleaning(sentence):
  doc = nlp(sentence)

  tokens = [] # list of tokens
  for token in doc:
    if token.lemma_ != "-PRON-":
      temp = token.lemma_.lower().strip()
      temp = token.lower_
  cleaned_tokens = []
  for token in tokens:
    if token not in stopwords and token not in punct:
  return cleaned_tokens
# if root form of that word is not pronoun then it is going to convert that into lower form
# and if that word is a proper noun, then we are directly taking lower form, because there is no lemma for proper noun
text_data_cleaning("Hello all, It's a beautiful day outside there!")
# stopwords and punctuations removed
['hello', 'beautiful', 'day', 'outside']

Vectorization Feature Engineering (TF-IDF)

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)
# tokenizer=text_data_cleaning, tokenization will be done according to this function
classifier = LinearSVC()

3) Train the model

Splitting the dataset into the Train and Test set

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
x_train.shape, x_test.shape
# 2198 samples in training dataset and 550 in test dataset
((2198,), (550,))
2572    An Italian reviewer called this "a small, grea...
526                          And it was way to expensive.
1509    As an earlier review noted, plug in this charg...
144     Nice blanket of moz over top but i feel like t...
2483    The film gives meaning to the phrase, "Never i...
Name: Review, dtype: object

Fit the x_train and y_train

clf = Pipeline([('tfidf',tfidf), ('clf',classifier)])
# it will first do vectorization and then it will do classification, y_train)
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x0000015BB7D451F0>)),
                ('clf', LinearSVC())])
# in this we don't need to prepare the dataset for testing(x_test)

4) Predict the Test set results

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred = clf.predict(x_test)
# confusion_matrix
confusion_matrix(y_test, y_pred)
array([[203,  76],
       [ 49, 222]], dtype=int64)
# classification_report
print(classification_report(y_test, y_pred))
# we are getting almost 77% accuracy
              precision    recall  f1-score   support

           0       0.81      0.73      0.76       279
           1       0.74      0.82      0.78       271

    accuracy                           0.77       550
   macro avg       0.78      0.77      0.77       550
weighted avg       0.78      0.77      0.77       550
accuracy_score(y_test, y_pred)
# 77% accuracy
clf.predict(["Wow, I am learning Natural Language Processing in fun fashion!"])
# output is 1, that means review is positive
array([1], dtype=int64)
clf.predict(["It's hard to learn new things!"])
# output is 0, that means review is Negative
array([0], dtype=int64)