IMDB - LSTM

Prepare data
Build model
Training
Testing

sessionInfo()

## R version 3.6.0 (2019-04-26)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.6.0  magrittr_1.5    tools_3.6.0     htmltools_0.3.6
##  [5] yaml_2.2.0      Rcpp_1.0.2      stringi_1.4.3   rmarkdown_1.15 
##  [9] knitr_1.25      stringr_1.4.0   xfun_0.9        digest_0.6.21  
## [13] evaluate_0.14

Source: https://tensorflow.rstudio.com/keras/articles/examples/imdb_lstm.html

Prepare data

From documentation:

Dataset of 25,000 movies reviews from IMDB, labeled by sentiment (positive/negative). Reviews have been preprocessed, and each review is encoded as a sequence of word indexes (integers). For convenience, words are indexed by overall frequency in the dataset, so that for instance the integer “3” encodes the 3rd most frequent word in the data. This allows for quick filtering operations such as: “only consider the top 10,000 most common words, but eliminate the top 20 most common words”.

Retrieve IMDB data:

library(keras)

max_features <- 20000
batch_size <- 32

# Cut texts after this number of words (among top max_features most common words)
maxlen <- 80  

cat('Loading data...\n')

## Loading data...

imdb <- dataset_imdb(num_words = max_features)

## Loaded Tensorflow version 2.8.0

imdb$train$x[[1]]

##   [1]     1    14    22    16    43   530   973  1622  1385    65   458
##  [12]  4468    66  3941     4   173    36   256     5    25   100    43
##  [23]   838   112    50   670     2     9    35   480   284     5   150
##  [34]     4   172   112   167     2   336   385    39     4   172  4536
##  [45]  1111    17   546    38    13   447     4   192    50    16     6
##  [56]   147  2025    19    14    22     4  1920  4613   469     4    22
##  [67]    71    87    12    16    43   530    38    76    15    13  1247
##  [78]     4    22    17   515    17    12    16   626    18 19193     5
##  [89]    62   386    12     8   316     8   106     5     4  2223  5244
## [100]    16   480    66  3785    33     4   130    12    16    38   619
## [111]     5    25   124    51    36   135    48    25  1415    33     6
## [122]    22    12   215    28    77    52     5    14   407    16    82
## [133] 10311     8     4   107   117  5952    15   256     4     2     7
## [144]  3766     5   723    36    71    43   530   476    26   400   317
## [155]    46     7     4 12118  1029    13   104    88     4   381    15
## [166]   297    98    32  2071    56    26   141     6   194  7486    18
## [177]     4   226    22    21   134   476    26   480     5   144    30
## [188]  5535    18    51    36    28   224    92    25   104     4   226
## [199]    65    16    38  1334    88    12    16   283     5    16  4472
## [210]   113   103    32    15    16  5345    19   178    32

imdb$train$y[[1]]

## [1] 1

x_train <- imdb$train$x
y_train <- imdb$train$y
x_test <- imdb$test$x
y_test <- imdb$test$y

cat(length(x_train), 'train sequences\n')

## 25000 train sequences

cat(length(x_test), 'test sequences\n')

## 25000 test sequences

cat('Pad sequences (samples x time)\n')

## Pad sequences (samples x time)

x_train <- pad_sequences(x_train, maxlen = maxlen)
x_test <- pad_sequences(x_test, maxlen = maxlen)
cat('x_train shape:', dim(x_train), '\n')

## x_train shape: 25000 80

cat('x_test shape:', dim(x_test), '\n')

## x_test shape: 25000 80

Build model

cat('Build model...\n')

## Build model...

model <- keras_model_sequential()
model %>%
  layer_embedding(input_dim = max_features, output_dim = 128) %>% 
  layer_lstm(units = 64, dropout = 0.2, recurrent_dropout = 0.2) %>% 
  layer_dense(units = 1, activation = 'sigmoid')

# Try using different optimizers and different optimizer configs
model %>% compile(
  loss = 'binary_crossentropy',
  optimizer = 'adam',
  metrics = c('accuracy')
)
summary(model)

## Model: "sequential"
## ___________________________________________________________________________
##  Layer (type)                    Output Shape                  Param #     
## ===========================================================================
##  embedding (Embedding)           (None, None, 128)             2560000     
##                                                                            
##  lstm (LSTM)                     (None, 64)                    49408       
##                                                                            
##  dense (Dense)                   (None, 1)                     65          
##                                                                            
## ===========================================================================
## Total params: 2,609,473
## Trainable params: 2,609,473
## Non-trainable params: 0
## ___________________________________________________________________________

Training

cat('Train...\n')

## Train...

system.time({
model %>% fit(
  x_train, y_train,
  batch_size = batch_size,
  epochs = 10,
  validation_data = list(x_test, y_test)
)
})

##     user   system  elapsed 
## 4201.186  572.993 1623.973

Testing

scores <- model %>% evaluate(
  x_test, y_test,
  batch_size = batch_size
)

cat('Test score:', scores[[1]])

## Test score: 0.9772567

cat('Test accuracy', scores[[2]])

## Test accuracy 0.802

IMDB - LSTM

Biostat 203B

Dr. Hua Zhou @ UCLA

3/1/2022

Prepare data

Build model

Training

Testing