sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_4.1.2  magrittr_2.0.1  fastmap_1.1.0   tools_4.1.2    
##  [5] htmltools_0.5.2 yaml_2.2.1      jquerylib_0.1.4 stringi_1.7.6  
##  [9] rmarkdown_2.11  knitr_1.37      stringr_1.4.0   xfun_0.29      
## [13] digest_0.6.29   rlang_0.4.12    evaluate_0.14

Load tidyverse and other packages for this lecture:

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

Web scraping

There is a wealth of data on internet. How to scrape them and analyze them?

rvest

rvest is an R package written by Hadley Wickham which makes web scraping easy.

Example: Scraping from webpage

Rank

  • Use SelectorGadget to highlight the element we want to scrape

  • Use the CSS selector to get the rankings

    # Use CSS selectors to scrap the rankings section
    (rank_data_html <- html_nodes(webpage, '.text-primary'))
    ## {xml_nodeset (100)}
    ##  [1] <span class="lister-item-index unbold text-primary">1.</span>
    ##  [2] <span class="lister-item-index unbold text-primary">2.</span>
    ##  [3] <span class="lister-item-index unbold text-primary">3.</span>
    ##  [4] <span class="lister-item-index unbold text-primary">4.</span>
    ##  [5] <span class="lister-item-index unbold text-primary">5.</span>
    ##  [6] <span class="lister-item-index unbold text-primary">6.</span>
    ##  [7] <span class="lister-item-index unbold text-primary">7.</span>
    ##  [8] <span class="lister-item-index unbold text-primary">8.</span>
    ##  [9] <span class="lister-item-index unbold text-primary">9.</span>
    ## [10] <span class="lister-item-index unbold text-primary">10.</span>
    ## [11] <span class="lister-item-index unbold text-primary">11.</span>
    ## [12] <span class="lister-item-index unbold text-primary">12.</span>
    ## [13] <span class="lister-item-index unbold text-primary">13.</span>
    ## [14] <span class="lister-item-index unbold text-primary">14.</span>
    ## [15] <span class="lister-item-index unbold text-primary">15.</span>
    ## [16] <span class="lister-item-index unbold text-primary">16.</span>
    ## [17] <span class="lister-item-index unbold text-primary">17.</span>
    ## [18] <span class="lister-item-index unbold text-primary">18.</span>
    ## [19] <span class="lister-item-index unbold text-primary">19.</span>
    ## [20] <span class="lister-item-index unbold text-primary">20.</span>
    ## ...
    # Convert the ranking data to text
    (rank_data <- html_text(rank_data_html))
    ##   [1] "1."   "2."   "3."   "4."   "5."   "6."   "7."   "8."   "9."   "10." 
    ##  [11] "11."  "12."  "13."  "14."  "15."  "16."  "17."  "18."  "19."  "20." 
    ##  [21] "21."  "22."  "23."  "24."  "25."  "26."  "27."  "28."  "29."  "30." 
    ##  [31] "31."  "32."  "33."  "34."  "35."  "36."  "37."  "38."  "39."  "40." 
    ##  [41] "41."  "42."  "43."  "44."  "45."  "46."  "47."  "48."  "49."  "50." 
    ##  [51] "51."  "52."  "53."  "54."  "55."  "56."  "57."  "58."  "59."  "60." 
    ##  [61] "61."  "62."  "63."  "64."  "65."  "66."  "67."  "68."  "69."  "70." 
    ##  [71] "71."  "72."  "73."  "74."  "75."  "76."  "77."  "78."  "79."  "80." 
    ##  [81] "81."  "82."  "83."  "84."  "85."  "86."  "87."  "88."  "89."  "90." 
    ##  [91] "91."  "92."  "93."  "94."  "95."  "96."  "97."  "98."  "99."  "100."
    # Turn into numerical values
    (rank_data <- as.integer(rank_data))
    ##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
    ##  [19]  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
    ##  [37]  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
    ##  [55]  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
    ##  [73]  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
    ##  [91]  91  92  93  94  95  96  97  98  99 100

Title

  • Use SelectorGadget to find the CSS selector .lister-item-header a.

    # Using CSS selectors to scrap the title section
    (title_data_html <- html_nodes(webpage, '.lister-item-header a'))
    ## {xml_nodeset (100)}
    ##  [1] <a href="/title/tt10023022/?ref_=adv_li_tt">Clean</a>
    ##  [2] <a href="/title/tt10886166/?ref_=adv_li_tt">365 Days</a>
    ##  [3] <a href="/title/tt10272386/?ref_=adv_li_tt">The Father</a>
    ##  [4] <a href="/title/tt6723592/?ref_=adv_li_tt">Tenet</a>
    ##  [5] <a href="/title/tt9620292/?ref_=adv_li_tt">Promising Young Woman</a>
    ##  [6] <a href="/title/tt9214832/?ref_=adv_li_tt">Emma.</a>
    ##  [7] <a href="/title/tt9731534/?ref_=adv_li_tt">The Night House</a>
    ##  [8] <a href="/title/tt8332922/?ref_=adv_li_tt">A Quiet Place Part II</a>
    ##  [9] <a href="/title/tt10362466/?ref_=adv_li_tt">After We Collided</a>
    ## [10] <a href="/title/tt10288566/?ref_=adv_li_tt">Another Round</a>
    ## [11] <a href="/title/tt7395114/?ref_=adv_li_tt">The Devil All the Time</a>
    ## [12] <a href="/title/tt8503618/?ref_=adv_li_tt">Hamilton</a>
    ## [13] <a href="/title/tt6673612/?ref_=adv_li_tt">Dolittle</a>
    ## [14] <a href="/title/tt9770150/?ref_=adv_li_tt">Nomadland</a>
    ## [15] <a href="/title/tt8368512/?ref_=adv_li_tt">The Courier</a>
    ## [16] <a href="/title/tt7126948/?ref_=adv_li_tt">Wonder Woman 1984</a>
    ## [17] <a href="/title/tt6475714/?ref_=adv_li_tt">Monster Hunter</a>
    ## [18] <a href="/title/tt2948372/?ref_=adv_li_tt">Soul</a>
    ## [19] <a href="/title/tt11655202/?ref_=adv_li_tt">Riders of Justice</a>
    ## [20] <a href="/title/tt9893250/?ref_=adv_li_tt">I Care a Lot</a>
    ## ...
    # Converting the title data to text
    (title_data <- html_text(title_data_html))
    ##   [1] "Clean"                                          
    ##   [2] "365 Days"                                       
    ##   [3] "The Father"                                     
    ##   [4] "Tenet"                                          
    ##   [5] "Promising Young Woman"                          
    ##   [6] "Emma."                                          
    ##   [7] "The Night House"                                
    ##   [8] "A Quiet Place Part II"                          
    ##   [9] "After We Collided"                              
    ##  [10] "Another Round"                                  
    ##  [11] "The Devil All the Time"                         
    ##  [12] "Hamilton"                                       
    ##  [13] "Dolittle"                                       
    ##  [14] "Nomadland"                                      
    ##  [15] "The Courier"                                    
    ##  [16] "Wonder Woman 1984"                              
    ##  [17] "Monster Hunter"                                 
    ##  [18] "Soul"                                           
    ##  [19] "Riders of Justice"                              
    ##  [20] "I Care a Lot"                                   
    ##  [21] "Mainstream"                                     
    ##  [22] "Fantasy Island"                                 
    ##  [23] "The Dry"                                        
    ##  [24] "The Hunt"                                       
    ##  [25] "The Invisible Man"                              
    ##  [26] "Palm Springs"                                   
    ##  [27] "Birds of Prey"                                  
    ##  [28] "Underwater"                                     
    ##  [29] "The King of Staten Island"                      
    ##  [30] "Enola Holmes"                                   
    ##  [31] "The Forgotten Battle"                           
    ##  [32] "Rifkin's Festival"                              
    ##  [33] "Boss Level"                                     
    ##  [34] "Sonic the Hedgehog"                             
    ##  [35] "Demon Slayer: Mugen Train"                      
    ##  [36] "Mulan"                                          
    ##  [37] "The Trial of the Chicago 7"                     
    ##  [38] "The Empty Man"                                  
    ##  [39] "The Nest"                                       
    ##  [40] "Unhinged"                                       
    ##  [41] "Greyhound"                                      
    ##  [42] "The Old Guard"                                  
    ##  [43] "Love and Monsters"                              
    ##  [44] "I'm Thinking of Ending Things"                  
    ##  [45] "Shadow in the Cloud"                            
    ##  [46] "Escape from Pretoria"                           
    ##  [47] "Run"                                            
    ##  [48] "Extraction"                                     
    ##  [49] "Greenland"                                      
    ##  [50] "Zola"                                           
    ##  [51] "Rebecca"                                        
    ##  [52] "Minari"                                         
    ##  [53] "The New Mutants"                                
    ##  [54] "A Shot Through the Wall"                        
    ##  [55] "Run Hide Fight"                                 
    ##  [56] "The Croods: A New Age"                          
    ##  [57] "Bad Boys for Life"                              
    ##  [58] "The World to Come"                              
    ##  [59] "Ava"                                            
    ##  [60] "The Witches"                                    
    ##  [61] "Shiva Baby"                                     
    ##  [62] "Supernova"                                      
    ##  [63] "Black Bear"                                     
    ##  [64] "News of the World"                              
    ##  [65] "Eurovision Song Contest: The Story of Fire Saga"
    ##  [66] "Onward"                                         
    ##  [67] "Mank"                                           
    ##  [68] "Joe Bell"                                       
    ##  [69] "Freaky"                                         
    ##  [70] "Bruised"                                        
    ##  [71] "Bill & Ted Face the Music"                      
    ##  [72] "Spenser Confidential"                           
    ##  [73] "The Babysitter: Killer Queen"                   
    ##  [74] "Black Box"                                      
    ##  [75] "Pieces of a Woman"                              
    ##  [76] "A Perfect Enemy"                                
    ##  [77] "The Call of the Wild"                           
    ##  [78] "The Midnight Sky"                               
    ##  [79] "The Wrong Missy"                                
    ##  [80] "Ammonite"                                       
    ##  [81] "Amulet"                                         
    ##  [82] "Finding You"                                    
    ##  [83] "Monday"                                         
    ##  [84] "Death of a Telemarketer"                        
    ##  [85] "Falling for Figaro"                             
    ##  [86] "Honest Thief"                                   
    ##  [87] "Minamata"                                       
    ##  [88] "Attack on Titan: Chronicle"                     
    ##  [89] "Bloodshot"                                      
    ##  [90] "Trolls World Tour"                              
    ##  [91] "Possessor"                                      
    ##  [92] "All the Bright Places"                          
    ##  [93] "The Comeback Trail"                             
    ##  [94] "Let Him Go"                                     
    ##  [95] "Definition Please"                              
    ##  [96] "Project Power"                                  
    ##  [97] "The Paper Tigers"                               
    ##  [98] "#Alive"                                         
    ##  [99] "Persian Lessons"                                
    ## [100] "Inheritance"

Description

  • # Using CSS selectors to scrap the description section
    (description_data_html <- html_nodes(webpage, '.ratings-bar+ .text-muted'))
    ## {xml_nodeset (100)}
    ##  [1] <p class="text-muted">\nTormented by his past, a garbage man named Clean ...
    ##  [2] <p class="text-muted">\nMassimo is a member of the Sicilian Mafia family ...
    ##  [3] <p class="text-muted">\nA man refuses all assistance from his daughter a ...
    ##  [4] <p class="text-muted">\nArmed with only one word, Tenet, and fighting fo ...
    ##  [5] <p class="text-muted">\nA young woman, traumatized by a tragic event in  ...
    ##  [6] <p class="text-muted">\nIn 1800s England, a well meaning but selfish you ...
    ##  [7] <p class="text-muted">\nA widow begins to uncover her recently deceased  ...
    ##  [8] <p class="text-muted">\nFollowing the events at home, the Abbott family  ...
    ##  [9] <p class="text-muted">\nBased on the 2014 romance novel of the same name ...
    ## [10] <p class="text-muted">\nFour high school teachers consume alcohol on a d ...
    ## [11] <p class="text-muted">\nSinister characters converge around a young man  ...
    ## [12] <p class="text-muted">\nThe real life of one of America's foremost found ...
    ## [13] <p class="text-muted">\nA physician who can talk to animals embarks on a ...
    ## [14] <p class="text-muted">\nA woman in her sixties, after losing everything  ...
    ## [15] <p class="text-muted">\nCold War spy Greville Wynne and his Russian sour ...
    ## [16] <p class="text-muted">\nDiana must contend with a work colleague and bus ...
    ## [17] <p class="text-muted">\nWhen Cpt. Artemis and her loyal soldiers are tra ...
    ## [18] <p class="text-muted">\nAfter landing the gig of a lifetime, a New York  ...
    ## [19] <p class="text-muted">\nMarkus goes home to his teenage daughter, Mathil ...
    ## [20] <p class="text-muted">\nA crooked legal guardian who drains the savings  ...
    ## ...
    # Converting the description data to text
    description_data <- html_text(description_data_html)
    # take a look at first few
    head(description_data)
    ## [1] "\nTormented by his past, a garbage man named Clean attempts a quiet life of redemption. But, soon finds himself forced to reconcile with the violence of his past."                                                                     
    ## [2] "\nMassimo is a member of the Sicilian Mafia family and Laura is a sales director. She does not expect that on a trip to Sicily trying to save her relationship, Massimo will kidnap her and give her 365 days to fall in love with him."
    ## [3] "\nA man refuses all assistance from his daughter as he ages. As he tries to make sense of his changing circumstances, he begins to doubt his loved ones, his own mind and even the fabric of his reality."                              
    ## [4] "\nArmed with only one word, Tenet, and fighting for the survival of the entire world, a Protagonist journeys through a twilight world of international espionage on a mission that will unfold in something beyond real time."          
    ## [5] "\nA young woman, traumatized by a tragic event in her past, seeks out vengeance against those who crossed her path."                                                                                                                    
    ## [6] "\nIn 1800s England, a well meaning but selfish young woman meddles in the love lives of her friends."
    # strip the '\n'
    description_data <- str_replace(description_data, "^\\n", "")
    head(description_data)
    ## [1] "Tormented by his past, a garbage man named Clean attempts a quiet life of redemption. But, soon finds himself forced to reconcile with the violence of his past."                                                                     
    ## [2] "Massimo is a member of the Sicilian Mafia family and Laura is a sales director. She does not expect that on a trip to Sicily trying to save her relationship, Massimo will kidnap her and give her 365 days to fall in love with him."
    ## [3] "A man refuses all assistance from his daughter as he ages. As he tries to make sense of his changing circumstances, he begins to doubt his loved ones, his own mind and even the fabric of his reality."                              
    ## [4] "Armed with only one word, Tenet, and fighting for the survival of the entire world, a Protagonist journeys through a twilight world of international espionage on a mission that will unfold in something beyond real time."          
    ## [5] "A young woman, traumatized by a tragic event in her past, seeks out vengeance against those who crossed her path."                                                                                                                    
    ## [6] "In 1800s England, a well meaning but selfish young woman meddles in the love lives of her friends."

Runtime

  • Retrieve runtime data
# Using CSS selectors to scrap the Movie runtime section
(runtime_data <- webpage %>%
  html_nodes('.runtime') %>%
  html_text() %>%
  str_replace(" min", "") %>%
  as.integer())
##   [1]  94 114  97 150 113 124 107  97 105 117 138 160 101 107 112 151 103 100
##  [19] 116 118  94 109 117  90 124  90 109  95 136 123 124  88 100  99 117 115
##  [37] 129 137 107  90  91 125 109 134  83 106  90 116 119  86 123 115  94  90
##  [55] 109  95 124 105  96 106  77  95 104 118 123 102 131  94 102 129  91 111
##  [73] 101 100 126  89 100 118  90 120  99 119 116  88 104  99 115 122 109  90
##  [91] 103 107 104 113  91 113 108  98 127 111

Genre

  • Collect the (first) genre of each movie:

    genre_data <- webpage %>%
      # Using CSS selectors to scrap the Movie genre section
      html_nodes('.genre') %>%
      # Converting the genre data to text
      html_text() %>%
      # Data-Preprocessing: retrieve the first word
      str_extract("[:alpha:]+")
    genre_data
    ##   [1] "Crime"     "Drama"     "Drama"     "Action"    "Crime"     "Comedy"   
    ##   [7] "Horror"    "Drama"     "Drama"     "Comedy"    "Crime"     "Biography"
    ##  [13] "Adventure" "Drama"     "Drama"     "Action"    "Action"    "Animation"
    ##  [19] "Action"    "Comedy"    "Comedy"    "Action"    "Crime"     "Action"   
    ##  [25] "Drama"     "Comedy"    "Action"    "Adventure" "Comedy"    "Action"   
    ##  [31] "Drama"     "Comedy"    "Mystery"   "Action"    "Animation" "Action"   
    ##  [37] "Drama"     "Horror"    "Drama"     "Action"    "Action"    "Action"   
    ##  [43] "Action"    "Drama"     "Action"    "Thriller"  "Mystery"   "Action"   
    ##  [49] "Action"    "Comedy"    "Drama"     "Drama"     "Action"    "Crime"    
    ##  [55] "Crime"     "Animation" "Action"    "Drama"     "Thriller"  "Adventure"
    ##  [61] "Comedy"    "Drama"     "Comedy"    "Action"    "Comedy"    "Animation"
    ##  [67] "Biography" "Biography" "Comedy"    "Drama"     "Adventure" "Action"   
    ##  [73] "Comedy"    "Horror"    "Drama"     "Adventure" "Adventure" "Adventure"
    ##  [79] "Comedy"    "Biography" "Horror"    "Drama"     "Drama"     "Comedy"   
    ##  [85] "Comedy"    "Action"    "Drama"     "Animation" "Action"    "Animation"
    ##  [91] "Horror"    "Drama"     "Comedy"    "Crime"     "Comedy"    "Action"   
    ##  [97] "Action"    "Action"    "Drama"     "Drama"

Rating

  • Rating data:

    rating_data <- webpage %>%
      html_nodes('.ratings-imdb-rating strong') %>%
      html_text() %>%
      as.numeric()
    rating_data
    ##   [1] 5.7 3.3 8.3 7.4 7.5 6.7 6.5 7.3 5.2 7.7 7.1 8.4 5.6 7.3 7.2 5.4 5.3 8.1
    ##  [19] 7.6 6.3 5.0 4.9 6.9 6.5 7.1 7.4 6.1 5.9 7.1 6.6 7.1 6.2 6.8 6.5 8.2 5.7
    ##  [37] 7.8 6.2 6.3 6.0 7.0 6.7 7.0 6.6 4.9 6.8 6.7 6.7 6.4 6.5 6.0 7.5 5.3 5.2
    ##  [55] 6.4 7.0 6.5 6.2 5.4 5.3 7.2 6.9 6.6 6.8 6.5 7.4 6.9 5.8 6.4 6.2 6.0 6.2
    ##  [73] 5.8 6.2 7.1 5.5 6.8 5.6 5.7 6.5 4.8 6.3 6.1 4.8 6.3 6.0 7.6 8.6 5.7 6.1
    ##  [91] 6.5 6.5 5.7 6.7 5.5 6.0 6.4 6.3 7.4 5.6

Votes

  • Vote data

    votes_data <- webpage %>%
      html_nodes('.sort-num_votes-visible span:nth-child(2)') %>%
      html_text() %>% 
      str_replace(",", "") %>% 
      as.numeric()
    votes_data
    ##   [1]   1719  72490 122060 450511 150403  45946  33609 194104  27974 131815
    ##  [11] 120985  81952  59472 143720  47005 246748  52801 297148  39183 123605
    ##  [21]   2820  46377  20255  96960 206854 142969 221078  76432  57626 156645
    ##  [31]  23522   5699  58849 113997  45887 141139 166635  23282  12473  61865
    ##  [41]  89424 152826 116353  78392  23790  33104  64348 187189 106744  10932
    ##  [51]  39186  70074  71524    244  21227  38036 151720   6762  51340  36581
    ##  [61]  16213   9661   9962  81579  89334 134889  69690   4921  49987  12778
    ##  [71]  43202  83009  35712  13231  46479   2031  46106  80331  37476  15618
    ##  [81]   3391   3385   3402    271   1429  46547  15964   7834  73903  21518
    ##  [91]  30962  27974   6624  23096    113  82853   3756  35231   6591  11994

Director

  • Director information

    directors_data <- webpage %>% 
      html_nodes('.text-muted+ p a:nth-child(1)') %>% 
      html_text()
    directors_data
    ##   [1] "Paul Solet"                  "Barbara Bialowas"           
    ##   [3] "Florian Zeller"              "Christopher Nolan"          
    ##   [5] "Emerald Fennell"             "Autumn de Wilde"            
    ##   [7] "David Bruckner"              "John Krasinski"             
    ##   [9] "Roger Kumble"                "Thomas Vinterberg"          
    ##  [11] "Antonio Campos"              "Thomas Kail"                
    ##  [13] "Stephen Gaghan"              "Chloé Zhao"                 
    ##  [15] "Dominic Cooke"               "Patty Jenkins"              
    ##  [17] "Paul W.S. Anderson"          "Pete Docter"                
    ##  [19] "Anders Thomas Jensen"        "J Blakeson"                 
    ##  [21] "Gia Coppola"                 "Jeff Wadlow"                
    ##  [23] "Robert Connolly"             "Craig Zobel"                
    ##  [25] "Leigh Whannell"              "Max Barbakow"               
    ##  [27] "Cathy Yan"                   "William Eubank"             
    ##  [29] "Judd Apatow"                 "Harry Bradbeer"             
    ##  [31] "Matthijs van Heijningen Jr." "Woody Allen"                
    ##  [33] "Joe Carnahan"                "Jeff Fowler"                
    ##  [35] "Haruo Sotozaki"              "Niki Caro"                  
    ##  [37] "Aaron Sorkin"                "David Prior"                
    ##  [39] "Sean Durkin"                 "Derrick Borte"              
    ##  [41] "Aaron Schneider"             "Gina Prince-Bythewood"      
    ##  [43] "Michael Matthews"            "Charlie Kaufman"            
    ##  [45] "Roseanne Liang"              "Francis Annan"              
    ##  [47] "Aneesh Chaganty"             "Sam Hargrave"               
    ##  [49] "Ric Roman Waugh"             "Janicza Bravo"              
    ##  [51] "Ben Wheatley"                "Lee Isaac Chung"            
    ##  [53] "Josh Boone"                  "Aimee Long"                 
    ##  [55] "Kyle Rankin"                 "Joel Crawford"              
    ##  [57] "Adil El Arbi"                "Mona Fastvold"              
    ##  [59] "Tate Taylor"                 "Robert Zemeckis"            
    ##  [61] "Emma Seligman"               "Harry Macqueen"             
    ##  [63] "Lawrence Michael Levine"     "Paul Greengrass"            
    ##  [65] "David Dobkin"                "Dan Scanlon"                
    ##  [67] "David Fincher"               "Reinaldo Marcus Green"      
    ##  [69] "Christopher Landon"          "Halle Berry"                
    ##  [71] "Dean Parisot"                "Peter Berg"                 
    ##  [73] "McG"                         "Emmanuel Osei-Kuffour"      
    ##  [75] "Kornél Mundruczó"            "Kike Maíllo"                
    ##  [77] "Chris Sanders"               "George Clooney"             
    ##  [79] "Tyler Spindel"               "Francis Lee"                
    ##  [81] "Romola Garai"                "Brian Baugh"                
    ##  [83] "Argyris Papadimitropoulos"   "Khaled Ridgeway"            
    ##  [85] "Ben Lewin"                   "Mark Williams"              
    ##  [87] "Andrew Levitas"              "Masashi Koizuka"            
    ##  [89] "Dave Wilson"                 "Walt Dohrn"                 
    ##  [91] "Brandon Cronenberg"          "Brett Haley"                
    ##  [93] "George Gallo"                "Thomas Bezucha"             
    ##  [95] "Sujata Day"                  "Henry Joost"                
    ##  [97] "Quoc Bao Tran"               "Il Cho"                     
    ##  [99] "Vadim Perelman"              "Vaughn Stein"

Actor

  • Only the first actor

    actors_data <- webpage %>%
      html_nodes('.lister-item-content .ghost+ a') %>%
      html_text()
    actors_data
    ##   [1] "Adrien Brody"           "Anna Maria Sieklucka"   "Anthony Hopkins"       
    ##   [4] "John David Washington"  "Carey Mulligan"         "Anya Taylor-Joy"       
    ##   [7] "Rebecca Hall"           "Emily Blunt"            "Josephine Langford"    
    ##  [10] "Mads Mikkelsen"         "Bill Skarsgård"         "Lin-Manuel Miranda"    
    ##  [13] "Robert Downey Jr."      "Frances McDormand"      "Benedict Cumberbatch"  
    ##  [16] "Gal Gadot"              "Milla Jovovich"         "Jamie Foxx"            
    ##  [19] "Mads Mikkelsen"         "Rosamund Pike"          "Andrew Garfield"       
    ##  [22] "Michael Peña"           "Eric Bana"              "Betty Gilpin"          
    ##  [25] "Elisabeth Moss"         "Andy Samberg"           "Margot Robbie"         
    ##  [28] "Kristen Stewart"        "Pete Davidson"          "Millie Bobby Brown"    
    ##  [31] "Gijs Blom"              "Wallace Shawn"          "Frank Grillo"          
    ##  [34] "Ben Schwartz"           "Natsuki Hanae"          "Liu Yifei"             
    ##  [37] "Eddie Redmayne"         "James Badge Dale"       "Jude Law"              
    ##  [40] "Russell Crowe"          "Tom Hanks"              "Charlize Theron"       
    ##  [43] "Dylan O'Brien"          "Jesse Plemons"          "Chloë Grace Moretz"    
    ##  [46] "Daniel Radcliffe"       "Sarah Paulson"          "Chris Hemsworth"       
    ##  [49] "Gerard Butler"          "Taylour Paige"          "Lily James"            
    ##  [52] "Steven Yeun"            "Maisie Williams"        "Kenny Leu"             
    ##  [55] "Isabel May"             "Nicolas Cage"           "Will Smith"            
    ##  [58] "Katherine Waterston"    "Jessica Chastain"       "Anne Hathaway"         
    ##  [61] "Rachel Sennott"         "Colin Firth"            "Aubrey Plaza"          
    ##  [64] "Tom Hanks"              "Will Ferrell"           "Tom Holland"           
    ##  [67] "Gary Oldman"            "Mark Wahlberg"          "Vince Vaughn"          
    ##  [70] "Halle Berry"            "Keanu Reeves"           "Mark Wahlberg"         
    ##  [73] "Judah Lewis"            "Mamoudou Athie"         "Vanessa Kirby"         
    ##  [76] "Dominique Pinon"        "Harrison Ford"          "George Clooney"        
    ##  [79] "David Spade"            "Kate Winslet"           "Carla Juri"            
    ##  [82] "Rose Reid"              "Sebastian Stan"         "Lamorne Morris"        
    ##  [85] "Danielle Macdonald"     "Liam Neeson"            "Akiko Iwase"           
    ##  [88] "Marina Inoue"           "Vin Diesel"             "Anna Kendrick"         
    ##  [91] "Andrea Riseborough"     "Elle Fanning"           "Robert De Niro"        
    ##  [94] "Diane Lane"             "Katrina Bowden"         "Jamie Foxx"            
    ##  [97] "Yuji Okumoto"           "Yoo Ah-in"              "Nahuel Pérez Biscayart"
    ## [100] "Lily Collins"

Metascore

  • We encounter the issue of missing data when scraping metascore.

  • We see there are only 90 meta scores. 10 movies don’t have meta scores. We may manually find which movies don’t have meta scores but that’s tedious and not reproducible.

    # Using CSS selectors to scrap the metascore section
    ms_data_html <- html_nodes(webpage, '.metascore')
    # Converting the runtime data to text
    ms_data <- html_text(ms_data_html)
    # Let's have a look at the metascore 
    ms_data <- str_replace(ms_data, "\\s*$", "") %>% as.integer()
    ms_data
    ##  [1] 43 88 69 73 71 68 71 14 79 55 90 26 93 65 60 47 83 81 66 36 22 69 50 72 83
    ## [26] 60 48 67 68 43 56 47 75 66 76 80 40 64 70 63 78 66 56 67 56 64 76 46 89 43
    ## [51] 13 56 59 73 39 47 79 73 79 73 50 61 79 54 67 52 65 49 22 62 66 48 58 33 72
    ## [76] 62 41 58 51 46 51 44 51 72 61 63 71 51 67 31
  • First let’s tally index and corresponding metascore (if present).

    rank_and_metascore <- webpage %>%
      html_nodes('.unfavorable , .text-primary , .favorable , .mixed') %>%
      html_text() %>%
      str_replace("\\s*$", "") %>%
      print()
    ##   [1] "1."   "43"   "2."   "3."   "88"   "4."   "69"   "5."   "73"   "6."  
    ##  [11] "71"   "7."   "68"   "8."   "71"   "9."   "14"   "10."  "79"   "11." 
    ##  [21] "55"   "12."  "90"   "13."  "26"   "14."  "93"   "15."  "65"   "16." 
    ##  [31] "60"   "17."  "47"   "18."  "83"   "19."  "81"   "20."  "66"   "21." 
    ##  [41] "36"   "22."  "22"   "23."  "69"   "24."  "50"   "25."  "72"   "26." 
    ##  [51] "83"   "27."  "60"   "28."  "48"   "29."  "67"   "30."  "68"   "31." 
    ##  [61] "32."  "43"   "33."  "56"   "34."  "47"   "35."  "75"   "36."  "66"  
    ##  [71] "37."  "76"   "38."  "39."  "80"   "40."  "40"   "41."  "64"   "42." 
    ##  [81] "70"   "43."  "63"   "44."  "78"   "45."  "66"   "46."  "56"   "47." 
    ##  [91] "67"   "48."  "56"   "49."  "64"   "50."  "76"   "51."  "46"   "52." 
    ## [101] "89"   "53."  "43"   "54."  "55."  "13"   "56."  "56"   "57."  "59"  
    ## [111] "58."  "73"   "59."  "39"   "60."  "47"   "61."  "79"   "62."  "73"  
    ## [121] "63."  "79"   "64."  "73"   "65."  "50"   "66."  "61"   "67."  "79"  
    ## [131] "68."  "54"   "69."  "67"   "70."  "52"   "71."  "65"   "72."  "49"  
    ## [141] "73."  "22"   "74."  "62"   "75."  "66"   "76."  "77."  "48"   "78." 
    ## [151] "58"   "79."  "33"   "80."  "72"   "81."  "62"   "82."  "41"   "83." 
    ## [161] "58"   "84."  "85."  "51"   "86."  "46"   "87."  "51"   "88."  "89." 
    ## [171] "44"   "90."  "51"   "91."  "72"   "92."  "61"   "93."  "94."  "63"  
    ## [181] "95."  "71"   "96."  "51"   "97."  "67"   "98."  "99."  "100." "31"
    isrank <- str_detect(rank_and_metascore, "\\.$")
    ismissing <- isrank[1:(length(rank_and_metascore) - 1)] & isrank[2:(length(rank_and_metascore))]
    ismissing[length(ismissing) + 1] <- isrank[length(isrank)]
    missingpos <- as.integer(rank_and_metascore[ismissing])
    metascore_data <- rep(NA, 100)
    metascore_data[-missingpos] <- ms_data
    metascore_data
    ##   [1] 43 NA 88 69 73 71 68 71 14 79 55 90 26 93 65 60 47 83 81 66 36 22 69 50 72
    ##  [26] 83 60 48 67 68 NA 43 56 47 75 66 76 NA 80 40 64 70 63 78 66 56 67 56 64 76
    ##  [51] 46 89 43 NA 13 56 59 73 39 47 79 73 79 73 50 61 79 54 67 52 65 49 22 62 66
    ##  [76] NA 48 58 33 72 62 41 58 NA 51 46 51 NA 44 51 72 61 NA 63 71 51 67 NA NA 31

Gross

  • Be careful with missing data.

    # Using CSS selectors to scrap the gross revenue section
    gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
    # Converting the gross revenue data to text
    gross_data <- html_text(gross_data_html)
    # Let's have a look at the gross data
    gross_data
    ##  [1] "$58.46M"  "$160.07M" "$2.39M"   "$77.05M"  "$46.37M"  "$27.31M" 
    ##  [7] "$70.41M"  "$84.16M"  "$148.97M" "$47.70M"  "$1.07M"   "$58.57M" 
    ## [13] "$206.31M" "$61.56M"  "$62.34M"
    # Data-Preprocessing: removing '$' and 'M' signs
    gross_data <- str_replace(gross_data, "M", "")
    gross_data <- str_sub(gross_data, 2, 10)
    #(gross_data <- str_extract(gross_data, "[:digit:]+.[:digit:]+"))
    gross_data <- as.numeric(gross_data)
    # Let's check the length of gross data
    gross_data
    ##  [1]  58.46 160.07   2.39  77.05  46.37  27.31  70.41  84.16 148.97  47.70
    ## [11]   1.07  58.57 206.31  61.56  62.34

    85 (out of 100) movies don’t have gross data yet! We need a better way to figure out missing entries.

    (rank_and_gross <- webpage %>%
      # retrieve rank and gross
      html_nodes('.ghost~ .text-muted+ span , .text-primary') %>%
      html_text() %>%
      str_replace("\\s+", "") %>%
      str_replace_all("[$M]", ""))
    ##   [1] "1."     "2."     "3."     "4."     "58.46"  "5."     "6."     "7."    
    ##   [9] "8."     "160.07" "9."     "2.39"   "10."    "11."    "12."    "13."   
    ##  [17] "77.05"  "14."    "15."    "16."    "46.37"  "17."    "18."    "19."   
    ##  [25] "20."    "21."    "22."    "27.31"  "23."    "24."    "25."    "70.41" 
    ##  [33] "26."    "27."    "84.16"  "28."    "29."    "30."    "31."    "32."   
    ##  [41] "33."    "34."    "148.97" "35."    "47.70"  "36."    "37."    "38."   
    ##  [49] "39."    "40."    "41."    "42."    "43."    "1.07"   "44."    "45."   
    ##  [57] "46."    "47."    "48."    "49."    "50."    "51."    "52."    "53."   
    ##  [65] "54."    "55."    "56."    "58.57"  "57."    "206.31" "58."    "59."   
    ##  [73] "60."    "61."    "62."    "63."    "64."    "65."    "66."    "61.56" 
    ##  [81] "67."    "68."    "69."    "70."    "71."    "72."    "73."    "74."   
    ##  [89] "75."    "76."    "77."    "62.34"  "78."    "79."    "80."    "81."   
    ##  [97] "82."    "83."    "84."    "85."    "86."    "87."    "88."    "89."   
    ## [105] "90."    "91."    "92."    "93."    "94."    "95."    "96."    "97."   
    ## [113] "98."    "99."    "100."
    isrank <- str_detect(rank_and_gross, "\\.$")
    ismissing <- isrank[1:(length(rank_and_gross) - 1)] & isrank[2:(length(rank_and_gross))]
    ismissing[length(ismissing)+1] <- isrank[length(isrank)]
    missingpos <- as.integer(rank_and_gross[ismissing])
    gs_data <- rep(NA, 100)
    gs_data[-missingpos] <- gross_data
    (gross_data <- gs_data)
    ##   [1]     NA     NA     NA  58.46     NA     NA     NA 160.07   2.39     NA
    ##  [11]     NA     NA  77.05     NA     NA  46.37     NA     NA     NA     NA
    ##  [21]     NA  27.31     NA     NA  70.41     NA  84.16     NA     NA     NA
    ##  [31]     NA     NA     NA 148.97  47.70     NA     NA     NA     NA     NA
    ##  [41]     NA     NA   1.07     NA     NA     NA     NA     NA     NA     NA
    ##  [51]     NA     NA     NA     NA     NA  58.57 206.31     NA     NA     NA
    ##  [61]     NA     NA     NA     NA     NA  61.56     NA     NA     NA     NA
    ##  [71]     NA     NA     NA     NA     NA     NA  62.34     NA     NA     NA
    ##  [81]     NA     NA     NA     NA     NA     NA     NA     NA     NA     NA
    ##  [91]     NA     NA     NA     NA     NA     NA     NA     NA     NA     NA

Visualizing movie data

  • Form a tibble:

    # Combining all the lists to form a data frame
    movies <- tibble(Rank = rank_data, 
                     Title = title_data,
                     Description = description_data, 
                     Runtime = runtime_data,
                     Genre = genre_data, 
                     Rating = rating_data,
                     Metascore = metascore_data, 
                     Votes = votes_data,
                     Gross_Earning_in_Mil = gross_data,
                     Director = directors_data, 
                     Actor = actors_data)
    movies %>% print(width=Inf)
    ## # A tibble: 100 × 11
    ##     Rank Title                
    ##    <int> <chr>                
    ##  1     1 Clean                
    ##  2     2 365 Days             
    ##  3     3 The Father           
    ##  4     4 Tenet                
    ##  5     5 Promising Young Woman
    ##  6     6 Emma.                
    ##  7     7 The Night House      
    ##  8     8 A Quiet Place Part II
    ##  9     9 After We Collided    
    ## 10    10 Another Round        
    ##    Description                                                                  
    ##    <chr>                                                                        
    ##  1 Tormented by his past, a garbage man named Clean attempts a quiet life of re…
    ##  2 Massimo is a member of the Sicilian Mafia family and Laura is a sales direct…
    ##  3 A man refuses all assistance from his daughter as he ages. As he tries to ma…
    ##  4 Armed with only one word, Tenet, and fighting for the survival of the entire…
    ##  5 A young woman, traumatized by a tragic event in her past, seeks out vengeanc…
    ##  6 In 1800s England, a well meaning but selfish young woman meddles in the love…
    ##  7 A widow begins to uncover her recently deceased husband's disturbing secrets.
    ##  8 Following the events at home, the Abbott family now face the terrors of the …
    ##  9 Based on the 2014 romance novel of the same name, this follows the love life…
    ## 10 Four high school teachers consume alcohol on a daily basis to see how it aff…
    ##    Runtime Genre  Rating Metascore  Votes Gross_Earning_in_Mil Director         
    ##      <int> <chr>   <dbl>     <int>  <dbl>                <dbl> <chr>            
    ##  1      94 Crime     5.7        43   1719                NA    Paul Solet       
    ##  2     114 Drama     3.3        NA  72490                NA    Barbara Bialowas 
    ##  3      97 Drama     8.3        88 122060                NA    Florian Zeller   
    ##  4     150 Action    7.4        69 450511                58.5  Christopher Nolan
    ##  5     113 Crime     7.5        73 150403                NA    Emerald Fennell  
    ##  6     124 Comedy    6.7        71  45946                NA    Autumn de Wilde  
    ##  7     107 Horror    6.5        68  33609                NA    David Bruckner   
    ##  8      97 Drama     7.3        71 194104               160.   John Krasinski   
    ##  9     105 Drama     5.2        14  27974                 2.39 Roger Kumble     
    ## 10     117 Comedy    7.7        79 131815                NA    Thomas Vinterberg
    ##    Actor                
    ##    <chr>                
    ##  1 Adrien Brody         
    ##  2 Anna Maria Sieklucka 
    ##  3 Anthony Hopkins      
    ##  4 John David Washington
    ##  5 Carey Mulligan       
    ##  6 Anya Taylor-Joy      
    ##  7 Rebecca Hall         
    ##  8 Emily Blunt          
    ##  9 Josephine Langford   
    ## 10 Mads Mikkelsen       
    ## # … with 90 more rows
  • How many top 100 movies are in each genre? (Be careful with interpretation.)

    movies %>%
      ggplot() +
      geom_bar(mapping = aes(x = Genre))

  • Which genre is most profitable in terms of average gross earnings?

    movies %>%
      group_by(Genre) %>%
      summarise(avg_earning = mean(Gross_Earning_in_Mil, na.rm = TRUE)) %>%
      ggplot() +
        geom_col(mapping = aes(x = Genre, y = avg_earning)) + 
        labs(y = "avg earning in millions")
    ## Warning: Removed 6 rows containing missing values (position_stack).

    ggplot(data = movies) +
      geom_boxplot(mapping = aes(x = Genre, y = Gross_Earning_in_Mil)) + 
      labs(y = "Gross earning in millions")
    ## Warning: Removed 85 rows containing non-finite values (stat_boxplot).

  • Is there a relationship between gross earning and rating? Find the best selling movie (by gross earning) in each genre

    library("ggrepel")
    (best_in_genre <- movies %>%
        group_by(Genre) %>%
        filter(row_number(desc(Gross_Earning_in_Mil)) == 1)) %>%
        print(width = Inf)
    ## # A tibble: 4 × 11
    ## # Groups:   Genre [4]
    ##    Rank Title                
    ##   <int> <chr>                
    ## 1     8 A Quiet Place Part II
    ## 2    13 Dolittle             
    ## 3    57 Bad Boys for Life    
    ## 4    66 Onward               
    ##   Description                                                                   
    ##   <chr>                                                                         
    ## 1 Following the events at home, the Abbott family now face the terrors of the o…
    ## 2 A physician who can talk to animals embarks on an adventure to find a legenda…
    ## 3 Miami detectives Mike Lowrey and Marcus Burnett must face off against a mothe…
    ## 4 Two elven brothers embark on a quest to bring their father back for one day.  
    ##   Runtime Genre     Rating Metascore  Votes Gross_Earning_in_Mil Director      
    ##     <int> <chr>      <dbl>     <int>  <dbl>                <dbl> <chr>         
    ## 1      97 Drama        7.3        71 194104                160.  John Krasinski
    ## 2     101 Adventure    5.6        26  59472                 77.0 Stephen Gaghan
    ## 3     124 Action       6.5        59 151720                206.  Adil El Arbi  
    ## 4     102 Animation    7.4        61 134889                 61.6 Dan Scanlon   
    ##   Actor            
    ##   <chr>            
    ## 1 Emily Blunt      
    ## 2 Robert Downey Jr.
    ## 3 Will Smith       
    ## 4 Tom Holland
    ggplot(movies, mapping = aes(x = Rating, y = Gross_Earning_in_Mil)) +
      geom_point(mapping = aes(size = Votes, color = Genre)) + 
      ggrepel::geom_label_repel(aes(label = Title), data = best_in_genre) +
      labs(y = "Gross earning in millions")
    ## Warning: Removed 85 rows containing missing values (geom_point).

Example: Scraping finance data

Example: Pull tweets into R

library(twitteR) #load package
consumer_key <- 'XXXXXXXXXX'
consumer_secret <- 'XXXXXXXXXX'
access_token <- 'XXXXXXXXXX'
access_secret <- 'XXXXXXXXXX'
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
virus <- searchTwitter('#China + #Coronavirus', 
                       n = 1000, 
                       since = '2020-01-01', 
                       retryOnRateLimit = 1e3)
virus_df <- as_tibble(twListToDF(virus))
virus_df %>% print(width = Inf)