sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_4.1.2 magrittr_2.0.1 fastmap_1.1.0 tools_4.1.2
## [5] htmltools_0.5.2 yaml_2.2.1 jquerylib_0.1.4 stringi_1.7.6
## [9] rmarkdown_2.11 knitr_1.37 stringr_1.4.0 xfun_0.29
## [13] digest_0.6.29 rlang_0.4.12 evaluate_0.14
Load tidyverse and other packages for this lecture:
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
There is a wealth of data on internet. How to scrape them and analyze them?
rvest is an R package written by Hadley Wickham which makes web scraping easy.
We follow instructions in a Blog by SAURAV KAUSHIK to find the most popular feature films of 2020.
Install the SelectorGadget extension for Chrome.
The 100 most popular feature films released in 2020 can be accessed at page https://www.imdb.com/search/title/?title_type=feature&release_date=2020-01-01,2020-12-31&count=100.
#Loading the rvest and tidyverse package
#Specifying the url for desired website to be scraped
url <- "https://www.imdb.com/search/title/?title_type=feature&release_date=2020-01-01,2020-12-31&count=100"
#Reading the HTML code from the website
(webpage <- read_html(url))
## {html_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body id="styleguide-v2" class="fixed">\n <img height="1" widt ...
Suppose we want to scrape following 11 features from this page:
Use the CSS selector to get the rankings
# Use CSS selectors to scrap the rankings section
(rank_data_html <- html_nodes(webpage, '.text-primary'))
## {xml_nodeset (100)}
## [1] <span class="lister-item-index unbold text-primary">1.</span>
## [2] <span class="lister-item-index unbold text-primary">2.</span>
## [3] <span class="lister-item-index unbold text-primary">3.</span>
## [4] <span class="lister-item-index unbold text-primary">4.</span>
## [5] <span class="lister-item-index unbold text-primary">5.</span>
## [6] <span class="lister-item-index unbold text-primary">6.</span>
## [7] <span class="lister-item-index unbold text-primary">7.</span>
## [8] <span class="lister-item-index unbold text-primary">8.</span>
## [9] <span class="lister-item-index unbold text-primary">9.</span>
## [10] <span class="lister-item-index unbold text-primary">10.</span>
## [11] <span class="lister-item-index unbold text-primary">11.</span>
## [12] <span class="lister-item-index unbold text-primary">12.</span>
## [13] <span class="lister-item-index unbold text-primary">13.</span>
## [14] <span class="lister-item-index unbold text-primary">14.</span>
## [15] <span class="lister-item-index unbold text-primary">15.</span>
## [16] <span class="lister-item-index unbold text-primary">16.</span>
## [17] <span class="lister-item-index unbold text-primary">17.</span>
## [18] <span class="lister-item-index unbold text-primary">18.</span>
## [19] <span class="lister-item-index unbold text-primary">19.</span>
## [20] <span class="lister-item-index unbold text-primary">20.</span>
## ...
# Convert the ranking data to text
(rank_data <- html_text(rank_data_html))
## [1] "1." "2." "3." "4." "5." "6." "7." "8." "9." "10."
## [11] "11." "12." "13." "14." "15." "16." "17." "18." "19." "20."
## [21] "21." "22." "23." "24." "25." "26." "27." "28." "29." "30."
## [31] "31." "32." "33." "34." "35." "36." "37." "38." "39." "40."
## [41] "41." "42." "43." "44." "45." "46." "47." "48." "49." "50."
## [51] "51." "52." "53." "54." "55." "56." "57." "58." "59." "60."
## [61] "61." "62." "63." "64." "65." "66." "67." "68." "69." "70."
## [71] "71." "72." "73." "74." "75." "76." "77." "78." "79." "80."
## [81] "81." "82." "83." "84." "85." "86." "87." "88." "89." "90."
## [91] "91." "92." "93." "94." "95." "96." "97." "98." "99." "100."
# Turn into numerical values
(rank_data <- as.integer(rank_data))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## [19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## [37] 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## [55] 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## [73] 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## [91] 91 92 93 94 95 96 97 98 99 100
Use SelectorGadget to find the CSS selector .lister-item-header a
.
# Using CSS selectors to scrap the title section
(title_data_html <- html_nodes(webpage, '.lister-item-header a'))
## {xml_nodeset (100)}
## [1] <a href="/title/tt10023022/?ref_=adv_li_tt">Clean</a>
## [2] <a href="/title/tt10886166/?ref_=adv_li_tt">365 Days</a>
## [3] <a href="/title/tt10272386/?ref_=adv_li_tt">The Father</a>
## [4] <a href="/title/tt6723592/?ref_=adv_li_tt">Tenet</a>
## [5] <a href="/title/tt9620292/?ref_=adv_li_tt">Promising Young Woman</a>
## [6] <a href="/title/tt9214832/?ref_=adv_li_tt">Emma.</a>
## [7] <a href="/title/tt9731534/?ref_=adv_li_tt">The Night House</a>
## [8] <a href="/title/tt8332922/?ref_=adv_li_tt">A Quiet Place Part II</a>
## [9] <a href="/title/tt10362466/?ref_=adv_li_tt">After We Collided</a>
## [10] <a href="/title/tt10288566/?ref_=adv_li_tt">Another Round</a>
## [11] <a href="/title/tt7395114/?ref_=adv_li_tt">The Devil All the Time</a>
## [12] <a href="/title/tt8503618/?ref_=adv_li_tt">Hamilton</a>
## [13] <a href="/title/tt6673612/?ref_=adv_li_tt">Dolittle</a>
## [14] <a href="/title/tt9770150/?ref_=adv_li_tt">Nomadland</a>
## [15] <a href="/title/tt8368512/?ref_=adv_li_tt">The Courier</a>
## [16] <a href="/title/tt7126948/?ref_=adv_li_tt">Wonder Woman 1984</a>
## [17] <a href="/title/tt6475714/?ref_=adv_li_tt">Monster Hunter</a>
## [18] <a href="/title/tt2948372/?ref_=adv_li_tt">Soul</a>
## [19] <a href="/title/tt11655202/?ref_=adv_li_tt">Riders of Justice</a>
## [20] <a href="/title/tt9893250/?ref_=adv_li_tt">I Care a Lot</a>
## ...
# Converting the title data to text
(title_data <- html_text(title_data_html))
## [1] "Clean"
## [2] "365 Days"
## [3] "The Father"
## [4] "Tenet"
## [5] "Promising Young Woman"
## [6] "Emma."
## [7] "The Night House"
## [8] "A Quiet Place Part II"
## [9] "After We Collided"
## [10] "Another Round"
## [11] "The Devil All the Time"
## [12] "Hamilton"
## [13] "Dolittle"
## [14] "Nomadland"
## [15] "The Courier"
## [16] "Wonder Woman 1984"
## [17] "Monster Hunter"
## [18] "Soul"
## [19] "Riders of Justice"
## [20] "I Care a Lot"
## [21] "Mainstream"
## [22] "Fantasy Island"
## [23] "The Dry"
## [24] "The Hunt"
## [25] "The Invisible Man"
## [26] "Palm Springs"
## [27] "Birds of Prey"
## [28] "Underwater"
## [29] "The King of Staten Island"
## [30] "Enola Holmes"
## [31] "The Forgotten Battle"
## [32] "Rifkin's Festival"
## [33] "Boss Level"
## [34] "Sonic the Hedgehog"
## [35] "Demon Slayer: Mugen Train"
## [36] "Mulan"
## [37] "The Trial of the Chicago 7"
## [38] "The Empty Man"
## [39] "The Nest"
## [40] "Unhinged"
## [41] "Greyhound"
## [42] "The Old Guard"
## [43] "Love and Monsters"
## [44] "I'm Thinking of Ending Things"
## [45] "Shadow in the Cloud"
## [46] "Escape from Pretoria"
## [47] "Run"
## [48] "Extraction"
## [49] "Greenland"
## [50] "Zola"
## [51] "Rebecca"
## [52] "Minari"
## [53] "The New Mutants"
## [54] "A Shot Through the Wall"
## [55] "Run Hide Fight"
## [56] "The Croods: A New Age"
## [57] "Bad Boys for Life"
## [58] "The World to Come"
## [59] "Ava"
## [60] "The Witches"
## [61] "Shiva Baby"
## [62] "Supernova"
## [63] "Black Bear"
## [64] "News of the World"
## [65] "Eurovision Song Contest: The Story of Fire Saga"
## [66] "Onward"
## [67] "Mank"
## [68] "Joe Bell"
## [69] "Freaky"
## [70] "Bruised"
## [71] "Bill & Ted Face the Music"
## [72] "Spenser Confidential"
## [73] "The Babysitter: Killer Queen"
## [74] "Black Box"
## [75] "Pieces of a Woman"
## [76] "A Perfect Enemy"
## [77] "The Call of the Wild"
## [78] "The Midnight Sky"
## [79] "The Wrong Missy"
## [80] "Ammonite"
## [81] "Amulet"
## [82] "Finding You"
## [83] "Monday"
## [84] "Death of a Telemarketer"
## [85] "Falling for Figaro"
## [86] "Honest Thief"
## [87] "Minamata"
## [88] "Attack on Titan: Chronicle"
## [89] "Bloodshot"
## [90] "Trolls World Tour"
## [91] "Possessor"
## [92] "All the Bright Places"
## [93] "The Comeback Trail"
## [94] "Let Him Go"
## [95] "Definition Please"
## [96] "Project Power"
## [97] "The Paper Tigers"
## [98] "#Alive"
## [99] "Persian Lessons"
## [100] "Inheritance"
# Using CSS selectors to scrap the description section
(description_data_html <- html_nodes(webpage, '.ratings-bar+ .text-muted'))
## {xml_nodeset (100)}
## [1] <p class="text-muted">\nTormented by his past, a garbage man named Clean ...
## [2] <p class="text-muted">\nMassimo is a member of the Sicilian Mafia family ...
## [3] <p class="text-muted">\nA man refuses all assistance from his daughter a ...
## [4] <p class="text-muted">\nArmed with only one word, Tenet, and fighting fo ...
## [5] <p class="text-muted">\nA young woman, traumatized by a tragic event in ...
## [6] <p class="text-muted">\nIn 1800s England, a well meaning but selfish you ...
## [7] <p class="text-muted">\nA widow begins to uncover her recently deceased ...
## [8] <p class="text-muted">\nFollowing the events at home, the Abbott family ...
## [9] <p class="text-muted">\nBased on the 2014 romance novel of the same name ...
## [10] <p class="text-muted">\nFour high school teachers consume alcohol on a d ...
## [11] <p class="text-muted">\nSinister characters converge around a young man ...
## [12] <p class="text-muted">\nThe real life of one of America's foremost found ...
## [13] <p class="text-muted">\nA physician who can talk to animals embarks on a ...
## [14] <p class="text-muted">\nA woman in her sixties, after losing everything ...
## [15] <p class="text-muted">\nCold War spy Greville Wynne and his Russian sour ...
## [16] <p class="text-muted">\nDiana must contend with a work colleague and bus ...
## [17] <p class="text-muted">\nWhen Cpt. Artemis and her loyal soldiers are tra ...
## [18] <p class="text-muted">\nAfter landing the gig of a lifetime, a New York ...
## [19] <p class="text-muted">\nMarkus goes home to his teenage daughter, Mathil ...
## [20] <p class="text-muted">\nA crooked legal guardian who drains the savings ...
## ...
# Converting the description data to text
description_data <- html_text(description_data_html)
# take a look at first few
head(description_data)
## [1] "\nTormented by his past, a garbage man named Clean attempts a quiet life of redemption. But, soon finds himself forced to reconcile with the violence of his past."
## [2] "\nMassimo is a member of the Sicilian Mafia family and Laura is a sales director. She does not expect that on a trip to Sicily trying to save her relationship, Massimo will kidnap her and give her 365 days to fall in love with him."
## [3] "\nA man refuses all assistance from his daughter as he ages. As he tries to make sense of his changing circumstances, he begins to doubt his loved ones, his own mind and even the fabric of his reality."
## [4] "\nArmed with only one word, Tenet, and fighting for the survival of the entire world, a Protagonist journeys through a twilight world of international espionage on a mission that will unfold in something beyond real time."
## [5] "\nA young woman, traumatized by a tragic event in her past, seeks out vengeance against those who crossed her path."
## [6] "\nIn 1800s England, a well meaning but selfish young woman meddles in the love lives of her friends."
# strip the '\n'
description_data <- str_replace(description_data, "^\\n", "")
head(description_data)
## [1] "Tormented by his past, a garbage man named Clean attempts a quiet life of redemption. But, soon finds himself forced to reconcile with the violence of his past."
## [2] "Massimo is a member of the Sicilian Mafia family and Laura is a sales director. She does not expect that on a trip to Sicily trying to save her relationship, Massimo will kidnap her and give her 365 days to fall in love with him."
## [3] "A man refuses all assistance from his daughter as he ages. As he tries to make sense of his changing circumstances, he begins to doubt his loved ones, his own mind and even the fabric of his reality."
## [4] "Armed with only one word, Tenet, and fighting for the survival of the entire world, a Protagonist journeys through a twilight world of international espionage on a mission that will unfold in something beyond real time."
## [5] "A young woman, traumatized by a tragic event in her past, seeks out vengeance against those who crossed her path."
## [6] "In 1800s England, a well meaning but selfish young woman meddles in the love lives of her friends."
# Using CSS selectors to scrap the Movie runtime section
(runtime_data <- webpage %>%
html_nodes('.runtime') %>%
html_text() %>%
str_replace(" min", "") %>%
as.integer())
## [1] 94 114 97 150 113 124 107 97 105 117 138 160 101 107 112 151 103 100
## [19] 116 118 94 109 117 90 124 90 109 95 136 123 124 88 100 99 117 115
## [37] 129 137 107 90 91 125 109 134 83 106 90 116 119 86 123 115 94 90
## [55] 109 95 124 105 96 106 77 95 104 118 123 102 131 94 102 129 91 111
## [73] 101 100 126 89 100 118 90 120 99 119 116 88 104 99 115 122 109 90
## [91] 103 107 104 113 91 113 108 98 127 111
Collect the (first) genre of each movie:
genre_data <- webpage %>%
# Using CSS selectors to scrap the Movie genre section
html_nodes('.genre') %>%
# Converting the genre data to text
html_text() %>%
# Data-Preprocessing: retrieve the first word
str_extract("[:alpha:]+")
genre_data
## [1] "Crime" "Drama" "Drama" "Action" "Crime" "Comedy"
## [7] "Horror" "Drama" "Drama" "Comedy" "Crime" "Biography"
## [13] "Adventure" "Drama" "Drama" "Action" "Action" "Animation"
## [19] "Action" "Comedy" "Comedy" "Action" "Crime" "Action"
## [25] "Drama" "Comedy" "Action" "Adventure" "Comedy" "Action"
## [31] "Drama" "Comedy" "Mystery" "Action" "Animation" "Action"
## [37] "Drama" "Horror" "Drama" "Action" "Action" "Action"
## [43] "Action" "Drama" "Action" "Thriller" "Mystery" "Action"
## [49] "Action" "Comedy" "Drama" "Drama" "Action" "Crime"
## [55] "Crime" "Animation" "Action" "Drama" "Thriller" "Adventure"
## [61] "Comedy" "Drama" "Comedy" "Action" "Comedy" "Animation"
## [67] "Biography" "Biography" "Comedy" "Drama" "Adventure" "Action"
## [73] "Comedy" "Horror" "Drama" "Adventure" "Adventure" "Adventure"
## [79] "Comedy" "Biography" "Horror" "Drama" "Drama" "Comedy"
## [85] "Comedy" "Action" "Drama" "Animation" "Action" "Animation"
## [91] "Horror" "Drama" "Comedy" "Crime" "Comedy" "Action"
## [97] "Action" "Action" "Drama" "Drama"
Rating data:
rating_data <- webpage %>%
html_nodes('.ratings-imdb-rating strong') %>%
html_text() %>%
as.numeric()
rating_data
## [1] 5.7 3.3 8.3 7.4 7.5 6.7 6.5 7.3 5.2 7.7 7.1 8.4 5.6 7.3 7.2 5.4 5.3 8.1
## [19] 7.6 6.3 5.0 4.9 6.9 6.5 7.1 7.4 6.1 5.9 7.1 6.6 7.1 6.2 6.8 6.5 8.2 5.7
## [37] 7.8 6.2 6.3 6.0 7.0 6.7 7.0 6.6 4.9 6.8 6.7 6.7 6.4 6.5 6.0 7.5 5.3 5.2
## [55] 6.4 7.0 6.5 6.2 5.4 5.3 7.2 6.9 6.6 6.8 6.5 7.4 6.9 5.8 6.4 6.2 6.0 6.2
## [73] 5.8 6.2 7.1 5.5 6.8 5.6 5.7 6.5 4.8 6.3 6.1 4.8 6.3 6.0 7.6 8.6 5.7 6.1
## [91] 6.5 6.5 5.7 6.7 5.5 6.0 6.4 6.3 7.4 5.6
Vote data
votes_data <- webpage %>%
html_nodes('.sort-num_votes-visible span:nth-child(2)') %>%
html_text() %>%
str_replace(",", "") %>%
as.numeric()
votes_data
## [1] 1719 72490 122060 450511 150403 45946 33609 194104 27974 131815
## [11] 120985 81952 59472 143720 47005 246748 52801 297148 39183 123605
## [21] 2820 46377 20255 96960 206854 142969 221078 76432 57626 156645
## [31] 23522 5699 58849 113997 45887 141139 166635 23282 12473 61865
## [41] 89424 152826 116353 78392 23790 33104 64348 187189 106744 10932
## [51] 39186 70074 71524 244 21227 38036 151720 6762 51340 36581
## [61] 16213 9661 9962 81579 89334 134889 69690 4921 49987 12778
## [71] 43202 83009 35712 13231 46479 2031 46106 80331 37476 15618
## [81] 3391 3385 3402 271 1429 46547 15964 7834 73903 21518
## [91] 30962 27974 6624 23096 113 82853 3756 35231 6591 11994
Director information
directors_data <- webpage %>%
html_nodes('.text-muted+ p a:nth-child(1)') %>%
html_text()
directors_data
## [1] "Paul Solet" "Barbara Bialowas"
## [3] "Florian Zeller" "Christopher Nolan"
## [5] "Emerald Fennell" "Autumn de Wilde"
## [7] "David Bruckner" "John Krasinski"
## [9] "Roger Kumble" "Thomas Vinterberg"
## [11] "Antonio Campos" "Thomas Kail"
## [13] "Stephen Gaghan" "Chloé Zhao"
## [15] "Dominic Cooke" "Patty Jenkins"
## [17] "Paul W.S. Anderson" "Pete Docter"
## [19] "Anders Thomas Jensen" "J Blakeson"
## [21] "Gia Coppola" "Jeff Wadlow"
## [23] "Robert Connolly" "Craig Zobel"
## [25] "Leigh Whannell" "Max Barbakow"
## [27] "Cathy Yan" "William Eubank"
## [29] "Judd Apatow" "Harry Bradbeer"
## [31] "Matthijs van Heijningen Jr." "Woody Allen"
## [33] "Joe Carnahan" "Jeff Fowler"
## [35] "Haruo Sotozaki" "Niki Caro"
## [37] "Aaron Sorkin" "David Prior"
## [39] "Sean Durkin" "Derrick Borte"
## [41] "Aaron Schneider" "Gina Prince-Bythewood"
## [43] "Michael Matthews" "Charlie Kaufman"
## [45] "Roseanne Liang" "Francis Annan"
## [47] "Aneesh Chaganty" "Sam Hargrave"
## [49] "Ric Roman Waugh" "Janicza Bravo"
## [51] "Ben Wheatley" "Lee Isaac Chung"
## [53] "Josh Boone" "Aimee Long"
## [55] "Kyle Rankin" "Joel Crawford"
## [57] "Adil El Arbi" "Mona Fastvold"
## [59] "Tate Taylor" "Robert Zemeckis"
## [61] "Emma Seligman" "Harry Macqueen"
## [63] "Lawrence Michael Levine" "Paul Greengrass"
## [65] "David Dobkin" "Dan Scanlon"
## [67] "David Fincher" "Reinaldo Marcus Green"
## [69] "Christopher Landon" "Halle Berry"
## [71] "Dean Parisot" "Peter Berg"
## [73] "McG" "Emmanuel Osei-Kuffour"
## [75] "Kornél Mundruczó" "Kike Maíllo"
## [77] "Chris Sanders" "George Clooney"
## [79] "Tyler Spindel" "Francis Lee"
## [81] "Romola Garai" "Brian Baugh"
## [83] "Argyris Papadimitropoulos" "Khaled Ridgeway"
## [85] "Ben Lewin" "Mark Williams"
## [87] "Andrew Levitas" "Masashi Koizuka"
## [89] "Dave Wilson" "Walt Dohrn"
## [91] "Brandon Cronenberg" "Brett Haley"
## [93] "George Gallo" "Thomas Bezucha"
## [95] "Sujata Day" "Henry Joost"
## [97] "Quoc Bao Tran" "Il Cho"
## [99] "Vadim Perelman" "Vaughn Stein"
Only the first actor
actors_data <- webpage %>%
html_nodes('.lister-item-content .ghost+ a') %>%
html_text()
actors_data
## [1] "Adrien Brody" "Anna Maria Sieklucka" "Anthony Hopkins"
## [4] "John David Washington" "Carey Mulligan" "Anya Taylor-Joy"
## [7] "Rebecca Hall" "Emily Blunt" "Josephine Langford"
## [10] "Mads Mikkelsen" "Bill Skarsgård" "Lin-Manuel Miranda"
## [13] "Robert Downey Jr." "Frances McDormand" "Benedict Cumberbatch"
## [16] "Gal Gadot" "Milla Jovovich" "Jamie Foxx"
## [19] "Mads Mikkelsen" "Rosamund Pike" "Andrew Garfield"
## [22] "Michael Peña" "Eric Bana" "Betty Gilpin"
## [25] "Elisabeth Moss" "Andy Samberg" "Margot Robbie"
## [28] "Kristen Stewart" "Pete Davidson" "Millie Bobby Brown"
## [31] "Gijs Blom" "Wallace Shawn" "Frank Grillo"
## [34] "Ben Schwartz" "Natsuki Hanae" "Liu Yifei"
## [37] "Eddie Redmayne" "James Badge Dale" "Jude Law"
## [40] "Russell Crowe" "Tom Hanks" "Charlize Theron"
## [43] "Dylan O'Brien" "Jesse Plemons" "Chloë Grace Moretz"
## [46] "Daniel Radcliffe" "Sarah Paulson" "Chris Hemsworth"
## [49] "Gerard Butler" "Taylour Paige" "Lily James"
## [52] "Steven Yeun" "Maisie Williams" "Kenny Leu"
## [55] "Isabel May" "Nicolas Cage" "Will Smith"
## [58] "Katherine Waterston" "Jessica Chastain" "Anne Hathaway"
## [61] "Rachel Sennott" "Colin Firth" "Aubrey Plaza"
## [64] "Tom Hanks" "Will Ferrell" "Tom Holland"
## [67] "Gary Oldman" "Mark Wahlberg" "Vince Vaughn"
## [70] "Halle Berry" "Keanu Reeves" "Mark Wahlberg"
## [73] "Judah Lewis" "Mamoudou Athie" "Vanessa Kirby"
## [76] "Dominique Pinon" "Harrison Ford" "George Clooney"
## [79] "David Spade" "Kate Winslet" "Carla Juri"
## [82] "Rose Reid" "Sebastian Stan" "Lamorne Morris"
## [85] "Danielle Macdonald" "Liam Neeson" "Akiko Iwase"
## [88] "Marina Inoue" "Vin Diesel" "Anna Kendrick"
## [91] "Andrea Riseborough" "Elle Fanning" "Robert De Niro"
## [94] "Diane Lane" "Katrina Bowden" "Jamie Foxx"
## [97] "Yuji Okumoto" "Yoo Ah-in" "Nahuel Pérez Biscayart"
## [100] "Lily Collins"
We encounter the issue of missing data when scraping metascore.
We see there are only 90 meta scores. 10 movies don’t have meta scores. We may manually find which movies don’t have meta scores but that’s tedious and not reproducible.
# Using CSS selectors to scrap the metascore section
ms_data_html <- html_nodes(webpage, '.metascore')
# Converting the runtime data to text
ms_data <- html_text(ms_data_html)
# Let's have a look at the metascore
ms_data <- str_replace(ms_data, "\\s*$", "") %>% as.integer()
ms_data
## [1] 43 88 69 73 71 68 71 14 79 55 90 26 93 65 60 47 83 81 66 36 22 69 50 72 83
## [26] 60 48 67 68 43 56 47 75 66 76 80 40 64 70 63 78 66 56 67 56 64 76 46 89 43
## [51] 13 56 59 73 39 47 79 73 79 73 50 61 79 54 67 52 65 49 22 62 66 48 58 33 72
## [76] 62 41 58 51 46 51 44 51 72 61 63 71 51 67 31
First let’s tally index and corresponding metascore (if present).
rank_and_metascore <- webpage %>%
html_nodes('.unfavorable , .text-primary , .favorable , .mixed') %>%
html_text() %>%
str_replace("\\s*$", "") %>%
print()
## [1] "1." "43" "2." "3." "88" "4." "69" "5." "73" "6."
## [11] "71" "7." "68" "8." "71" "9." "14" "10." "79" "11."
## [21] "55" "12." "90" "13." "26" "14." "93" "15." "65" "16."
## [31] "60" "17." "47" "18." "83" "19." "81" "20." "66" "21."
## [41] "36" "22." "22" "23." "69" "24." "50" "25." "72" "26."
## [51] "83" "27." "60" "28." "48" "29." "67" "30." "68" "31."
## [61] "32." "43" "33." "56" "34." "47" "35." "75" "36." "66"
## [71] "37." "76" "38." "39." "80" "40." "40" "41." "64" "42."
## [81] "70" "43." "63" "44." "78" "45." "66" "46." "56" "47."
## [91] "67" "48." "56" "49." "64" "50." "76" "51." "46" "52."
## [101] "89" "53." "43" "54." "55." "13" "56." "56" "57." "59"
## [111] "58." "73" "59." "39" "60." "47" "61." "79" "62." "73"
## [121] "63." "79" "64." "73" "65." "50" "66." "61" "67." "79"
## [131] "68." "54" "69." "67" "70." "52" "71." "65" "72." "49"
## [141] "73." "22" "74." "62" "75." "66" "76." "77." "48" "78."
## [151] "58" "79." "33" "80." "72" "81." "62" "82." "41" "83."
## [161] "58" "84." "85." "51" "86." "46" "87." "51" "88." "89."
## [171] "44" "90." "51" "91." "72" "92." "61" "93." "94." "63"
## [181] "95." "71" "96." "51" "97." "67" "98." "99." "100." "31"
isrank <- str_detect(rank_and_metascore, "\\.$")
ismissing <- isrank[1:(length(rank_and_metascore) - 1)] & isrank[2:(length(rank_and_metascore))]
ismissing[length(ismissing) + 1] <- isrank[length(isrank)]
missingpos <- as.integer(rank_and_metascore[ismissing])
metascore_data <- rep(NA, 100)
metascore_data[-missingpos] <- ms_data
metascore_data
## [1] 43 NA 88 69 73 71 68 71 14 79 55 90 26 93 65 60 47 83 81 66 36 22 69 50 72
## [26] 83 60 48 67 68 NA 43 56 47 75 66 76 NA 80 40 64 70 63 78 66 56 67 56 64 76
## [51] 46 89 43 NA 13 56 59 73 39 47 79 73 79 73 50 61 79 54 67 52 65 49 22 62 66
## [76] NA 48 58 33 72 62 41 58 NA 51 46 51 NA 44 51 72 61 NA 63 71 51 67 NA NA 31
Be careful with missing data.
# Using CSS selectors to scrap the gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
# Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)
# Let's have a look at the gross data
gross_data
## [1] "$58.46M" "$160.07M" "$2.39M" "$77.05M" "$46.37M" "$27.31M"
## [7] "$70.41M" "$84.16M" "$148.97M" "$47.70M" "$1.07M" "$58.57M"
## [13] "$206.31M" "$61.56M" "$62.34M"
# Data-Preprocessing: removing '$' and 'M' signs
gross_data <- str_replace(gross_data, "M", "")
gross_data <- str_sub(gross_data, 2, 10)
#(gross_data <- str_extract(gross_data, "[:digit:]+.[:digit:]+"))
gross_data <- as.numeric(gross_data)
# Let's check the length of gross data
gross_data
## [1] 58.46 160.07 2.39 77.05 46.37 27.31 70.41 84.16 148.97 47.70
## [11] 1.07 58.57 206.31 61.56 62.34
85 (out of 100) movies don’t have gross data yet! We need a better way to figure out missing entries.
(rank_and_gross <- webpage %>%
# retrieve rank and gross
html_nodes('.ghost~ .text-muted+ span , .text-primary') %>%
html_text() %>%
str_replace("\\s+", "") %>%
str_replace_all("[$M]", ""))
## [1] "1." "2." "3." "4." "58.46" "5." "6." "7."
## [9] "8." "160.07" "9." "2.39" "10." "11." "12." "13."
## [17] "77.05" "14." "15." "16." "46.37" "17." "18." "19."
## [25] "20." "21." "22." "27.31" "23." "24." "25." "70.41"
## [33] "26." "27." "84.16" "28." "29." "30." "31." "32."
## [41] "33." "34." "148.97" "35." "47.70" "36." "37." "38."
## [49] "39." "40." "41." "42." "43." "1.07" "44." "45."
## [57] "46." "47." "48." "49." "50." "51." "52." "53."
## [65] "54." "55." "56." "58.57" "57." "206.31" "58." "59."
## [73] "60." "61." "62." "63." "64." "65." "66." "61.56"
## [81] "67." "68." "69." "70." "71." "72." "73." "74."
## [89] "75." "76." "77." "62.34" "78." "79." "80." "81."
## [97] "82." "83." "84." "85." "86." "87." "88." "89."
## [105] "90." "91." "92." "93." "94." "95." "96." "97."
## [113] "98." "99." "100."
isrank <- str_detect(rank_and_gross, "\\.$")
ismissing <- isrank[1:(length(rank_and_gross) - 1)] & isrank[2:(length(rank_and_gross))]
ismissing[length(ismissing)+1] <- isrank[length(isrank)]
missingpos <- as.integer(rank_and_gross[ismissing])
gs_data <- rep(NA, 100)
gs_data[-missingpos] <- gross_data
(gross_data <- gs_data)
## [1] NA NA NA 58.46 NA NA NA 160.07 2.39 NA
## [11] NA NA 77.05 NA NA 46.37 NA NA NA NA
## [21] NA 27.31 NA NA 70.41 NA 84.16 NA NA NA
## [31] NA NA NA 148.97 47.70 NA NA NA NA NA
## [41] NA NA 1.07 NA NA NA NA NA NA NA
## [51] NA NA NA NA NA 58.57 206.31 NA NA NA
## [61] NA NA NA NA NA 61.56 NA NA NA NA
## [71] NA NA NA NA NA NA 62.34 NA NA NA
## [81] NA NA NA NA NA NA NA NA NA NA
## [91] NA NA NA NA NA NA NA NA NA NA
Form a tibble:
# Combining all the lists to form a data frame
movies <- tibble(Rank = rank_data,
Title = title_data,
Description = description_data,
Runtime = runtime_data,
Genre = genre_data,
Rating = rating_data,
Metascore = metascore_data,
Votes = votes_data,
Gross_Earning_in_Mil = gross_data,
Director = directors_data,
Actor = actors_data)
movies %>% print(width=Inf)
## # A tibble: 100 × 11
## Rank Title
## <int> <chr>
## 1 1 Clean
## 2 2 365 Days
## 3 3 The Father
## 4 4 Tenet
## 5 5 Promising Young Woman
## 6 6 Emma.
## 7 7 The Night House
## 8 8 A Quiet Place Part II
## 9 9 After We Collided
## 10 10 Another Round
## Description
## <chr>
## 1 Tormented by his past, a garbage man named Clean attempts a quiet life of re…
## 2 Massimo is a member of the Sicilian Mafia family and Laura is a sales direct…
## 3 A man refuses all assistance from his daughter as he ages. As he tries to ma…
## 4 Armed with only one word, Tenet, and fighting for the survival of the entire…
## 5 A young woman, traumatized by a tragic event in her past, seeks out vengeanc…
## 6 In 1800s England, a well meaning but selfish young woman meddles in the love…
## 7 A widow begins to uncover her recently deceased husband's disturbing secrets.
## 8 Following the events at home, the Abbott family now face the terrors of the …
## 9 Based on the 2014 romance novel of the same name, this follows the love life…
## 10 Four high school teachers consume alcohol on a daily basis to see how it aff…
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil Director
## <int> <chr> <dbl> <int> <dbl> <dbl> <chr>
## 1 94 Crime 5.7 43 1719 NA Paul Solet
## 2 114 Drama 3.3 NA 72490 NA Barbara Bialowas
## 3 97 Drama 8.3 88 122060 NA Florian Zeller
## 4 150 Action 7.4 69 450511 58.5 Christopher Nolan
## 5 113 Crime 7.5 73 150403 NA Emerald Fennell
## 6 124 Comedy 6.7 71 45946 NA Autumn de Wilde
## 7 107 Horror 6.5 68 33609 NA David Bruckner
## 8 97 Drama 7.3 71 194104 160. John Krasinski
## 9 105 Drama 5.2 14 27974 2.39 Roger Kumble
## 10 117 Comedy 7.7 79 131815 NA Thomas Vinterberg
## Actor
## <chr>
## 1 Adrien Brody
## 2 Anna Maria Sieklucka
## 3 Anthony Hopkins
## 4 John David Washington
## 5 Carey Mulligan
## 6 Anya Taylor-Joy
## 7 Rebecca Hall
## 8 Emily Blunt
## 9 Josephine Langford
## 10 Mads Mikkelsen
## # … with 90 more rows
How many top 100 movies are in each genre? (Be careful with interpretation.)
movies %>%
ggplot() +
geom_bar(mapping = aes(x = Genre))
Which genre is most profitable in terms of average gross earnings?
movies %>%
group_by(Genre) %>%
summarise(avg_earning = mean(Gross_Earning_in_Mil, na.rm = TRUE)) %>%
ggplot() +
geom_col(mapping = aes(x = Genre, y = avg_earning)) +
labs(y = "avg earning in millions")
## Warning: Removed 6 rows containing missing values (position_stack).
ggplot(data = movies) +
geom_boxplot(mapping = aes(x = Genre, y = Gross_Earning_in_Mil)) +
labs(y = "Gross earning in millions")
## Warning: Removed 85 rows containing non-finite values (stat_boxplot).
Is there a relationship between gross earning and rating? Find the best selling movie (by gross earning) in each genre
library("ggrepel")
(best_in_genre <- movies %>%
group_by(Genre) %>%
filter(row_number(desc(Gross_Earning_in_Mil)) == 1)) %>%
print(width = Inf)
## # A tibble: 4 × 11
## # Groups: Genre [4]
## Rank Title
## <int> <chr>
## 1 8 A Quiet Place Part II
## 2 13 Dolittle
## 3 57 Bad Boys for Life
## 4 66 Onward
## Description
## <chr>
## 1 Following the events at home, the Abbott family now face the terrors of the o…
## 2 A physician who can talk to animals embarks on an adventure to find a legenda…
## 3 Miami detectives Mike Lowrey and Marcus Burnett must face off against a mothe…
## 4 Two elven brothers embark on a quest to bring their father back for one day.
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil Director
## <int> <chr> <dbl> <int> <dbl> <dbl> <chr>
## 1 97 Drama 7.3 71 194104 160. John Krasinski
## 2 101 Adventure 5.6 26 59472 77.0 Stephen Gaghan
## 3 124 Action 6.5 59 151720 206. Adil El Arbi
## 4 102 Animation 7.4 61 134889 61.6 Dan Scanlon
## Actor
## <chr>
## 1 Emily Blunt
## 2 Robert Downey Jr.
## 3 Will Smith
## 4 Tom Holland
ggplot(movies, mapping = aes(x = Rating, y = Gross_Earning_in_Mil)) +
geom_point(mapping = aes(size = Votes, color = Genre)) +
ggrepel::geom_label_repel(aes(label = Title), data = best_in_genre) +
labs(y = "Gross earning in millions")
## Warning: Removed 85 rows containing missing values (geom_point).
quantmod
package contains many utility functions for retrieving and plotting finance data. E.g.,
library(quantmod)
stock <- getSymbols("TSLA", src = "yahoo", auto.assign = FALSE, from = "2020-01-01")
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
head(stock)
## TSLA.Open TSLA.High TSLA.Low TSLA.Close TSLA.Volume TSLA.Adjusted
## 2020-01-02 84.900 86.140 84.342 86.052 47660500 86.052
## 2020-01-03 88.100 90.800 87.384 88.602 88892500 88.602
## 2020-01-06 88.094 90.312 88.000 90.308 50665000 90.308
## 2020-01-07 92.280 94.326 90.672 93.812 89410500 93.812
## 2020-01-08 94.740 99.698 93.646 98.428 155721500 98.428
## 2020-01-09 99.420 99.760 94.574 96.268 142202000 96.268
chartSeries(stock, theme = chartTheme("white"),
type = "line", log.scale = FALSE, TA = NULL)
Read blog: https://towardsdatascience.com/pulling-tweets-into-r-e17d4981cfe2
twitteR
package is useful for pulling tweets text data into R.
library(twitteR) #load package
Step 1: apply for a Twitter developer account. It takes some time to get approved.
Step 2: Generate and copy the Twitter App Keys.
consumer_key <- 'XXXXXXXXXX'
consumer_secret <- 'XXXXXXXXXX'
access_token <- 'XXXXXXXXXX'
access_secret <- 'XXXXXXXXXX'
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
virus <- searchTwitter('#China + #Coronavirus',
n = 1000,
since = '2020-01-01',
retryOnRateLimit = 1e3)
virus_df <- as_tibble(twListToDF(virus))
virus_df %>% print(width = Inf)