Display machine information:
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.29 R6_2.5.1 jsonlite_1.7.2 magrittr_2.0.1
## [5] evaluate_0.14 stringi_1.7.6 rlang_1.0.1 cli_3.1.0
## [9] rstudioapi_0.13 jquerylib_0.1.4 bslib_0.3.1 rmarkdown_2.11
## [13] tools_3.6.0 stringr_1.4.0 xfun_0.29 yaml_2.2.1
## [17] fastmap_1.1.0 compiler_3.6.0 htmltools_0.5.2 knitr_1.37
## [21] sass_0.4.0
Load database libraries and the tidyverse frontend:
suppressPackageStartupMessages(library(dbplyr))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(lubridate))
library(DBI)
library(bigrquery)
Below is an outline of steps.
# path to the service account token
if (Sys.info()[["nodename"]] == "biostat-203b-teaching-server") {
# on teaching server
satoken <- "/mnt/mimiciv/1.0/biostat-203b-2022winter-3fdc2392ac39.json"
} else {
# on my own Mac laptop
satoken <- "/Users/huazhou/Documents/github.com/ucla-biostat-203b/2022winter/hw/hw3/biostat-203b-2022winter-3fdc2392ac39.json"
}
# BigQuery authentication using service account
bq_auth(
path = satoken,
# email = "mimiciv-bigquery@biostat-203b-2022winter.iam.gserviceaccount.com",
# scopes = c("https://www.googleapis.com/auth/bigquery",
# "https://www.googleapis.com/auth/cloud-platform")
)
mimic4_v1_0_203b
in GCP (Google Cloud Platform), using the billing account biostat-203b-2022winter
.# Connect to the BigQuery database `biostat-203b-2022winter.mimic4_v1_0_203b`
con <- dbConnect(
bigrquery::bigquery(),
project = "biostat-203b-2022winter",
dataset = "mimic4_v1_0_203b",
billing = "biostat-203b-2022winter"
)
con
## <BigQueryConnection>
## Dataset: biostat-203b-2022winter.mimic4_v1_0_203b
## Billing: biostat-203b-2022winter
mimic4_v1_0_203b
database.dbListTables(con)
## [1] "admissions" "chartevents" "d_items" "d_labitems" "icustays"
## [6] "labevents" "patients"
icustays
table.# full ICU stays table
icustays_tble <- tbl(con, "icustays") %>%
show_query() %>%
print(width = Inf)
## <SQL>
## SELECT *
## FROM `icustays`
## # Source: table<icustays> [?? x 8]
## # Database: BigQueryConnection
## subject_id hadm_id stay_id first_careunit last_careunit
## <int> <int> <int> <chr> <chr>
## 1 12776735 20817525 34547665 Neuro Stepdown Neuro Stepdown
## 2 16256226 20013290 39289362 Neuro Stepdown Neuro Stepdown
## 3 12974563 29618057 32563675 Neuro Stepdown Neuro Stepdown
## 4 14609218 20606189 34947848 Neuro Stepdown Neuro Stepdown
## 5 12687112 26132667 37445058 Neuro Stepdown Neuro Stepdown
## 6 18190935 24712081 30056748 Neuro Stepdown Neuro Stepdown
## 7 10404210 22880512 30254828 Neuro Stepdown Neuro Stepdown
## 8 12552973 21320643 30261549 Neuro Stepdown Neuro Stepdown
## 9 14303051 26559537 30401486 Neuro Stepdown Neuro Stepdown
## 10 12347959 22779073 30503984 Neuro Stepdown Neuro Stepdown
## intime outtime los
## <dttm> <dttm> <dbl>
## 1 2200-07-12 00:33:00 2200-07-13 16:44:40 1.67
## 2 2150-12-20 16:09:08 2150-12-21 14:58:40 0.951
## 3 2138-11-13 23:30:01 2138-11-15 16:25:19 1.71
## 4 2174-06-28 21:13:00 2174-07-05 17:01:32 6.83
## 5 2162-05-31 18:08:45 2162-06-04 10:16:13 3.67
## 6 2115-12-08 09:39:45 2115-12-12 02:36:52 3.71
## 7 2161-03-03 18:30:00 2161-03-05 18:10:31 1.99
## 8 2135-09-21 18:12:03 2135-09-23 12:56:46 1.78
## 9 2175-11-20 01:02:00 2175-11-21 16:06:17 1.63
## 10 2140-07-11 17:54:14 2140-07-19 13:01:05 7.80
## # … with more rows
We only keep the first ICU stay. Following code is kind of a hack, using the summarise_all(min)
function. It seems that slice_min()
, slice_head()
, distinct(, .keep_all = TRUE)
don’t work with dbplyr
+bigrquery
at the moment.
icustays_tble <- icustays_tble %>%
select(subject_id, intime) %>%
group_by(subject_id) %>%
summarise_all(min) %>%
left_join(icustays_tble, by = c("subject_id", "intime")) %>%
show_query() %>%
print(width = Inf)
## Warning: Missing values are always removed in SQL.
## Use `MIN(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
## <SQL>
## SELECT `LHS`.`subject_id` AS `subject_id`, `LHS`.`intime` AS `intime`, `hadm_id`, `stay_id`, `first_careunit`, `last_careunit`, `outtime`, `los`
## FROM (SELECT `subject_id`, MIN(`intime`) AS `intime`
## FROM (SELECT `subject_id`, `intime`
## FROM `icustays`) `q01`
## GROUP BY `subject_id`) `LHS`
## LEFT JOIN `icustays` AS `RHS`
## ON (`LHS`.`subject_id` = `RHS`.`subject_id` AND `LHS`.`intime` = `RHS`.`intime`)
##
## # Source: lazy query [?? x 8]
## # Database: BigQueryConnection
## subject_id intime hadm_id stay_id first_careunit
## <int> <dttm> <int> <int> <chr>
## 1 12776735 2200-07-12 00:33:00 20817525 34547665 Neuro Stepdown
## 2 16256226 2150-12-20 16:09:08 20013290 39289362 Neuro Stepdown
## 3 12974563 2138-11-13 23:30:01 29618057 32563675 Neuro Stepdown
## 4 14609218 2174-06-28 21:13:00 20606189 34947848 Neuro Stepdown
## 5 12687112 2162-05-31 18:08:45 26132667 37445058 Neuro Stepdown
## 6 18190935 2115-11-12 23:23:32 21137829 34338402 Neuro Stepdown
## 7 10404210 2161-03-03 18:30:00 22880512 30254828 Neuro Stepdown
## 8 12552973 2135-09-21 18:12:03 21320643 30261549 Neuro Stepdown
## 9 14303051 2175-11-20 01:02:00 26559537 30401486 Neuro Stepdown
## 10 12347959 2140-07-11 17:54:14 22779073 30503984 Neuro Stepdown
## last_careunit outtime los
## <chr> <dttm> <dbl>
## 1 Neuro Stepdown 2200-07-13 16:44:40 1.67
## 2 Neuro Stepdown 2150-12-21 14:58:40 0.951
## 3 Neuro Stepdown 2138-11-15 16:25:19 1.71
## 4 Neuro Stepdown 2174-07-05 17:01:32 6.83
## 5 Neuro Stepdown 2162-06-04 10:16:13 3.67
## 6 Neuro Stepdown 2115-11-15 18:06:26 2.78
## 7 Neuro Stepdown 2161-03-05 18:10:31 1.99
## 8 Neuro Stepdown 2135-09-23 12:56:46 1.78
## 9 Neuro Stepdown 2175-11-21 16:06:17 1.63
## 10 Neuro Stepdown 2140-07-19 13:01:05 7.80
## # … with more rows
admissions
table and only keep the patients who have a match in icustays_tble
(according to subject_id
and hadm_id
).# # TODO
# admissions_tble <-
patients
table and only keep the patients who have a match in icustays_tble
(according to subject_id
).# # TODO
# patients_tble <-
labevents
table and retrieve a subset that only contain subjects who appear in icustays_tble
and the lab items listed in HW2.# # TODO
# labevents_tble <-
Only keep the first lab measurements during ICU stay and pivot lab items to become variables/columns.
# # TODO
# labevents_tble <- labevents_tble %>% ...
chartevents
table and retrieve a subset that only contain subjects who appear in icustays_tble
and the chart events listed in HW2.# # TODO
# chartevents_tble <-
Only keep the first chart events during ICU stay and pivot chart events to become variables/columns.
# # TODO
# chartevents_tble <- chartevents_tble %>% ...
%>%
to perform following data wrangling steps: (i) start with the icustays_tble
for the first ICU stay of each unique patient, (ii) merge in admissions and patients tables, (iii) keep adults only (age at admission >= 18), (iv) merge in the labevents and chartevents tables, (v) create an indicator for 30-day mortality, (vi) save the final tibble to an icu_cohort.rds
R data file in the mimiciv_shiny
folder.# make a directory mimiciv_shiny
if (!dir.exists("mimiciv_shiny")) {
dir.create("mimiciv_shiny")
}
# # TODO
# icu_cohort <- icustays_tble %>% ...
Close database connection and clear workspace.
dbDisconnect(con)
rm(list = ls())
Develop a Shiny app for exploring the ICU cohort data created in Q1. The app should reside in the mimiciv_shiny
folder. The app should provide easy access to the graphical and numerical summaries of variables (demographics, lab measurements, vitals) in the ICU cohort.