This report was automatically generated with the R package knitr (version 1.14).
# knitr::stitch_rmd(script="./dal/correct/correct-units.R", output="./stitched-output/dal/correct/correct-units.md")
rm(list=ls(all=TRUE)) #Clear the variables from previous runs.
source("./dal/arch/population-proportion.R")
library(ggplot2, quietly=TRUE)
library(magrittr, quietly=TRUE)
requireNamespace("dplyr", quietly=TRUE)
requireNamespace("readr", quietly=TRUE)
requireNamespace("tidyr", quietly=TRUE)
# requireNamespace("digest", quietly=TRUE)
path_in <- "./data-unshared/derived/ds-visit.rds"
path_out_rds <- "./data-unshared/derived/ds-visit-corrected.rds"
path_out_csv <- "./data-unshared/derived/ds-visit-corrected.csv"
figure_path <- 'stitched-output/dal/correct/'
convert_inches_to_cm <- function( inches ) {
inches * 2.54 #Exact
}
convert_cm_to_inches <- function( inches ) {
inches / 2.54 #Exact
}
convert_pounds_to_kg <- function( pounds ) {
pounds / 2.204622622
}
convert_kg_to_pounds <- function( pounds ) {
pounds * 2.204622622
}
calculate_bmi <- function( kg, cm ) {
kg / (cm *cm) * 10000
}
col_types <- readr::cols(
hdid = readr::col_integer(),
header = readr::col_character(),
name_long = readr::col_character(),
min = readr::col_integer(),
max = readr::col_integer(),
desired = readr::col_logical()
)
ds <- readr::read_rds(path_in)
ds_header <- as.data.frame(readr::read_csv("./data-phi-free/metadata/metadata-obs-header.csv", col_types=col_types))
rownames(ds_header) <- ds_header$header
# OuhscMunge::column_rename_headstart(ds)
mean(is.na(ds$height_cm))
## [1] 0.2007884
mean(is.na(ds$weight_kg))
## [1] 0.0619131
ds <- ds %>%
dplyr::select_(
"record_id" = "`record_id`"
, "patient_index" = "`patient_index`"
, "yob" = "`yob`"
, "male" = "`male`"
, "ethnicity_race" = "`ethnicity_race`"
, "location" = "`location`"
, "obs_month" = "`obs_month`"
, "age" = "`age`"
, "bmi_centricity" = "`bmi`"
, "height_in" = "`height_in`"
, "height_cm" = "`height_cm`"
, "weight_lb" = "`weight_lb`"
, "weight_kg" = "`weight_kg`"
) %>%
dplyr::mutate(
location = dplyr::if_else(location=="adolesent", "adolescent", location),
# gender = dplyr::if_else(as.logical(male), "Male", "Female", missing = NA_character_),
gender = ifelse(male==1, "male", "female"),
age_truncated = floor(age),
height_cm = dplyr::coalesce(height_cm, convert_inches_to_cm(height_in)),
weight_kg = dplyr::coalesce(weight_kg, convert_pounds_to_kg(weight_lb)),
bmi_centricity = dplyr::if_else(dplyr::between(bmi_centricity, 5, 100), bmi_centricity, NA_real_)
# bmi = dplyr::if_else(dplyr::between(bmi, ds_header["bmi" , "min"], ds_header["bmi" , "max"]), bmi , NA_real_)
) %>%
dplyr::select(
-height_in,
-weight_lb
)
mean(is.na(ds$height_cm))
## [1] 0.1850399
mean(is.na(ds$weight_kg))
## [1] 0.02119199
# ggplot(ds, aes(x=height_in, y=height_cm)) +
# geom_abline(slope=2.54, color="gray70") +
# geom_point(shape=1, na.rm=T) +
# theme_light()
# ggplot(ds, aes(x=weight_lb, y=weight_kg)) +
# geom_abline(slope=1/2.204622622, color="gray70") +
# geom_point(shape=1, na.rm=T) +
# theme_light()
ggplot(ds, aes(x=bmi_centricity)) +
geom_histogram(bins=30, fill="gray70", na.rm=T) +
theme_light() +
labs(title="BMI before Corrections")
ggplot(ds, aes(x=height_cm)) +
geom_histogram(bins=30, fill="gray70", na.rm=T) +
theme_light() +
labs(title="Height before Corrections")
mean(is.na(ds$height_cm))
## [1] 0.1850399
ds_height <- ds %>%
determine_population_bounds(.outcome_name = "height_cm") %>%
dplyr::mutate(
# height_cm = dplyr::na_if(height_cm, range_extended_above),
height_cm = dplyr::if_else(range_extended_below, convert_inches_to_cm(height_cm), height_cm)
) %>%
dplyr::select(record_id, height_cm, gender, age_truncated) %>%
determine_population_bounds(.outcome_name = "height_cm") %>%
dplyr::mutate(
height_cm = dplyr::if_else(range_extended_below | range_extended_above, NA_real_, height_cm)
) %>%
dplyr::select(record_id, height_cm)
mean(is.na(ds_height$height_cm))
## [1] 0.203987
ggplot(ds_height, aes(x=height_cm)) +
geom_histogram(bins=30, fill="gray70", na.rm=T) +
theme_light() +
labs(title="Height after Corrections")
ggplot(ds, aes(x=weight_kg)) +
geom_histogram(bins=30, fill="gray70", na.rm=T) +
theme_light() +
labs(title="Weight before Corrections")
mean(is.na(ds$weight_kg))
## [1] 0.02119199
ds_weight <- ds %>%
determine_population_bounds(.outcome_name = "weight_kg") %>%
dplyr::mutate(
weight_kg = dplyr::if_else(range_extended_below, convert_pounds_to_kg(weight_kg), weight_kg)
) %>%
dplyr::select(record_id, weight_kg, gender, age_truncated) %>%
determine_population_bounds(.outcome_name = "weight_kg") %>%
dplyr::mutate(
weight_kg = dplyr::if_else(range_extended_below | range_extended_above, NA_real_, weight_kg)
) %>%
dplyr::select(record_id, weight_kg)
mean(is.na(ds_weight$weight_kg))
## [1] 0.1005849
ggplot(ds_weight, aes(x=weight_kg)) +
geom_histogram(bins=30, fill="gray70", na.rm=T) +
theme_light() +
labs(title="Weight after Corrections")
ds <- ds %>%
dplyr::select(-height_cm, -weight_kg) %>%
dplyr::left_join(ds_height, by="record_id") %>%
dplyr::left_join(ds_weight, by="record_id") %>%
dplyr::mutate(
height_in = convert_cm_to_inches(height_cm),
weight_lb = convert_kg_to_pounds(weight_kg),
bmi_corrected = calculate_bmi(weight_kg, height_cm)
)
ggplot(ds, aes(x=bmi_centricity, y=bmi_corrected)) +
geom_rug(color="tan") +
geom_abline(slope=1, color="blue") +
# geom_abline(slope=2.54, color="tomato") +
geom_abline(slope=2.204622622, color="tomato") +
geom_abline(slope=1/(2.54*2.54), color="tomato") +
# geom_abline(slope=1/(2.204622622*2.204622622), color="tomato") +
geom_point(shape=1, na.rm=T) +
coord_fixed(xlim=c(0, 100), ylim=c(0, 100)) +
theme_light()
ds <- ds %>%
dplyr::rename_(
"bmi" = "bmi_corrected"
)
readr::write_rds(ds, path_out_rds, compress="gz")
readr::write_csv(ds, path_out_csv)
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 3.3.1 Patched (2016-09-18 r71304)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 7 x64 (build 7601) Service Pack 1
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] mgcv_1.8-15 nlme_3.1-128 magrittr_1.5 lme4_1.1-12
## [5] Matrix_1.2-7.1 ggplot2_2.1.0 dplyr_0.5.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.7 knitr_1.14 REDCapR_0.9.7
## [4] splines_3.3.1 MASS_7.3-45 testit_0.5
## [7] munsell_0.4.3 colorspace_1.2-6 lattice_0.20-34
## [10] R6_2.1.3 minqa_1.2.4 stringr_1.1.0
## [13] plyr_1.8.4 tools_3.3.1 RODBCext_0.2.7
## [16] grid_3.3.1 gtable_0.2.0 DBI_0.5-1
## [19] htmltools_0.3.5 digest_0.6.10 lazyeval_0.2.0
## [22] assertthat_0.1 tibble_1.2 purrr_0.2.2
## [25] formatR_1.4 tidyr_0.6.0 readr_1.0.0
## [28] RColorBrewer_1.1-2 nloptr_1.0.4 RODBC_1.3-14
## [31] evaluate_0.9 rmarkdown_1.0.9014 labeling_0.3
## [34] stringi_1.1.2 scales_0.4.0 markdown_0.7.7
Sys.time()
## [1] "2016-10-06 00:37:25 CDT"