This report was automatically generated with the R package knitr (version 1.14).

# knitr::stitch_rmd(script="./dal/correct/correct-units.R", output="./stitched-output/dal/correct/correct-units.md")
rm(list=ls(all=TRUE))  #Clear the variables from previous runs. 
source("./dal/arch/population-proportion.R")
library(ggplot2, quietly=TRUE)
library(magrittr, quietly=TRUE)
requireNamespace("dplyr", quietly=TRUE)
requireNamespace("readr", quietly=TRUE)
requireNamespace("tidyr", quietly=TRUE)
# requireNamespace("digest", quietly=TRUE)
path_in        <- "./data-unshared/derived/ds-visit.rds"
path_out_rds   <- "./data-unshared/derived/ds-visit-corrected.rds"
path_out_csv   <- "./data-unshared/derived/ds-visit-corrected.csv"
figure_path    <- 'stitched-output/dal/correct/'

convert_inches_to_cm <- function( inches ) {
  inches * 2.54 #Exact
}
convert_cm_to_inches <- function( inches ) {
  inches / 2.54 #Exact
}
convert_pounds_to_kg <- function( pounds ) {
  pounds / 2.204622622
}
convert_kg_to_pounds <- function( pounds ) {
  pounds * 2.204622622
}
calculate_bmi <- function( kg, cm ) {
  kg / (cm *cm)  * 10000
}
col_types <- readr::cols(
  hdid            = readr::col_integer(),
  header          = readr::col_character(),
  name_long       = readr::col_character(),
  min             = readr::col_integer(),
  max             = readr::col_integer(),
  desired         = readr::col_logical()
)
ds    <- readr::read_rds(path_in)
ds_header <- as.data.frame(readr::read_csv("./data-phi-free/metadata/metadata-obs-header.csv", col_types=col_types))
rownames(ds_header) <- ds_header$header
# OuhscMunge::column_rename_headstart(ds)
mean(is.na(ds$height_cm))
## [1] 0.2007884
mean(is.na(ds$weight_kg))
## [1] 0.0619131
ds <- ds %>% 
  dplyr::select_(
    "record_id"                   = "`record_id`"
    , "patient_index"             = "`patient_index`"
    , "yob"                       = "`yob`"
    , "male"                      = "`male`"
    , "ethnicity_race"            = "`ethnicity_race`"
    , "location"                  = "`location`"
    , "obs_month"                 = "`obs_month`"
    , "age"                       = "`age`"
    , "bmi_centricity"            = "`bmi`"
    , "height_in"                 = "`height_in`"
    , "height_cm"                 = "`height_cm`"
    , "weight_lb"                 = "`weight_lb`"
    , "weight_kg"                 = "`weight_kg`"
  ) %>% 
  dplyr::mutate(
    location          = dplyr::if_else(location=="adolesent", "adolescent", location),
    # gender          = dplyr::if_else(as.logical(male), "Male", "Female", missing = NA_character_),
    gender            = ifelse(male==1, "male", "female"),
    age_truncated     = floor(age), 
    height_cm         = dplyr::coalesce(height_cm, convert_inches_to_cm(height_in)), 
    weight_kg         = dplyr::coalesce(weight_kg, convert_pounds_to_kg(weight_lb)),
    bmi_centricity    = dplyr::if_else(dplyr::between(bmi_centricity, 5, 100), bmi_centricity, NA_real_)
    # bmi               = dplyr::if_else(dplyr::between(bmi,      ds_header["bmi"    , "min"], ds_header["bmi"    , "max"]), bmi   , NA_real_)
  ) %>% 
  dplyr::select(
    -height_in, 
    -weight_lb
  )
mean(is.na(ds$height_cm))
## [1] 0.1850399
mean(is.na(ds$weight_kg))
## [1] 0.02119199
# ggplot(ds, aes(x=height_in, y=height_cm)) +
#   geom_abline(slope=2.54, color="gray70") +
#   geom_point(shape=1, na.rm=T) +
#   theme_light()
# ggplot(ds, aes(x=weight_lb, y=weight_kg)) +
#   geom_abline(slope=1/2.204622622, color="gray70") +
#   geom_point(shape=1, na.rm=T) +
#   theme_light()

ggplot(ds, aes(x=bmi_centricity)) +
  geom_histogram(bins=30, fill="gray70", na.rm=T) +
  theme_light() +
  labs(title="BMI before Corrections")

plot of chunk inspect

ggplot(ds, aes(x=height_cm)) +
  geom_histogram(bins=30, fill="gray70", na.rm=T) +
  theme_light() +
  labs(title="Height before Corrections")

plot of chunk height-against-growth-chart

mean(is.na(ds$height_cm))
## [1] 0.1850399
ds_height <- ds %>%
  determine_population_bounds(.outcome_name = "height_cm") %>% 
  dplyr::mutate(
    # height_cm     = dplyr::na_if(height_cm, range_extended_above),
    height_cm     = dplyr::if_else(range_extended_below, convert_inches_to_cm(height_cm), height_cm)
  )  %>% 
  dplyr::select(record_id, height_cm, gender, age_truncated) %>%
  determine_population_bounds(.outcome_name = "height_cm") %>% 
  dplyr::mutate(
    height_cm     = dplyr::if_else(range_extended_below | range_extended_above, NA_real_, height_cm)
  ) %>% 
  dplyr::select(record_id, height_cm)

mean(is.na(ds_height$height_cm))
## [1] 0.203987
ggplot(ds_height, aes(x=height_cm)) +
  geom_histogram(bins=30, fill="gray70", na.rm=T) +
  theme_light() +
  labs(title="Height after Corrections")

plot of chunk height-against-growth-chart

ggplot(ds, aes(x=weight_kg)) +
  geom_histogram(bins=30, fill="gray70", na.rm=T) +
  theme_light() +
  labs(title="Weight before Corrections")

plot of chunk weight-against-growth-chart

mean(is.na(ds$weight_kg))
## [1] 0.02119199
ds_weight <- ds %>%
  determine_population_bounds(.outcome_name = "weight_kg") %>% 
  dplyr::mutate(
    weight_kg     = dplyr::if_else(range_extended_below, convert_pounds_to_kg(weight_kg), weight_kg)
  )  %>% 
  dplyr::select(record_id, weight_kg, gender, age_truncated) %>%
  determine_population_bounds(.outcome_name = "weight_kg") %>% 
  dplyr::mutate(
    weight_kg     = dplyr::if_else(range_extended_below | range_extended_above, NA_real_, weight_kg)
  ) %>% 
  dplyr::select(record_id, weight_kg)

mean(is.na(ds_weight$weight_kg))
## [1] 0.1005849
ggplot(ds_weight, aes(x=weight_kg)) +
  geom_histogram(bins=30, fill="gray70", na.rm=T) +
  theme_light() +
  labs(title="Weight after Corrections")

plot of chunk weight-against-growth-chart

ds <- ds %>% 
  dplyr::select(-height_cm, -weight_kg) %>% 
  dplyr::left_join(ds_height, by="record_id") %>% 
  dplyr::left_join(ds_weight, by="record_id") %>% 
  dplyr::mutate(
    height_in      = convert_cm_to_inches(height_cm),
    weight_lb      = convert_kg_to_pounds(weight_kg),
    bmi_corrected  = calculate_bmi(weight_kg, height_cm)
  )

ggplot(ds, aes(x=bmi_centricity, y=bmi_corrected)) +
  geom_rug(color="tan") +
  geom_abline(slope=1, color="blue") +
  # geom_abline(slope=2.54, color="tomato") +
  geom_abline(slope=2.204622622, color="tomato") +
  geom_abline(slope=1/(2.54*2.54), color="tomato") +
  # geom_abline(slope=1/(2.204622622*2.204622622), color="tomato") +
  geom_point(shape=1, na.rm=T) +
  coord_fixed(xlim=c(0, 100), ylim=c(0, 100)) +
  theme_light()

plot of chunk reunite-measurements

ds <- ds %>% 
  dplyr::rename_(
    "bmi"  = "bmi_corrected"
  )
readr::write_rds(ds, path_out_rds, compress="gz")
readr::write_csv(ds, path_out_csv)

The R session information (including the OS info, R version and all packages used):

sessionInfo()
## R version 3.3.1 Patched (2016-09-18 r71304)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 7 x64 (build 7601) Service Pack 1
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] mgcv_1.8-15    nlme_3.1-128   magrittr_1.5   lme4_1.1-12   
## [5] Matrix_1.2-7.1 ggplot2_2.1.0  dplyr_0.5.0   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.7        knitr_1.14         REDCapR_0.9.7     
##  [4] splines_3.3.1      MASS_7.3-45        testit_0.5        
##  [7] munsell_0.4.3      colorspace_1.2-6   lattice_0.20-34   
## [10] R6_2.1.3           minqa_1.2.4        stringr_1.1.0     
## [13] plyr_1.8.4         tools_3.3.1        RODBCext_0.2.7    
## [16] grid_3.3.1         gtable_0.2.0       DBI_0.5-1         
## [19] htmltools_0.3.5    digest_0.6.10      lazyeval_0.2.0    
## [22] assertthat_0.1     tibble_1.2         purrr_0.2.2       
## [25] formatR_1.4        tidyr_0.6.0        readr_1.0.0       
## [28] RColorBrewer_1.1-2 nloptr_1.0.4       RODBC_1.3-14      
## [31] evaluate_0.9       rmarkdown_1.0.9014 labeling_0.3      
## [34] stringi_1.1.2      scales_0.4.0       markdown_0.7.7
Sys.time()
## [1] "2016-10-06 00:37:25 CDT"