Calculate Compound-Level Polyphenol Intakes
This script calculates compound-level polyphenol intake (mg, mg/1000kcal) for provided dietary data.
INPUTS
- Diet_FooDB_polyphenol_content.csv.bz2: Disaggregated dietary data, mapped to FooDB polyphenol content, at the compound-level
- Diet_total_nutrients.csv - total daily nutrient data to go with dietary data.
- Diet_polyphenol_classtax_3072.csv - class taxonomy is derived from FooDB which uses ClassyFire, an automated chemical taxonomic classification application based on chemical structure
OUTPUTS
- summary_compound_intake_by_entry.csv, polyphenol compound intakes by recall for each participant
- summary_compound_intake_by_subject.csv, polyphenol compound intakes for each participant, provided in long format (compounds as rows)
- summary_compound_intake_by_subject_wide.csv, polyphenol compound intakes for each participant, provided in wide format (compounds as columns)
SCRIPTS
# Load packages
suppressMessages(library(dplyr))
suppressMessages(library(vroom))
suppressMessages(library(tidyr))
suppressMessages(library(stringr))
# Load provided file paths
source("provided_files.R")
#Content and kcal data
input_polyphenol_content = vroom::vroom('outputs/Diet_FooDB_polyphenol_content.csv.bz2',
show_col_types = FALSE)
input_kcal = vroom::vroom('outputs/Diet_total_nutrients.csv', show_col_types = FALSE) %>%
# Ensure consistent KCAL naming whether ASA24 or NHANES
dplyr::rename_with(~ "Total_KCAL", .cols = any_of(c("Total_KCAL", # Specific to ASA24
"Total_DRXIKCAL"))) %>% # Specific to NHANES
dplyr::select(c(subject,
# Ensures we pull correct columns for record or recall
any_of(c("RecallNo", "RecordNo", "RecordDayNo")),
Total_KCAL))
# Class taxonomy for FooDB compounds
class_tax = vroom::vroom(class_tax, show_col_types = FALSE) %>%
dplyr::select(c(compound_public_id, class))
# Merge the files
input_polyphenol_kcal = dplyr::left_join(input_polyphenol_content, input_kcal) %>%
# Add class data
left_join(class_tax)
## Joining with `by = join_by(subject, RecallNo)`
## Joining with `by = join_by(compound_public_id)`
Specify grouping variables
Column grouping depends on whether output is from a record or recall.
if ("RecallNo" %in% names(input_polyphenol_kcal)) {
group_vars = c("subject", "RecallNo", "compound_public_id")
} else if ("RecordNo" %in% names(input_polyphenol_kcal)) {
group_vars = c("subject", "RecordNo", "RecordDayNo", "compound_public_id")
} else {
stop("Data must contain RecallNo or RecordNo.")
}
Daily Class Polyphenol Intake Numbers BY ENTRY (Record/Recall)
compound_intakes_entry = input_polyphenol_kcal %>%
# Recall - Sum by Subject, Recall
# Record - Sum by Subject, Record Number, Day in Record Number
# Both recall and record group by compound
dplyr::group_by(across(all_of(group_vars))) %>%
#gets the sum of each compound for each participant's recall
dplyr::mutate(compound_intake_mg = sum(pp_consumed, na.rm = TRUE)) %>%
dplyr::select(c(subject,
any_of(c("RecallNo", "RecordNo", "RecordDayNo")),
class, compound_public_id, compound_name, compound_intake_mg, Total_KCAL)) %>%
dplyr::ungroup()%>%
#Remove duplicates since we've summed each polyphenol per recall
dplyr::distinct(across(all_of(group_vars)), .keep_all = TRUE) %>%
#Filter out missing compounds, this is for foods that did not map
dplyr::filter(!is.na(compound_public_id)) %>%
#Standardize Intakes to caloric intake
dplyr::mutate(compound_intake_mg1000kcal = compound_intake_mg/(Total_KCAL/1000))
# Write output
vroom::vroom_write(compound_intakes_entry,
"outputs/summary_compound_intake_by_entry.csv", delim = ",")
Daily Class Intakes by Subject
# First average caloric intakes
kcal_subject = input_kcal %>%
dplyr::group_by(subject) %>%
dplyr::summarise(avg_Total_KCAL = mean(Total_KCAL, na.rm = TRUE))
# Then let's average the class intakes
compound_intakes_subject = compound_intakes_entry %>%
# We will replace these with the subject average
dplyr::select(-c(Total_KCAL, compound_intake_mg1000kcal)) %>%
#Average polyphenol intake across recalls for each compound
dplyr::group_by(subject, compound_public_id) %>%
dplyr::mutate(Avg_compound_intake_mg = mean(compound_intake_mg)) %>%
dplyr::ungroup() %>%
#Remove duplicates
dplyr::distinct(subject, compound_public_id, .keep_all = TRUE) %>%
dplyr::select(-compound_intake_mg) %>%
# Add kcal data
dplyr::left_join(kcal_subject, by = 'subject') %>%
# Standardize to caloric intake
dplyr::mutate(compound_intake_mg1000kcal = Avg_compound_intake_mg/(avg_Total_KCAL/1000))
# Write Output
vroom::vroom_write(compound_intakes_subject,
"outputs/summary_compound_intake_by_subject.csv", delim = ",")
Available for users who prefer a wide format
compound_intakes_subject_wide = compound_intakes_subject %>%
#Transpose dataframe where each column is a participant
# columns are the compound_public_id for simplicity
tidyr::pivot_wider(id_cols = subject, names_from = compound_public_id,
values_from = compound_intake_mg1000kcal, values_fill = 0)
# Write Output
vroom::vroom_write(compound_intakes_subject_wide,
"outputs/summary_compound_intake_by_subject_wide.csv", delim = ",")