# The data input directory on my development machine:
options("avoncap.input" = "~/Data/avoncap/")
Data loading
The raw data is expected to be in the urine-antigens
subfolder
fs::dir_tree(path = avoncap::input("urine-antigens"))
#> /home/vp22681/Data/avoncap/urine-antigens
#> ├── 2022-09-15
#> │ └── uad-cases.csv
#> ├── 2023-05-15
#> │ ├── gp-cases.csv
#> │ └── uad-cases.csv
#> ├── B1851202_SAP V1.0-27Sep2021_Final.pdf
#> ├── README.md
#> └── Table Template Serotype Distribution Scientific Affairs Proposal_FINAL.docx
The data is distributed as a single file with both cases and controls together without distinguishing features. There are BinaxNOW and serotype specific UAD tests and their results. Not all tests are run for every person.
most_recent_files("urine-antigens")
#> # A tibble: 2 × 7
#> filename directory path date hospital study_year filetype
#> <chr> <chr> <fs::path> <date> <chr> <int> <chr>
#> 1 gp-cases urine-antigens …gp-cases.csv 2023-05-15 NA NA csv
#> 2 uad-cases urine-antigens …ad-cases.csv 2023-05-15 NA NA csv
rawUA = load_data("urine-antigens")
#> caching item: ~/.cache/avoncap/data-6c0a3f301ee14020e3907a7472c55225-a34df7f73813316db44634d7c8101b95.rda
#> Loaded 101647 rows from 2 files, (7925+93722=101647)
Data normalisation
The data can be subdivided into serotype
versus
binax
groups and normalisation handles processing the
serotype to be in the format ^[0-9]+[A-Z]$
, aligning format
of the patient identifier to match [0-9]{4}-[0-9]+
. And
processing test dates (which are supplied in the %e-%b-%y
format).
normUA = rawUA %>% normalise_data(instrument = "serotype", .nocache=TRUE)
#> Normalising data using: normalise.urine_antigens.serotype
#> caching item: ~/.cache/avoncap/norm-0ef4812a2d1bc751062531c384a2b89e-54799b57a745498112d81b06bd78e6c8.rda
#> mapping .RESULT to pneumo.urine_antigen_result
#> mapping .EVENT_DATE to pneumo.test_date
#> mapping .ANALYSIS to pneumo.urine_antigen_test
#> mapping .SUBJECT to admin.consented_record_number
#> mapping .BARCODE to pneumo.urine_antigen_sample_id
#> Mapped 5 columns
#> Did not map 23 columns
normBinax = rawUA %>% normalise_data(instrument = "binax", .nocache=TRUE)
#> Normalising data using: normalise.urine_antigens.binax
#> caching item: ~/.cache/avoncap/norm-0ef4812a2d1bc751062531c384a2b89e-90f11bb9b43480ac102fc888a3d2d068.rda
#> mapping .RESULT to pneumo.binax_result
#> mapping .EVENT_DATE to pneumo.test_date
#> mapping .SUBJECT to admin.consented_record_number
#> mapping .BARCODE to pneumo.urine_antigen_sample_id
#> Mapped 4 columns
#> Did not map 24 columns
The normalised UAD data has the individual serotype specific tests and results in a nested column. The single BinaxNOW result is not nested. Results are keyed off a surveillance number (admission episode identifier) and test data.
UAD format:
- admin.consented_record_number: character
- pneumo.test_date: Date
- key.consent: character
- key.sample: character
- pneumo.urine_antigen: list
BinaxNOW format:
- pneumo.binax_result: factor
- pneumo.test_date: Date
- admin.consented_record_number: character
- pneumo.urine_antigen_sample_id: character
- key.consent: character
- key.sample: character
Data augmentation
For the UAD results we commonly want to know the following:
- was the serotype testing complete for all serotypes?
- was the whole UAD panel positive for any serotype, if so how many?
- was the UAD1 panel positive? UAD2?
- was the UAD positive for a PCV vaccine covered serotype?
For the BINAX results there is no real addition qeustions that are relevant
augUA = normUA %>% augment_data()
#> Augmenting data using: augment.urine_antigens.serotype
#> caching item: ~/.cache/avoncap/augment-4c0c0c577381837fb7b8ee36b440288a-9ed7ba378591057ee91c20db48f25d48.rda
#> Created pneumo.pcv_group using: key.sample, pneumo.urine_antigen
#> Created pneumo.non_uad_panel_result, pneumo.uad1_panel_result, pneumo.uad2_panel_result, pneumo.serotype_summary_result, pneumo.serotype_positive_count, pneumo.serotype_results_complete using: key.sample, pneumo.urine_antigen
# augUA %>% filter(pneumo.serotype_results_complete) %>%
# group_by(pneumo.serotype_summary_result) %>%
# count()
#
# augUA %>% unnest(pneumo.pcv_group) %>% with(table(group,result))
#
# ggplot(
# augUA %>% unnest(pneumo.pcv_group) %>%
# inner_join(avoncap::serotype_data$names, by=c("group"="label")) %>%
# filter(indent==2, result == "Positive"), aes(x=group))+
# geom_histogram(stat="count")
Common operations
# normUA %>% unnest(pneumo.urine_antigen) %>% group_by(result) %>% count()
# ggplot(normUA %>% unnest(pneumo.urine_antigen) %>% filter(result == "Positive"), aes(x=test))+geom_histogram(stat="count")
# normBinax %>% group_by(pneumo.binax_result) %>% count()
Find all the
tmp = augUA %>% derive_pcv_groupings(pcv_map = avoncap::uad_pcv_map, col_name = pcv_custom) %>% glimpse()
#> Rows: 4,077
#> Columns: 13
#> $ admin.consented_record_number <chr> "1001-00045", "1001-00057", "1001-000…
#> $ pneumo.test_date <date> 2020-11-06, 2021-02-04, 2021-02-04, …
#> $ key.consent <chr> "1001-00045", "1001-00057", "1001-000…
#> $ key.sample <chr> "1001-00045-2020-11-06", "1001-00057-…
#> $ pneumo.urine_antigen <list> [<tbl_df[24 x 3]>], [<tbl_df[24 x 3]…
#> $ pneumo.pcv_group <list> [<tbl_df[38 x 2]>], [<tbl_df[38 x 2]…
#> $ pneumo.non_uad_panel_result <fct> Unknown, Unknown, Unknown, Unknown, U…
#> $ pneumo.uad1_panel_result <fct> Negative, Negative, Negative, Negativ…
#> $ pneumo.uad2_panel_result <fct> Negative, Negative, Negative, Negativ…
#> $ pneumo.serotype_summary_result <fct> Negative, Negative, Negative, Negativ…
#> $ pneumo.serotype_positive_count <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
#> $ pneumo.serotype_results_complete <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T…
#> $ pcv_custom <list> [<tbl_df[38 x 2]>], [<tbl_df[38 x 2]…
tmp %>% unnest(pcv_custom) %>% with(table(group,result))
#> result
#> group Negative Positive Other Unknown
#> PCV7 (plus 9A,18A/B/F,23B) 4002 43 26 6
#> PCV13 (plus 9A,18A/B/F,23B,6C) 3907 131 33 6
#> PCV15 (plus 9A,18A/B/F,23B,6C) 3864 153 44 16
#> PCV20 (plus 9A,18A/B/F,23B,6C,15C) 3740 268 53 16
#> PPV23 (plus 9A,18A/B/F,23B,15C) 3727 280 54 16
#> PCV10 (Serum Study Institute) 3953 85 33 6
#> PCV10 (GSK) 3953 85 33 6
#> PCV15 (Zhifei) 3896 132 33 16
#> PCV24 (Vaxcyte) 3711 296 54 16
#> PCV24 (Affinivax) 3711 296 54 16
#> Additional PCV13 on 7 (plus 6C) 3949 89 33 6
#> Additional PCV15 on 13 4008 24 33 12
#> Additional PCV20 on 15 (plus 15C) 3895 124 46 12
#> Additional PPV23 on PCV20 4028 29 8 12
#> 4 4040 11 20 6
#> 6B 4051 0 20 6
#> 9V+A 4050 1 20 6
#> 14 4037 11 23 6
#> 18C+A/B/F 4048 1 23 5
#> 19F 4048 3 20 6
#> 23F+B 4031 17 23 6
#> 1 4037 9 25 6
#> 3 4025 26 20 6
#> 5 4046 3 22 6
#> 6A+C 4032 17 22 6
#> 7F 4011 33 28 5
#> 19A 4044 6 21 6
#> 22F 4029 13 23 12
#> 33F 4030 11 24 12
#> 8 3931 97 37 12
#> 10A 4021 16 28 12
#> 11A 4053 9 3 12
#> 12F 4061 0 4 12
#> 15B+C 4058 3 4 12
#> 2 4060 1 4 12
#> 9N 4050 11 4 12
#> 17F 4048 12 5 12
#> 20 4056 5 4 12
#> Other 0 0 0 0