# The data input directory on my development machine:
options("avoncap.input" = "~/Data/avoncap/")
Data loading
The raw data is expected to be in the urine-antigens
fs::dir_tree(path = avoncap::input("urine-antigens"))
#> /home/vp22681/Data/avoncap/urine-antigens
#> ├── 2022-09-15
#> │ └── uad-cases.csv
#> ├── 2023-05-15
#> │ ├── gp-cases.csv
#> │ └── uad-cases.csv
#> ├── B1851202_SAP V1.0-27Sep2021_Final.pdf
#> ├── README.md
#> └── Table Template Serotype Distribution Scientific Affairs Proposal_FINAL.docx
The data is distributed as a single file with both cases and controls together without distinguishing features. There are BinaxNOW and serotype specific UAD tests and their results. Not all tests are run for every person.
#> # A tibble: 2 × 7
#> filename directory path date hospital study_year filetype
#> <chr> <chr> <fs::path> <date> <chr> <int> <chr>
#> 1 gp-cases urine-antigens …gp-cases.csv 2023-05-15 NA NA csv
#> 2 uad-cases urine-antigens …ad-cases.csv 2023-05-15 NA NA csv
rawUA = load_data("urine-antigens")
#> caching item: ~/.cache/avoncap/data-6c0a3f301ee14020e3907a7472c55225-a34df7f73813316db44634d7c8101b95.rda
#> Loaded 101647 rows from 2 files, (7925+93722=101647)
Data normalisation
The data can be subdivided into serotype
groups and normalisation handles processing the
serotype to be in the format ^[0-9]+[A-Z]$
, aligning format
of the patient identifier to match [0-9]{4}-[0-9]+
. And
processing test dates (which are supplied in the %e-%b-%y
normUA = rawUA %>% normalise_data(instrument = "serotype", .nocache=TRUE)
#> Normalising data using: normalise.urine_antigens.serotype
#> caching item: ~/.cache/avoncap/norm-0ef4812a2d1bc751062531c384a2b89e-54799b57a745498112d81b06bd78e6c8.rda
#> mapping .RESULT to pneumo.urine_antigen_result
#> mapping .EVENT_DATE to pneumo.test_date
#> mapping .ANALYSIS to pneumo.urine_antigen_test
#> mapping .SUBJECT to admin.consented_record_number
#> mapping .BARCODE to pneumo.urine_antigen_sample_id
#> Mapped 5 columns
#> Did not map 23 columns
normBinax = rawUA %>% normalise_data(instrument = "binax", .nocache=TRUE)
#> Normalising data using: normalise.urine_antigens.binax
#> caching item: ~/.cache/avoncap/norm-0ef4812a2d1bc751062531c384a2b89e-90f11bb9b43480ac102fc888a3d2d068.rda
#> mapping .RESULT to pneumo.binax_result
#> mapping .EVENT_DATE to pneumo.test_date
#> mapping .SUBJECT to admin.consented_record_number
#> mapping .BARCODE to pneumo.urine_antigen_sample_id
#> Mapped 4 columns
#> Did not map 24 columns
The normalised UAD data has the individual serotype specific tests and results in a nested column. The single BinaxNOW result is not nested. Results are keyed off a surveillance number (admission episode identifier) and test data.
UAD format:
- admin.consented_record_number: character
- pneumo.test_date: Date
- key.consent: character
- key.sample: character
- pneumo.urine_antigen: list
BinaxNOW format:
- pneumo.binax_result: factor
- pneumo.test_date: Date
- admin.consented_record_number: character
- pneumo.urine_antigen_sample_id: character
- key.consent: character
- key.sample: character
Data augmentation
For the UAD results we commonly want to know the following:
- was the serotype testing complete for all serotypes?
- was the whole UAD panel positive for any serotype, if so how many?
- was the UAD1 panel positive? UAD2?
- was the UAD positive for a PCV vaccine covered serotype?
For the BINAX results there is no real addition qeustions that are relevant
augUA = normUA %>% augment_data()
#> Augmenting data using: augment.urine_antigens.serotype
#> caching item: ~/.cache/avoncap/augment-4c0c0c577381837fb7b8ee36b440288a-9ed7ba378591057ee91c20db48f25d48.rda
#> Created pneumo.pcv_group using: key.sample, pneumo.urine_antigen
#> Created pneumo.non_uad_panel_result, pneumo.uad1_panel_result, pneumo.uad2_panel_result, pneumo.serotype_summary_result, pneumo.serotype_positive_count, pneumo.serotype_results_complete using: key.sample, pneumo.urine_antigen
# augUA %>% filter(pneumo.serotype_results_complete) %>%
# group_by(pneumo.serotype_summary_result) %>%
# count()
# augUA %>% unnest(pneumo.pcv_group) %>% with(table(group,result))
# ggplot(
# augUA %>% unnest(pneumo.pcv_group) %>%
# inner_join(avoncap::serotype_data$names, by=c("group"="label")) %>%
# filter(indent==2, result == "Positive"), aes(x=group))+
# geom_histogram(stat="count")
Common operations
# normUA %>% unnest(pneumo.urine_antigen) %>% group_by(result) %>% count()
# ggplot(normUA %>% unnest(pneumo.urine_antigen) %>% filter(result == "Positive"), aes(x=test))+geom_histogram(stat="count")
# normBinax %>% group_by(pneumo.binax_result) %>% count()
Find all the
tmp = augUA %>% derive_pcv_groupings(pcv_map = avoncap::uad_pcv_map, col_name = pcv_custom) %>% glimpse()
#> Rows: 4,077
#> Columns: 13
#> $ admin.consented_record_number <chr> "1001-00045", "1001-00057", "1001-000…
#> $ pneumo.test_date <date> 2020-11-06, 2021-02-04, 2021-02-04, …
#> $ key.consent <chr> "1001-00045", "1001-00057", "1001-000…
#> $ key.sample <chr> "1001-00045-2020-11-06", "1001-00057-…
#> $ pneumo.urine_antigen <list> [<tbl_df[24 x 3]>], [<tbl_df[24 x 3]…
#> $ pneumo.pcv_group <list> [<tbl_df[38 x 2]>], [<tbl_df[38 x 2]…
#> $ pneumo.non_uad_panel_result <fct> Unknown, Unknown, Unknown, Unknown, U…
#> $ pneumo.uad1_panel_result <fct> Negative, Negative, Negative, Negativ…
#> $ pneumo.uad2_panel_result <fct> Negative, Negative, Negative, Negativ…
#> $ pneumo.serotype_summary_result <fct> Negative, Negative, Negative, Negativ…
#> $ pneumo.serotype_positive_count <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
#> $ pneumo.serotype_results_complete <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T…
#> $ pcv_custom <list> [<tbl_df[38 x 2]>], [<tbl_df[38 x 2]…
tmp %>% unnest(pcv_custom) %>% with(table(group,result))
#> result
#> group Negative Positive Other Unknown
#> PCV7 (plus 9A,18A/B/F,23B) 4002 43 26 6
#> PCV13 (plus 9A,18A/B/F,23B,6C) 3907 131 33 6
#> PCV15 (plus 9A,18A/B/F,23B,6C) 3864 153 44 16
#> PCV20 (plus 9A,18A/B/F,23B,6C,15C) 3740 268 53 16
#> PPV23 (plus 9A,18A/B/F,23B,15C) 3727 280 54 16
#> PCV10 (Serum Study Institute) 3953 85 33 6
#> PCV10 (GSK) 3953 85 33 6
#> PCV15 (Zhifei) 3896 132 33 16
#> PCV24 (Vaxcyte) 3711 296 54 16
#> PCV24 (Affinivax) 3711 296 54 16
#> Additional PCV13 on 7 (plus 6C) 3949 89 33 6
#> Additional PCV15 on 13 4008 24 33 12
#> Additional PCV20 on 15 (plus 15C) 3895 124 46 12
#> Additional PPV23 on PCV20 4028 29 8 12
#> 4 4040 11 20 6
#> 6B 4051 0 20 6
#> 9V+A 4050 1 20 6
#> 14 4037 11 23 6
#> 18C+A/B/F 4048 1 23 5
#> 19F 4048 3 20 6
#> 23F+B 4031 17 23 6
#> 1 4037 9 25 6
#> 3 4025 26 20 6
#> 5 4046 3 22 6
#> 6A+C 4032 17 22 6
#> 7F 4011 33 28 5
#> 19A 4044 6 21 6
#> 22F 4029 13 23 12
#> 33F 4030 11 24 12
#> 8 3931 97 37 12
#> 10A 4021 16 28 12
#> 11A 4053 9 3 12
#> 12F 4061 0 4 12
#> 15B+C 4058 3 4 12
#> 2 4060 1 4 12
#> 9N 4050 11 4 12
#> 17F 4048 12 5 12
#> 20 4056 5 4 12
#> Other 0 0 0 0