Skip to contents

# The data input directory on my development machine:
options("avoncap.input" = "~/Data/avoncap/")

Data loading

The raw data is expected to be in the urine-antigens subfolder

fs::dir_tree(path = avoncap::input("urine-antigens"))
#> /home/vp22681/Data/avoncap/urine-antigens
#> ├── 2022-09-15
#> │   └── uad-cases.csv
#> ├── 2023-05-15
#> │   ├── gp-cases.csv
#> │   └── uad-cases.csv
#> ├── B1851202_SAP V1.0-27Sep2021_Final.pdf
#> ├── README.md
#> └── Table Template Serotype Distribution Scientific Affairs Proposal_FINAL.docx

The data is distributed as a single file with both cases and controls together without distinguishing features. There are BinaxNOW and serotype specific UAD tests and their results. Not all tests are run for every person.

most_recent_files("urine-antigens")
#> # A tibble: 2 × 7
#>   filename  directory      path          date       hospital study_year filetype
#>   <chr>     <chr>          <fs::path>    <date>     <chr>         <int> <chr>   
#> 1 gp-cases  urine-antigens …gp-cases.csv 2023-05-15 NA               NA csv     
#> 2 uad-cases urine-antigens …ad-cases.csv 2023-05-15 NA               NA csv
rawUA = load_data("urine-antigens")
#> caching item: ~/.cache/avoncap/data-6c0a3f301ee14020e3907a7472c55225-a34df7f73813316db44634d7c8101b95.rda
#> Loaded 101647 rows from 2 files, (7925+93722=101647)

Data normalisation

The data can be subdivided into serotype versus binax groups and normalisation handles processing the serotype to be in the format ^[0-9]+[A-Z]$, aligning format of the patient identifier to match [0-9]{4}-[0-9]+. And processing test dates (which are supplied in the %e-%b-%y format).


normUA = rawUA %>% normalise_data(instrument = "serotype", .nocache=TRUE)
#> Normalising data using: normalise.urine_antigens.serotype
#> caching item: ~/.cache/avoncap/norm-0ef4812a2d1bc751062531c384a2b89e-54799b57a745498112d81b06bd78e6c8.rda
#> mapping .RESULT to pneumo.urine_antigen_result
#> mapping .EVENT_DATE to pneumo.test_date
#> mapping .ANALYSIS to pneumo.urine_antigen_test
#> mapping .SUBJECT to admin.consented_record_number
#> mapping .BARCODE to pneumo.urine_antigen_sample_id
#> Mapped 5 columns
#> Did not map 23 columns
normBinax = rawUA %>% normalise_data(instrument = "binax", .nocache=TRUE)
#> Normalising data using: normalise.urine_antigens.binax
#> caching item: ~/.cache/avoncap/norm-0ef4812a2d1bc751062531c384a2b89e-90f11bb9b43480ac102fc888a3d2d068.rda
#> mapping .RESULT to pneumo.binax_result
#> mapping .EVENT_DATE to pneumo.test_date
#> mapping .SUBJECT to admin.consented_record_number
#> mapping .BARCODE to pneumo.urine_antigen_sample_id
#> Mapped 4 columns
#> Did not map 24 columns

The normalised UAD data has the individual serotype specific tests and results in a nested column. The single BinaxNOW result is not nested. Results are keyed off a surveillance number (admission episode identifier) and test data.

UAD format:

  • admin.consented_record_number: character
  • pneumo.test_date: Date
  • key.consent: character
  • key.sample: character
  • pneumo.urine_antigen: list

BinaxNOW format:

  • pneumo.binax_result: factor
  • pneumo.test_date: Date
  • admin.consented_record_number: character
  • pneumo.urine_antigen_sample_id: character
  • key.consent: character
  • key.sample: character

Data augmentation

For the UAD results we commonly want to know the following:

  • was the serotype testing complete for all serotypes?
  • was the whole UAD panel positive for any serotype, if so how many?
  • was the UAD1 panel positive? UAD2?
  • was the UAD positive for a PCV vaccine covered serotype?

For the BINAX results there is no real addition qeustions that are relevant


augUA = normUA %>% augment_data()
#> Augmenting data using: augment.urine_antigens.serotype
#> caching item: ~/.cache/avoncap/augment-4c0c0c577381837fb7b8ee36b440288a-9ed7ba378591057ee91c20db48f25d48.rda
#> Created pneumo.pcv_group using: key.sample, pneumo.urine_antigen
#> Created pneumo.non_uad_panel_result, pneumo.uad1_panel_result, pneumo.uad2_panel_result, pneumo.serotype_summary_result, pneumo.serotype_positive_count, pneumo.serotype_results_complete using: key.sample, pneumo.urine_antigen


# augUA %>% filter(pneumo.serotype_results_complete) %>% 
#   group_by(pneumo.serotype_summary_result) %>% 
#   count()
# 
# augUA %>% unnest(pneumo.pcv_group) %>% with(table(group,result))
# 
# ggplot(
#   augUA %>% unnest(pneumo.pcv_group) %>% 
#     inner_join(avoncap::serotype_data$names, by=c("group"="label")) %>%
#     filter(indent==2, result == "Positive"), aes(x=group))+
#   geom_histogram(stat="count")

Common operations


# normUA %>% unnest(pneumo.urine_antigen) %>% group_by(result) %>% count()
# ggplot(normUA %>% unnest(pneumo.urine_antigen) %>% filter(result == "Positive"), aes(x=test))+geom_histogram(stat="count")
# normBinax %>% group_by(pneumo.binax_result) %>% count()

Find all the


tmp = augUA %>% derive_pcv_groupings(pcv_map = avoncap::uad_pcv_map, col_name = pcv_custom) %>% glimpse()
#> Rows: 4,077
#> Columns: 13
#> $ admin.consented_record_number    <chr> "1001-00045", "1001-00057", "1001-000…
#> $ pneumo.test_date                 <date> 2020-11-06, 2021-02-04, 2021-02-04, …
#> $ key.consent                      <chr> "1001-00045", "1001-00057", "1001-000…
#> $ key.sample                       <chr> "1001-00045-2020-11-06", "1001-00057-…
#> $ pneumo.urine_antigen             <list> [<tbl_df[24 x 3]>], [<tbl_df[24 x 3]…
#> $ pneumo.pcv_group                 <list> [<tbl_df[38 x 2]>], [<tbl_df[38 x 2]…
#> $ pneumo.non_uad_panel_result      <fct> Unknown, Unknown, Unknown, Unknown, U…
#> $ pneumo.uad1_panel_result         <fct> Negative, Negative, Negative, Negativ…
#> $ pneumo.uad2_panel_result         <fct> Negative, Negative, Negative, Negativ…
#> $ pneumo.serotype_summary_result   <fct> Negative, Negative, Negative, Negativ…
#> $ pneumo.serotype_positive_count   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
#> $ pneumo.serotype_results_complete <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T…
#> $ pcv_custom                       <list> [<tbl_df[38 x 2]>], [<tbl_df[38 x 2]…
tmp %>% unnest(pcv_custom) %>% with(table(group,result))
#>                                     result
#> group                                Negative Positive Other Unknown
#>   PCV7 (plus 9A,18A/B/F,23B)             4002       43    26       6
#>   PCV13 (plus 9A,18A/B/F,23B,6C)         3907      131    33       6
#>   PCV15 (plus 9A,18A/B/F,23B,6C)         3864      153    44      16
#>   PCV20 (plus 9A,18A/B/F,23B,6C,15C)     3740      268    53      16
#>   PPV23 (plus 9A,18A/B/F,23B,15C)        3727      280    54      16
#>   PCV10 (Serum Study Institute)          3953       85    33       6
#>   PCV10 (GSK)                            3953       85    33       6
#>   PCV15 (Zhifei)                         3896      132    33      16
#>   PCV24 (Vaxcyte)                        3711      296    54      16
#>   PCV24 (Affinivax)                      3711      296    54      16
#>   Additional PCV13 on 7 (plus 6C)        3949       89    33       6
#>   Additional PCV15 on 13                 4008       24    33      12
#>   Additional PCV20 on 15 (plus 15C)      3895      124    46      12
#>   Additional PPV23 on PCV20              4028       29     8      12
#>   4                                      4040       11    20       6
#>   6B                                     4051        0    20       6
#>   9V+A                                   4050        1    20       6
#>   14                                     4037       11    23       6
#>   18C+A/B/F                              4048        1    23       5
#>   19F                                    4048        3    20       6
#>   23F+B                                  4031       17    23       6
#>   1                                      4037        9    25       6
#>   3                                      4025       26    20       6
#>   5                                      4046        3    22       6
#>   6A+C                                   4032       17    22       6
#>   7F                                     4011       33    28       5
#>   19A                                    4044        6    21       6
#>   22F                                    4029       13    23      12
#>   33F                                    4030       11    24      12
#>   8                                      3931       97    37      12
#>   10A                                    4021       16    28      12
#>   11A                                    4053        9     3      12
#>   12F                                    4061        0     4      12
#>   15B+C                                  4058        3     4      12
#>   2                                      4060        1     4      12
#>   9N                                     4050       11     4      12
#>   17F                                    4048       12     5      12
#>   20                                     4056        5     4      12
#>   Other                                     0        0     0       0