Merge branch 'master' of github.com:Wytamma/GISAIDR

Wytamma · Apr 25, 2023 · 71a887a · 71a887a
2 parents f496a54 + f21b25d
commit 71a887a
Show file tree

Hide file tree

Showing 6 changed files with 103 additions and 25 deletions.
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
@@ -32,7 +32,7 @@ jobs:
     steps:
       - uses: actions/checkout@v2
       - name: Set up R ${{ matrix.config.os }} (${{ matrix.config.r }})
-        uses: r-lib/actions/setup-r@v1
+        uses: r-lib/actions/setup-r@v2
         with:
           r-version: ${{ matrix.config.r }}
           http-user-agent: ${{ matrix.config.http-user-agent }}

diff --git a/R/internal_query.R b/R/internal_query.R
@@ -21,6 +21,8 @@
 #' @param collection_date_complete include only entries with complete in collection date the results.
 #' @param total returns the total number of sequences matching the query.
 #' @param fast returns all of the accession_ids that match the query.
+#' @param aa_substitution returns all sequences with the amino acid mutation(s), negative selection by '-' prefix
+#' @param nucl_mutation returns all sequences with the nucleotide mutation(s), negative selection by '-' prefix
 #' @return Dataframe.
 internal_query <-
   function(credentials,
@@ -34,6 +36,8 @@ internal_query <-
            to_subm = NULL,
            virus_name = NULL,
            order_by = NULL,
+           aa_substitution = NULL,
+           nucl_mutation = NULL,
            order_asc = TRUE,
            start_index = 0,
            nrows = 50,
@@ -170,6 +174,35 @@ internal_query <-
           )
       }
 
+      # amino acid changes
+      if (!is.null(aa_substitution)) {
+        queue <-
+          append(
+            queue,
+            create_search_queue(
+              credentials,
+              credentials$aa_substitution_ceid,
+              aa_substitution,
+              'FilterChange'
+            )
+          )
+      }
+
+      # nucleotide changes
+      if (!is.null(nucl_mutation)) {
+        queue <-
+          append (
+            queue,
+            create_search_queue(
+              credentials,
+              credentials$nucl_mutation_ceid,
+              nucl_mutation,
+              'FilterChange'
+            )
+          )
+      }
+
+
       if (low_coverage_excl) {
         queue <-
           append(
@@ -342,6 +375,8 @@ internal_query <-
             from_subm = from_subm,
             to = to,
             to_subm = to_subm,
+            aa_substitution = aa_substitution,
+            nucl_mutation = nucl_mutation,
             nrows = j$totalRecords,
             # set load_all to false to break the recursion
             load_all = FALSE,

diff --git a/R/login.R b/R/login.R
@@ -200,6 +200,10 @@ login <- function(username, password, database="EpiCoV") {
     linage_ceid <- NULL
   }
 
+  # AA substitution/mutation- ", " separated values
+  aa_substitution_ceid <- extract_search_ceid('mutation', customSearch_page_text)
+  # nucleotide substitution/nuc mutation, ", " separated values
+  nucl_mutation_ceid <- extract_search_ceid('nuc_mutation', customSearch_page_text)
   # Virus Name
   virus_name_ceid <- extract_search_ceid('covv_virus_name', customSearch_page_text)
 
@@ -250,6 +254,12 @@ login <- function(username, password, database="EpiCoV") {
     # collection date complete
     collection_date_complete_ceid <-
       extract_search_ceid('quality2', customSearch_page_text)
+    # AA substitution
+    aa_substitution_ceid <-
+      extract_search_ceid('mutation', customSearch_page_text)
+    # nucleotide mutation
+    nucl_mutation_ceid <-
+      extract_search_ceid('nuc_mutation', customSearch_page_text)
   }
 
   # send selection command
@@ -281,6 +291,8 @@ login <- function(username, password, database="EpiCoV") {
       text_ceid = text_ceid,
       location_ceid = location_ceid,
       search_cid = search_cid,
+      aa_substitution_ceid = aa_substitution_ceid,
+      nucl_mutation_ceid = nucl_mutation_ceid,
       linage_ceid = linage_ceid,
       virus_name_ceid = virus_name_ceid,
       from_ceid = from_ceid,

diff --git a/R/query.R b/R/query.R
@@ -22,6 +22,8 @@
 #' @param collection_date_complete include only entries with complete in collection date the results.
 #' @param total returns the total number of sequences matching the query.
 #' @param fast returns all of the accession_ids that match the query.
+#' @param aa_substitution_ceid returns all sequences with the selected amino acid mutation 
+#' @param nucl_mutation_ceid returns all sequences with the selected nucleotide mutation
 #' @return data.frame
 query <-
   function(credentials,
@@ -35,6 +37,8 @@ query <-
            to_subm = NULL,
            virus_name = NULL,
            order_by = NULL,
+           aa_substitution = NULL,
+           nucl_mutation = NULL,
            order_asc = TRUE,
            start_index = 0,
            nrows = 50,
@@ -69,7 +73,9 @@ query <-
           low_coverage_excl = low_coverage_excl,
           complete = complete,
           high_coverage = high_coverage,
-          collection_date_complete = collection_date_complete
+          collection_date_complete = collection_date_complete,
+          aa_substitution = aa_substitution,
+          nucl_mutation = nucl_mutation
         ))
       }
       return(results)
@@ -93,6 +99,8 @@ query <-
           complete = complete,
           high_coverage = high_coverage,
           collection_date_complete = collection_date_complete,
+          aa_substitution = aa_substitution,
+          nucl_mutation = nucl_mutation,
           total = total,
           fast = fast
         )
@@ -117,7 +125,9 @@ query <-
           low_coverage_excl = low_coverage_excl,
           complete = complete,
           high_coverage = high_coverage,
-          collection_date_complete = collection_date_complete
+          collection_date_complete = collection_date_complete,
+          aa_substitution = aa_substitution,
+          nucl_mutation = nucl_mutation
         )
       )
     }

diff --git a/README.md b/README.md
@@ -283,12 +283,27 @@ total
 
 ## Download
 
-To download the full data set you need a list of accession IDs (which can be obtained from `query` results).
+To download the full data set you need a list of accession IDs (which can be obtained from `query` results). This will also download the sequence data for each entry.
+
+``` r
+full_df_with_seq <- download(
+    credentials = credentials, 
+    list_of_accession_ids = list_of_accession_ids, 
+)
+full_df_with_seq$sequence
+```
+
+[1] "AGATCTGTTCTCTAAACGAACTTTAAAATCT...  
+[2] "AGATCTGTTCTCTAAACGAACTTTAAAATCT...  
+[3] "AGATCTGTTCTCTAAACGAACTTTAAAATCT...  
+...
+
+You can stop GISAIDR from loading the sequence data into the memory by setting get_sequence=FALSE. Note: the sequence data will still be downloaded.
 
 ``` r
 df <- query(credentials = credentials)
 list_of_accession_ids <- df$accession_id
-full_df <- download(credentials = credentials, list_of_accession_ids = list_of_accession_ids)
+full_df <- download(credentials = credentials, list_of_accession_ids = list_of_accession_ids, get_sequence=FALSE)
 colnames(full_df)
 ```
 
@@ -302,24 +317,6 @@ colnames(full_df)
 
 Note: a maximum of 5000 results can be downloaded at a time.
 
-### Get sequence data
-
-Use the `get_sequence` argument to download the sequences with the full data.
-
-``` r
-full_df_with_seq <- download(
-    credentials = credentials, 
-    list_of_accession_ids = list_of_accession_ids, 
-    get_sequence=TRUE
-)
-full_df_with_seq$sequence
-```
-
-[1] "AGATCTGTTCTCTAAACGAACTTTAAAATCT...  
-[2] "AGATCTGTTCTCTAAACGAACTTTAAAATCT...  
-[3] "AGATCTGTTCTCTAAACGAACTTTAAAATCT...  
-...
-
 ### Export to fasta file
 
 Use the export_fasta function to write sequence data to a file in fasta format. The sequence names will be [country\@pango_lineage\@accesion_id\@date](mailto:country@pango_lineage@accesion_id@date), with the date in decimal format (requires the [lubridate](https://cran.r-project.org/web/packages/lubridate/index.html) package). The default is to only export sequences for which a decimal date could be set. To prevent this, use the argument export_dated_only = F.
@@ -417,8 +414,9 @@ quality_ceid <- extract_search_ceid("quality'", customSearch_page_text)
 ```
 
 1.  Add the extracted `ceid` to the list of `credentials` e.g. `complete_ceid = complete_ceid`
-2.  Add the new argument and default value to the `query()` function in `query.R` e.g. `complete = FALSE`.
-3.  Create and append a search queue to the main queue if the `complete` argument is used. Create the command using the `create_search_queue()` function. Use the `complete_ceid` for the `ceid` and the checkbox value (identified in step 4) for the `cvalue` e.g.
+2.  Add the new argument and default value to **all** `query()` function in `query.R` and `internal_query.R` e.g. `complete = FALSE`.
+3.  Add the new argument and default value to the `(load_all && j$totalRecords > nrows)` load_all recursion loop so all paginations will continue using the argument.
+4.  Create and append a search queue to the main queue if the `complete` argument is used. Create the command using the `create_search_queue()` function. Use the `complete_ceid` for the `ceid` and the checkbox value (identified in step 4) for the `cvalue` e.g.
 
 ``` r
 if (complete) {

diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R
@@ -145,6 +145,29 @@ test_that("order_by works", {
   expect_true(df$submission_date[1] == "2020-01-10")
 })
 
+test_that("aa_substitution works", {
+  df <- query(credentials = credentials,
+      aa_substitution = 'Spike_E484Q, Spike_H69del, -N_P13L',
+      to_subm =  '2023-02-22',
+      load_all = TRUE,
+      order_by='submission_date')
+  expect_true(is.data.frame(df))
+  expect_equal(df$submission_date[1], "2021-01-25")
+  expect_equal(nrow(df),576)
+  ## to test accuracy - set of 4 rarely co-existing mutations to verify Spike_H69del, Spike_A222V, Spike_G476S, -N_P13L
+})
+
+test_that("nucl_mutation works", {
+  df <- query(credentials = credentials, 
+    nucl_mutation = '-T23599G, -C10029T, -C14408T, -A23403G, T22679C, G28881A, A24424T',
+    to_subm = '2023-02-22',
+    load_all = TRUE,
+    order_by='submission_date')
+  expect_true(is.data.frame(df))
+  expect_equal(df$submission_date[1],"2021-12-29")
+  expect_equal(nrow(df),55)
+})
+
 test_that("text search works", {
   accession_ids = c("EPI_ISL_17398411", "EPI_ISL_17199001", "EPI_ISL_17409201", "EPI_ISL_17243716")
   df <- query(credentials = credentials, text = paste(accession_ids, collapse = "\n"))